Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Chunx

livebooks/ollama/chunx.livemd

Chunx

Mix.install([
  {:chunx, github: "preciz/chunx"},
  {:ollama, "~> 0.8"},
  {:req, "~> 0.5"}
])

Prepare text

%{body: text} =
  Req.get!(
    "https://raw.githubusercontent.com/RyoWakabayashi/elixir-learning/main/livebooks/bumblebee/colab/momotaro.txt"
  )

Prepare tokenizer

{:ok, tokenizer} = Tokenizers.Tokenizer.from_file("/tmp/ruri_base/tokenizer.json")

Token-based Chunking

{:ok, token_chunks} = Chunx.Chunker.Token.chunk(text, tokenizer, chunk_size: 512)

Word-based Chunking

{:ok, word_chunks} = Chunx.Chunker.Word.chunk(text, tokenizer, chunk_size: 512)

Sentence-based Chunking

{:ok, sentence_chunks} =
  Chunx.Chunker.Sentence.chunk(
    text,
    tokenizer,
    delimiters: ~w(。 \\n)
  )

Semantic Chunking

client = Ollama.init(base_url: "http://host.docker.internal:11434/api", receive_timeout: 300_000)
Ollama.pull_model(client, name: "kun432/cl-nagoya-ruri-base")
embedding_fn = fn texts ->
  texts
  |> Enum.map(fn text ->
    client
    |> Ollama.embed(
      model: "kun432/cl-nagoya-ruri-base",
      input: "文章: #{text}"
    )
    |> elem(1)
    |> Map.get("embeddings")
    |> hd()
    |> Nx.tensor()
  end)
end
Chunx.Chunker.Semantic.chunk(
  text,
  tokenizer,
  embedding_fn,
  delimiters: ~w(。 \\n)
)