Chunx
Mix.install([
{:chunx, github: "preciz/chunx"},
{:ollama, "~> 0.8"},
{:req, "~> 0.5"}
])
Prepare text
%{body: text} =
Req.get!(
"https://raw.githubusercontent.com/RyoWakabayashi/elixir-learning/main/livebooks/bumblebee/colab/momotaro.txt"
)
Prepare tokenizer
{:ok, tokenizer} = Tokenizers.Tokenizer.from_file("/tmp/ruri_base/tokenizer.json")
Token-based Chunking
{:ok, token_chunks} = Chunx.Chunker.Token.chunk(text, tokenizer, chunk_size: 512)
Word-based Chunking
{:ok, word_chunks} = Chunx.Chunker.Word.chunk(text, tokenizer, chunk_size: 512)
Sentence-based Chunking
{:ok, sentence_chunks} =
Chunx.Chunker.Sentence.chunk(
text,
tokenizer,
delimiters: ~w(。 \\n)
)
Semantic Chunking
client = Ollama.init(base_url: "http://host.docker.internal:11434/api", receive_timeout: 300_000)
Ollama.pull_model(client, name: "kun432/cl-nagoya-ruri-base")
embedding_fn = fn texts ->
texts
|> Enum.map(fn text ->
client
|> Ollama.embed(
model: "kun432/cl-nagoya-ruri-base",
input: "文章: #{text}"
)
|> elem(1)
|> Map.get("embeddings")
|> hd()
|> Nx.tensor()
end)
end
Chunx.Chunker.Semantic.chunk(
text,
tokenizer,
embedding_fn,
delimiters: ~w(。 \\n)
)