Arcana 🔮📚 Tutorial
Mix.install([
{:arcana, path: Path.expand("../"), force: true},
{:kino, "~> 0.14"},
{:kino_vega_lite, "~> 0.1"}
])
alias VegaLite, as: Vl
Intro
A hands-on guide to building RAG (Retrieval Augmented Generation) applications with Arcana. We’ll use the in-memory vector store, so no database setup is required.
What you’ll learn:
- Ingesting documents into a vector store
- Semantic, fulltext, and hybrid search
- Organizing documents into collections
- Building agentic RAG pipelines
1. Getting Started: In-Memory Vector Store
Arcana supports two storage backends: pgvector (PostgreSQL) and memory (HNSWLib). The memory backend is perfect for learning and experimentation.
Start the Memory Vector Store
# Start the in-memory vector store
{:ok, memory_pid} = Arcana.VectorStore.Memory.start_link(name: nil)
# We'll pass this pid to all operations
IO.puts("Memory vector store started: #{inspect(memory_pid)}")
Start Local Embeddings
Arcana uses Bumblebee to generate embeddings locally with bge-small-en-v1.5. This creates 384-dimensional vectors.
# Start the local embedding model
# This downloads the model on first run (~100MB)
{:ok, _} = Arcana.Embedder.Local.start_link([])
# Verify it's working
{:ok, embedding} = Arcana.Embedder.embed(Arcana.embedder(), "Hello, world!")
IO.puts("Embedding dimensions: #{length(embedding)}")
2. Storing and Searching Vectors
Let’s store some sample documents about programming languages.
Store Documents
alias Arcana.VectorStore
# Sample documents about programming languages
docs = [
%{
id: "elixir-intro",
text: "Elixir is a dynamic, functional language for building scalable applications. It runs on the Erlang VM (BEAM) and is known for fault-tolerance and concurrency."
},
%{
id: "elixir-syntax",
text: "Elixir uses pattern matching extensively. Functions can have multiple clauses that match different patterns. The pipe operator |> chains function calls elegantly."
},
%{
id: "erlang-intro",
text: "Erlang was designed for telecom systems requiring high availability. It pioneered the actor model with lightweight processes and message passing."
},
%{
id: "erlang-otp",
text: "OTP (Open Telecom Platform) provides behaviors like GenServer, Supervisor, and Application. These building blocks enable fault-tolerant system design."
},
%{
id: "python-intro",
text: "Python is a high-level, interpreted language emphasizing readability. It's popular for data science, web development, and scripting."
},
%{
id: "rust-intro",
text: "Rust provides memory safety without garbage collection through its ownership system. It's used for systems programming where performance is critical."
}
]
# Store each document
for doc <- docs do
{:ok, embedding} = Arcana.Embedder.embed(Arcana.embedder(), doc.text)
:ok = VectorStore.store(
"languages", # collection name
doc.id, # unique id
embedding, # vector embedding
%{text: doc.text}, # metadata
vector_store: {:memory, pid: memory_pid}
)
end
IO.puts("Stored #{length(docs)} documents in 'languages' collection")
Semantic Search
Semantic search finds documents with similar meaning, not just matching keywords.
# Search for documents about concurrent programming
query = "concurrent programming with message passing"
{:ok, query_embedding} = Arcana.Embedder.embed(Arcana.embedder(), query)
results = VectorStore.search(
"languages",
query_embedding,
vector_store: {:memory, pid: memory_pid},
limit: 3
)
IO.puts("Query: #{query}\n")
IO.puts("Results:")
for result <- results do
IO.puts(" [#{Float.round(result.score, 3)}] #{result.id}")
IO.puts(" #{String.slice(result.metadata.text, 0, 80)}...")
end
Notice how it finds Elixir and Erlang documents even though we searched for “concurrent” and “message passing” - the model understands these concepts are related.
Fulltext Search
Fulltext search matches exact keywords using TF-IDF-like scoring.
# Search for exact term "pattern matching"
results = VectorStore.search_text(
"languages",
"pattern matching",
vector_store: {:memory, pid: memory_pid},
limit: 3
)
IO.puts("Fulltext search: 'pattern matching'\n")
IO.puts("Results:")
for result <- results do
IO.puts(" [#{Float.round(result.score, 3)}] #{result.id}")
IO.puts(" #{String.slice(result.metadata.text, 0, 80)}...")
end
This only returns documents containing the exact terms “pattern” and “matching”.
3. Search Mode Comparison
Let’s visualize how different search modes perform on the same query.
# Compare search modes
test_queries = [
"functional programming language",
"memory safety",
"fault tolerant systems",
"BEAM virtual machine"
]
comparison_data =
for query <- test_queries do
{:ok, query_embedding} = Arcana.Embedder.embed(Arcana.embedder(), query)
# Semantic search
semantic = VectorStore.search("languages", query_embedding,
vector_store: {:memory, pid: memory_pid}, limit: 1)
# Fulltext search
fulltext = VectorStore.search_text("languages", query,
vector_store: {:memory, pid: memory_pid}, limit: 1)
%{
query: query,
semantic_match: if(semantic != [], do: hd(semantic).id, else: "none"),
semantic_score: if(semantic != [], do: hd(semantic).score, else: 0),
fulltext_match: if(fulltext != [], do: hd(fulltext).id, else: "none"),
fulltext_score: if(fulltext != [], do: hd(fulltext).score, else: 0)
}
end
Kino.DataTable.new(comparison_data)
Key insight: Semantic search understands meaning (“BEAM virtual machine” matches Elixir), while fulltext requires exact terms.
4. Working with Collections
Collections let you organize documents into logical groups, like folders.
# Add some web framework documents to a new collection
frameworks = [
%{id: "phoenix", text: "Phoenix is a web framework for Elixir. It uses channels for real-time features and LiveView for server-rendered interactive UIs."},
%{id: "rails", text: "Ruby on Rails pioneered convention over configuration. It includes ActiveRecord ORM and follows the MVC pattern."},
%{id: "django", text: "Django is a Python web framework with batteries included. It has an admin interface, ORM, and authentication built-in."}
]
for doc <- frameworks do
{:ok, embedding} = Arcana.Embedder.embed(Arcana.embedder(), doc.text)
:ok = VectorStore.store("frameworks", doc.id, embedding, %{text: doc.text},
vector_store: {:memory, pid: memory_pid})
end
IO.puts("Stored #{length(frameworks)} documents in 'frameworks' collection")
# Search only in frameworks collection
query = "real-time web applications"
{:ok, query_embedding} = Arcana.Embedder.embed(Arcana.embedder(), query)
framework_results = VectorStore.search("frameworks", query_embedding,
vector_store: {:memory, pid: memory_pid}, limit: 2)
language_results = VectorStore.search("languages", query_embedding,
vector_store: {:memory, pid: memory_pid}, limit: 2)
IO.puts("Query: #{query}\n")
IO.puts("From 'frameworks':")
for r <- framework_results, do: IO.puts(" #{r.id}: #{Float.round(r.score, 3)}")
IO.puts("\nFrom 'languages':")
for r <- language_results, do: IO.puts(" #{r.id}: #{Float.round(r.score, 3)}")
5. Visualizing Similarity Scores
Let’s create a heatmap showing how similar each document is to various queries.
queries = ["concurrency", "web framework", "memory management", "functional"]
doc_ids = ["elixir-intro", "erlang-intro", "python-intro", "rust-intro", "phoenix"]
# Get all documents
all_docs = docs ++ [%{id: "phoenix", text: hd(frameworks).text}]
# Calculate similarity matrix
heatmap_data =
for query <- queries,
doc <- all_docs,
doc.id in doc_ids do
{:ok, q_emb} = Arcana.Embedder.embed(Arcana.embedder(), query)
{:ok, d_emb} = Arcana.Embedder.embed(Arcana.embedder(), doc.text)
# Cosine similarity
dot = Enum.zip_with(q_emb, d_emb, &(&1 * &2)) |> Enum.sum()
norm_q = :math.sqrt(Enum.map(q_emb, &(&1 * &1)) |> Enum.sum())
norm_d = :math.sqrt(Enum.map(d_emb, &(&1 * &1)) |> Enum.sum())
similarity = dot / (norm_q * norm_d)
%{query: query, document: doc.id, similarity: Float.round(similarity, 3)}
end
Vl.new(width: 400, height: 200, title: "Query-Document Similarity")
|> Vl.data_from_values(heatmap_data)
|> Vl.mark(:rect)
|> Vl.encode_field(:x, "document", type: :nominal, title: "Document")
|> Vl.encode_field(:y, "query", type: :nominal, title: "Query")
|> Vl.encode_field(:color, "similarity",
type: :quantitative,
scale: [scheme: "blues"],
title: "Similarity"
)
6. Pipeline (Modular RAG)
For complex questions, use Arcana.Pipeline to compose retrieval steps like gating, reasoning, expansion, and reranking. Each step is a behaviour you can replace, and the pipeline is composed at code time (you decide the order).
If you want the LLM to drive the control flow at runtime instead, see Arcana.Loop later in this notebook.
Define an LLM Function
The pipeline needs an LLM to make decisions. Let’s create a simple mock:
# Mock LLM that returns structured responses
# In production, use OpenAI, Anthropic, or another provider
mock_llm = fn prompt ->
cond do
# Gate: decide if retrieval is needed
prompt =~ "needs_retrieval" ->
if prompt =~ "2 + 2" or prompt =~ "basic" do
{:ok, ~s({"needs_retrieval": false, "reasoning": "Basic knowledge"})}
else
{:ok, ~s({"needs_retrieval": true, "reasoning": "Domain-specific"})}
end
prompt =~ "collections" ->
# Select relevant collections
{:ok, ~s({"collections": ["languages", "frameworks"], "reasoning": "Question covers both"})}
prompt =~ "sub_questions" or prompt =~ "decompose" ->
# Decompose into simpler questions
{:ok, ~s({"sub_questions": ["What is Elixir?", "What is Phoenix?"], "reasoning": "Two distinct topics"})}
# Reason: evaluate if results are sufficient
prompt =~ "sufficient" ->
{:ok, ~s({"sufficient": true, "reasoning": "Results contain needed info"})}
prompt =~ "Answer" ->
# Generate answer from context
{:ok, "Based on the context, Elixir is a functional language on BEAM, and Phoenix is its web framework with real-time capabilities."}
true ->
{:ok, "Default response"}
end
end
Simple RAG: Search → Answer
alias Arcana.Pipeline
# First, we need a repo-like interface for the pipeline
# Since we're using memory backend, we'll work directly with VectorStore
# Simple search and answer
ctx =
Pipeline.new("Tell me about Elixir and its web framework",
repo: nil, # Not using database
llm: mock_llm,
limit: 3
)
# Manual search since we're using custom vector store
{:ok, query_embedding} = Arcana.Embedder.embed(Arcana.embedder(), ctx.question)
lang_results = VectorStore.search("languages", query_embedding,
vector_store: {:memory, pid: memory_pid}, limit: 2)
fw_results = VectorStore.search("frameworks", query_embedding,
vector_store: {:memory, pid: memory_pid}, limit: 2)
# Combine results
all_results = (lang_results ++ fw_results) |> Enum.map(fn r ->
%{id: r.id, text: r.metadata.text, score: r.score}
end)
IO.puts("Retrieved #{length(all_results)} chunks:")
for r <- all_results do
IO.puts(" - #{r.id} (#{Float.round(r.score, 3)})")
end
Generate Answer with Context
# Build context for the LLM
context_text = all_results
|> Enum.map(& &1.text)
|> Enum.join("\n\n---\n\n")
prompt = """
Answer the question based on the following context.
Question: "Tell me about Elixir and its web framework"
Context:
#{context_text}
"""
{:ok, answer} = mock_llm.(prompt)
IO.puts("Answer:\n#{answer}")
7. Hybrid Search with RRF
Hybrid search combines semantic and fulltext results. The approach differs by backend:
-
pgvector backend: Uses a single SQL query with weighted score combination and min-max normalization. Supports
semantic_weightandfulltext_weightoptions. - Memory backend: Uses two separate queries combined with Reciprocal Rank Fusion (RRF).
Since we’re using the memory backend in this tutorial, let’s implement RRF:
defmodule HybridSearch do
@doc """
Combines semantic and fulltext results using RRF.
RRF score = sum(1 / (k + rank)) where k=60
"""
def search(collection, query, vector_store_opts, limit \\ 5) do
k = 60
# Get query embedding for semantic search
{:ok, query_embedding} = Arcana.Embedder.embed(Arcana.embedder(), query)
# Semantic results
semantic = Arcana.VectorStore.search(collection, query_embedding,
Keyword.merge(vector_store_opts, limit: limit * 2))
# Fulltext results
fulltext = Arcana.VectorStore.search_text(collection, query,
Keyword.merge(vector_store_opts, limit: limit * 2))
# Build rank maps
semantic_ranks = semantic
|> Enum.with_index(1)
|> Map.new(fn {r, rank} -> {r.id, rank} end)
fulltext_ranks = fulltext
|> Enum.with_index(1)
|> Map.new(fn {r, rank} -> {r.id, rank} end)
# Get all unique results
all_results = (semantic ++ fulltext)
|> Enum.uniq_by(& &1.id)
|> Map.new(fn r -> {r.id, r} end)
# Calculate RRF scores
all_results
|> Enum.map(fn {id, result} ->
s_rank = Map.get(semantic_ranks, id, 1000)
f_rank = Map.get(fulltext_ranks, id, 1000)
rrf_score = 1/(k + s_rank) + 1/(k + f_rank)
%{result | score: rrf_score}
end)
|> Enum.sort_by(& &1.score, :desc)
|> Enum.take(limit)
end
end
# Compare hybrid vs semantic only
query = "BEAM concurrency"
{:ok, query_embedding} = Arcana.Embedder.embed(Arcana.embedder(), query)
semantic_only = VectorStore.search("languages", query_embedding,
vector_store: {:memory, pid: memory_pid}, limit: 3)
hybrid = HybridSearch.search("languages", query,
[vector_store: {:memory, pid: memory_pid}], 3)
IO.puts("Query: '#{query}'\n")
IO.puts("Semantic only:")
for r <- semantic_only, do: IO.puts(" #{r.id}: #{Float.round(r.score, 3)}")
IO.puts("\nHybrid (RRF):")
for r <- hybrid, do: IO.puts(" #{r.id}: #{Float.round(r.score, 4)}")
8. Score Distribution Visualization
# Get all scores for a query
query = "programming language features"
{:ok, query_embedding} = Arcana.Embedder.embed(Arcana.embedder(), query)
all_scores = VectorStore.search("languages", query_embedding,
vector_store: {:memory, pid: memory_pid}, limit: 10)
|> Enum.map(fn r ->
%{document: r.id, score: r.score, type: "semantic"}
end)
fulltext_scores = VectorStore.search_text("languages", query,
vector_store: {:memory, pid: memory_pid}, limit: 10)
|> Enum.map(fn r ->
%{document: r.id, score: r.score, type: "fulltext"}
end)
combined = all_scores ++ fulltext_scores
Vl.new(width: 500, height: 300, title: "Score Comparison: '#{query}'")
|> Vl.data_from_values(combined)
|> Vl.mark(:bar)
|> Vl.encode_field(:x, "document", type: :nominal, axis: [label_angle: -45])
|> Vl.encode_field(:y, "score", type: :quantitative)
|> Vl.encode_field(:color, "type", type: :nominal)
|> Vl.encode_field(:x_offset, "type", type: :nominal)
9. Query Rewriting
Query rewriting cleans up conversational input into clear search queries. This helps when questions come from chatbots or voice interfaces.
How It Works
The Pipeline.rewrite/2 step uses an LLM to remove conversational noise while preserving important terms:
Original: "Hey, I was wondering if you could tell me about Elixir"
Rewritten: "about Elixir"
Original: "So like, can you compare Python and Rust for me?"
Rewritten: "compare Python and Rust"
Example with Mock LLM
# Mock LLM that rewrites queries
rewrite_llm = fn prompt ->
cond do
prompt =~ "Hey" or prompt =~ "wondering" ->
{:ok, "about Elixir"}
prompt =~ "So like" or prompt =~ "for me" ->
{:ok, "compare Python and Rust"}
true ->
{:ok, prompt}
end
end
# Using with Pipeline
# ctx = Pipeline.new("Hey, can you tell me about Elixir?", repo: nil, llm: rewrite_llm)
# |> Pipeline.rewrite()
# ctx.rewritten_query => "about Elixir"
IO.puts("Query rewriting removes conversational noise for better retrieval")
Rewrite vs Expand
| Step | Purpose | Use When |
|---|---|---|
rewrite/2 |
Cleans conversational input | Chatbots, voice interfaces |
expand/2 |
Adds synonyms | Short queries, abbreviations, jargon |
You can combine both (rewrite runs first):
# ctx = Pipeline.new("Hey, tell me about ML", ...)
# |> Pipeline.rewrite() # "Hey, tell me about ML" → "about ML"
# |> Pipeline.expand() # "about ML" → "about ML machine learning..."
# |> Pipeline.search()
10. Query Expansion
Query expansion adds synonyms and related terms to your query, which helps when users search with abbreviations (e.g., “ML” instead of “machine learning”).
How It Works
The Pipeline.expand/2 step uses an LLM to analyze your query and add related terms:
Original: "ML models"
Expanded: "ML machine learning artificial intelligence models algorithms neural networks"
Example with Mock LLM
# Mock LLM that expands queries
expand_llm = fn prompt ->
cond do
prompt =~ "ML" or prompt =~ "machine learning" ->
{:ok, "ML machine learning artificial intelligence models algorithms deep learning neural networks"}
prompt =~ "API" ->
{:ok, "API application programming interface REST GraphQL endpoints HTTP requests"}
true ->
{:ok, prompt}
end
end
# Using with Pipeline
alias Arcana.Pipeline
# First ingest a document about ML
{:ok, _} = Arcana.ingest(
"Deep learning is a subset of machine learning that uses neural networks.",
repo: nil,
vector_store: {:memory, pid: memory_pid},
collection: "tech-docs"
)
# The expand step would add synonyms before search
# ctx = Pipeline.new("ML", repo: nil, llm: expand_llm) |> Pipeline.expand()
# ctx.expanded_query => "ML machine learning artificial intelligence..."
IO.puts("Query expansion adds synonyms to improve recall")
Expand vs. Decompose
| Step | Purpose | Use When |
|---|---|---|
expand/2 |
Adds synonyms | Short queries, abbreviations, jargon |
decompose/2 |
Splits into sub-questions | Complex, multi-part questions |
You can combine both:
# ctx = Pipeline.new("What is ML and how does it work?", ...)
# |> Pipeline.expand() # Adds synonyms
# |> Pipeline.decompose() # Splits into sub-questions
# |> Pipeline.search()
11. Re-ranking
Re-ranking improves search result quality by scoring each retrieved chunk’s relevance to the question, then filtering and re-sorting by score.
How It Works
Retrieved chunks: [chunk1, chunk2, chunk3, chunk4, chunk5]
↓
LLM scores each: [ 9, 3, 8, 2, 7 ]
↓
Filter (≥7): [chunk1, chunk3, chunk5]
↓
Sort by score: [chunk1, chunk3, chunk5] # 9, 8, 7
Using the Default Reranker
# Mock LLM that scores relevance
scoring_llm = fn prompt ->
cond do
prompt =~ "functional programming" and prompt =~ "Rate how relevant" ->
{:ok, ~s({"score": 9, "reasoning": "directly relevant"})}
prompt =~ "weather" and prompt =~ "Rate how relevant" ->
{:ok, ~s({"score": 2, "reasoning": "not relevant"})}
prompt =~ "Rate how relevant" ->
{:ok, ~s({"score": 5, "reasoning": "somewhat relevant"})}
true ->
{:ok, "Elixir is a functional programming language."}
end
end
# Rerank filters low-scoring chunks
# ctx = Pipeline.new("What is Elixir?", ...)
# |> Pipeline.search()
# |> Pipeline.rerank(threshold: 7) # Keep only chunks scoring 7+
IO.puts("Re-ranking filters out irrelevant chunks")
Custom Reranker
Implement the Arcana.Reranker behaviour for custom scoring logic:
defmodule MyKeywordReranker do
@behaviour Arcana.Reranker
@impl Arcana.Reranker
def rerank(_question, chunks, opts) do
threshold = Keyword.get(opts, :threshold, 5)
scored_chunks =
chunks
|> Enum.map(fn chunk ->
score = if chunk.text =~ "Elixir", do: 10, else: 3
{chunk, score}
end)
|> Enum.filter(fn {_chunk, score} -> score >= threshold end)
|> Enum.sort_by(fn {_chunk, score} -> score end, :desc)
|> Enum.map(fn {chunk, _score} -> chunk end)
{:ok, scored_chunks}
end
end
# Use with: Pipeline.rerank(ctx, reranker: MyKeywordReranker)
Full Pipeline
# Complete agentic RAG pipeline:
# ctx = Pipeline.new("What is Elixir?", repo: MyApp.Repo, llm: my_llm)
# |> Pipeline.rewrite() # Clean up conversational input
# |> Pipeline.select() # Choose collections
# |> Pipeline.expand() # Add synonyms
# |> Pipeline.search() # Retrieve chunks
# |> Pipeline.rerank() # Filter by relevance
# |> Pipeline.answer(self_correct: true) # Generate and refine answer
IO.puts("The full pipeline: rewrite → select → expand → search → rerank → answer")
12. GraphRAG with Memory Backend
GraphRAG enhances retrieval by building a knowledge graph from your documents. Both pgvector and memory backends support GraphRAG.
Start the Memory Graph Store
# Start the in-memory graph store
{:ok, graph_pid} = Arcana.Graph.GraphStore.Memory.start_link(name: nil)
IO.puts("Memory graph store started: #{inspect(graph_pid)}")
How GraphRAG Works
When you ingest with graph: true:
- Entity extraction - Named entities (people, organizations, etc.) are extracted from each chunk
- Relationship extraction - Semantic relationships between entities are identified
- Graph storage - Entities, relationships, and chunk mentions are persisted
- Community detection - Clusters of related entities are identified
When you search with graph: true:
- Query entities - Entities are extracted from your query
- Graph traversal - Related chunks are found via entity relationships
- Fusion - Graph results are combined with vector results using Reciprocal Rank Fusion
Using GraphRAG with Memory Backend
# Note: In a real application, you'd use:
# Arcana.ingest(text, graph: true, graph_store: {:memory, pid: graph_pid})
# Arcana.search(query, graph: true, graph_store: {:memory, pid: graph_pid})
# The graph store can also be named for easier reference:
# {:ok, _} = Arcana.Graph.GraphStore.Memory.start_link(name: :my_graph)
# Arcana.ingest(text, graph: true, graph_store: {:memory, name: :my_graph})
IO.puts("""
GraphRAG with memory backend:
- No database setup required
- Ideal for testing and experimentation
- Data is not persisted (lost when process stops)
- Same API as Ecto backend
For production, use the Ecto backend which persists to PostgreSQL.
""")
Self-Correcting Answers
The answer/2 step can evaluate and refine answers to ensure they’re grounded in the context:
# ctx = Pipeline.new("What is Elixir?", repo: MyApp.Repo, llm: my_llm)
# |> Pipeline.search()
# |> Pipeline.rerank()
# |> Pipeline.answer(self_correct: true, max_corrections: 2)
#
# ctx.answer # Final (possibly refined) answer
# ctx.correction_count # Number of corrections made
# ctx.corrections # List of {previous_answer, feedback} tuples
IO.puts("""
Self-correction flow:
1. Generate initial answer
2. Evaluate if grounded in context
3. If not grounded, regenerate with feedback
4. Repeat up to max_corrections times
""")
Explicit Collection Selection
You can skip LLM-based collection selection by passing :collection or :collections directly to search/2:
# Search a specific collection without using select/2
# ctx = Pipeline.new("How do I deploy Phoenix?", repo: MyApp.Repo, llm: my_llm)
# |> Pipeline.search(collection: "deployment-docs")
# |> Pipeline.answer()
# Search multiple specific collections
# ctx = Pipeline.new("What are the best practices?", repo: MyApp.Repo, llm: my_llm)
# |> Pipeline.search(collections: ["docs", "tutorials"])
# |> Pipeline.answer()
IO.puts("""
Collection selection priority:
1. :collection/:collections option passed to search/2
2. ctx.collections (set by select/2)
3. Falls back to "default" collection
Use explicit collections when:
- You have only one collection
- User explicitly chooses the collection(s)
- You want deterministic routing without LLM overhead
""")
13. Grounding & Chunk Attribution
After generating an answer, ground/2 checks if each part is faithful to the retrieved context. Arcana ships two grounders:
- Hallmark (NLI): runs Vectara’s HHEM ModernBERT model locally via Bumblebee. Scores each sentence against the concatenated context. Fast, free, great for Pipeline where the chunk set is small and fixed.
- LLMJudge (claim decomposition): asks an LLM to decompose the answer into atomic claims and verify each against the chunks. Returns per-claim verdicts and chunk attribution. Better for Loop where many chunks accumulate and NLI context truncation is a problem.
Both return the same %Arcana.Grounding.Result{} struct with score, hallucinated_spans, and faithful_spans.
Using Grounding
# Pipeline: uses Hallmark (NLI) by default
# ctx = Pipeline.new("When was Elixir created?", repo: repo, llm: llm)
# |> Pipeline.search(collection: "docs")
# |> Pipeline.answer()
# |> Pipeline.ground()
# Loop: use LLMJudge for better handling of large chunk sets
# ctx = Arcana.Loop.ground(ctx,
# grounder: Arcana.Grounder.LLMJudge,
# judge_model: "anthropic:claude-haiku-4-5"
# )
#
# ctx.grounding.score # 0.85 (proportion of supported claims)
# ctx.grounding.hallucinated_spans # unsupported/contradicted claims
# ctx.grounding.faithful_spans # claims backed by context
IO.puts("""
Grounding result fields:
- score: faithfulness score (0.0 to 1.0)
- hallucinated_spans: unsupported claims with byte offsets and chunk attribution
- faithful_spans: supported claims with byte offsets and chunk attribution
""")
Chunk Attribution
Each span carries a :sources list linking it to the context chunks that support (or contradict) it:
# Every span has sources: [%{chunk_id: term(), score: float()}]
#
# For Hallmark: score is word overlap fraction
# For LLMJudge: score is 1.0 (the LLM directly identifies supporting chunks)
# See which chunks actually supported the answer
# cited_chunk_ids =
# ctx.grounding.faithful_spans
# |> Enum.flat_map(& &1.sources)
# |> Enum.map(& &1.chunk_id)
# |> Enum.uniq()
IO.puts("""
Attribution patterns:
- Empty sources on hallucinated span: fully invented
- Hallucinated with sources: words match but facts wrong (contradiction)
- faithful_spans sources: chunks that backed the answer
""")
Custom Grounder
Replace the built-in grounders with your own logic:
# Module-based
# defmodule MyGrounder do
# @behaviour Arcana.Grounder
#
# @impl true
# def ground(answer, chunks, _opts) do
# {:ok, %Arcana.Grounding.Result{
# score: 1.0,
# hallucinated_spans: [],
# faithful_spans: []
# }}
# end
# end
# Inline function
# Pipeline.ground(ctx, grounder: fn answer, chunks, _opts ->
# {:ok, %Arcana.Grounding.Result{score: 1.0, hallucinated_spans: [], faithful_spans: []}}
# end)
IO.puts("Grounding is pluggable: Hallmark, LLMJudge, a custom module, or an inline function")
14. Loop (Agentic RAG)
Pipeline is a deterministic composition: you decide the steps at call time and the context flows through them in order. Arcana.Loop is the opposite trade-off. You give the LLM a set of tools and let it decide which to call each turn, until it commits via answer or hits the iteration cap.
The default toolset is three tools: search (the only one that touches the repo), answer (ends the loop with the final text), and give_up (ends with a failure signal).
# In a real app:
# {:ok, ctx} =
# Arcana.Loop.new("Which Time Lords have betrayed the Doctor?",
# repo: MyApp.Repo,
# collection: "doctor-who"
# )
# |> Arcana.Loop.run(controller_llm: "openai:gpt-4o-mini")
#
# ctx.answer # the final text
# ctx.tool_history # list of tool calls in order
# ctx.terminated_by # :answered, :gave_up, :max_iterations, or :error
IO.puts("Loop lets the LLM drive retrieval instead of hard-coding a pipeline")
Collections: lock vs pick
The :collection / :collections options shape what the controller can express. This is a guardrail for multi-tenant and multi-collection apps.
-
Lock:
Loop.new(q, collection: "docs")removes thecollectionparameter from the tool schema entirely. The controller literally cannot search anything else. -
Pick:
Loop.new(q, collections: ["docs", "wiki"])adds an optionalcollectionparam the controller picks per call. The system prompt lists the allowed values. - Unrestricted: omit both. Searches across whatever the configured default is.
Live demo: scripted controller
For the livebook we don’t have a real LLM, so we’ll drive the loop with a scripted controller that returns canned classified responses. This is the same technique Arcana.Loop‘s own test suite uses.
alias Arcana.Loop
alias Arcana.Loop.{Context, Tools}
# Inspect the tool schema the controller will see for a multi-collection setup.
# Notice the `:collection` parameter that appears only because we passed a list.
multi_tools = Tools.default(["docs", "wiki"])
search_tool = Enum.find(multi_tools, &(&1.name == "search"))
IO.puts("Multi-collection search tool params:")
Enum.each(search_tool.parameter_schema, fn {name, opts} ->
IO.puts(" #{name}: #{inspect(opts[:type])}")
end)
# And the locked (single-collection) variant. No `:collection` param at all.
locked_tools = Tools.default(["docs"])
locked_search = Enum.find(locked_tools, &(&1.name == "search"))
locked_params = Enum.map(locked_search.parameter_schema, &elem(&1, 0))
IO.puts("\nLocked search tool params: #{inspect(locked_params)}")
# Scripted controller: call search, then answer.
{:ok, script} =
Agent.start_link(fn ->
[
%{
type: :tool_calls,
text: "",
thinking: "",
tool_calls: [%{id: "c1", name: "search", arguments: %{"query" => "elixir"}}],
finish_reason: :tool_calls
},
%{
type: :tool_calls,
text: "",
thinking: "",
tool_calls: [
%{id: "c2", name: "answer", arguments: %{"text" => "Elixir runs on the BEAM."}}
],
finish_reason: :tool_calls
}
]
end)
controller = fn _messages, _tools, _opts ->
next =
Agent.get_and_update(script, fn
[head | rest] -> {head, rest}
[] -> {nil, []}
end)
case next do
nil -> {:error, :script_exhausted}
classified -> {:ok, classified}
end
end
# Stub search_fn so the loop doesn't need a repo.
search_fn = fn _query, _opts ->
{:ok,
[
%{id: "c1", text: "Elixir is a functional language that runs on the BEAM.", score: 0.9}
]}
end
{:ok, ctx} =
Loop.new("what is elixir")
|> Loop.run(controller_llm: controller, search_fn: search_fn, max_iterations: 5)
IO.puts("\n=== Loop result ===")
IO.puts("terminated_by: #{inspect(ctx.terminated_by)}")
IO.puts("iterations: #{ctx.iterations}")
IO.puts("answer: #{ctx.answer}")
IO.puts("\nTool history:")
Enum.each(ctx.tool_history, fn entry ->
IO.puts(" [#{entry.iteration}] #{entry.tool} #{inspect(entry.args)}")
end)
In a real app you’d pass a ReqLLM model string (e.g. "openai:gpt-4o-mini") as the controller instead of a scripted function. The loop emits a [:arcana, :loop, :tool_call] telemetry event after each tool call, which is what drives the live trace in the Arcana dashboard.
Custom tools
You can extend the Loop with your own tools. Custom tools are invoked via their :callback when the controller calls them. The callback is a 1-arity function (args) -> {:ok, text} | {:error, text} where text is returned to the controller. Custom tools always continue the loop (only answer and give_up terminate it).
# Example: add a calculator tool
calculator = ReqLLM.Tool.new!(
name: "calculate",
description: "Evaluate an arithmetic expression.",
parameter_schema: [
expression: [type: :string, required: true, doc: "e.g. '2 + 2'"]
],
callback: fn %{expression: expr} ->
# In a real app, use a safe expression parser
{:ok, "Result: #{expr}"}
end
)
# Append to defaults and pass via :tools
# {:ok, ctx} = Loop.run(ctx,
# tools: Tools.default() ++ [calculator],
# controller_llm: "openai:gpt-4o-mini"
# )
IO.puts("Custom tools extend the default search/answer/give_up toolset")
When the loop hits max_iterations and falls back to synthesis, the synthesis step is recorded in ctx.tool_history as a :synthesis entry so the dashboard trace shows it alongside the controller’s regular tool calls.
Summary
| Concept | Description |
|---|---|
| Memory Backend | HNSWLib-based in-memory store, no database needed |
| Semantic Search | Finds similar meaning using embeddings |
| Fulltext Search | Matches exact keywords with TF-IDF scoring |
| Hybrid Search | Combines both using Reciprocal Rank Fusion |
| Collections | Organize documents into logical groups |
| Retrieval Gating | Skip retrieval for questions answerable from knowledge |
| Query Rewriting | Clean conversational input into clear queries |
| Query Expansion | Add synonyms for better recall |
| Multi-hop Reasoning | Search again if results are insufficient |
| Re-ranking | Filter chunks by LLM-scored relevance |
| Pipeline (Modular RAG) | Gate → Rewrite → Select → Expand → Search → Reason → Rerank → Answer → Ground. You compose the steps. |
| Loop (Agentic RAG) |
LLM picks tools each turn (search / answer / give_up + custom tools). The model decides the control flow, including when to refine queries and when to issue separate searches for different aspects of a question. |
| Custom tools | Extend Loop’s toolset with domain-specific tools (web search, calculator, API calls). Callback-based: the tool’s function runs and the result goes back to the controller. |
| Controller / answerer split |
In Arcana.Loop, pair a cheap controller LLM (picks tools) with a stronger answerer LLM (writes the user-facing answer) via controller_llm: and answer_llm: |
| Grounding |
Detect hallucinated vs faithful spans with chunk attribution. Two built-in grounders: Hallmark (NLI, fast, local) and LLMJudge (claim decomposition, better for large chunk sets). Available on both Pipeline (Pipeline.ground/2) and Loop (Loop.ground/2), sharing the same Arcana.Grounder behaviour. |
| GraphRAG | Knowledge graph with entity extraction (supports memory and Ecto backends) |
Next Steps
- Try with real documents (PDFs, markdown files)
- Connect to pgvector for persistence
- Use a real LLM (OpenAI, Anthropic, Z.ai) for the pipeline
-
Try
Arcana.Loopfor open-ended questions where the right sequence of searches isn’t knowable upfront (see the Loop guide) - Add to your Phoenix application with the dashboard
- Enable GraphRAG for entity-based retrieval (see the GraphRAG guide)
# Cleanup
GenServer.stop(memory_pid)
if Process.alive?(graph_pid), do: GenServer.stop(graph_pid)
IO.puts("Memory stores stopped")