Hallmark Guide
Mix.install([
{:hallmark, "~> 0.1.0"},
{:exla, "~> 0.9"}
])
Loading the model
Hallmark uses Vectara’s HHEM (Hallucination Evaluation Model), a fine-tuned FLAN-T5-base that runs entirely locally. The first call downloads ~440MB of weights from HuggingFace, then everything is cached.
{:ok, model} = Hallmark.load(compiler: EXLA)
Scoring a single pair
score/3 takes a premise (source text) and a hypothesis (text to check), returning a float from 0.0 (hallucinated) to 1.0 (consistent).
# The hypothesis follows from the premise
{:ok, score} = Hallmark.score(model, "I am in California", "I am in United States.")
IO.puts("Score: #{score}")
# The hypothesis contradicts the premise
{:ok, score} =
Hallmark.score(
model,
"The capital of France is Berlin.",
"The capital of France is Paris."
)
IO.puts("Score: #{score}")
Getting a label
If you just need a :consistent or :hallucinated verdict, use evaluate/3. The default threshold is 0.5.
{:ok, label} = Hallmark.evaluate(model, "I am in California", "I am in United States.")
IO.puts("Label: #{label}")
# You can set a stricter threshold
{:ok, label} =
Hallmark.evaluate(
model,
"I am in California",
"I am in United States.",
threshold: 0.8
)
IO.puts("Label with 0.8 threshold: #{label}")
Batch scoring
For multiple pairs at once, score_batch/2 is more efficient than calling score/3 in a loop since it runs a single forward pass through the model.
pairs = [
{"I am in California", "I am in United States."},
{"I am in United States", "I am in California."},
{"A person on a horse jumps over a broken down airplane.", "A person is outdoors, on a horse."},
{"Mark Wahlberg was a fan of Manny.", "Manny was a fan of Mark Wahlberg."}
]
{:ok, scores} = Hallmark.score_batch(model, pairs)
Enum.zip(pairs, scores)
|> Enum.each(fn {{premise, hypothesis}, score} ->
label = if score >= 0.5, do: "consistent", else: "hallucinated"
IO.puts("#{label} (#{Float.round(score, 3)})")
IO.puts(" premise: #{premise}")
IO.puts(" hypothesis: #{hypothesis}\n")
end)
Checking LLM output against source material
The main use case: you have some source text and an LLM generated a response from it. Is the response actually grounded in the source?
source = """
Elixir is a dynamic, functional language for building scalable and maintainable applications.
Elixir runs on the Erlang VM, known for creating low-latency, distributed, and fault-tolerant systems.
"""
# Grounded claim
{:ok, score} = Hallmark.score(model, source, "Elixir is a functional programming language.")
IO.puts("Grounded claim: #{Float.round(score, 3)}")
# Hallucinated claim (not in the source)
{:ok, score} = Hallmark.score(model, source, "Elixir was created by José Valim in 2011.")
IO.puts("Hallucinated claim: #{Float.round(score, 3)}")
Important: entailment, not factual accuracy
Hallmark checks whether the hypothesis follows from the premise. It doesn’t check if something is true in general. A factually correct statement can still be flagged as hallucinated if it’s not supported by the given premise.
premise = "The dog sat on the mat."
# True in general but not stated in the premise
{:ok, score} = Hallmark.score(model, premise, "Dogs are mammals.")
IO.puts("'Dogs are mammals' score: #{Float.round(score, 3)}")
# Actually follows from the premise
{:ok, score} = Hallmark.score(model, premise, "An animal was on the mat.")
IO.puts("'An animal was on the mat' score: #{Float.round(score, 3)}")