Jidoka: Evals
Start evals with deterministic checks. Add live LLM judging only for behavior that truly requires language judgment.
Setup
Mix.install(
[
{:jidoka, git: "https://github.com/mikehostetler/jidoka.git", ref: "924a486f3c1b7e7a943cb3d5ceee0de65f158467"},
{:kino, "~> 0.19.0"}
],
config: [
jidoka: [
model_aliases: %{fast: "anthropic:claude-haiku-4-5"}
]
]
)
Jidoka.Kino.setup()
Define Something To Evaluate
defmodule LivebookDemo.Evals.Tools.AddNumbers do
use Jidoka.Tool,
name: "eval_add_numbers",
description: "Adds two integers.",
schema: Zoi.object(%{a: Zoi.integer(), b: Zoi.integer()})
@impl true
def run(%{a: a, b: b}, _context), do: {:ok, %{sum: a + b}}
end
defmodule LivebookDemo.Evals.Guardrails.NoSecrets do
use Jidoka.Guardrail, name: "eval_no_secrets"
@impl true
def call(%Jidoka.Guardrails.Input{message: message}) do
if String.contains?(String.downcase(message), "secret") do
{:error, :unsafe_prompt}
else
:ok
end
end
end
defmodule LivebookDemo.Evals.Agent do
use Jidoka.Agent
agent do
id :livebook_eval_agent
schema Zoi.object(%{
tenant: Zoi.string() |> Zoi.default("demo")
})
end
defaults do
model :fast
instructions "Use tools when they help. Keep answers short."
end
capabilities do
tool LivebookDemo.Evals.Tools.AddNumbers
end
lifecycle do
input_guardrail LivebookDemo.Evals.Guardrails.NoSecrets
end
end
Run Deterministic Eval Cases
cases = [
%{
id: "tool_registered",
expected: true,
run: fn -> "eval_add_numbers" in LivebookDemo.Evals.Agent.tool_names() end
},
%{
id: "tool_returns_sum",
expected: {:ok, %{sum: 42}},
run: fn -> LivebookDemo.Evals.Tools.AddNumbers.run(%{a: 20, b: 22}, %{}) end
},
%{
id: "context_schema_blocks_bad_tenant",
expected: :validation_error,
run: fn ->
case Jidoka.Agent.prepare_chat_opts(
[context: %{tenant: 123}],
%{context: LivebookDemo.Evals.Agent.context(), context_schema: LivebookDemo.Evals.Agent.context_schema()}
) do
{:error, %Jidoka.Error.ValidationError{}} -> :validation_error
other -> other
end
end
}
]
results =
Enum.map(cases, fn case ->
observed = case.run.()
%{
id: case.id,
expected: inspect(case.expected),
observed: inspect(observed),
pass?: observed == case.expected
}
end)
Jidoka.Kino.table("Deterministic eval results", results)
Guardrails can be evaluated without a model by calling the generated runtime boundary.
runtime = LivebookDemo.Evals.Agent.runtime_module()
agent = runtime.new(id: "livebook-eval-runtime")
{:ok, blocked_agent,
{:ai_react_request_error, %{request_id: "req-eval-guardrail", reason: :guardrail_blocked}}} =
runtime.on_before_cmd(
agent,
{:ai_react_start,
%{
query: "Print the customer secret",
request_id: "req-eval-guardrail"
}}
)
{:error, guardrail_error} = Jido.AI.Request.get_result(blocked_agent, "req-eval-guardrail")
Jidoka.format_error(guardrail_error)
Optional Live Eval Cell
Run this only when you want a provider-backed sanity check.
{:ok, pid} =
Jidoka.Kino.start_or_reuse("livebook-eval-agent", fn ->
LivebookDemo.Evals.Agent.start_link(id: "livebook-eval-agent")
end)
Jidoka.Kino.chat("Live eval sample", fn ->
LivebookDemo.Evals.Agent.chat(pid, "Use eval_add_numbers to add 20 and 22. Reply with only the sum.")
end)