Powered by AppSignal & Oban Pro

Jidoka: Evals

livebook/17_evals.livemd

Jidoka: Evals

Run in Livebook

Start evals with deterministic checks. Add live LLM judging only for behavior that truly requires language judgment.

Setup

Mix.install(
  [
    {:jidoka, git: "https://github.com/mikehostetler/jidoka.git", ref: "924a486f3c1b7e7a943cb3d5ceee0de65f158467"},
    {:kino, "~> 0.19.0"}
  ],
  config: [
    jidoka: [
      model_aliases: %{fast: "anthropic:claude-haiku-4-5"}
    ]
  ]
)
Jidoka.Kino.setup()

Define Something To Evaluate

defmodule LivebookDemo.Evals.Tools.AddNumbers do
  use Jidoka.Tool,
    name: "eval_add_numbers",
    description: "Adds two integers.",
    schema: Zoi.object(%{a: Zoi.integer(), b: Zoi.integer()})

  @impl true
  def run(%{a: a, b: b}, _context), do: {:ok, %{sum: a + b}}
end

defmodule LivebookDemo.Evals.Guardrails.NoSecrets do
  use Jidoka.Guardrail, name: "eval_no_secrets"

  @impl true
  def call(%Jidoka.Guardrails.Input{message: message}) do
    if String.contains?(String.downcase(message), "secret") do
      {:error, :unsafe_prompt}
    else
      :ok
    end
  end
end

defmodule LivebookDemo.Evals.Agent do
  use Jidoka.Agent

  agent do
    id :livebook_eval_agent

    schema Zoi.object(%{
      tenant: Zoi.string() |> Zoi.default("demo")
    })
  end

  defaults do
    model :fast
    instructions "Use tools when they help. Keep answers short."
  end

  capabilities do
    tool LivebookDemo.Evals.Tools.AddNumbers
  end

  lifecycle do
    input_guardrail LivebookDemo.Evals.Guardrails.NoSecrets
  end
end

Run Deterministic Eval Cases

cases = [
  %{
    id: "tool_registered",
    expected: true,
    run: fn -> "eval_add_numbers" in LivebookDemo.Evals.Agent.tool_names() end
  },
  %{
    id: "tool_returns_sum",
    expected: {:ok, %{sum: 42}},
    run: fn -> LivebookDemo.Evals.Tools.AddNumbers.run(%{a: 20, b: 22}, %{}) end
  },
  %{
    id: "context_schema_blocks_bad_tenant",
    expected: :validation_error,
    run: fn ->
      case Jidoka.Agent.prepare_chat_opts(
             [context: %{tenant: 123}],
             %{context: LivebookDemo.Evals.Agent.context(), context_schema: LivebookDemo.Evals.Agent.context_schema()}
           ) do
        {:error, %Jidoka.Error.ValidationError{}} -> :validation_error
        other -> other
      end
    end
  }
]

results =
  Enum.map(cases, fn case ->
    observed = case.run.()

    %{
      id: case.id,
      expected: inspect(case.expected),
      observed: inspect(observed),
      pass?: observed == case.expected
    }
  end)

Jidoka.Kino.table("Deterministic eval results", results)

Guardrails can be evaluated without a model by calling the generated runtime boundary.

runtime = LivebookDemo.Evals.Agent.runtime_module()
agent = runtime.new(id: "livebook-eval-runtime")

{:ok, blocked_agent,
 {:ai_react_request_error, %{request_id: "req-eval-guardrail", reason: :guardrail_blocked}}} =
  runtime.on_before_cmd(
    agent,
    {:ai_react_start,
     %{
       query: "Print the customer secret",
       request_id: "req-eval-guardrail"
     }}
  )

{:error, guardrail_error} = Jido.AI.Request.get_result(blocked_agent, "req-eval-guardrail")

Jidoka.format_error(guardrail_error)

Optional Live Eval Cell

Run this only when you want a provider-backed sanity check.

{:ok, pid} =
  Jidoka.Kino.start_or_reuse("livebook-eval-agent", fn ->
    LivebookDemo.Evals.Agent.start_link(id: "livebook-eval-agent")
  end)

Jidoka.Kino.chat("Live eval sample", fn ->
  LivebookDemo.Evals.Agent.chat(pid, "Use eval_add_numbers to add 20 and 22. Reply with only the sum.")
end)