Architecture Comparison with Edifice

notebooks/architecture_comparison.livemd

Bradley Fargo

@blasphemetheus

edifice

Share to X

Share to Bluesky

More notebooks

Architecture Comparison with Edifice

Setup

Choose one of the two cells below depending on how you started Livebook.

Standalone (default)

Use this if you started Livebook normally (livebook server). Uncomment the EXLA lines for GPU acceleration.

edifice_dep =
  if File.dir?(Path.expand("~/edifice")) do
    {:edifice, path: Path.expand("~/edifice")}
  else
    {:edifice, "~> 0.2.0"}
  end

Mix.install([
  edifice_dep,
  # {:exla, "~> 0.10"},
  {:kino_vega_lite, "~> 0.1"},
  {:kino, "~> 0.14"}
])

# Nx.global_default_backend(EXLA.Backend)
alias VegaLite, as: Vl

Attached to project (recommended for Nix/CUDA)

Use this if you started Livebook via ./scripts/livebook.sh. See the Architecture Zoo notebook for full setup instructions.

Nx.global_default_backend(EXLA.Backend)
alias VegaLite, as: Vl
IO.puts("Attached mode — using EXLA backend from project node")

Introduction

Edifice’s unified Edifice.build(:name, opts) API means swapping architectures is a one-line change while the entire training pipeline stays identical.

This notebook puts that to the test: we train 8 architectures from 5 different families on the same classification task, measure accuracy and training time, and rank them in a head-to-head comparison.

What you’ll learn:

How different architecture families compare on the same data
That backbone choice matters more than hyperparameter tuning for some problems
How Edifice makes systematic experimentation trivial

Generate Synthetic Data

We use a harder classification problem than the MLP notebook: 5 classes arranged in concentric rings and clusters. This tests whether architectures can learn non-linear decision boundaries.

key = Nx.Random.key(42)
n_per_class = 200
num_classes = 5

# Class 0-1: two concentric rings
# Class 2-4: three offset clusters
{all_points, all_labels, _key} =
  Enum.reduce(0..(num_classes - 1), {[], [], key}, fn class, {pts, labs, k} ->
    {noise, k} = Nx.Random.normal(k, shape: {n_per_class, 2})
    {angles, k} = Nx.Random.uniform(k, shape: {n_per_class, 1})

    points =
      case class do
        0 ->
          # Inner ring
          theta = Nx.multiply(angles, 2 * :math.pi())
          r = Nx.add(1.0, Nx.multiply(Nx.squeeze(noise[[.., 0..0]]), 0.2))
          x = Nx.multiply(r, Nx.cos(Nx.squeeze(theta)))
          y = Nx.multiply(r, Nx.sin(Nx.squeeze(theta)))
          Nx.stack([x, y], axis: 1)

        1 ->
          # Outer ring
          theta = Nx.multiply(angles, 2 * :math.pi())
          r = Nx.add(2.5, Nx.multiply(Nx.squeeze(noise[[.., 0..0]]), 0.25))
          x = Nx.multiply(r, Nx.cos(Nx.squeeze(theta)))
          y = Nx.multiply(r, Nx.sin(Nx.squeeze(theta)))
          Nx.stack([x, y], axis: 1)

        c ->
          # Offset clusters at different positions
          centers = [{-3.0, 2.5}, {3.0, 2.5}, {0.0, -3.5}]
          {cx, cy} = Enum.at(centers, c - 2)
          Nx.add(Nx.multiply(noise, 0.5), Nx.tensor([cx, cy]))
      end

    {pts ++ [points], labs ++ List.duplicate(class, n_per_class), k}
  end)

x_all = Nx.concatenate(all_points)
y_all = Nx.tensor(all_labels)

# Shuffle
n_total = num_classes * n_per_class
{shuffle_noise, _k} = Nx.Random.uniform(Nx.Random.key(99), shape: {n_total})
shuffle_idx = Nx.argsort(shuffle_noise)
x_all = Nx.take(x_all, shuffle_idx)
y_all = Nx.take(y_all, shuffle_idx)

# One-hot encode
y_onehot =
  Nx.equal(
    Nx.new_axis(y_all, 1),
    Nx.tensor([Enum.to_list(0..(num_classes - 1))])
  )
  |> Nx.as_type(:f32)

# 80/20 split
n_train = round(n_total * 0.8)
train_x = x_all[0..(n_train - 1)]
train_y = y_onehot[0..(n_train - 1)]
test_x = x_all[n_train..-1//1]
test_y = y_onehot[n_train..-1//1]
test_labels = y_all[n_train..-1//1]

batch_size = 32

train_data =
  Enum.zip(
    Nx.to_batched(train_x, batch_size) |> Enum.to_list(),
    Nx.to_batched(train_y, batch_size) |> Enum.to_list()
  )

"#{n_train} train / #{n_total - n_train} test, #{num_classes} classes, #{length(train_data)} batches"

chart_data =
  Enum.zip_with(
    [Nx.to_flat_list(x_all[[.., 0]]), Nx.to_flat_list(x_all[[.., 1]]), Nx.to_flat_list(y_all)],
    fn [x, y, label] -> %{"x" => x, "y" => y, "class" => "Class #{trunc(label)}"} end
  )

Vl.new(width: 500, height: 400, title: "5-Class Dataset: Rings + Clusters")
|> Vl.data_from_values(chart_data)
|> Vl.mark(:circle, size: 30, opacity: 0.6)
|> Vl.encode_field(:x, "x", type: :quantitative)
|> Vl.encode_field(:y, "y", type: :quantitative)
|> Vl.encode_field(:color, "class", type: :nominal)

Define the Contenders

We build 8 models from 5 families. Each produces a feature vector that we pipe through a shared classification head: Dense(num_classes) -> Softmax.

contenders = [
  # {name, family, build_fn}
  {"MLP", "feedforward",
    fn ->
      Edifice.build(:mlp, input_size: 2, hidden_sizes: [64, 32], activation: :relu, dropout: 0.0)
    end},
  {"TabNet", "feedforward",
    fn ->
      Edifice.build(:tabnet, input_size: 2, hidden_size: 32, num_steps: 3)
    end},
  {"Bayesian", "probabilistic",
    fn ->
      Edifice.build(:bayesian, input_size: 2, output_size: 16, hidden_sizes: [64, 32])
    end},
  {"MCDropout", "probabilistic",
    fn ->
      Edifice.build(:mc_dropout, input_size: 2, output_size: 16, hidden_sizes: [64, 32])
    end},
  {"EBM", "energy",
    fn ->
      Edifice.build(:ebm, input_size: 2, hidden_sizes: [64, 32])
    end},
  {"Hopfield", "energy",
    fn ->
      Edifice.build(:hopfield, input_dim: 2, hidden_dim: 64)
    end},
  {"SNN", "neuromorphic",
    fn ->
      Edifice.build(:snn, input_size: 2, output_size: 16, hidden_sizes: [64, 32])
    end},
  {"ANN2SNN", "neuromorphic",
    fn ->
      Edifice.build(:ann2snn, input_size: 2, output_size: 16, hidden_sizes: [64, 32])
    end}
]

IO.puts("#{length(contenders)} architectures from #{contenders |> Enum.map(&elem(&1, 1)) |> Enum.uniq() |> length()} families")

Train All Architectures

Same loss, optimizer, epochs, and data for every model. The only thing that changes is the backbone.

# Increase to 30-50 for better accuracy; 10 is enough to see differences
epochs = 10
lr = 1.0e-2

results =
  for {name, family, build_fn} <- contenders do
    IO.puts("Training #{name}...")

    model =
      build_fn.()
      |> Axon.dense(num_classes, name: "#{String.downcase(name)}_head")
      |> Axon.activation(:softmax)

    {train_us, trained_state} =
      :timer.tc(fn ->
        model
        |> Axon.Loop.trainer(
          :categorical_cross_entropy,
          Polaris.Optimizers.adam(learning_rate: lr)
        )
        |> Axon.Loop.run(train_data, Axon.ModelState.empty(), epochs: epochs)
      end)

    {_init_fn, predict_fn} = Axon.build(model)
    test_preds = predict_fn.(trained_state, test_x)

    accuracy =
      Nx.equal(Nx.argmax(test_preds, axis: 1), Nx.argmax(test_y, axis: 1))
      |> Nx.mean()
      |> Nx.to_number()

    train_s = Float.round(train_us / 1_000_000, 1)
    IO.puts("  -> #{Float.round(accuracy * 100, 1)}% accuracy in #{train_s}s\n")

    %{
      name: name,
      family: family,
      accuracy: accuracy,
      train_s: train_s,
      preds: test_preds,
      model: model,
      state: trained_state
    }
  end

:done

Results Table

ranked = Enum.sort_by(results, & &1.accuracy, :desc)

IO.puts("=" |> String.duplicate(65))
IO.puts("  #{String.pad_trailing("Rank", 6)}#{String.pad_trailing("Architecture", 15)}#{String.pad_trailing("Family", 16)}#{String.pad_trailing("Accuracy", 12)}Time")
IO.puts("  " <> String.duplicate("-", 60))

ranked
|> Enum.with_index(1)
|> Enum.each(fn {r, rank} ->
  acc_str = "#{Float.round(r.accuracy * 100, 1)}%"
  IO.puts(
    "  #{String.pad_trailing("##{rank}", 6)}" <>
    "#{String.pad_trailing(r.name, 15)}" <>
    "#{String.pad_trailing(r.family, 16)}" <>
    "#{String.pad_trailing(acc_str, 12)}" <>
    "#{r.train_s}s"
  )
end)

IO.puts("=" |> String.duplicate(65))

Accuracy Chart

chart_data =
  ranked
  |> Enum.map(fn r ->
    %{
      "Architecture" => r.name,
      "Accuracy" => Float.round(r.accuracy * 100, 1),
      "Family" => r.family
    }
  end)

Vl.new(width: 500, height: 300, title: "Test Accuracy by Architecture")
|> Vl.data_from_values(chart_data)
|> Vl.mark(:bar)
|> Vl.encode_field(:x, "Architecture", type: :nominal, sort: "-y")
|> Vl.encode_field(:y, "Accuracy", type: :quantitative, scale: %{domain: [0, 100]}, title: "Accuracy (%)")
|> Vl.encode_field(:color, "Family", type: :nominal)

Decision Boundaries (Top 4)

resolution = 50

grid_points =
  for gx <- 0..(resolution - 1), gy <- 0..(resolution - 1) do
    [-5.0 + 10.0 * gx / (resolution - 1), -5.0 + 10.0 * gy / (resolution - 1)]
  end

grid_tensor = Nx.tensor(grid_points)

top4 = Enum.take(ranked, 4)

boundary_charts =
  for r <- top4 do
    {_init_fn, predict_fn} = Axon.build(r.model)
    grid_preds = predict_fn.(r.state, grid_tensor)
    grid_classes = Nx.argmax(grid_preds, axis: 1) |> Nx.to_flat_list()

    grid_data =
      Enum.zip_with([grid_points, grid_classes], fn [[x, y], class] ->
        %{"x" => x, "y" => y, "class" => "Class #{class}"}
      end)

    test_data =
      Enum.zip_with(
        [Nx.to_flat_list(test_x[[.., 0]]), Nx.to_flat_list(test_x[[.., 1]]), Nx.to_flat_list(test_labels)],
        fn [x, y, l] -> %{"x" => x, "y" => y, "class" => "Class #{trunc(l)}"} end
      )

    acc_str = Float.round(r.accuracy * 100, 1)

    Vl.new(width: 250, height: 220, title: "#{r.name} (#{acc_str}%)")
    |> Vl.layers([
      Vl.new()
      |> Vl.data_from_values(grid_data)
      |> Vl.mark(:square, size: 15, opacity: 0.25)
      |> Vl.encode_field(:x, "x", type: :quantitative)
      |> Vl.encode_field(:y, "y", type: :quantitative)
      |> Vl.encode_field(:color, "class", type: :nominal, legend: nil),
      Vl.new()
      |> Vl.data_from_values(test_data)
      |> Vl.mark(:circle, size: 25, stroke: "black", stroke_width: 0.5)
      |> Vl.encode_field(:x, "x", type: :quantitative)
      |> Vl.encode_field(:y, "y", type: :quantitative)
      |> Vl.encode_field(:color, "class", type: :nominal, legend: nil)
    ])
  end

# Arrange as 2x2 grid
[row1, row2] = Enum.chunk_every(boundary_charts, 2)

Vl.new()
|> Vl.concat(
  [
    Vl.new() |> Vl.concat(row1, :horizontal),
    Vl.new() |> Vl.concat(row2, :horizontal)
  ],
  :vertical
)

Let’s visualize how the top 4 models carve up the input space.

Key Takeaways

Same API, different results: Every model was built with Edifice.build/2, trained with the same loop, and evaluated identically. The only variable was the architecture name.
Architecture matters: Different families have different inductive biases. Some handle non-linear boundaries better than others.
Experimentation is cheap: With Edifice’s unified API, trying a new architecture is a one-line change. You don’t need to rewrite data pipelines or training loops.

What’s Next?

Try sequence models: Reshape data to {batch, seq_len, features} and compare :mamba, :gru, :retnet, etc. See the Sequence Modeling notebook.
Scale up: Use scidata for MNIST/CIFAR-10 with vision models like :vit.
Add EXLA: Nx.global_default_backend(EXLA.Backend) for GPU acceleration.
More architectures: Edifice.list_architectures() has 111+ options.

Other notebooks:

Michal Slaski
@michalslaski

livebook_examples

Salary predictions

salary_prediction.livemd

advanced data-science exla axon nx

2022-8-18
Dr. Christian Geuer-Pollmann
@chgeuer

livebook_on_azure

Christian's first LiveBook test

notebook1.livemd

tutorial advanced data-science axon exla nx

2022-8-18
@andyl

elix_util

MNIST

mnist.livemd

tutorial advanced data-science req axon exla nx

2022-8-18
Yejun Su
@goofansu

ogp

ogp

ogp.livemd

tutorial intermediate ogp kino

2022-8-18
Ryan Young
@ryoung786

AdventOfCode

2023 Day 06

06.livemd

tutorial intermediate req vega_lite kino_vega_lite

2023-12-11
@DockYard-Academy

curriculum

File System Todo

file_system_todo_app.livemd

tutorial intermediate jason kino youtube hidden_cell

2023-1-23
Ammar Massoud
@ammar-mohamed-massoud

Dockyard-Academy

Caesar Cypher

caesar_cypher.livemd

tutorial algorithms intermediate jason kino youtube hidden_cell

2026-5-23

Back