Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Chapter 15: Let's do development

15_development/15_development.livemd

Chapter 15: Let’s do development

Mix.install(
  [
    {:exla, "~> 0.5"},
    {:nx, "~> 0.5"},
    {:vega_lite, "~> 0.1.6"},
    {:kino, "~> 0.8.1"},
    {:kino_vega_lite, "~> 0.1.7"}
  ],
  config: [nx: [default_backend: EXLA.Backend]]
)

Prepare Data - Standardizing Input Variables

The module to load MNIST data is the based on the one used in the chapter 14.

defmodule C15.MNISTStandardized do
  @moduledoc """
  Use this Module to load the MNIST database (test, train and validation sets) with
  standardized inputs.

  MNIST dataset specifications can be found here: http://yann.lecun.com/exdb/mnist/
  """

  import Nx.Defn

  @data_path Path.join(__DIR__, "../data/mnist") |> Path.expand()

  @train_images_filename Path.join(@data_path, "train-images-idx3-ubyte.gz")
  @test_images_filename Path.join(@data_path, "t10k-images-idx3-ubyte.gz")
  @train_labels_filename Path.join(@data_path, "train-labels-idx1-ubyte.gz")
  @test_labels_filename Path.join(@data_path, "t10k-labels-idx1-ubyte.gz")

  @type t :: %__MODULE__{
          x_train: Nx.Tensor.t(),
          x_test: Nx.Tensor.t(),
          x_validation: Nx.Tensor.t(),
          y_train: Nx.Tensor.t(),
          y_test: Nx.Tensor.t(),
          y_validation: Nx.Tensor.t(),
          y_train_unencoded: Nx.Tensor.t()
        }
  defstruct [
    :x_train,
    :x_test,
    :x_validation,
    :y_train,
    :y_test,
    :y_validation,
    :y_train_unencoded
  ]

  @doc """
  Load the MNIST database and return the train, test and validation images.

  By default it standardizes the dataset, but it can be skipped by setting the
  option `skip_standardization` to true.

  `y_train` already hot-encoded.
  """
  @spec load(opts :: keyword()) :: t()
  def load(opts \\ []) do
    # 60000 images, each 784 elements (28 * 28 pixels)
    x_train_raw = load_images(@train_images_filename)

    # 10000 images, each 784 elements, with the same structure as `x_train`
    x_test_raw = load_images(@test_images_filename)

    {x_train, x_test_all} = maybe_standardize(x_train_raw, x_test_raw, opts)

    # 10000 labels, each a single digit from 0 to 9
    y_test_all = load_labels(@test_labels_filename)

    # Split the test data (10000 images/labels) in `validation` and `test` datasets
    [x_validation, x_test] = Nx.to_batched(x_test_all, 5000) |> Enum.to_list()
    [y_validation, y_test] = Nx.to_batched(y_test_all, 5000) |> Enum.to_list()

    %__MODULE__{
      x_train: x_train,
      x_validation: x_validation,
      x_test: x_test,
      y_train: load_labels(@train_labels_filename) |> one_hot_encode(),
      y_validation: y_validation,
      y_test: y_test,
      y_train_unencoded: load_labels(@train_labels_filename)
    }
  end

  defp maybe_standardize(training_set, test_set, skip_standardization: true) do
    {training_set, test_set}
  end

  defp maybe_standardize(training_set, test_set, _opts) do
    standardize(training_set, test_set)
  end

  # The function calculates the average and the standard deviation on the training set alone,
  # because that’s the only information that we want to train on.
  # If we involve the validation and test set in those calculation,
  # we'll leak information from those sets into the neural network's training.
  defnp standardize(training_set, test_set) do
    average = Nx.mean(training_set)
    standard_deviation = Nx.standard_deviation(training_set)
    training_set_standardized = (training_set - average) / standard_deviation
    test_set_standardized = (test_set - average) / standard_deviation
    {training_set_standardized, test_set_standardized}
  end

  @doc """
  One-hot encode the given tensor (classes: from 0 to 9).
  """
  @spec one_hot_encode(y :: Nx.Tensor.t()) :: Nx.Tensor.t()
  def one_hot_encode(y) do
    Nx.equal(y, Nx.tensor(Enum.to_list(0..9)))
  end

  @doc """
  Load the MNIST labels from the given file
  and return a matrix.
  """
  @spec load_labels(Path.t()) :: Nx.Tensor.t()
  def load_labels(filename) do
    # Open and unzip the file of labels
    with {:ok, binary} <- File.read(filename) do
      <<_::32, n_labels::32, labels_binary::binary>> = :zlib.gunzip(binary)

      # Create a tensor from the binary and
      # reshape the list of labels into a one-column matrix.
      labels_binary
      |> Nx.from_binary({:u, 8})
      |> Nx.reshape({n_labels, 1})
    end
  end

  @doc """
  Load the MNIST images from the given file
  and return a matrix.
  """
  @spec load_images(Path.t()) :: Nx.Tensor.t()
  def load_images(filename) do
    # Open and unzip the file of images
    with {:ok, binary} <- File.read(filename) do
      <<_::32, n_images::32, n_rows::32, n_cols::32, images_binary::binary>> =
        :zlib.gunzip(binary)

      # Create a tensor from the binary and
      # reshape the pixels into a matrix where each line is an image.
      images_binary
      |> Nx.from_binary({:u, 8})
      |> Nx.reshape({n_images, n_cols * n_rows})
    end
  end
end

Neural Network implementation

The neural network is based on the one implemented in Chapter 13.

defmodule C15.NeuralNetwork do
  import Nx.Defn

  defn sigmoid(z) do
    Nx.divide(1, Nx.add(1, Nx.exp(Nx.negate(z))))
  end

  defn softmax(logits) do
    exponentials = Nx.exp(logits)

    Nx.divide(
      exponentials,
      Nx.sum(exponentials, axes: [1]) |> Nx.reshape({:auto, 1})
    )
  end

  defn sigmoid_gradient(sigmoid) do
    Nx.multiply(sigmoid, 1 - sigmoid)
  end

  defn loss(y, y_hat) do
    -Nx.sum(y * Nx.log(y_hat)) / elem(Nx.shape(y), 0)
  end

  defn prepend_bias(x) do
    bias = Nx.broadcast(1, {elem(Nx.shape(x), 0), 1})

    # Insert a column of 1s in the position 0 of x.
    # ("axis: 1" stands for: "insert a column, not a row")
    Nx.concatenate([bias, x], axis: 1)
  end

  defn forward(x, weight1, weight2) do
    h = sigmoid(Nx.dot(prepend_bias(x), weight1))
    y_hat = softmax(Nx.dot(prepend_bias(h), weight2))

    {y_hat, h}
  end

  defn back(x, y, y_hat, weight2, h) do
    w2_gradient =
      Nx.dot(
        Nx.transpose(prepend_bias(h)),
        Nx.subtract(y_hat, y)
      ) / elem(Nx.shape(x), 0)

    w1_gradient =
      Nx.dot(
        Nx.transpose(prepend_bias(x)),
        Nx.dot(y_hat - y, Nx.transpose(weight2[1..-1//1])) * sigmoid_gradient(h)
      ) / elem(Nx.shape(x), 0)

    {w1_gradient, w2_gradient}
  end

  defn classify(x, weight1, weight2) do
    {y_hat, _h} = forward(x, weight1, weight2)
    labels = Nx.argmax(y_hat, axis: 1)
    Nx.reshape(labels, {:auto, 1})
  end

  defn initialize_weights(opts \\ []) do
    opts = keyword!(opts, [:w1_shape, :w2_shape])
    mean = 0.0
    std_deviation = 0.01

    prng_key = Nx.Random.key(1234)

    {weight1, new_prng_key} =
      Nx.Random.normal(prng_key, mean, std_deviation, shape: opts[:w1_shape])

    {weight2, _new_prng_key} =
      Nx.Random.normal(new_prng_key, mean, std_deviation, shape: opts[:w2_shape])

    {weight1, weight2}
  end

  def prepare_batches(x_train, y_train, batch_size) do
    x_batches = Nx.to_batched(x_train, batch_size) |> Enum.to_list()
    y_batches = Nx.to_batched(y_train, batch_size) |> Enum.to_list()

    {x_batches, y_batches}
  end

  def report(epoch, batch, x_train, y_train, x_test, y_test, weight1, weight2) do
    {y_hat, _h} = forward(x_train, weight1, weight2)
    training_loss = loss(y_train, y_hat) |> Nx.to_number()
    classifications = classify(x_test, weight1, weight2)
    accuracy = Nx.multiply(Nx.mean(Nx.equal(classifications, y_test)), 100.0) |> Nx.to_number()

    IO.puts("#{epoch}-#{batch} > Loss: #{training_loss}, Accuracy: #{accuracy}%")

    {training_loss, accuracy}
  end

  def train(x_train, y_train, x_test, y_test, n_hidden_nodes, epochs, batch_size, lr) do
    n_input_variables = elem(Nx.shape(x_train), 1)
    n_classes = elem(Nx.shape(y_train), 1)

    {initial_weight_1, initial_weight_2} =
      initialize_weights(
        w1_shape: {n_input_variables + 1, n_hidden_nodes},
        w2_shape: {n_hidden_nodes + 1, n_classes}
      )

    {x_batches, y_batches} = prepare_batches(x_train, y_train, batch_size)

    for epoch <- 0..(epochs - 1),
        batch <- 0..(length(x_batches) - 1),
        reduce: {initial_weight_1, initial_weight_2} do
      {w1, w2} ->
        {updated_w1, updated_w2} =
          step(Enum.at(x_batches, batch), Enum.at(y_batches, batch), w1, w2, lr)

        {_loss, _accuracy} =
          report(epoch, batch, x_train, y_train, x_test, y_test, updated_w1, updated_w2)

        {updated_w1, updated_w2}
    end
  end

  defn step(x_batch, y_batch, w1, w2, lr) do
    {y_hat, h} = forward(x_batch, w1, w2)
    {w1_gradient, w2_gradient} = back(x_batch, y_batch, y_hat, w2, h)
    w1 = w1 - w1_gradient * lr
    w2 = w2 - w2_gradient * lr

    {w1, w2}
  end
end

Regular MNIST vs Standardized MNIST

Regular MNIST

%{
  x_train: x_train_non_standardized,
  x_test: x_test_non_standardized,
  x_validation: x_validation_non_standardized,
  y_train: y_train,
  y_test: y_test,
  y_validation: y_validation
} = C15.MNISTStandardized.load(skip_standardization: true)
n_hidden_nodes = 200
epochs = 2
batch_size = 60
learning_rate = 0.1

{_w1, _w2} =
  C15.NeuralNetwork.train(
    x_train_non_standardized,
    y_train,
    x_validation_non_standardized,
    y_validation,
    n_hidden_nodes,
    epochs,
    batch_size,
    learning_rate
  )

Standardized MNIST

%{
  x_train: x_train,
  x_test: x_test,
  x_validation: x_validation,
  y_train: y_train,
  y_test: y_test,
  y_validation: y_validation
} = C15.MNISTStandardized.load()
n_hidden_nodes = 200
epochs = 2
batch_size = 60
learning_rate = 0.1

{_w1, _w2} =
  C15.NeuralNetwork.train(
    x_train,
    y_train,
    x_validation,
    y_validation,
    n_hidden_nodes,
    epochs,
    batch_size,
    learning_rate
  )

Tuning Hyperparameters

Neural Network implementation

This train/1 replaces the previous one, and it differs from it in a few ways:

  • it runs for a specified time, rather than a specified number of epochs;
  • it runs quietly instead of reporting the loss and accuracy at each step;
  • at each step it stores the loss and time passed, and finally returns those histories to the caller;
  • it also returns the number of training epochs and the total number of gradient descent steps.
defmodule C15.NeuralNetworkWithTimeout do
  @moduledoc """
  This implementation extends the previous one `C15.NeuralNetwork` and
  it is based on the one implemented in Chapter 13.
  """

  def train(
        x_train,
        y_train,
        _x_validation,
        _y_validation,
        n_hidden_nodes,
        lr,
        batch_size,
        timeout_after_in_seconds
      ) do
    n_input_variables = elem(Nx.shape(x_train), 1)
    n_classes = elem(Nx.shape(y_train), 1)

    {initial_weight_1, initial_weight_2} =
      C15.NeuralNetwork.initialize_weights(
        w1_shape: {n_input_variables + 1, n_hidden_nodes},
        w2_shape: {n_hidden_nodes + 1, n_classes}
      )

    {x_batches, y_batches} = C15.NeuralNetwork.prepare_batches(x_train, y_train, batch_size)

    now = DateTime.utc_now()

    initial_state = %{
      w1: initial_weight_1,
      w2: initial_weight_2,
      losses: [],
      times: [],
      steps: 0,
      epochs: 0,
      batch: 0,
      started_at: now,
      timeout_at: DateTime.add(now, timeout_after_in_seconds, :second)
    }

    do_train(x_train, y_train, x_batches, y_batches, lr, initial_state)
  end

  defp do_train(
         x_train,
         y_train,
         x_batches,
         y_batches,
         lr,
         %{w1: w1, w2: w2, timeout_at: timeout_at, batch: batch} = state
       ) do
    updated_state = compute_loss_and_update_state(state, x_train, y_train, w1, w2)

    cond do
      DateTime.compare(DateTime.utc_now(), timeout_at) == :gt ->
        updated_state

      batch > length(x_batches) - 1 ->
        do_train(
          x_train,
          y_train,
          x_batches,
          y_batches,
          lr,
          %{updated_state | batch: 0, epochs: state.epochs + 1}
        )

      true ->
        {updated_w1, updated_w2} =
          C15.NeuralNetwork.step(Enum.at(x_batches, batch), Enum.at(y_batches, batch), w1, w2, lr)

        do_train(
          x_train,
          y_train,
          x_batches,
          y_batches,
          lr,
          %{
            updated_state
            | w1: updated_w1,
              w2: updated_w2,
              batch: batch + 1,
              steps: state.steps + 1
          }
        )
    end
  end

  def compute_loss_and_update_state(state, x_train, y_train, w1, w2) do
    {y_hat, _h} = C15.NeuralNetwork.forward(x_train, w1, w2)
    training_loss = C15.NeuralNetwork.loss(y_train, y_hat) |> Nx.to_number()

    %{
      state
      | losses: state.losses ++ [training_loss],
        times: state.times ++ [DateTime.utc_now()]
    }
  end
end

Utility for comparing the results of the different trainings.

defmodule C15.Comparator do
  alias VegaLite, as: Vl

  # results_with_label :: [{training_result_map, label}, ...]
  def plot(results_with_label) do
    all_plot_inputs = Enum.map(results_with_label, &amp;plot_inputs/1)

    Vl.new(width: 600, height: 400)
    |> Vl.layers(
      Enum.map(all_plot_inputs, fn inputs ->
        Vl.new()
        |> Vl.data_from_values(inputs)
        |> Vl.mark(:line)
        |> Vl.encode_field(:x, "time", title: "time (seconds)", type: :quantitative)
        |> Vl.encode_field(:y, "loss", title: "loss", type: :quantitative)
        |> Vl.encode(:color, field: "type")
        |> Vl.encode(:stroke_dash, field: "type")
      end)
    )
  end

  defp plot_inputs({result, label}) do
    %{losses: losses, times: times, epochs: epochs, steps: steps, started_at: started_at} = result

    IO.puts(
      "Training #{label} Loss: #{Enum.at(losses, -1)} (#{epochs} epochs completed, #{steps} total steps)"
    )

    Enum.zip_with([losses, times], fn [loss, time] ->
      %{
        type: label,
        loss: loss,
        time: DateTime.diff(time, started_at, :millisecond) / 1000
      }
    end)
  end
end

Tuning the Number of Hidden Nodes

batch_size = 128
lr = 0.1
timeout_after_in_seconds = 60 * 10

n_hidden_nodes_to_compare = [10, 100, 400, 1000]

n_hidden_nodes_training_results =
  Enum.map(n_hidden_nodes_to_compare, fn n_hidden_nodes ->
    result =
      C15.NeuralNetworkWithTimeout.train(
        x_train,
        y_train,
        x_validation,
        y_validation,
        n_hidden_nodes,
        lr,
        batch_size,
        timeout_after_in_seconds
      )

    {result, "h=#{n_hidden_nodes}"}
  end)

C15.Comparator.plot(n_hidden_nodes_training_results)

Tuning the Learning Rate

batch_size = 128
n_hidden_nodes = 100
timeout_after_in_seconds = 60 * 10

lr_to_compare = [0.001, 0.01, 0.1, 1]

lr_training_results =
  Enum.map(lr_to_compare, fn lr ->
    result =
      C15.NeuralNetworkWithTimeout.train(
        x_train,
        y_train,
        x_validation,
        y_validation,
        n_hidden_nodes,
        lr,
        batch_size,
        timeout_after_in_seconds
      )

    {result, "lr=#{lr}"}
  end)

C15.Comparator.plot(lr_training_results)

Tuning the Batch Size

n_hidden_nodes = 100
lr = 1
timeout_after_in_seconds = 60 * 5

# 60_000 - batch GD, all the examples in one batch
batch_sizes_to_compare = [60_000, 256, 128, 64]

batch_size_training_results =
  Enum.map(batch_sizes_to_compare, fn batch_size ->
    result =
      C15.NeuralNetworkWithTimeout.train(
        x_train,
        y_train,
        x_validation,
        y_validation,
        n_hidden_nodes,
        lr,
        batch_size,
        timeout_after_in_seconds
      )

    {result, "batch_size=#{batch_size}"}
  end)

C15.Comparator.plot(batch_size_training_results)

The Final Test

We’ve trained our network on the training set and measuring its performance on the validation set. It’s time to see how it perform with the test set using the hyperparameters we’ve found in the previous sections.

n_hidden_nodes = 100
epochs = 10
batch_size = 256
lr = 1

C15.NeuralNetwork.train(x_train, y_train, x_test, y_test, n_hidden_nodes, epochs, batch_size, lr)