Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Chapter 14: The Zen of Testing

14_the_zen_of_testing.livemd

Chapter 14: The Zen of Testing

Mix.install(
  [
    {:nx, "~> 0.7.1"},
    {:exla, "~> 0.7.1"},
    {:kino_vega_lite, "~> 0.1.10"}
  ],
  config: [nx: [default_backend: EXLA.Backend]]
)

Overfitting

Overfitting is a problem where supervised learning systems tend to memorize their training examples instead of generalizing from them

defmodule C7.MNIST do
  @moduledoc "MNIST functions from Chapter 7"
  def load_images(path) do
    # Open and unzip the file of images then store header information into variables
    <<_magic_number::32, n_images::32, n_rows::32, n_cols::32, images::binary>> =
      path
      |> File.read!()
      |> :zlib.gunzip()

    images
    # Create 1D tensor of type unsigned 8-bit integer from binary
    |> Nx.from_binary({:u, 8})
    # Reshape the pixels into a matrix into a matrix where each row is an image
    |> Nx.reshape({n_images, n_cols * n_rows})
  end

  @doc """
  Inserts a column of 1's into position 0 of tensor X along the the x-axis
  """
  def prepend_bias(x) do
    {row, _col} = Nx.shape(x)
    bias = Nx.broadcast(Nx.tensor(1), {row, 1})
    Nx.concatenate([bias, x], axis: 1)
  end

  def load_labels(filename) do
    # Open and unzip the file of images then read all the labels into a list
    <<_magic_number::32, n_items::32, labels::binary>> =
      filename
      |> File.read!()
      |> :zlib.gunzip()

    labels
    # Create 1D tensor of type unsigned 8-bit integer from binary
    |> Nx.from_binary({:u, 8})
    # Reshape the labels into a 1-column matrix
    |> Nx.reshape({n_items, 1})
  end

  @doc "Flip hot values to 1"
  def one_hot_encode(y) do
    Nx.equal(y, Nx.tensor(Enum.to_list(0..9)))
  end

  def load_datasets do
    files_path = __DIR__ |> Path.join("/files")

    training_images_path = Path.join(files_path, "train-images-idx3-ubyte.gz")
    training_labels_path = Path.join(files_path, "train-labels-idx1-ubyte.gz")

    test_images_path = Path.join(files_path, "t10k-images-idx3-ubyte.gz")
    test_labels_path = Path.join(files_path, "t10k-labels-idx1-ubyte.gz")

    y_train_unencoded = load_labels(training_labels_path)
    y_train = one_hot_encode(y_train_unencoded)
    y_test = load_labels(test_labels_path)

    x_train = load_images(training_images_path)
    x_test = load_images(test_images_path)

    %{
      x_train: x_train,
      y_train: y_train,
      x_test: x_test,
      y_test: y_test,
      y_train_unencoded: y_train_unencoded
    }
  end
end

Neural Network from Chapter 11 (but with train/7 modified to return losses of training and test sets)

defmodule C14.NeuralNetwork do
  import C7.MNIST, only: [prepend_bias: 1]
  import Nx.Defn

  @doc """
  For w2_gradient:
  - Swap the operands of the multiplication then transpose one of them to get a result with the
  same dimension as w2.
  - We also need to do a prepend_bias/1 since we did it with forward/3, we need to do the same
  during backprop.
  - The matrix multiplication needs to be divide by the rows of x (number of examples) to get
  the average gradient

  For w1_gradient:
  - Use h as is, without a bias column since it has no effect on dL/dw1
  - Since we ignored the first column of h, we also have to ignore the first row of w1 to match
  columns by rows in matrix multiplication
  - `sigmoid_gradient/1` calculates sigmoid's gradient from sigmoid's output `h`
  - Lastly multiply x to the previous intermediate result with same rules when we calculate
  w2_gradient
  """
  def back(x, y, y_hat, w2, h) do
    {num_samples, _} = Nx.shape(x)

    w2_gradient =
      h
      |> prepend_bias()
      |> Nx.transpose()
      |> Nx.dot(Nx.subtract(y_hat, y))
      |> Nx.divide(num_samples)

    w1_gradient =
      x
      |> prepend_bias()
      |> Nx.transpose()
      |> Nx.dot(
        y_hat
        |> Nx.subtract(y)
        |> Nx.dot(Nx.transpose(w2[1..-1//1]))
        |> Nx.multiply(sigmoid_gradient(h))
      )
      |> Nx.divide(num_samples)

    {w1_gradient, w2_gradient}
  end

  def initialize_weights(n_input_vars, n_hidden_nodes, n_classes) do
    key = Nx.Random.key(1234)
    mean = 0.0
    standard_deviation = 0.01

    w1_rows = n_input_vars + 1

    {w1, new_key} =
      Nx.Random.normal(key, mean, standard_deviation, shape: {w1_rows, n_hidden_nodes})

    w2_rows = n_hidden_nodes + 1

    {w2, _new_key} =
      Nx.Random.normal(new_key, mean, standard_deviation, shape: {w2_rows, n_classes})

    {w1, w2}
  end

  @doc """
  Hacked to return two lists, the training losses and test losses stored at each iteration.
  """
  def train(x_train, y_train, x_test, y_test, n_hidden_nodes, iterations, lr) do
    {_, n_input_variables} = Nx.shape(x_train)
    {_, n_classes} = Nx.shape(y_train)
    {w1, w2} = initialize_weights(n_input_variables, n_hidden_nodes, n_classes)

    state = %{
      w1: w1,
      w2: w2,
      iterations: [],
      training_losses: [],
      test_losses: []
    }

    Enum.reduce(1..iterations, state, fn iteration,
                                         %{
                                           w1: w1,
                                           w2: w2,
                                           iterations: iterations,
                                           training_losses: training_losses,
                                           test_losses: test_losses
                                         } ->
      {y_train_hat, h} = forward(x_train, w1, w2)
      {y_test_hat, _h} = forward(x_test, w1, w2)
      {w1_gradient, w2_gradient} = back(x_train, y_train, y_train_hat, w2, h)
      w1 = Nx.subtract(w1, Nx.multiply(w1_gradient, lr))
      w2 = Nx.subtract(w2, Nx.multiply(w2_gradient, lr))

      training_loss = loss(y_train, y_train_hat)
      test_loss = loss(y_test, y_test_hat)

      IO.puts(
        "#{iteration} > Training loss: #{Nx.to_number(training_loss)} - Test loss: #{Nx.to_number(test_loss)}"
      )

      %{
        state
        | w1: w1,
          w2: w2,
          iterations: [iteration | iterations],
          training_losses: [Nx.to_number(training_loss) | training_losses],
          test_losses: [Nx.to_number(test_loss) | test_losses]
      }
    end)
  end

  # Functions from `Ch 10: Building the Network` below

  defn sigmoid(z) do
    1 / (1 + Nx.exp(-z))
  end

  def softmax(logits) do
    exponentials = Nx.exp(logits)

    sum_of_exponentials_by_row =
      exponentials
      |> Nx.sum(axes: [1])
      |> Nx.reshape({:auto, 1})

    Nx.divide(exponentials, sum_of_exponentials_by_row)
  end

  def sigmoid_gradient(sigmoid) do
    Nx.multiply(sigmoid, Nx.subtract(1, sigmoid))
  end

  @doc """
  Cross-entropy loss
  Loss formula specific for multiclass classifiers.
  Measures the distance between the classifier's predictions and the labels.
  Lower loss means better classifier
  """
  def loss(y_train, y_hat) do
    {rows, _} = Nx.shape(y_train)

    y_train
    |> Nx.multiply(Nx.log(y_hat))
    |> Nx.sum()
    |> Nx.multiply(-1)
    |> Nx.divide(rows)
  end

  @doc """
  Implements the operation called "forward propagation"
  Steps:
  1. We add a bias to the inputs
  2. Compute the weighted sum using the 1st matrix of weights, w1
  3. Pass the result to the activation function (sigmoid or softmax)
  4. Repeat for all layers
  """
  def forward(x, w1, w2) do
    # Hidden layer
    h =
      x
      |> prepend_bias()
      |> then(&amp;Nx.dot(&amp;1, w1))
      |> sigmoid()

    # Output layer
    y_hat =
      h
      |> prepend_bias()
      |> then(&amp;Nx.dot(&amp;1, w2))
      |> softmax()

    {y_hat, h}
  end

  @doc """
  Same classify/2 function from Ch 7 but modified for neutral network
  """
  def classify(x, w1, w2) do
    x
    |> forward(w1, w2)
    |> elem(0)
    |> Nx.argmax(axis: 1)
    |> Nx.reshape({:auto, 1})
  end

  def accuracy(x, y_unencoded, w1, w2) do
    x
    |> classify(w1, w2)
    |> Nx.equal(y_unencoded)
    |> Nx.mean()
    |> Nx.multiply(100.0)
    |> Nx.to_number()
  end
end
%{
  x_train: x_train,
  y_train: y_train,
  x_test: x_test,
  y_test: y_test,
  y_train_unencoded: y_train_unencoded
} = C7.MNIST.load_datasets()

n_hidden_nodes = 200
lr = 0.01
iterations = 10

%{
  w1: w1,
  w2: w2,
  iterations: iterations,
  training_losses: training_losses,
  test_losses: test_losses
} =
  C14.NeuralNetwork.train(
    x_train,
    y_train,
    x_test,
    C7.MNIST.one_hot_encode(y_test),
    n_hidden_nodes,
    iterations,
    lr
  )
training_accuracy =
  C14.NeuralNetwork.accuracy(x_train, y_train_unencoded, w1, w2) |> Float.round(2)

test_accuracy = C14.NeuralNetwork.accuracy(x_test, y_test, w1, w2) |> Float.round(2)
IO.puts("Training accuracy: #{training_accuracy}%, Test accuracy: #{test_accuracy}%")
training_losses =
  iterations
  |> Enum.zip(training_losses)
  |> Enum.map(fn {i, loss} ->
    %{iteration: i, loss: loss, dataset: "Training set"}
  end)

test_losses =
  iterations
  |> Enum.zip(test_losses)
  |> Enum.map(fn {i, loss} ->
    %{iteration: i, loss: loss, dataset: "Test set"}
  end)

VegaLite.new(width: 600, height: 500, title: "Training and Test sets Loss per Iteration")
|> VegaLite.layers([
  VegaLite.new()
  |> VegaLite.data_from_values(training_losses)
  |> VegaLite.mark(:line)
  |> VegaLite.encode_field(:x, "iteration", type: :quantitative, title: "Iterations")
  |> VegaLite.encode_field(:y, "loss", type: :quantitative, title: "Loss")
  |> VegaLite.encode(:color, field: "dataset", title: "Dataset"),
  VegaLite.new()
  |> VegaLite.data_from_values(test_losses)
  |> VegaLite.mark(:line)
  |> VegaLite.encode_field(:x, "iteration", type: :quantitative, title: "Iterations")
  |> VegaLite.encode_field(:y, "loss", type: :quantitative, title: "Loss")
  |> VegaLite.encode(:color, field: "dataset", title: "Dataset")
])

The large difference between the accuracy of the network using training and using test set is the effect of overfitting. The network is more accurate on the training data. This the reason why we need to stick to the Blind Test Rule: test your system to data that it hasn’t seen before

A Testing Conundrum

Don’ts

  1. Use the test set during tuning
  2. Split the dataset into just two: train and test

Dos

Split the dataset into three: training, testing, and validation set. Common split is 60/20/20.

  1. Setup: put the test set aside
  2. Development lifecycle: train the network using the training set but use the validation set to gauge performance
  3. Final test: after tuning the hyperparameters, we use the test set to know how it will perform in production
defmodule C14.MNISTThreeSets do
  @moduledoc "MNIST functions from Chapter 7"
  def load_images(path) do
    # Open and unzip the file of images then store header information into variables
    <<_magic_number::32, n_images::32, n_rows::32, n_cols::32, images::binary>> =
      path
      |> File.read!()
      |> :zlib.gunzip()

    images
    # Create 1D tensor of type unsigned 8-bit integer from binary
    |> Nx.from_binary({:u, 8})
    # Reshape the pixels into a matrix into a matrix where each row is an image
    |> Nx.reshape({n_images, n_cols * n_rows})
  end

  @doc """
  Inserts a column of 1's into position 0 of tensor X along the the x-axis
  """
  def prepend_bias(x) do
    {row, _col} = Nx.shape(x)
    bias = Nx.broadcast(Nx.tensor(1), {row, 1})
    Nx.concatenate([bias, x], axis: 1)
  end

  def load_labels(filename) do
    # Open and unzip the file of images then read all the labels into a list
    <<_magic_number::32, n_items::32, labels::binary>> =
      filename
      |> File.read!()
      |> :zlib.gunzip()

    labels
    # Create 1D tensor of type unsigned 8-bit integer from binary
    |> Nx.from_binary({:u, 8})
    # Reshape the labels into a 1-column matrix
    |> Nx.reshape({n_items, 1})
  end

  @doc "Flip hot values to 1"
  def one_hot_encode(y) do
    Nx.equal(y, Nx.tensor(Enum.to_list(0..9)))
  end

  def load_datasets do
    files_path = __DIR__ |> Path.join("/files")

    training_images_path = Path.join(files_path, "train-images-idx3-ubyte.gz")
    training_labels_path = Path.join(files_path, "train-labels-idx1-ubyte.gz")

    test_images_path = Path.join(files_path, "t10k-images-idx3-ubyte.gz")
    test_labels_path = Path.join(files_path, "t10k-labels-idx1-ubyte.gz")

    # 60K labels, each a single digit from 0 to 9
    y_train_unencoded = load_labels(training_labels_path)

    # X_train/X_validation/X_test: 60K/5K/5K images
    # Each image has 784 elements (28 * 28 pixels)
    x_train = load_images(training_images_path)
    x_test_all = load_images(test_images_path)
    {x_test, x_validation} = Nx.split(x_test_all, 0.5)

    # Y_train: 60K labels, each consisting of 10 one-hot-encoded elements
    y_train = one_hot_encode(y_train_unencoded)
    # Y_validation/Y_test: 5K/5K labels, each a single digit from 0 to 9
    y_test_all = load_labels(test_labels_path)
    {y_test, y_validation} = Nx.split(y_test_all, 0.5)

    %{
      x_test_all: x_test_all,
      y_test_all: y_test_all,
      x_train: x_train,
      y_train: y_train,
      x_test: x_test,
      y_test: y_test,
      x_validation: x_validation,
      y_validation: y_validation,
      y_train_unencoded: y_train_unencoded
    }
  end
end
%{
  x_test_all: x_test_all,
  y_test_all: y_test_all,
  x_test: x_test,
  x_validation: x_val,
  y_test: y_test,
  y_validation: y_val
} = C14.MNISTThreeSets.load_datasets()

{x_total_rows, x_total_columns} = Nx.shape(x_test_all)
{y_total_rows, y_total_columns} = Nx.shape(y_test_all)

expected_shape = {div(x_total_rows, 2), x_total_columns}
^expected_shape = Nx.shape(x_test)
^expected_shape = Nx.shape(x_val)

expected_shape = {div(y_total_rows, 2), y_total_columns}
^expected_shape = Nx.shape(y_test)
^expected_shape = Nx.shape(y_val)