Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Chapter 11: Training the Network

11_training/neural_network.livemd

Chapter 11: Training the Network

Mix.install(
  [
    {:exla, "~> 0.5"},
    {:nx, "~> 0.5"}
  ],
  config: [nx: [default_backend: EXLA.Backend]]
)

Load MNIST dataset

The module to load MNIST data is the based on the developed one in the chapter 10, but the returned y_train is already hot-encoded.

defmodule C11.MNIST do
  @moduledoc """
  Use this Module to load the MNIST database (test, train, and labels).

  MNIST dataset specifications can be found here: http://yann.lecun.com/exdb/mnist/
  """

  @data_path Path.join(__DIR__, "../data/mnist") |> Path.expand()

  @train_images_filename Path.join(@data_path, "train-images-idx3-ubyte.gz")
  @test_images_filename Path.join(@data_path, "t10k-images-idx3-ubyte.gz")
  @train_labels_filename Path.join(@data_path, "train-labels-idx1-ubyte.gz")
  @test_labels_filename Path.join(@data_path, "t10k-labels-idx1-ubyte.gz")

  defstruct [:x_train, :x_test, :y_train, :y_test]

  @doc """
  Load the MNIST database and return the train and test images.

  `y_train` already hot-encoded.
  """
  def load() do
    %__MODULE__{
      # 60000 images, each 784 elements (28 * 28 pixels)
      x_train: load_images(@train_images_filename),
      # 10000 images, each 784 elements, with the same structure as `x_train`
      x_test: load_images(@test_images_filename),
      # 60000 labels
      y_train: load_labels(@train_labels_filename) |> one_hot_encode(),
      # 10000 labels, with the same encoding as `y_train`
      y_test: load_labels(@test_labels_filename)
    }
  end

  @doc """
  One-hot encode the given tensor (classes: from 0 to 9).
  """
  def one_hot_encode(y) do
    Nx.equal(y, Nx.tensor(Enum.to_list(0..9)))
  end

  @doc """
  Load the MNIST labels from the given file
  and return a matrix.
  """
  def load_labels(filename) do
    # Open and unzip the file of labels
    with {:ok, binary} <- File.read(filename) do
      <<_::32, n_labels::32, labels_binary::binary>> = :zlib.gunzip(binary)

      # Create a tensor from the binary and
      # reshape the list of labels into a one-column matrix.
      labels_binary
      |> Nx.from_binary({:u, 8})
      |> Nx.reshape({n_labels, 1})
    end
  end

  @doc """
  Load the MNIST images from the given file
  and return a matrix.
  """
  def load_images(filename) do
    # Open and unzip the file of images
    with {:ok, binary} <- File.read(filename) do
      <<_::32, n_images::32, n_rows::32, n_cols::32, images_binary::binary>> =
        :zlib.gunzip(binary)

      # Create a tensor from the binary and
      # reshape the pixels into a matrix where each line is an image.
      images_binary
      |> Nx.from_binary({:u, 8})
      |> Nx.reshape({n_images, n_cols * n_rows})
    end
  end
end

Load the data.

# Use the public API to get train and test images
%{x_train: x_train, x_test: x_test, y_train: y_train, y_test: y_test} = data = C11.MNIST.load()

Backpropagation

Local gradient for the w2:

$$

\frac {\partial L}{\partial w2} = SML\rq \cdot \frac {\partial b}{\partial w2} = (\hat y - y) \cdot h

$$

Local gradient for the w1:

$$

\frac {\partial L}{\partial w1} = SML\rq \cdot \frac {\partial b}{\partial w1} \cdot \sigma\rq \frac {\partial a}{\partial w1}= (\hat y - y) \cdot w2 \cdot \sigma \cdot (1 - \sigma) \cdot x

$$

The Finished Network

The Neural Network is based on the classifier implemented in chapter 7, but with the backpropagation step when training the model.

defmodule C11.Classifier do
  import Nx.Defn

  @doc """
  A sigmoid function is a mathematical function having
  a characteristic "S"-shaped curve or sigmoid curve.

  A sigmoid function:
  - is monotonic
  - has no local minimums
  - has a non-negative derivative for each point

  It is used as activation function in the intermediate
  layers of a neural network.

  More here https://en.wikipedia.org/wiki/Sigmoid_function
  """
  defn sigmoid(z) do
    Nx.divide(1, Nx.add(1, Nx.exp(Nx.negate(z))))
  end

  @doc """
  A softmax function turns a list of numbers (logits)
  into probabilities that sum to one.

  It is used as activation function in the last
  layer of a neural network.

  More here https://en.wikipedia.org/wiki/Softmax_function

  For MNIST dataset, the `logits` is a tensor `{60_000, 10}`:
  - one row for each MNIST image (60_000)
  - one column per class (0..9) 
  and it must return a matrix of the same shape.
  """
  defn softmax(logits) do
    exponentials = Nx.exp(logits)

    Nx.divide(
      exponentials,
      Nx.sum(exponentials, axes: [1]) |> Nx.reshape({:auto, 1})
    )
  end

  @doc """
  Helper function that calculates the sigmoid's gradient
  from the sigmoid's output.
  """
  defn sigmoid_gradient(sigmoid) do
    Nx.multiply(sigmoid, 1 - sigmoid)
  end

  @doc """
  Cross-entropy loss.

  It measures the distance between the classifier's prediction
  and the labels.
  """
  defn loss(y, y_hat) do
    -Nx.sum(y * Nx.log(y_hat)) / elem(Nx.shape(y), 0)
  end

  @doc """
  Prepend a the bias, an extra column of 1s, to
  the given tensor.
  """
  defn prepend_bias(x) do
    bias = Nx.broadcast(1, {elem(Nx.shape(x), 0), 1})

    # Insert a column of 1s in the position 0 of x.
    # ("axis: 1" stands for: "insert a column, not a row")
    Nx.concatenate([bias, x], axis: 1)
  end

  @doc """
  Forward propagation: it propagates data "forward" through the
  network's layers, from input to hidden layer to output
  It returns a tuple `{ŷ, h}` with the prediction tensor `ŷ` (`y_hat`)
  and the tensor `h` for the hidden layer given the inputs and weights.

  Each element in the tensors is now constrained between 0 and 1,
  but the activation functions used for `h` and `y_hat` are
  different:
  - `sigmoid` for the hidden layer `h`
  - `softmax` for the prediction tensor `y_hat`

  Tensors shapes:
  - `weight1` shape: `{785, 200}`
  - `h` shape: `{60000, 200}`
  - `weight2` shape: `{201, 10}`
  - `y_hat` shape: `{60000, 10}`
  """
  defn forward(x, weight1, weight2) do
    h = sigmoid(Nx.dot(prepend_bias(x), weight1))
    y_hat = softmax(Nx.dot(prepend_bias(h), weight2))

    {y_hat, h}
  end

  @doc """
  Calculates the gradients of the weights by multiplying the local
  gradients of individual operations, from the loss to the weights.

  It uses the chain rule to calculate the gradient of any
  node `y` with respect to any other node `x`, we multiply
  the local gradient of all the nodes on the way back from `y` to `x`.
  Thanks to the chain rule, we can calculate a complicated gradient
  as a multiplication of many simple gradients.
  """
  defn back(x, y, y_hat, weight2, h) do
    # - The swapping and transposition are needed to get the correct dimension
    # - The bias columnm is prepended to `h` as it is done in `forward/3`
    # - It is divided by `elem(Nx.shape(x), 0)` because the matrix multiplication
    # gives us the accumulated gradient over all the examples, but we want the
    # average gradient
    #
    # numpy:
    # w2_gradient = np.matmul(prepend_bias(h).T, y_hat - Y) / X.shape[0]
    w2_gradient =
      Nx.dot(
        Nx.transpose(prepend_bias(h)),
        Nx.subtract(y_hat, y)
      ) / elem(Nx.shape(x), 0)

    # - The swapping and transposition are needed to get a result with
    # the same dimensions as `w2`.
    # - In this case, we don't need to add a bias column to `h` because
    # the bias is added after its calculation (when computing `y_hat`)
    # Instead, the bias is prepended to `x` as it is done in the `forward/3`
    # function
    # - And since we ignored the the bias prepended to `h`, we need to
    # ignore the its weights (1st row of `weight2`).
    # - It is divided by `elem(Nx.shape(x), 0)` because the matrix multiplication
    # gives us the accumulated gradient over all the examples, but we want the
    # average gradient
    #
    # numpy:
    # w1_gradient = np.matmul(prepend_bias(X).T, np.matmul(y_hat - Y, w2[1:].T) * sigmoid_gradient(h)) / X.shape[0]
    w1_gradient =
      Nx.dot(
        Nx.transpose(prepend_bias(x)),
        Nx.dot(y_hat - y, Nx.transpose(weight2[1..-1//1])) * sigmoid_gradient(h)
      ) / elem(Nx.shape(x), 0)

    {w1_gradient, w2_gradient}
  end

  @doc """
  Return a single-column matrix of prediction, where each value is between 0 and 9.
  """
  defn classify(x, weight1, weight2) do
    {y_hat, _h} = forward(x, weight1, weight2)

    # Get the index of the maximum value in each row of `y_hat`
    # (the value that's closer to 1).
    # NOTE: in case of MNIST dataset, the returned index is also the
    # decoded label (0..9).
    labels = Nx.argmax(y_hat, axis: 1)

    Nx.reshape(labels, {:auto, 1})
  end

  @doc """
  Initialize the weights `weight1` and `weight2` with
  the given shape passed as options.

  - The weights are initialized with random numbers to "break the symmetry",
  otherwise our neural network would behave as if it had only
  one hidden node.
  - The initial values must be small because large values
  can cause problems if the network's function are not numerically
  stable (overflow). Plus, large values make the training slower with the
  possibility to halt the learning completely ("dead neurons").

  The initialization of the weights is done via `Nx.Random.normal/4`
  https://hexdocs.pm/nx/Nx.Random.html#normal/4

  And for doing that we need to pass a pseudo-random number generator (PRNG) key
  to the function as argument, a new one for each different Nx’s random number generation
  calls.
  https://hexdocs.pm/nx/Nx.Random.html#module-design-and-context
  """
  defn initialize_weights(opts \\ []) do
    opts = keyword!(opts, [:w1_shape, :w2_shape])
    mean = 0.0
    std_deviation = 0.01

    prng_key = Nx.Random.key(1234)

    {weight1, new_prng_key} =
      Nx.Random.normal(prng_key, mean, std_deviation, shape: opts[:w1_shape])

    {weight2, _new_prng_key} =
      Nx.Random.normal(new_prng_key, mean, std_deviation, shape: opts[:w2_shape])

    {weight1, weight2}
  end

  @doc """
  Utility to report (to stdout) the loss per iteration.
  """
  def report(iteration, x_train, y_train, x_test, y_test, weight1, weight2) do
    {y_hat, _h} = forward(x_train, weight1, weight2)
    training_loss = loss(y_train, y_hat) |> Nx.to_number()
    classifications = classify(x_test, weight1, weight2)
    accuracy = Nx.multiply(Nx.mean(Nx.equal(classifications, y_test)), 100.0) |> Nx.to_number()

    IO.puts("Iteration #{iteration}, Loss: #{training_loss}, Accuracy: #{accuracy}%")
  end

  @doc """
  Computes the weights `w1` and `w2` by training the system
  with the given inputs and labels, by iterating
  over the examples the specified number of times.

  For each iteration, it prints the loss and the accuracy.
  """
  def train(x_train, y_train, x_test, y_test, n_hidden_nodes, iterations, lr) do
    n_input_variables = elem(Nx.shape(x_train), 1)
    n_classes = elem(Nx.shape(y_train), 1)

    {initial_weight_1, initial_weight_2} =
      initialize_weights(
        w1_shape: {n_input_variables + 1, n_hidden_nodes},
        w2_shape: {n_hidden_nodes + 1, n_classes}
      )

    Enum.reduce(0..(iterations - 1), {initial_weight_1, initial_weight_2}, fn i, {w1, w2} ->
      {updated_w1, updated_w2} = step(x_train, y_train, w1, w2, lr)
      report(i, x_train, y_train, x_test, y_test, updated_w1, updated_w2)
      {updated_w1, updated_w2}
    end)
  end

  defnp step(x_train, y_train, w1, w2, lr) do
    {y_hat, h} = forward(x_train, w1, w2)
    {w1_gradient, w2_gradient} = back(x_train, y_train, y_hat, w2, h)
    w1 = w1 - w1_gradient * lr
    w2 = w2 - w2_gradient * lr

    {w1, w2}
  end
end

Training the network

hidden_nodes = 200
# In the books the NN is trained for 10_000 iterations
# but already 3500 lead to the same result in terms of
# loss and accuracy
iterations = 3500
learning_rate = 0.01

{w1, w2} =
  C11.Classifier.train(
    x_train,
    y_train,
    x_test,
    y_test,
    hidden_nodes,
    iterations,
    learning_rate
  )
Iteration 0, Loss: 2.2787163257598877, Accuracy: 13.799999237060547%
Iteration 1, Loss: 2.2671103477478027, Accuracy: 18.26999855041504%
Iteration 2, Loss: 2.2557082176208496, Accuracy: 24.209999084472656%
Iteration 3, Loss: 2.2444677352905273, Accuracy: 29.920000076293945%
Iteration 4, Loss: 2.2333567142486572, Accuracy: 34.720001220703125%
...
...
Iteration 3497, Loss: 0.13987472653388977, Accuracy: 94.20999908447266%
Iteration 3498, Loss: 0.13985344767570496, Accuracy: 94.20999908447266%
Iteration 3499, Loss: 0.13983216881752014, Accuracy: 94.20999908447266%

After 35 minutes ca. and 3500 iterations:

  • Loss: 0.13983216881752014
  • Accuracy: 94.20999908447266%