Chapter 11: Training the Network
Mix.install(
[
{:exla, "~> 0.5"},
{:nx, "~> 0.5"}
],
config: [nx: [default_backend: EXLA.Backend]]
)
Load MNIST dataset
The module to load MNIST data is the based on the developed one in the chapter 10, but the returned y_train
is already hot-encoded.
defmodule C11.MNIST do
@moduledoc """
Use this Module to load the MNIST database (test, train, and labels).
MNIST dataset specifications can be found here: http://yann.lecun.com/exdb/mnist/
"""
@data_path Path.join(__DIR__, "../data/mnist") |> Path.expand()
@train_images_filename Path.join(@data_path, "train-images-idx3-ubyte.gz")
@test_images_filename Path.join(@data_path, "t10k-images-idx3-ubyte.gz")
@train_labels_filename Path.join(@data_path, "train-labels-idx1-ubyte.gz")
@test_labels_filename Path.join(@data_path, "t10k-labels-idx1-ubyte.gz")
defstruct [:x_train, :x_test, :y_train, :y_test]
@doc """
Load the MNIST database and return the train and test images.
`y_train` already hot-encoded.
"""
def load() do
%__MODULE__{
# 60000 images, each 784 elements (28 * 28 pixels)
x_train: load_images(@train_images_filename),
# 10000 images, each 784 elements, with the same structure as `x_train`
x_test: load_images(@test_images_filename),
# 60000 labels
y_train: load_labels(@train_labels_filename) |> one_hot_encode(),
# 10000 labels, with the same encoding as `y_train`
y_test: load_labels(@test_labels_filename)
}
end
@doc """
One-hot encode the given tensor (classes: from 0 to 9).
"""
def one_hot_encode(y) do
Nx.equal(y, Nx.tensor(Enum.to_list(0..9)))
end
@doc """
Load the MNIST labels from the given file
and return a matrix.
"""
def load_labels(filename) do
# Open and unzip the file of labels
with {:ok, binary} <- File.read(filename) do
<<_::32, n_labels::32, labels_binary::binary>> = :zlib.gunzip(binary)
# Create a tensor from the binary and
# reshape the list of labels into a one-column matrix.
labels_binary
|> Nx.from_binary({:u, 8})
|> Nx.reshape({n_labels, 1})
end
end
@doc """
Load the MNIST images from the given file
and return a matrix.
"""
def load_images(filename) do
# Open and unzip the file of images
with {:ok, binary} <- File.read(filename) do
<<_::32, n_images::32, n_rows::32, n_cols::32, images_binary::binary>> =
:zlib.gunzip(binary)
# Create a tensor from the binary and
# reshape the pixels into a matrix where each line is an image.
images_binary
|> Nx.from_binary({:u, 8})
|> Nx.reshape({n_images, n_cols * n_rows})
end
end
end
Load the data.
# Use the public API to get train and test images
%{x_train: x_train, x_test: x_test, y_train: y_train, y_test: y_test} = data = C11.MNIST.load()
Backpropagation
Local gradient for the w2
:
$$
\frac {\partial L}{\partial w2} = SML\rq \cdot \frac {\partial b}{\partial w2} = (\hat y - y) \cdot h
$$
Local gradient for the w1
:
$$
\frac {\partial L}{\partial w1} = SML\rq \cdot \frac {\partial b}{\partial w1} \cdot \sigma\rq \frac {\partial a}{\partial w1}= (\hat y - y) \cdot w2 \cdot \sigma \cdot (1 - \sigma) \cdot x
$$
The Finished Network
The Neural Network is based on the classifier implemented in chapter 7, but with the backpropagation step when training the model.
defmodule C11.Classifier do
import Nx.Defn
@doc """
A sigmoid function is a mathematical function having
a characteristic "S"-shaped curve or sigmoid curve.
A sigmoid function:
- is monotonic
- has no local minimums
- has a non-negative derivative for each point
It is used as activation function in the intermediate
layers of a neural network.
More here https://en.wikipedia.org/wiki/Sigmoid_function
"""
defn sigmoid(z) do
Nx.divide(1, Nx.add(1, Nx.exp(Nx.negate(z))))
end
@doc """
A softmax function turns a list of numbers (logits)
into probabilities that sum to one.
It is used as activation function in the last
layer of a neural network.
More here https://en.wikipedia.org/wiki/Softmax_function
For MNIST dataset, the `logits` is a tensor `{60_000, 10}`:
- one row for each MNIST image (60_000)
- one column per class (0..9)
and it must return a matrix of the same shape.
"""
defn softmax(logits) do
exponentials = Nx.exp(logits)
Nx.divide(
exponentials,
Nx.sum(exponentials, axes: [1]) |> Nx.reshape({:auto, 1})
)
end
@doc """
Helper function that calculates the sigmoid's gradient
from the sigmoid's output.
"""
defn sigmoid_gradient(sigmoid) do
Nx.multiply(sigmoid, 1 - sigmoid)
end
@doc """
Cross-entropy loss.
It measures the distance between the classifier's prediction
and the labels.
"""
defn loss(y, y_hat) do
-Nx.sum(y * Nx.log(y_hat)) / elem(Nx.shape(y), 0)
end
@doc """
Prepend a the bias, an extra column of 1s, to
the given tensor.
"""
defn prepend_bias(x) do
bias = Nx.broadcast(1, {elem(Nx.shape(x), 0), 1})
# Insert a column of 1s in the position 0 of x.
# ("axis: 1" stands for: "insert a column, not a row")
Nx.concatenate([bias, x], axis: 1)
end
@doc """
Forward propagation: it propagates data "forward" through the
network's layers, from input to hidden layer to output
It returns a tuple `{ŷ, h}` with the prediction tensor `ŷ` (`y_hat`)
and the tensor `h` for the hidden layer given the inputs and weights.
Each element in the tensors is now constrained between 0 and 1,
but the activation functions used for `h` and `y_hat` are
different:
- `sigmoid` for the hidden layer `h`
- `softmax` for the prediction tensor `y_hat`
Tensors shapes:
- `weight1` shape: `{785, 200}`
- `h` shape: `{60000, 200}`
- `weight2` shape: `{201, 10}`
- `y_hat` shape: `{60000, 10}`
"""
defn forward(x, weight1, weight2) do
h = sigmoid(Nx.dot(prepend_bias(x), weight1))
y_hat = softmax(Nx.dot(prepend_bias(h), weight2))
{y_hat, h}
end
@doc """
Calculates the gradients of the weights by multiplying the local
gradients of individual operations, from the loss to the weights.
It uses the chain rule to calculate the gradient of any
node `y` with respect to any other node `x`, we multiply
the local gradient of all the nodes on the way back from `y` to `x`.
Thanks to the chain rule, we can calculate a complicated gradient
as a multiplication of many simple gradients.
"""
defn back(x, y, y_hat, weight2, h) do
# - The swapping and transposition are needed to get the correct dimension
# - The bias columnm is prepended to `h` as it is done in `forward/3`
# - It is divided by `elem(Nx.shape(x), 0)` because the matrix multiplication
# gives us the accumulated gradient over all the examples, but we want the
# average gradient
#
# numpy:
# w2_gradient = np.matmul(prepend_bias(h).T, y_hat - Y) / X.shape[0]
w2_gradient =
Nx.dot(
Nx.transpose(prepend_bias(h)),
Nx.subtract(y_hat, y)
) / elem(Nx.shape(x), 0)
# - The swapping and transposition are needed to get a result with
# the same dimensions as `w2`.
# - In this case, we don't need to add a bias column to `h` because
# the bias is added after its calculation (when computing `y_hat`)
# Instead, the bias is prepended to `x` as it is done in the `forward/3`
# function
# - And since we ignored the the bias prepended to `h`, we need to
# ignore the its weights (1st row of `weight2`).
# - It is divided by `elem(Nx.shape(x), 0)` because the matrix multiplication
# gives us the accumulated gradient over all the examples, but we want the
# average gradient
#
# numpy:
# w1_gradient = np.matmul(prepend_bias(X).T, np.matmul(y_hat - Y, w2[1:].T) * sigmoid_gradient(h)) / X.shape[0]
w1_gradient =
Nx.dot(
Nx.transpose(prepend_bias(x)),
Nx.dot(y_hat - y, Nx.transpose(weight2[1..-1//1])) * sigmoid_gradient(h)
) / elem(Nx.shape(x), 0)
{w1_gradient, w2_gradient}
end
@doc """
Return a single-column matrix of prediction, where each value is between 0 and 9.
"""
defn classify(x, weight1, weight2) do
{y_hat, _h} = forward(x, weight1, weight2)
# Get the index of the maximum value in each row of `y_hat`
# (the value that's closer to 1).
# NOTE: in case of MNIST dataset, the returned index is also the
# decoded label (0..9).
labels = Nx.argmax(y_hat, axis: 1)
Nx.reshape(labels, {:auto, 1})
end
@doc """
Initialize the weights `weight1` and `weight2` with
the given shape passed as options.
- The weights are initialized with random numbers to "break the symmetry",
otherwise our neural network would behave as if it had only
one hidden node.
- The initial values must be small because large values
can cause problems if the network's function are not numerically
stable (overflow). Plus, large values make the training slower with the
possibility to halt the learning completely ("dead neurons").
The initialization of the weights is done via `Nx.Random.normal/4`
https://hexdocs.pm/nx/Nx.Random.html#normal/4
And for doing that we need to pass a pseudo-random number generator (PRNG) key
to the function as argument, a new one for each different Nx’s random number generation
calls.
https://hexdocs.pm/nx/Nx.Random.html#module-design-and-context
"""
defn initialize_weights(opts \\ []) do
opts = keyword!(opts, [:w1_shape, :w2_shape])
mean = 0.0
std_deviation = 0.01
prng_key = Nx.Random.key(1234)
{weight1, new_prng_key} =
Nx.Random.normal(prng_key, mean, std_deviation, shape: opts[:w1_shape])
{weight2, _new_prng_key} =
Nx.Random.normal(new_prng_key, mean, std_deviation, shape: opts[:w2_shape])
{weight1, weight2}
end
@doc """
Utility to report (to stdout) the loss per iteration.
"""
def report(iteration, x_train, y_train, x_test, y_test, weight1, weight2) do
{y_hat, _h} = forward(x_train, weight1, weight2)
training_loss = loss(y_train, y_hat) |> Nx.to_number()
classifications = classify(x_test, weight1, weight2)
accuracy = Nx.multiply(Nx.mean(Nx.equal(classifications, y_test)), 100.0) |> Nx.to_number()
IO.puts("Iteration #{iteration}, Loss: #{training_loss}, Accuracy: #{accuracy}%")
end
@doc """
Computes the weights `w1` and `w2` by training the system
with the given inputs and labels, by iterating
over the examples the specified number of times.
For each iteration, it prints the loss and the accuracy.
"""
def train(x_train, y_train, x_test, y_test, n_hidden_nodes, iterations, lr) do
n_input_variables = elem(Nx.shape(x_train), 1)
n_classes = elem(Nx.shape(y_train), 1)
{initial_weight_1, initial_weight_2} =
initialize_weights(
w1_shape: {n_input_variables + 1, n_hidden_nodes},
w2_shape: {n_hidden_nodes + 1, n_classes}
)
Enum.reduce(0..(iterations - 1), {initial_weight_1, initial_weight_2}, fn i, {w1, w2} ->
{updated_w1, updated_w2} = step(x_train, y_train, w1, w2, lr)
report(i, x_train, y_train, x_test, y_test, updated_w1, updated_w2)
{updated_w1, updated_w2}
end)
end
defnp step(x_train, y_train, w1, w2, lr) do
{y_hat, h} = forward(x_train, w1, w2)
{w1_gradient, w2_gradient} = back(x_train, y_train, y_hat, w2, h)
w1 = w1 - w1_gradient * lr
w2 = w2 - w2_gradient * lr
{w1, w2}
end
end
Training the network
hidden_nodes = 200
# In the books the NN is trained for 10_000 iterations
# but already 3500 lead to the same result in terms of
# loss and accuracy
iterations = 3500
learning_rate = 0.01
{w1, w2} =
C11.Classifier.train(
x_train,
y_train,
x_test,
y_test,
hidden_nodes,
iterations,
learning_rate
)
Iteration 0, Loss: 2.2787163257598877, Accuracy: 13.799999237060547%
Iteration 1, Loss: 2.2671103477478027, Accuracy: 18.26999855041504%
Iteration 2, Loss: 2.2557082176208496, Accuracy: 24.209999084472656%
Iteration 3, Loss: 2.2444677352905273, Accuracy: 29.920000076293945%
Iteration 4, Loss: 2.2333567142486572, Accuracy: 34.720001220703125%
...
...
Iteration 3497, Loss: 0.13987472653388977, Accuracy: 94.20999908447266%
Iteration 3498, Loss: 0.13985344767570496, Accuracy: 94.20999908447266%
Iteration 3499, Loss: 0.13983216881752014, Accuracy: 94.20999908447266%
After 35 minutes ca. and 3500 iterations:
- Loss: 0.13983216881752014
- Accuracy: 94.20999908447266%