Chapter 14: The Zen of Testing
Mix.install(
[
{:nx, "~> 0.7.1"},
{:exla, "~> 0.7.1"},
{:kino_vega_lite, "~> 0.1.10"}
],
config: [nx: [default_backend: EXLA.Backend]]
)
Overfitting
Overfitting is a problem where supervised learning systems tend to memorize their training examples instead of generalizing from them
defmodule C7.MNIST do
@moduledoc "MNIST functions from Chapter 7"
def load_images(path) do
# Open and unzip the file of images then store header information into variables
<<_magic_number::32, n_images::32, n_rows::32, n_cols::32, images::binary>> =
path
|> File.read!()
|> :zlib.gunzip()
images
# Create 1D tensor of type unsigned 8-bit integer from binary
|> Nx.from_binary({:u, 8})
# Reshape the pixels into a matrix into a matrix where each row is an image
|> Nx.reshape({n_images, n_cols * n_rows})
end
@doc """
Inserts a column of 1's into position 0 of tensor X along the the x-axis
"""
def prepend_bias(x) do
{row, _col} = Nx.shape(x)
bias = Nx.broadcast(Nx.tensor(1), {row, 1})
Nx.concatenate([bias, x], axis: 1)
end
def load_labels(filename) do
# Open and unzip the file of images then read all the labels into a list
<<_magic_number::32, n_items::32, labels::binary>> =
filename
|> File.read!()
|> :zlib.gunzip()
labels
# Create 1D tensor of type unsigned 8-bit integer from binary
|> Nx.from_binary({:u, 8})
# Reshape the labels into a 1-column matrix
|> Nx.reshape({n_items, 1})
end
@doc "Flip hot values to 1"
def one_hot_encode(y) do
Nx.equal(y, Nx.tensor(Enum.to_list(0..9)))
end
def load_datasets do
files_path = __DIR__ |> Path.join("/files")
training_images_path = Path.join(files_path, "train-images-idx3-ubyte.gz")
training_labels_path = Path.join(files_path, "train-labels-idx1-ubyte.gz")
test_images_path = Path.join(files_path, "t10k-images-idx3-ubyte.gz")
test_labels_path = Path.join(files_path, "t10k-labels-idx1-ubyte.gz")
y_train_unencoded = load_labels(training_labels_path)
y_train = one_hot_encode(y_train_unencoded)
y_test = load_labels(test_labels_path)
x_train = load_images(training_images_path)
x_test = load_images(test_images_path)
%{
x_train: x_train,
y_train: y_train,
x_test: x_test,
y_test: y_test,
y_train_unencoded: y_train_unencoded
}
end
end
Neural Network from Chapter 11 (but with train/7 modified to return losses of training and test sets)
defmodule C14.NeuralNetwork do
import C7.MNIST, only: [prepend_bias: 1]
import Nx.Defn
@doc """
For w2_gradient:
- Swap the operands of the multiplication then transpose one of them to get a result with the
same dimension as w2.
- We also need to do a prepend_bias/1 since we did it with forward/3, we need to do the same
during backprop.
- The matrix multiplication needs to be divide by the rows of x (number of examples) to get
the average gradient
For w1_gradient:
- Use h as is, without a bias column since it has no effect on dL/dw1
- Since we ignored the first column of h, we also have to ignore the first row of w1 to match
columns by rows in matrix multiplication
- `sigmoid_gradient/1` calculates sigmoid's gradient from sigmoid's output `h`
- Lastly multiply x to the previous intermediate result with same rules when we calculate
w2_gradient
"""
def back(x, y, y_hat, w2, h) do
{num_samples, _} = Nx.shape(x)
w2_gradient =
h
|> prepend_bias()
|> Nx.transpose()
|> Nx.dot(Nx.subtract(y_hat, y))
|> Nx.divide(num_samples)
w1_gradient =
x
|> prepend_bias()
|> Nx.transpose()
|> Nx.dot(
y_hat
|> Nx.subtract(y)
|> Nx.dot(Nx.transpose(w2[1..-1//1]))
|> Nx.multiply(sigmoid_gradient(h))
)
|> Nx.divide(num_samples)
{w1_gradient, w2_gradient}
end
def initialize_weights(n_input_vars, n_hidden_nodes, n_classes) do
key = Nx.Random.key(1234)
mean = 0.0
standard_deviation = 0.01
w1_rows = n_input_vars + 1
{w1, new_key} =
Nx.Random.normal(key, mean, standard_deviation, shape: {w1_rows, n_hidden_nodes})
w2_rows = n_hidden_nodes + 1
{w2, _new_key} =
Nx.Random.normal(new_key, mean, standard_deviation, shape: {w2_rows, n_classes})
{w1, w2}
end
@doc """
Hacked to return two lists, the training losses and test losses stored at each iteration.
"""
def train(x_train, y_train, x_test, y_test, n_hidden_nodes, iterations, lr) do
{_, n_input_variables} = Nx.shape(x_train)
{_, n_classes} = Nx.shape(y_train)
{w1, w2} = initialize_weights(n_input_variables, n_hidden_nodes, n_classes)
state = %{
w1: w1,
w2: w2,
iterations: [],
training_losses: [],
test_losses: []
}
Enum.reduce(1..iterations, state, fn iteration,
%{
w1: w1,
w2: w2,
iterations: iterations,
training_losses: training_losses,
test_losses: test_losses
} ->
{y_train_hat, h} = forward(x_train, w1, w2)
{y_test_hat, _h} = forward(x_test, w1, w2)
{w1_gradient, w2_gradient} = back(x_train, y_train, y_train_hat, w2, h)
w1 = Nx.subtract(w1, Nx.multiply(w1_gradient, lr))
w2 = Nx.subtract(w2, Nx.multiply(w2_gradient, lr))
training_loss = loss(y_train, y_train_hat)
test_loss = loss(y_test, y_test_hat)
IO.puts(
"#{iteration} > Training loss: #{Nx.to_number(training_loss)} - Test loss: #{Nx.to_number(test_loss)}"
)
%{
state
| w1: w1,
w2: w2,
iterations: [iteration | iterations],
training_losses: [Nx.to_number(training_loss) | training_losses],
test_losses: [Nx.to_number(test_loss) | test_losses]
}
end)
end
# Functions from `Ch 10: Building the Network` below
defn sigmoid(z) do
1 / (1 + Nx.exp(-z))
end
def softmax(logits) do
exponentials = Nx.exp(logits)
sum_of_exponentials_by_row =
exponentials
|> Nx.sum(axes: [1])
|> Nx.reshape({:auto, 1})
Nx.divide(exponentials, sum_of_exponentials_by_row)
end
def sigmoid_gradient(sigmoid) do
Nx.multiply(sigmoid, Nx.subtract(1, sigmoid))
end
@doc """
Cross-entropy loss
Loss formula specific for multiclass classifiers.
Measures the distance between the classifier's predictions and the labels.
Lower loss means better classifier
"""
def loss(y_train, y_hat) do
{rows, _} = Nx.shape(y_train)
y_train
|> Nx.multiply(Nx.log(y_hat))
|> Nx.sum()
|> Nx.multiply(-1)
|> Nx.divide(rows)
end
@doc """
Implements the operation called "forward propagation"
Steps:
1. We add a bias to the inputs
2. Compute the weighted sum using the 1st matrix of weights, w1
3. Pass the result to the activation function (sigmoid or softmax)
4. Repeat for all layers
"""
def forward(x, w1, w2) do
# Hidden layer
h =
x
|> prepend_bias()
|> then(&Nx.dot(&1, w1))
|> sigmoid()
# Output layer
y_hat =
h
|> prepend_bias()
|> then(&Nx.dot(&1, w2))
|> softmax()
{y_hat, h}
end
@doc """
Same classify/2 function from Ch 7 but modified for neutral network
"""
def classify(x, w1, w2) do
x
|> forward(w1, w2)
|> elem(0)
|> Nx.argmax(axis: 1)
|> Nx.reshape({:auto, 1})
end
def accuracy(x, y_unencoded, w1, w2) do
x
|> classify(w1, w2)
|> Nx.equal(y_unencoded)
|> Nx.mean()
|> Nx.multiply(100.0)
|> Nx.to_number()
end
end
%{
x_train: x_train,
y_train: y_train,
x_test: x_test,
y_test: y_test,
y_train_unencoded: y_train_unencoded
} = C7.MNIST.load_datasets()
n_hidden_nodes = 200
lr = 0.01
iterations = 10
%{
w1: w1,
w2: w2,
iterations: iterations,
training_losses: training_losses,
test_losses: test_losses
} =
C14.NeuralNetwork.train(
x_train,
y_train,
x_test,
C7.MNIST.one_hot_encode(y_test),
n_hidden_nodes,
iterations,
lr
)
training_accuracy =
C14.NeuralNetwork.accuracy(x_train, y_train_unencoded, w1, w2) |> Float.round(2)
test_accuracy = C14.NeuralNetwork.accuracy(x_test, y_test, w1, w2) |> Float.round(2)
IO.puts("Training accuracy: #{training_accuracy}%, Test accuracy: #{test_accuracy}%")
training_losses =
iterations
|> Enum.zip(training_losses)
|> Enum.map(fn {i, loss} ->
%{iteration: i, loss: loss, dataset: "Training set"}
end)
test_losses =
iterations
|> Enum.zip(test_losses)
|> Enum.map(fn {i, loss} ->
%{iteration: i, loss: loss, dataset: "Test set"}
end)
VegaLite.new(width: 600, height: 500, title: "Training and Test sets Loss per Iteration")
|> VegaLite.layers([
VegaLite.new()
|> VegaLite.data_from_values(training_losses)
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "iteration", type: :quantitative, title: "Iterations")
|> VegaLite.encode_field(:y, "loss", type: :quantitative, title: "Loss")
|> VegaLite.encode(:color, field: "dataset", title: "Dataset"),
VegaLite.new()
|> VegaLite.data_from_values(test_losses)
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "iteration", type: :quantitative, title: "Iterations")
|> VegaLite.encode_field(:y, "loss", type: :quantitative, title: "Loss")
|> VegaLite.encode(:color, field: "dataset", title: "Dataset")
])
The large difference between the accuracy of the network using training and using test set is the effect of overfitting. The network is more accurate on the training data. This the reason why we need to stick to the Blind Test Rule: test your system to data that it hasn’t seen before
A Testing Conundrum
Don’ts
- Use the test set during tuning
- Split the dataset into just two: train and test
Dos
Split the dataset into three: training, testing, and validation set. Common split is 60/20/20.
- Setup: put the test set aside
- Development lifecycle: train the network using the training set but use the validation set to gauge performance
- Final test: after tuning the hyperparameters, we use the test set to know how it will perform in production
defmodule C14.MNISTThreeSets do
@moduledoc "MNIST functions from Chapter 7"
def load_images(path) do
# Open and unzip the file of images then store header information into variables
<<_magic_number::32, n_images::32, n_rows::32, n_cols::32, images::binary>> =
path
|> File.read!()
|> :zlib.gunzip()
images
# Create 1D tensor of type unsigned 8-bit integer from binary
|> Nx.from_binary({:u, 8})
# Reshape the pixels into a matrix into a matrix where each row is an image
|> Nx.reshape({n_images, n_cols * n_rows})
end
@doc """
Inserts a column of 1's into position 0 of tensor X along the the x-axis
"""
def prepend_bias(x) do
{row, _col} = Nx.shape(x)
bias = Nx.broadcast(Nx.tensor(1), {row, 1})
Nx.concatenate([bias, x], axis: 1)
end
def load_labels(filename) do
# Open and unzip the file of images then read all the labels into a list
<<_magic_number::32, n_items::32, labels::binary>> =
filename
|> File.read!()
|> :zlib.gunzip()
labels
# Create 1D tensor of type unsigned 8-bit integer from binary
|> Nx.from_binary({:u, 8})
# Reshape the labels into a 1-column matrix
|> Nx.reshape({n_items, 1})
end
@doc "Flip hot values to 1"
def one_hot_encode(y) do
Nx.equal(y, Nx.tensor(Enum.to_list(0..9)))
end
def load_datasets do
files_path = __DIR__ |> Path.join("/files")
training_images_path = Path.join(files_path, "train-images-idx3-ubyte.gz")
training_labels_path = Path.join(files_path, "train-labels-idx1-ubyte.gz")
test_images_path = Path.join(files_path, "t10k-images-idx3-ubyte.gz")
test_labels_path = Path.join(files_path, "t10k-labels-idx1-ubyte.gz")
# 60K labels, each a single digit from 0 to 9
y_train_unencoded = load_labels(training_labels_path)
# X_train/X_validation/X_test: 60K/5K/5K images
# Each image has 784 elements (28 * 28 pixels)
x_train = load_images(training_images_path)
x_test_all = load_images(test_images_path)
{x_test, x_validation} = Nx.split(x_test_all, 0.5)
# Y_train: 60K labels, each consisting of 10 one-hot-encoded elements
y_train = one_hot_encode(y_train_unencoded)
# Y_validation/Y_test: 5K/5K labels, each a single digit from 0 to 9
y_test_all = load_labels(test_labels_path)
{y_test, y_validation} = Nx.split(y_test_all, 0.5)
%{
x_test_all: x_test_all,
y_test_all: y_test_all,
x_train: x_train,
y_train: y_train,
x_test: x_test,
y_test: y_test,
x_validation: x_validation,
y_validation: y_validation,
y_train_unencoded: y_train_unencoded
}
end
end
%{
x_test_all: x_test_all,
y_test_all: y_test_all,
x_test: x_test,
x_validation: x_val,
y_test: y_test,
y_validation: y_val
} = C14.MNISTThreeSets.load_datasets()
{x_total_rows, x_total_columns} = Nx.shape(x_test_all)
{y_total_rows, y_total_columns} = Nx.shape(y_test_all)
expected_shape = {div(x_total_rows, 2), x_total_columns}
^expected_shape = Nx.shape(x_test)
^expected_shape = Nx.shape(x_val)
expected_shape = {div(y_total_rows, 2), y_total_columns}
^expected_shape = Nx.shape(y_test)
^expected_shape = Nx.shape(y_val)