Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Chapter 17: Defeating Overfitting

17_overfitting/17_overfitting.livemd

Chapter 17: Defeating Overfitting

Mix.install(
  [
    {:exla, "~> 0.5"},
    {:nx, "~> 0.5"},
    {:axon, "~> 0.5"},
    {:kino, "~> 0.8.1"},
    {:kino_vega_lite, "~> 0.1.7"},
    {:vega_lite, "~> 0.1.6"}
  ],
  config: [nx: [default_backend: EXLA.Backend]]
)

Regularizing the Model

Reviewing the Deep Network

Load Echidna Dataset

defmodule C17.EchidnaDataset do
  import Nx.Defn

  @data_path Path.join(__DIR__, "../data") |> Path.expand()

  @filename Path.join(@data_path, "echidna.txt")

  @doc """
  Loads the echidna dataset and returns the input `x` and label `y` tensors.

  - the dataset has been shuffled
  - the input tensor is already normalized
  """
  def load() do
    with {:ok, binary} <- read_file() do
      # seed the random algorithm
      :rand.seed(:exsss, {1, 2, 3})

      tensor =
        binary
        |> parse()
        |> Enum.shuffle()
        |> Nx.tensor()

      # all the rows, only first 2 columns
      x = tensor[[0..-1//1, 0..1//1]] |> normalize_inputs()

      # all the rows, only 3rd column
      y =
        tensor[[0..-1//1, 2]]
        |> Nx.reshape({:auto, 1})
        |> Nx.as_type(:u8)

      %{x: x, y: y}
    end
  end

  def parse(binary) do
    binary
    |> String.split("\n", trim: true)
    |> Enum.slice(1..-1)
    |> Enum.map(fn row ->
      row
      |> String.split(" ", trim: true)
      |> Enum.map(&amp;parse_float/1)
    end)
  end

  # Normalization (Min-Max Scalar)
  #
  # In this approach, the data is scaled to a fixed range — usually 0 to 1.
  # In contrast to standardization, the cost of having this bounded range
  # is that we will end up with smaller standard deviations,
  # which can suppress the effect of outliers.
  # Thus MinMax Scalar is sensitive to outliers.
  defnp normalize_inputs(x_raw) do
    # Compute the min/max over the first axe
    min = Nx.reduce_min(x_raw, axes: [0])
    max = Nx.reduce_max(x_raw, axes: [0])

    # After MinMaxScaling, the distributions are not centered
    # at zero and the standard deviation is not 1.
    # Therefore, subtract 0.5 to rescale data between -0.5 and 0.5
    (x_raw - min) / (max - min) - 0.5
  end

  # to handle both integer and float numbers
  defp parse_float(stringified_float) do
    {float, ""} = Float.parse(stringified_float)
    float
  end

  def read_file() do
    if File.exists?(@filename) do
      File.read(@filename)
    else
      {:error, "The file #{@filename} is missing!"}
    end
  end
end

Load the data and split the input/label tensors in train, validate and test sets to use in the different stages.

%{x: x_all, y: y_all} = C17.EchidnaDataset.load()

size = (elem(Nx.shape(x_all), 0) / 3) |> ceil()

[x_train, x_validation, x_test] = Nx.to_batched(x_all, size) |> Enum.to_list()
[y_train, y_validation, y_test] = Nx.to_batched(y_all, size) |> Enum.to_list()

# One-hot encode the labels
y_train = Nx.equal(y_train, Nx.tensor([0, 1]))
y_validation = Nx.equal(y_validation, Nx.tensor([0, 1]))

Building a Neural Network with Axon

batch_size = 25

train_inputs = Nx.to_batched(x_train, batch_size)
train_labels = Nx.to_batched(y_train, batch_size)
train_batches = Stream.zip(train_inputs, train_labels)

validation_data = [{x_validation, y_validation}]

epochs = 30_000

# Set `eps` option in the RMSprop to prevent division by zero (NaN)
# By default in Axon is 1.0e-8, I tried with 1.0e-7 (Keras default) and
# it was still returning NaN.
epsilon = 1.0e-4

model =
  Axon.input("data")
  |> Axon.dense(100, activation: :sigmoid)
  |> Axon.dense(30, activation: :sigmoid)
  |> Axon.dense(2, activation: :softmax)

# `output_transform/1` applies a transformation on the final accumulated loop state.
#
# At the moment Axon does not provide a clean API to override/set it,
# therefore we use an "hack" (`Map.update`) to override its value in the Loop's state.
#
# https://hexdocs.pm/axon/Axon.Loop.html#loop/3
# https://github.com/elixir-nx/axon/blob/d180f074c33cf841fcbaf44c8e66d677c364d713/test/axon/loop_test.exs#L1073-L1080
output_transform = fn %Axon.Loop.State{step_state: step_state, metrics: metrics} ->
  %{params: step_state[:model_state], metrics: metrics}
end

# (~450 seconds with CPU)
%{params: params, metrics: metrics} =
  model
  |> Axon.Loop.trainer(:categorical_cross_entropy, Axon.Optimizers.rmsprop(0.001, eps: epsilon))
  |> Axon.Loop.validate(model, validation_data)
  |> Map.update(:output_transform, nil, fn _original_output_transform ->
    fn state -> output_transform.(state) end
  end)
  |> Axon.Loop.run(train_batches, %{}, epochs: epochs, compiler: EXLA)
training_losses =
  metrics
  |> Enum.sort_by(fn {index, _metric} -> index end)
  |> Enum.map(fn {index, %{"loss" => loss}} ->
    %{loss: Nx.to_number(loss), epoch: index, type: "training"}
  end)

validation_losses =
  metrics
  |> Enum.sort_by(fn {index, _metric} -> index end)
  |> Enum.map(fn {index, %{"validation_loss" => validation_loss}} ->
    %{loss: Nx.to_number(validation_loss), epoch: index, type: "validation"}
  end)
VegaLite.new(width: 600, height: 400)
|> VegaLite.layers([
  VegaLite.new()
  |> VegaLite.data_from_values(training_losses, only: ["epoch", "loss", "type"])
  |> VegaLite.mark(:line)
  |> VegaLite.encode_field(:x, "epoch", type: :quantitative)
  |> VegaLite.encode_field(:y, "loss", type: :quantitative)
  |> VegaLite.encode_field(:color, "type", type: :nominal),
  VegaLite.new()
  |> VegaLite.data_from_values(validation_losses, only: ["epoch", "loss", "type"])
  |> VegaLite.mark(:line)
  |> VegaLite.encode_field(:x, "epoch", type: :quantitative)
  |> VegaLite.encode_field(:y, "loss", type: :quantitative)
  |> VegaLite.encode_field(:color, "type", type: :nominal)
])

L1 and L2 regularization

I couldn’t replicate this section of the book because L1/L2 regularizations are not supported by Axon out of the box.

More details in this post in the Elixir Forum.

Interestingly enough, it was possible with a previous version of Axon, but then the feature has been removed for the following reasons:

  • It’s not in PyTorch, and it didn’t seem very commonly used in TensorFlow
  • Regularization is a concern of training/optimization and not the model

It is probably possible to achieve that by creating a custom training loop to apply L1/L2 regularization per-level. I tried but I couldn’t manage to make it work 😞.