Chapter 3: Walking the Gradient

03_gradient/03_gradient.livemd

@dgigafox

programming_machine_learn...

Share to X

Share to Bluesky

More notebooks

Chapter 3: Walking the Gradient

Mix.install([
  {:nx, "~> 0.5.3"},
  {:kino_vega_lite, "~> 0.1.7"}
])

The Problem with Chapter 2 Algorithm

The train/4 function from Chapter 2 looks like this

def train(x, y, iterations, lr) do
    w = b = 0

    Enum.reduce_while(0..iterations, {w, b}, fn i, {w, b} ->
      current_loss = loss(x, y, w, b) |> Nx.to_number()
      IO.puts("Iteration #{i} => Loss: #{current_loss}")

      cond do
        loss(x, y, w + lr, b) |> Nx.to_number() < current_loss -> {:cont, {w + lr, b}}
        loss(x, y, w - lr, b) |> Nx.to_number() < current_loss -> {:cont, {w - lr, b}}
        loss(x, y, w, b + lr) |> Nx.to_number() < current_loss -> {:cont, {w, b + lr}}
        loss(x, y, w, b - lr) |> Nx.to_number() < current_loss -> {:cont, {w, b - lr}}
        true -> {:halt, {w, b}}
      end
    end)
  end

The problem with this algorithm is that we are tweaking bias b and weight w separately i.e. we first rotate the line that estimates the model by adjusting w. If current loss cannot go down further, then we shift the line up and down by adjusting b. This approach could go wrong: as we tweak b we might be increasing the loss caused by w and vice versa. In conclusion, we need to simultaneously tweak both w and b and compute the losses.

We could update the train/4 to try all the possible combinations of tweaks but would be impractical because combinations will increase as we increase the number of parameters and in our case there will be three raise to two (3^2) or nine (9) combination of calls per iteration i.e. three (3) possibilities for two (2) parameters w and b values are to increase, decrease, or unchange

Importing dataset

path = __DIR__ |> Path.join("files/pizza.txt") |> Path.expand()

{x, y} =
  path
  |> File.stream!()
  |> Stream.map(&amp;String.split/1)
  |> Stream.map(&amp;List.to_tuple/1)
  |> Enum.into([])
  # Drop header
  |> List.delete_at(0)
  |> Enum.unzip()

x = Enum.map(x, &amp;String.to_integer/1) |> Nx.tensor()
y = Enum.map(y, &amp;String.to_integer/1) |> Nx.tensor()

Loss Curve

Here’s a simple program that graphs the value of losses when we tweak the weight from -1 to 4. Our aim is to get the weight that results to the smallest possible loss in the graph by using gradient descent

defmodule C2.LinearRegression do
  import Nx.Defn

  defn predict(x, w) do
    x * w
  end

  defn loss(x, y, w) do
    x
    |> predict(w)
    |> Nx.subtract(y)
    |> Nx.pow(2)
    |> Nx.mean()
  end
end

range = -1..4

weight_vs_loss = %{
  weight: range,
  loss: Enum.map(range, &amp;(C2.LinearRegression.loss(x, y, &amp;1) |> Nx.to_number()))
}

VegaLite.new(title: "Weight vs Loss")
|> VegaLite.data_from_values(weight_vs_loss, only: ["weight", "loss"])
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "weight", type: :quantitative)
|> VegaLite.encode_field(:y, "loss", type: :quantitative)

Gradient Descent

Gradient is the slope of the loss curve. It is measured by getting the derivative of the loss with respect to the weight dL/dW

defmodule C3.GradientDescentWithoutBias do
  import Nx.Defn

  defn predict(x, w, b) do
    x * w + b
  end

  def loss(x, y, w, b) do
    x
    |> predict(w, b)
    |> Nx.subtract(y)
    |> Nx.pow(2)
    |> Nx.mean()
  end

  defn gradient(x, y, w) do
    x
    |> predict(w, 0)
    |> Nx.subtract(y)
    |> Nx.multiply(x)
    |> Nx.mean()
    |> Nx.multiply(2)
  end

  def train(x, y, iterations, lr) do
    w = 0

    Enum.reduce(0..iterations, w, fn i, w ->
      IO.puts("Iteration #{i} => Loss #{loss(x, y, w, 0) |> Nx.to_number()}")
      w - Nx.to_number(gradient(x, y, w)) * lr
    end)
  end
end

Running the code

w = C3.GradientDescentWithoutBias.train(x, y, 100, 0.001)
IO.puts("w=#{Nx.to_number(w)}")

Escape from flatland

Changing b from a constant to a variable changes loss vs bias graph from 2D to 3D curve

Partial derivatives

Since the gradient is now a function of b and w we will not be able to use C3.GradientDescentWithoutBias.gradient/3 anymore since in that function b is a constant. The gradient should now be calculated by using partial derivatives where we calculate the derivative of the loss function while pretending b is a constant and another derivative while pretending w is a constant

defmodule C3.GradientDescentFinal do
  import Nx.Defn

  defn predict(x, w, b) do
    x * w + b
  end

  def loss(x, y, w, b) do
    x
    |> predict(w, b)
    |> Nx.subtract(y)
    |> Nx.pow(2)
    |> Nx.mean()
  end

  def gradient(x, y, w, b) do
    # Derivative of L with respect to w where b is constant
    w_gradient =
      x
      |> predict(w, b)
      |> Nx.subtract(y)
      |> Nx.multiply(x)
      |> Nx.mean()
      |> Nx.multiply(2)

    # Derivative of L with respect to b where w is constant
    b_gradient =
      x
      |> predict(w, b)
      |> Nx.subtract(y)
      |> Nx.mean()
      |> Nx.multiply(2)

    {w_gradient, b_gradient}
  end

  def train(x, y, iterations, lr) do
    w = b = 0

    Enum.reduce(0..iterations, {w, b}, fn i, {w, b} ->
      IO.puts("Iteration #{i} => Loss #{loss(x, y, w, b) |> Nx.to_number()}")
      {w_gradient, b_gradient} = gradient(x, y, w, b)
      w = w - Nx.to_number(w_gradient) * lr
      b = b - Nx.to_number(b_gradient) * lr
      {w, b}
    end)
  end
end

Copying linear regression with bias from Chapter 2 so we can compare

defmodule C2.LinearRegressionWithBias do
  import Nx.Defn

  defn predict(x, w, b) do
    x * w + b
  end

  def loss(x, y, w, b) do
    x
    |> predict(w, b)
    |> Nx.subtract(y)
    |> Nx.pow(2)
    |> Nx.mean()
  end

  def train(x, y, iterations, lr) do
    w = b = 0

    Enum.reduce_while(0..iterations, {w, b}, fn i, {w, b} ->
      current_loss = loss(x, y, w, b) |> Nx.to_number()
      IO.puts("Iteration #{i} => Loss: #{current_loss}")

      cond do
        loss(x, y, w + lr, b) |> Nx.to_number() < current_loss -> {:cont, {w + lr, b}}
        loss(x, y, w - lr, b) |> Nx.to_number() < current_loss -> {:cont, {w - lr, b}}
        loss(x, y, w, b + lr) |> Nx.to_number() < current_loss -> {:cont, {w, b + lr}}
        loss(x, y, w, b - lr) |> Nx.to_number() < current_loss -> {:cont, {w, b - lr}}
        true -> {:halt, {w, b}}
      end
    end)
  end
end

First, let’s run the earlier version with plenty of iterations and low lr of 0.0001

{w, b} = C2.LinearRegressionWithBias.train(x, y, 157_777, 0.0001)
IO.puts("w=#{w}, b=#{b}")
x_hat = 20
y_hat = C3.GradientDescentFinal.predict(x_hat, w, b)
IO.puts("Prediction x=#{x_hat} => y=#{Nx.to_number(y_hat)}")

With the new Gradient Descent implementation with just 20,000 iterations we get:

{w, b} = C3.GradientDescentFinal.train(x, y, 20_000, 0.001)
IO.puts("w=#{w}, b=#{b}")
x_hat = 20
y_hat = C3.GradientDescentFinal.predict(x_hat, w, b)
IO.puts("Prediction x=#{x_hat} => y=#{Nx.to_number(y_hat)}")

When Gradient Descent Fails

GD works well as long as the loss surface has a few characteristics:

Convex - doesn’t have bumps
Continuous - doesn’t have vertical cliffs or gaps
Differentiable - smooth without cusps and weird spots (also the reason why we use mean squared error to create a steep surface instead of absolute value)

Other notebooks:

Michal Slaski
@michalslaski

livebook_examples

Salary predictions

salary_prediction.livemd

exla axon nx

2022-8-18
Dr. Christian Geuer-Pollmann
@chgeuer

livebook_on_azure

Christian's first LiveBook test

notebook1.livemd

axon exla nx

2022-8-18
@andyl

elix_util

MNIST

mnist.livemd

req axon exla nx

2022-8-18
@TomBers

livebookNotes

Trying Nx

NX.livemd

exla axon nx

2022-8-18
Ryo Wakabayashi
@RyoWakabayashi

elixir-learning

Image Classification Standalone

Image_classification_standalone.livemd

nx exla axon_onnx evision flow req kino

2023-8-21
Shozo Fukuda
@shoz-f

tfl_interp

Pose Estimation: MoveNet

MoveNet.livemd

tfl_interp nx cimg exprintf kino

2024-12-1
Stewart
@imakestews

cur

Todo List

deprecated_todo_list.livemd

jason kino youtube hidden_cell

2025-6-30

Back