Chapter 5: A Discerning Machine

05_discerning_machine.livemd

@dgigafox

programming_machine_learn...

Share to X

Share to Bluesky

More notebooks

Chapter 5: A Discerning Machine

Mix.install([
  {:nx, "~> 0.6.3"},
  {:kino_vega_lite, "~> 0.1.10"}
])

Classifier and Where Linear Regression Fails

Classifier - works with categorical labels instead of numerical labels

path = __DIR__ |> Path.join("files/police.txt") |> Path.expand()

dataset =
  path
  |> File.stream!()
  |> Stream.map(&amp;String.split/1)
  # Drop header
  |> Stream.drop(1)
  |> Stream.map(fn row -> Enum.map(row, &amp;String.to_integer/1) end)
  |> Enum.into([])
  |> Nx.tensor()

reservations = dataset[[.., 0]] |> Nx.to_list()
police = dataset[[.., -1]] |> Nx.to_list()

reservations_vs_police = %{reservations: reservations, police: police}

VegaLite.new(width: 500, height: 250, title: "Reservations vs Police Call")
|> VegaLite.data_from_values(reservations_vs_police, only: ["reservations", "police"])
|> VegaLite.mark(:point)
|> VegaLite.encode_field(:x, "reservations", type: :quantitative)
|> VegaLite.encode_field(:y, "police", type: :quantitative)

In the above chart of reservation vs the likelihood of a police call, linear regression is not applicable for the following reasons:

In linear regression, we assume that the data points are aligned to begin with
Adding an outlier, a data point that’s very far from the others, greatly moves the line generated by linear regression

Invasion of the Sigmoids

Logistic function - wrapper function that takes the output of the weighted sum and squashes into the range from 0 to 1. Something like:ŷ = wrapper_function(x1*w1 + x2*w2 + x3*w3 + ...). It should smoothly changes across the range from 0 to 1. Belongs to a family of S-shaped function called sigmoids.

σ(z) = 1 / (1 + e^(-z))

Log Loss

Introducing sigmoid to our program made gradient descent less reliable. The resulting loss function (mean squared error)

def mse_loss(x, y, w) do
  x
  |> forward(w)
  |> Nx.subtract(y)
  |> Nx.pow(2)
  |> Nx.mean()
end

will have deep canyons leading straight into holes. Those holes are called “local minima”. The algorithm will stop because it will be falsely convinced that it reached the “global minima” it should be aiming for. The appropriate function for calculating the loss of sigmoid is called log loss:

def loss(x, y, w) do
  y_hat = forward(x, w)
  first_term = Nx.multiply(y, Nx.log(y_hat))
  second_term = Nx.multiply(Nx.subtract(1, y), Nx.log(Nx.subtract(1, y_hat)))
  -Nx.mean(first_term + second_term)
end

Upgrading the Gradient

Let’s change the gradient by getting the partial derivative of the new loss function

def gradient(x, y, w) do
  {num_samples, _} = Nx.shape(x)

  x
  |> forward(w)
  |> Nx.subtract(y)
  |> then(&amp;Nx.dot(Nx.transpose(x), &amp;1))
  |> Nx.divide(num_samples)
end

It’s almost the same with multiple linear regressions’ gradient function just without the multiplication to 2

defmodule C5.LogisticRegression do
  import Nx.Defn

  defn sigmoid(z) do
    1 / (1 + Nx.exp(-z))
  end

  # renaming predict function to forward
  def forward(x, w) do
    weighted_sum = Nx.dot(x, w)
    sigmoid(weighted_sum)
  end

  def classify(x, w) do
    x
    |> forward(w)
    |> Nx.round()
  end

  def loss(x, y, w) do
    y_hat = forward(x, w)
    first_term = Nx.multiply(y, Nx.log(y_hat))
    second_term = Nx.multiply(Nx.subtract(1, y), Nx.log(Nx.subtract(1, y_hat)))

    first_term
    |> Nx.add(second_term)
    |> Nx.mean()
    |> Nx.multiply(-1)
  end

  def gradient(x, y, w) do
    {num_samples, _} = Nx.shape(x)

    x
    |> forward(w)
    |> Nx.subtract(y)
    |> then(&amp;Nx.dot(Nx.transpose(x), &amp;1))
    |> Nx.divide(num_samples)
  end

  def train(x, y, iterations, lr) do
    {_, x_cols} = Nx.shape(x)
    w = Nx.broadcast(0, {x_cols, 1})

    Enum.reduce(0..iterations, w, fn i, w ->
      IO.puts("Iteration #{i} => Loss #{loss(x, y, w) |> Nx.to_number()}")
      gradient = gradient(x, y, w)
      Nx.subtract(w, Nx.multiply(gradient, lr))
    end)
  end

  def test(x, y, w) do
    {total_examples, _} = Nx.shape(x)
    correct_results = Nx.sum(classify(x, w) |> Nx.equal(y)) |> Nx.to_number()

    success_percent =
      Nx.multiply(correct_results, 100) |> Nx.divide(total_examples) |> Nx.to_number()

    IO.puts("Success: #{correct_results}/#{total_examples} (#{success_percent}%)")
  end
end

Visualizing the new model

Below is a script that trains the classifier on the first column (reservations) while keeping the other fields (temperature & tourists) as constant

# Get reservations column only
reservations = dataset[[.., 0..0]]
# Get label
police_call = dataset[[.., -1..-1//1]]
# add constant
constant = Nx.broadcast(1, Nx.shape(reservations))
# reservations with other fields (temperature, tourists) as constant
x = Nx.concatenate([reservations, constant], axis: 1)

w = C5.LogisticRegression.train(x, police_call, 100_000, 0.001)
forward = C5.LogisticRegression.forward(x, w)
classify = C5.LogisticRegression.classify(x, w)

{n_row, _} = Nx.shape(reservations)
reservations = Nx.reshape(reservations, {n_row}) |> Nx.to_list()
forward = Nx.reshape(forward, {n_row}) |> Nx.to_list()
classify = Nx.reshape(classify, {n_row}) |> Nx.to_list()

output_plot = %{reservations: reservations, forward: forward, classify: classify}

As a result, passing the weighted sum to the sigmoid function turns the straight line into something more sigmoid-y as seen below:

VegaLite.new(width: 500, height: 500, title: "Plot of the forward() function")
|> VegaLite.layers([
  VegaLite.new()
  |> VegaLite.data_from_values(reservations_vs_police, only: ["reservations", "police"])
  |> VegaLite.mark(:point)
  |> VegaLite.encode_field(:x, "reservations", type: :quantitative)
  |> VegaLite.encode_field(:y, "police", type: :quantitative),
  VegaLite.new()
  |> VegaLite.data_from_values(output_plot, only: ["reservations", "forward"])
  |> VegaLite.mark(:line)
  |> VegaLite.encode_field(:x, "reservations", type: :quantitative)
  |> VegaLite.encode_field(:y, "forward", type: :quantitative)
])

To predict the label, the model from the forward function is passed through the classifier function resulting a sharper shape just like below:

VegaLite.new(width: 500, height: 500, title: "Plot of the classify() function")
|> VegaLite.layers([
  VegaLite.new()
  |> VegaLite.data_from_values(reservations_vs_police, only: ["reservations", "police"])
  |> VegaLite.mark(:point)
  |> VegaLite.encode_field(:x, "reservations", type: :quantitative)
  |> VegaLite.encode_field(:y, "police", type: :quantitative),
  VegaLite.new()
  |> VegaLite.data_from_values(output_plot, only: ["reservations", "classify"])
  |> VegaLite.mark(:line)
  |> VegaLite.encode_field(:x, "reservations", type: :quantitative)
  |> VegaLite.encode_field(:y, "classify", type: :quantitative)
])

Classification in Action

# Drop last column
x = dataset[[.., 0..-2//1]]
# Drop first three columns
y = dataset[[.., -1..-1//1]]
w = C5.LogisticRegression.train(x, y, 100_000, 0.001)
C5.LogisticRegression.test(x, y, w)

Other notebooks:

Michal Slaski
@michalslaski

livebook_examples

Salary predictions

salary_prediction.livemd

exla axon nx

2022-8-18
Dr. Christian Geuer-Pollmann
@chgeuer

livebook_on_azure

Christian's first LiveBook test

notebook1.livemd

axon exla nx

2022-8-18
@andyl

elix_util

MNIST

mnist.livemd

req axon exla nx

2022-8-18
@TomBers

livebookNotes

Trying Nx

NX.livemd

exla axon nx

2022-8-18
@DockYard-Academy

curriculum

Naming Numbers

naming_numbers.livemd

jason kino youtube hidden_cell

2023-3-21
@DockYard-Academy

curriculum

Capstone: Mock

deprecated_capstone_project_mock.livemd

jason kino youtube hidden_cell

2023-3-21
@ohisama

ohi_Livebook

map

map.livemd

explorer geo kino kino_maplibre download jason

2024-1-9

Back