Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Chapter 5: A Discerning Machine

05_discerning_machine.livemd

Chapter 5: A Discerning Machine

Mix.install([
  {:nx, "~> 0.6.3"},
  {:kino_vega_lite, "~> 0.1.10"}
])

Classifier and Where Linear Regression Fails

Classifier - works with categorical labels instead of numerical labels

path = __DIR__ |> Path.join("files/police.txt") |> Path.expand()

dataset =
  path
  |> File.stream!()
  |> Stream.map(&String.split/1)
  # Drop header
  |> Stream.drop(1)
  |> Stream.map(fn row -> Enum.map(row, &String.to_integer/1) end)
  |> Enum.into([])
  |> Nx.tensor()

reservations = dataset[[.., 0]] |> Nx.to_list()
police = dataset[[.., -1]] |> Nx.to_list()

reservations_vs_police = %{reservations: reservations, police: police}
VegaLite.new(width: 500, height: 250, title: "Reservations vs Police Call")
|> VegaLite.data_from_values(reservations_vs_police, only: ["reservations", "police"])
|> VegaLite.mark(:point)
|> VegaLite.encode_field(:x, "reservations", type: :quantitative)
|> VegaLite.encode_field(:y, "police", type: :quantitative)

In the above chart of reservation vs the likelihood of a police call, linear regression is not applicable for the following reasons:

  1. In linear regression, we assume that the data points are aligned to begin with
  2. Adding an outlier, a data point that’s very far from the others, greatly moves the line generated by linear regression

Invasion of the Sigmoids

Logistic function - wrapper function that takes the output of the weighted sum and squashes into the range from 0 to 1. Something like:ŷ = wrapper_function(x1*w1 + x2*w2 + x3*w3 + ...). It should smoothly changes across the range from 0 to 1. Belongs to a family of S-shaped function called sigmoids.

σ(z) = 1 / (1 + e^(-z))

Log Loss

Introducing sigmoid to our program made gradient descent less reliable. The resulting loss function (mean squared error)

def mse_loss(x, y, w) do
  x
  |> forward(w)
  |> Nx.subtract(y)
  |> Nx.pow(2)
  |> Nx.mean()
end

will have deep canyons leading straight into holes. Those holes are called “local minima”. The algorithm will stop because it will be falsely convinced that it reached the “global minima” it should be aiming for. The appropriate function for calculating the loss of sigmoid is called log loss:

def loss(x, y, w) do
  y_hat = forward(x, w)
  first_term = Nx.multiply(y, Nx.log(y_hat))
  second_term = Nx.multiply(Nx.subtract(1, y), Nx.log(Nx.subtract(1, y_hat)))
  -Nx.mean(first_term + second_term)
end

Upgrading the Gradient

Let’s change the gradient by getting the partial derivative of the new loss function

def gradient(x, y, w) do
  {num_samples, _} = Nx.shape(x)

  x
  |> forward(w)
  |> Nx.subtract(y)
  |> then(&Nx.dot(Nx.transpose(x), &1))
  |> Nx.divide(num_samples)
end

It’s almost the same with multiple linear regressions’ gradient function just without the multiplication to 2

defmodule C5.LogisticRegression do
  import Nx.Defn

  defn sigmoid(z) do
    1 / (1 + Nx.exp(-z))
  end

  # renaming predict function to forward
  def forward(x, w) do
    weighted_sum = Nx.dot(x, w)
    sigmoid(weighted_sum)
  end

  def classify(x, w) do
    x
    |> forward(w)
    |> Nx.round()
  end

  def loss(x, y, w) do
    y_hat = forward(x, w)
    first_term = Nx.multiply(y, Nx.log(y_hat))
    second_term = Nx.multiply(Nx.subtract(1, y), Nx.log(Nx.subtract(1, y_hat)))

    first_term
    |> Nx.add(second_term)
    |> Nx.mean()
    |> Nx.multiply(-1)
  end

  def gradient(x, y, w) do
    {num_samples, _} = Nx.shape(x)

    x
    |> forward(w)
    |> Nx.subtract(y)
    |> then(&Nx.dot(Nx.transpose(x), &1))
    |> Nx.divide(num_samples)
  end

  def train(x, y, iterations, lr) do
    {_, x_cols} = Nx.shape(x)
    w = Nx.broadcast(0, {x_cols, 1})

    Enum.reduce(0..iterations, w, fn i, w ->
      IO.puts("Iteration #{i} => Loss #{loss(x, y, w) |> Nx.to_number()}")
      gradient = gradient(x, y, w)
      Nx.subtract(w, Nx.multiply(gradient, lr))
    end)
  end

  def test(x, y, w) do
    {total_examples, _} = Nx.shape(x)
    correct_results = Nx.sum(classify(x, w) |> Nx.equal(y)) |> Nx.to_number()

    success_percent =
      Nx.multiply(correct_results, 100) |> Nx.divide(total_examples) |> Nx.to_number()

    IO.puts("Success: #{correct_results}/#{total_examples} (#{success_percent}%)")
  end
end

Visualizing the new model

Below is a script that trains the classifier on the first column (reservations) while keeping the other fields (temperature & tourists) as constant

# Get reservations column only
reservations = dataset[[.., 0..0]]
# Get label
police_call = dataset[[.., -1..-1//1]]
# add constant
constant = Nx.broadcast(1, Nx.shape(reservations))
# reservations with other fields (temperature, tourists) as constant
x = Nx.concatenate([reservations, constant], axis: 1)

w = C5.LogisticRegression.train(x, police_call, 100_000, 0.001)
forward = C5.LogisticRegression.forward(x, w)
classify = C5.LogisticRegression.classify(x, w)

{n_row, _} = Nx.shape(reservations)
reservations = Nx.reshape(reservations, {n_row}) |> Nx.to_list()
forward = Nx.reshape(forward, {n_row}) |> Nx.to_list()
classify = Nx.reshape(classify, {n_row}) |> Nx.to_list()

output_plot = %{reservations: reservations, forward: forward, classify: classify}

As a result, passing the weighted sum to the sigmoid function turns the straight line into something more sigmoid-y as seen below:

VegaLite.new(width: 500, height: 500, title: "Plot of the forward() function")
|> VegaLite.layers([
  VegaLite.new()
  |> VegaLite.data_from_values(reservations_vs_police, only: ["reservations", "police"])
  |> VegaLite.mark(:point)
  |> VegaLite.encode_field(:x, "reservations", type: :quantitative)
  |> VegaLite.encode_field(:y, "police", type: :quantitative),
  VegaLite.new()
  |> VegaLite.data_from_values(output_plot, only: ["reservations", "forward"])
  |> VegaLite.mark(:line)
  |> VegaLite.encode_field(:x, "reservations", type: :quantitative)
  |> VegaLite.encode_field(:y, "forward", type: :quantitative)
])

To predict the label, the model from the forward function is passed through the classifier function resulting a sharper shape just like below:

VegaLite.new(width: 500, height: 500, title: "Plot of the classify() function")
|> VegaLite.layers([
  VegaLite.new()
  |> VegaLite.data_from_values(reservations_vs_police, only: ["reservations", "police"])
  |> VegaLite.mark(:point)
  |> VegaLite.encode_field(:x, "reservations", type: :quantitative)
  |> VegaLite.encode_field(:y, "police", type: :quantitative),
  VegaLite.new()
  |> VegaLite.data_from_values(output_plot, only: ["reservations", "classify"])
  |> VegaLite.mark(:line)
  |> VegaLite.encode_field(:x, "reservations", type: :quantitative)
  |> VegaLite.encode_field(:y, "classify", type: :quantitative)
])

Classification in Action

# Drop last column
x = dataset[[.., 0..-2//1]]
# Drop first three columns
y = dataset[[.., -1..-1//1]]
w = C5.LogisticRegression.train(x, y, 100_000, 0.001)
C5.LogisticRegression.test(x, y, w)