Chapter 5: A Discerning Machine
Mix.install([
{:nx, "~> 0.6.3"},
{:kino_vega_lite, "~> 0.1.10"}
])
Classifier and Where Linear Regression Fails
Classifier - works with categorical labels instead of numerical labels
path = __DIR__ |> Path.join("files/police.txt") |> Path.expand()
dataset =
path
|> File.stream!()
|> Stream.map(&String.split/1)
# Drop header
|> Stream.drop(1)
|> Stream.map(fn row -> Enum.map(row, &String.to_integer/1) end)
|> Enum.into([])
|> Nx.tensor()
reservations = dataset[[.., 0]] |> Nx.to_list()
police = dataset[[.., -1]] |> Nx.to_list()
reservations_vs_police = %{reservations: reservations, police: police}
VegaLite.new(width: 500, height: 250, title: "Reservations vs Police Call")
|> VegaLite.data_from_values(reservations_vs_police, only: ["reservations", "police"])
|> VegaLite.mark(:point)
|> VegaLite.encode_field(:x, "reservations", type: :quantitative)
|> VegaLite.encode_field(:y, "police", type: :quantitative)
In the above chart of reservation vs the likelihood of a police call, linear regression is not applicable for the following reasons:
- In linear regression, we assume that the data points are aligned to begin with
- Adding an outlier, a data point that’s very far from the others, greatly moves the line generated by linear regression
Invasion of the Sigmoids
Logistic function - wrapper function that takes the output of the weighted sum and squashes into the range from 0 to 1. Something like:ŷ = wrapper_function(x1*w1 + x2*w2 + x3*w3 + ...)
. It should smoothly changes across the range from 0 to 1. Belongs to a family of S-shaped function called sigmoids.
σ(z) = 1 / (1 + e^(-z))
Log Loss
Introducing sigmoid to our program made gradient descent less reliable. The resulting loss function (mean squared error)
def mse_loss(x, y, w) do
x
|> forward(w)
|> Nx.subtract(y)
|> Nx.pow(2)
|> Nx.mean()
end
will have deep canyons leading straight into holes. Those holes are called “local minima”. The algorithm will stop because it will be falsely convinced that it reached the “global minima” it should be aiming for. The appropriate function for calculating the loss of sigmoid is called log loss:
def loss(x, y, w) do
y_hat = forward(x, w)
first_term = Nx.multiply(y, Nx.log(y_hat))
second_term = Nx.multiply(Nx.subtract(1, y), Nx.log(Nx.subtract(1, y_hat)))
-Nx.mean(first_term + second_term)
end
Upgrading the Gradient
Let’s change the gradient by getting the partial derivative of the new loss function
def gradient(x, y, w) do
{num_samples, _} = Nx.shape(x)
x
|> forward(w)
|> Nx.subtract(y)
|> then(&Nx.dot(Nx.transpose(x), &1))
|> Nx.divide(num_samples)
end
It’s almost the same with multiple linear regressions’ gradient function just without the multiplication to 2
defmodule C5.LogisticRegression do
import Nx.Defn
defn sigmoid(z) do
1 / (1 + Nx.exp(-z))
end
# renaming predict function to forward
def forward(x, w) do
weighted_sum = Nx.dot(x, w)
sigmoid(weighted_sum)
end
def classify(x, w) do
x
|> forward(w)
|> Nx.round()
end
def loss(x, y, w) do
y_hat = forward(x, w)
first_term = Nx.multiply(y, Nx.log(y_hat))
second_term = Nx.multiply(Nx.subtract(1, y), Nx.log(Nx.subtract(1, y_hat)))
first_term
|> Nx.add(second_term)
|> Nx.mean()
|> Nx.multiply(-1)
end
def gradient(x, y, w) do
{num_samples, _} = Nx.shape(x)
x
|> forward(w)
|> Nx.subtract(y)
|> then(&Nx.dot(Nx.transpose(x), &1))
|> Nx.divide(num_samples)
end
def train(x, y, iterations, lr) do
{_, x_cols} = Nx.shape(x)
w = Nx.broadcast(0, {x_cols, 1})
Enum.reduce(0..iterations, w, fn i, w ->
IO.puts("Iteration #{i} => Loss #{loss(x, y, w) |> Nx.to_number()}")
gradient = gradient(x, y, w)
Nx.subtract(w, Nx.multiply(gradient, lr))
end)
end
def test(x, y, w) do
{total_examples, _} = Nx.shape(x)
correct_results = Nx.sum(classify(x, w) |> Nx.equal(y)) |> Nx.to_number()
success_percent =
Nx.multiply(correct_results, 100) |> Nx.divide(total_examples) |> Nx.to_number()
IO.puts("Success: #{correct_results}/#{total_examples} (#{success_percent}%)")
end
end
Visualizing the new model
Below is a script that trains the classifier on the first column (reservations) while keeping the other fields (temperature & tourists) as constant
# Get reservations column only
reservations = dataset[[.., 0..0]]
# Get label
police_call = dataset[[.., -1..-1//1]]
# add constant
constant = Nx.broadcast(1, Nx.shape(reservations))
# reservations with other fields (temperature, tourists) as constant
x = Nx.concatenate([reservations, constant], axis: 1)
w = C5.LogisticRegression.train(x, police_call, 100_000, 0.001)
forward = C5.LogisticRegression.forward(x, w)
classify = C5.LogisticRegression.classify(x, w)
{n_row, _} = Nx.shape(reservations)
reservations = Nx.reshape(reservations, {n_row}) |> Nx.to_list()
forward = Nx.reshape(forward, {n_row}) |> Nx.to_list()
classify = Nx.reshape(classify, {n_row}) |> Nx.to_list()
output_plot = %{reservations: reservations, forward: forward, classify: classify}
As a result, passing the weighted sum to the sigmoid function turns the straight line into something more sigmoid-y as seen below:
VegaLite.new(width: 500, height: 500, title: "Plot of the forward() function")
|> VegaLite.layers([
VegaLite.new()
|> VegaLite.data_from_values(reservations_vs_police, only: ["reservations", "police"])
|> VegaLite.mark(:point)
|> VegaLite.encode_field(:x, "reservations", type: :quantitative)
|> VegaLite.encode_field(:y, "police", type: :quantitative),
VegaLite.new()
|> VegaLite.data_from_values(output_plot, only: ["reservations", "forward"])
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "reservations", type: :quantitative)
|> VegaLite.encode_field(:y, "forward", type: :quantitative)
])
To predict the label, the model from the forward function is passed through the classifier function resulting a sharper shape just like below:
VegaLite.new(width: 500, height: 500, title: "Plot of the classify() function")
|> VegaLite.layers([
VegaLite.new()
|> VegaLite.data_from_values(reservations_vs_police, only: ["reservations", "police"])
|> VegaLite.mark(:point)
|> VegaLite.encode_field(:x, "reservations", type: :quantitative)
|> VegaLite.encode_field(:y, "police", type: :quantitative),
VegaLite.new()
|> VegaLite.data_from_values(output_plot, only: ["reservations", "classify"])
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "reservations", type: :quantitative)
|> VegaLite.encode_field(:y, "classify", type: :quantitative)
])
Classification in Action
# Drop last column
x = dataset[[.., 0..-2//1]]
# Drop first three columns
y = dataset[[.., -1..-1//1]]
w = C5.LogisticRegression.train(x, y, 100_000, 0.001)
C5.LogisticRegression.test(x, y, w)