Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Predicting Titanic survivors with Explorer and ML 🧊🛳️

predicting-titanic-survivors-with-ml.livemd

Predicting Titanic survivors with Explorer and ML 🧊🛳️

Mix.install([
  {:scholar, "~> 0.2.1"},
  {:explorer, "~> 0.7.1"},
  {:exgboost, "~> 0.3"},
  {:kino_explorer, "~> 0.1.12"},
  {:kino_vega_lite, "~> 0.1.10"}
])

Before starting…

Importing Data

df =
  Kino.FS.file_path("train.csv")
  |> Explorer.DataFrame.from_csv!()

Exploring the data

VegaLite.new(width: 400, height: 400, title: "Survived distribution (0 = NO, 1 = YES)")
|> VegaLite.data_from_values(df, only: ["Survived"])
|> VegaLite.mark(:bar)
|> VegaLite.encode_field(:x, "Survived", type: :nominal)
|> VegaLite.encode(:y, aggregate: :count)
VegaLite.new(width: 400, height: 401, title: "Age wrt Survived")
|> VegaLite.data_from_values(df, only: ["Survived", "Age"])
|> VegaLite.mark(:point)
|> VegaLite.encode_field(:x, "Survived", type: :nominal)
|> VegaLite.encode_field(:y, "Age", type: :quantitative)
|> VegaLite.encode(:color, aggregate: :count)
VegaLite.new(width: 400, height: 600, title: "Class distribution")
|> VegaLite.data_from_values(df, only: ["Pclass"])
|> VegaLite.mark(:bar)
|> VegaLite.encode_field(:x, "Pclass", type: :nominal)
|> VegaLite.encode(:y, aggregate: :count)
# https://vega.github.io/vega-lite/docs/density.html
# https://mathisonian.github.io/kde/

VegaLite.new(width: 400, height: 400, title: "Class density (KDE)")
|> VegaLite.data_from_values(df, only: ["Age", "Pclass"])
|> VegaLite.layers([
  VegaLite.new()
  |> VegaLite.transform(filter: "datum.Pclass == 1")
  |> VegaLite.transform(density: "Age", extent: [-40, 120])
  |> VegaLite.mark(:line, color: "red")
  |> VegaLite.encode_field(:x, "value", type: :quantitative, title: "Age")
  |> VegaLite.encode_field(:y, "density", type: :quantitative),
  VegaLite.new()
  |> VegaLite.transform(filter: "datum.Pclass == 2")
  |> VegaLite.transform(density: "Age", extent: [-40, 120])
  |> VegaLite.mark(:line, color: "blue")
  |> VegaLite.encode_field(:x, "value", type: :quantitative, title: "Age")
  |> VegaLite.encode_field(:y, "density", type: :quantitative),
  VegaLite.new()
  |> VegaLite.transform(filter: "datum.Pclass == 3")
  |> VegaLite.transform(density: "Age", extent: [-40, 120])
  |> VegaLite.mark(:line, color: "green")
  |> VegaLite.encode_field(:x, "value", type: :quantitative, title: "Age")
  |> VegaLite.encode_field(:y, "density", type: :quantitative)
])

Sex

VegaLite.new(width: 400, height: 400, title: "Sex distribution")
|> VegaLite.data_from_values(df, only: ["Sex"])
|> VegaLite.mark(:bar)
|> VegaLite.encode_field(:x, "Sex", type: :nominal)
|> VegaLite.encode(:y, aggregate: :count)
VegaLite.new(title: "Survived wrt Sex")
|> VegaLite.data_from_values(df, only: ["Sex", "Survived"])
|> VegaLite.facet(
  [field: "Sex"],
  VegaLite.new(width: 300, height: 300)
  |> VegaLite.mark(:bar)
  |> VegaLite.encode_field(:x, "Survived", type: :nominal)
  |> VegaLite.encode(:y, aggregate: :count, type: :quantitative)
)

# It would be nice to compute the percentage here!
# [ SKIP] Same but stacked
VegaLite.new(width: 400, height: 400)
|> VegaLite.data_from_values(df, only: ["Sex", "Survived"])
|> VegaLite.mark(:bar, tooltip: true)
|> VegaLite.encode_field(:x, "Survived", type: :nominal)
|> VegaLite.encode(:y, aggregate: :count, type: :quantitative, stack: :normalize)
|> VegaLite.encode(:color, field: "Sex", scale: %{"range" => ["lightblue", "aquamarine"]})
VegaLite.new(width: 400, height: 400, title: "Survived density wrt Class")
|> VegaLite.data_from_values(df, only: ["Survived", "Pclass"])
|> VegaLite.layers([
  VegaLite.new()
  |> VegaLite.transform(filter: "datum.Pclass == 1")
  |> VegaLite.transform(density: "Survived", extent: [-0.5, 1.5])
  |> VegaLite.mark(:line, color: "red")
  |> VegaLite.encode_field(:x, "value", type: :quantitative, title: "Survived")
  |> VegaLite.encode_field(:y, "density", type: :quantitative),
  VegaLite.new()
  |> VegaLite.transform(filter: "datum.Pclass == 2")
  |> VegaLite.transform(density: "Survived", extent: [-0.5, 1.5])
  |> VegaLite.mark(:line, color: "blue")
  |> VegaLite.encode_field(:x, "value", type: :quantitative, title: "Survived")
  |> VegaLite.encode_field(:y, "density", type: :quantitative),
  VegaLite.new()
  |> VegaLite.transform(filter: "datum.Pclass == 3")
  |> VegaLite.transform(density: "Survived", extent: [-0.5, 1.5])
  |> VegaLite.mark(:line, color: "green")
  |> VegaLite.encode_field(:x, "value", type: :quantitative, title: "Survived")
  |> VegaLite.encode_field(:y, "density", type: :quantitative)
])

Combining sex and class

VegaLite.new(title: "Survived grouped by Class and Sex")
|> VegaLite.data_from_values(df, only: ["Sex", "Pclass", "Survived"])
|> VegaLite.facet(
  [row: [field: "Pclass", title: "Class"], column: [field: "Sex", title: "Sex"]],
  VegaLite.new(width: 200, height: 300)
  |> VegaLite.mark(:bar, tooltip: true)
  |> VegaLite.encode_field(:x, "Survived", type: :nominal)
  |> VegaLite.encode(:y, aggregate: :count, type: :quantitative)
)

Predict data with algorithms

Random

50% chances of guessing the right answer.

Based on Sex value

  • Women survive
  • Men do not survive
require Explorer.DataFrame, as: DF

df_modified =
  df
  # add a column `hyp` and prefill it based on the sex
  |> DF.mutate(
    hyp:
      if col("Sex") == "female" do
        1
      else
        0
      end
  )
  # add a column `result` to check if the hypothesis is correct
  |> DF.mutate(
    result:
      if col("hyp") == col("Survived") do
        1
      else
        0
      end
  )

total_rows = Explorer.DataFrame.n_rows(df_modified)

matches_count =
  Explorer.DataFrame.filter(df_modified, result == 1)
  |> Explorer.DataFrame.n_rows()

matches_count / total_rows * 100

✨ Fancy stuff ✨

Prepare the data

  • Fill missing values
  • Categorize columns (from string to integers)
df =
  df
  # |> DF.lazy()
  |> DF.discard("Cabin")
  |> DF.mutate(Age: fill_missing(col("Age"), :mean))
  |> DF.mutate(Embarked: fill_missing(col("Embarked"), "S"))
  |> DF.mutate(Embarked: cast(col("Embarked"), :category))
  |> DF.mutate(Sex: cast(col("Sex"), :category))

Linear Regression

target =
  df[["Survived"]]
  |> Explorer.DataFrame.to_series()
  |> Map.get("Survived")
  |> Explorer.Series.to_tensor()
  |> dbg()
features =
  df[["Pclass", "Age", "Sex", "SibSp", "Parch"]]
  # df[["Pclass", "Age", "Sex", "SibSp", "Parch", "Fare", "Embarked"]]
  |> Nx.stack(axis: -1)
classifier =
  Scholar.Linear.LogisticRegression.fit(
    features,
    target,
    num_classes: 2,
    iterations: 100,
    # Optimizers: https://hexdocs.pm/polaris/Polaris.Optimizers.html
    optimizer: :sgd
    # optimizer: :yogi
  )
predictions = Scholar.Linear.LogisticRegression.predict(classifier, features)

Scholar.Metrics.Classification.accuracy(target, predictions)

Polynomial Regression

poly_features =
  Scholar.Linear.PolynomialRegression.transform(features, degree: 2)

classifier =
  Scholar.Linear.LogisticRegression.fit(
    poly_features,
    target,
    num_classes: 2,
    iterations: 100,
    optimizer: :yogi
  )
predictions = Scholar.Linear.LogisticRegression.predict(classifier, poly_features)

Scholar.Metrics.Classification.accuracy(target, predictions)

Decision Tree

features =
  df[["Pclass", "Age", "Sex", "SibSp", "Parch", "Fare", "Embarked"]]
  |> Nx.stack(axis: -1)
target_hot_encoded =
  target
  |> Scholar.Preprocessing.one_hot_encode(num_classes: 2)
  |> dbg()
model =
  EXGBoost.train(
    features,
    target_hot_encoded,
    obj: :binary_logistic,
    max_depth: 5
    # max_depth: 10
  )

predictions =
  EXGBoost.predict(model, features)
  |> Nx.argmax(axis: 1)

Scholar.Metrics.Classification.accuracy(target, predictions)

Avoid overfitting with cross-validation

https://notes.club/elixir-nx/scholar/notebooks/cv_gradient_boosting_tree

alias Scholar.ModelSelection
alias Scholar.Preprocessing
alias Scholar.Metrics.Classification

folding_fn = fn x -> ModelSelection.k_fold_split(x, 5) end

scoring_fn = fn x, y ->
  {x_train, x_test} = x
  {y_train, y_test} = y

  y_train_hot_encoded = Preprocessing.one_hot_encode(y_train, num_classes: 2)

  y_pred =
    EXGBoost.train(
      x_train,
      y_train_hot_encoded,
      obj: :binary_logistic,
      max_depth: 5,
      evals: [{x_train, y_train_hot_encoded, "training"}],
      verbose_eval: true
    )
    |> EXGBoost.predict(x_test)
    |> Nx.argmax(axis: 1)

  Classification.accuracy(y_test, y_pred)
end
cv_scores =
  ModelSelection.cross_validate(
    features,
    target,
    folding_fn,
    scoring_fn
  )
  |> Nx.squeeze()
Nx.mean(cv_scores)

Plotting

# https://stackoverflow.com/questions/60186747/how-do-i-include-feature-names-in-the-plot-tree-function-from-the-xgboost-librar
["Pclass", "Age", "Sex", "SibSp", "Parch", "Fare", "Embarked"]
|> Enum.with_index(fn element, index -> "#{index} #{element} q" end)
|> Enum.join("\n")
|> then(&File.write!("/Users/nicolo.gnudi/fmap.txt", &1))

EXGBoost.Booster.get_dump(model, fmap: "/Users/nicolo.gnudi/fmap.txt", format: :json)
# |> Jason.Formatter.pretty_print()
# |> then(& File.write!("/Users/nicolo.gnudi/dt.json", &1))

Export model and import it in Python for plotting

https://github.com/acalejos/exgboost/issues/29

# Dump the model
EXGBoost.write_weights(model, "/Users/nicolo.gnudi/dtw")

Then, install the required Python packages

pip3 install xgboost
pip3 install graphviz

And finally plot the Decision Tree

❯ python3

>>> import xgboost as xgb
>>> model = xgb.Booster()
>>> model.load_model("/Users/nicolo.gnudi/dtw.json")
>>> g = xgb.to_graphviz(model, fmap="/Users/nicolo.gnudi/fmap.txt")
>>> g.render(filename="/Users/nicolo.gnudi/dtg")
'/Users/nicolo.gnudi/dtg.pdf'