EXGBoost Titanic optimized
Mix.install([
{:exgboost, "~> 0.5"},
{:explorer, "~> 0.9"},
{:nx, "~> 0.8"},
{:kino, "~> 0.14"},
{:kino_vega_lite, "~> 0.1"}
])
Alias
alias Explorer.DataFrame
alias Explorer.Series
require Explorer.DataFrame
Load data
train_data_input = Kino.Input.file("train data")
test_data_input = Kino.Input.file("test data")
train_data =
train_data_input
|> Kino.Input.read()
|> Map.get(:file_ref)
|> Kino.Input.file_path()
|> DataFrame.from_csv!()
Kino.DataTable.new(train_data)
test_data =
test_data_input
|> Kino.Input.read()
|> Map.get(:file_ref)
|> Kino.Input.file_path()
|> DataFrame.from_csv!()
Kino.DataTable.new(test_data)
Preprocess
defmodule PreProcess do
def load_csv(kino_input) do
kino_input
|> Kino.Input.read()
|> Map.get(:file_ref)
|> Kino.Input.file_path()
|> DataFrame.from_csv!()
end
def fill_empty(data, fill_map) do
fill_map
|> Enum.reduce(data, fn {column_name, fill_value}, acc ->
fill_value =
if fill_value == :median do
Series.median(data[column_name])
else
fill_value
end
DataFrame.put(
acc,
column_name,
Series.fill_missing(data[column_name], fill_value)
)
end)
end
def replace_dummy(data, columns_names) do
data
|> DataFrame.dummies(columns_names)
|> DataFrame.concat_columns(DataFrame.discard(data, columns_names))
end
def to_tensor(data) do
data
|> DataFrame.to_columns()
|> Map.values()
|> Nx.tensor()
|> Nx.transpose()
end
def process(kino_input, id_key, label_key, followers_df) do
data_org = load_csv(kino_input)
id_list = Series.to_list(data_org[id_key])
has_label_key =
data_org
|> DataFrame.names()
|> Enum.member?(label_key)
labels =
if has_label_key do
Series.to_tensor(data_org[label_key])
else
nil
end
inputs =
if has_label_key do
DataFrame.discard(data_org, [id_key, label_key])
else
DataFrame.discard(data_org, [id_key])
end
|> DataFrame.mutate(
prob_child:
col("Name") |> contains("Master") or
(col("Name") |> contains("Miss") and
col("Parch") > 0)
)
filled_age =
[
Series.to_list(inputs["Age"]),
Series.to_list(inputs["prob_child"])
]
|> Enum.zip()
|> Enum.map(fn
{nil, true} ->
9
{nil, false} ->
30
{age, _prob_child} ->
age
end)
|> Series.from_list()
inputs =
inputs
|> DataFrame.put("Age", filled_age)
|> DataFrame.join(followers_df, how: :left)
|> fill_empty(%{"followers" => 0, "Embarked" => "S", "Fare" => :median})
|> replace_dummy(["Embarked", "Pclass"])
|> DataFrame.mutate(is_man: col("Sex") == "male")
|> DataFrame.mutate(fare_group: (col("Fare") / 50) |> floor())
|> DataFrame.mutate(age_group: (col("Age") / 10) |> floor())
|> DataFrame.discard(["Cabin", "Name", "Ticket", "Sex", "Fare", "Age", "SibSp", "Parch"])
|> to_tensor()
{id_list, labels, inputs}
end
end
full_data =
train_data
|> DataFrame.discard("Survived")
|> DataFrame.concat_rows(test_data)
Kino.DataTable.new(full_data)
followers_df =
full_data["Ticket"]
|> Series.frequencies()
|> DataFrame.rename(["Ticket", "followers"])
|> DataFrame.mutate(followers: followers - 1)
|> DataFrame.filter(col("Ticket") != "LINE")
Kino.DataTable.new(followers_df)
{
train_id_list,
train_labels,
train_inputs
} = PreProcess.process(train_data_input, "PassengerId", "Survived", followers_df)
{
test_id_list,
nil,
test_inputs
} = PreProcess.process(test_data_input, "PassengerId", "Survived", followers_df)
Training
loss_plot =
VegaLite.new(width: 700)
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "step", type: :quantitative)
|> VegaLite.encode_field(:y, "mlogloss", type: :quantitative)
|> Kino.VegaLite.new()
step_callback =
EXGBoost.Training.Callback.new(
:after_iteration,
fn state ->
if rem(state.iteration, 1000) == 0 do
Kino.VegaLite.push(
loss_plot,
%{"step" => state.iteration, "mlogloss" => state.metrics["training"]["mlogloss"]}
)
end
state
end,
:loss_plot
)
booster =
EXGBoost.train(train_inputs, train_labels,
num_class: 2,
objective: :multi_softprob,
num_boost_rounds: 100_000,
learning_rates: fn _ -> 0.1 end,
max_depth: 3,
early_stopping_rounds: 10,
evals: [{train_inputs, train_labels, "training"}],
callbacks: [step_callback]
)
Kino.nothing()
Prediction
preds = EXGBoost.predict(booster, test_inputs) |> Nx.argmax(axis: -1)
results =
preds
|> Nx.to_flat_list()
|> Enum.map(&round(&1))
|> then(
&%{
"PassengerId" => test_id_list,
"Survived" => &1
}
)
|> DataFrame.new()
Kino.DataTable.new(results)
results
|> DataFrame.dump_csv!()
|> then(&Kino.Download.new(fn -> &1 end, filename: "result.csv"))