Titanic with Explorer
Mix.install([
{:exla, "~> 0.8"},
{:axon, "~> 0.6", git: "https://github.com/elixir-nx/axon/"},
{:kino, "~> 0.14"},
{:kino_vega_lite, "~> 0.1"},
{:explorer, "~> 0.9"},
{:table_rex, "~> 4.0", override: true}
])
エイリアス
alias Explorer.DataFrame
alias Explorer.Series
require Explorer.DataFrame
データの準備
Kaggle のタイタニックデータをダウンロードする
https://www.kaggle.com/competitions/titanic/data
train_data_input = Kino.Input.file("train data")
test_data_input = Kino.Input.file("test data")
前処理
train_data =
train_data_input
|> Kino.Input.read()
|> Map.get(:file_ref)
|> Kino.Input.file_path()
|> DataFrame.from_csv!()
Kino.DataTable.new(train_data)
id_list = Series.to_list(train_data["PassengerId"])
label_tensor =
train_data["Survived"]
|> Series.to_tensor(backend: EXLA.Backend)
|> Nx.as_type(:f32)
|> Nx.new_axis(1)
|> Nx.to_batched(1)
|> Enum.to_list()
inputs = DataFrame.discard(train_data, ["PassengerId", "Survived"])
Kino.DataTable.new(inputs)
dropped_inputs = DataFrame.discard(inputs, ["Cabin", "Name", "Ticket"])
Kino.DataTable.new(dropped_inputs)
filled_inputs =
dropped_inputs
|> DataFrame.put("Age", Series.fill_missing(train_data["Age"], :mean))
|> DataFrame.put("Embarked", Series.fill_missing(train_data["Embarked"], "S"))
Kino.DataTable.new(filled_inputs)
dummied_inputs =
filled_inputs
|> DataFrame.dummies(["Sex", "Embarked"])
|> DataFrame.concat_columns(DataFrame.discard(filled_inputs, ["Sex", "Embarked"]))
Kino.DataTable.new(dummied_inputs)
input_tensor_list =
dummied_inputs
|> DataFrame.to_columns()
|> Map.values()
|> Nx.tensor(backend: EXLA.Backend)
|> Nx.transpose()
|> Nx.to_batched(1)
|> Enum.to_list()
defmodule PreProcess do
def load_csv(kino_input) do
kino_input
|> Kino.Input.read()
|> Map.get(:file_ref)
|> Kino.Input.file_path()
|> DataFrame.from_csv!()
end
def fill_empty(data, fill_map) do
fill_map
|> Enum.reduce(data, fn {column_name, fill_value}, acc ->
DataFrame.put(
acc,
column_name,
Series.fill_missing(data[column_name], fill_value)
)
end)
end
def replace_dummy(data, columns_names) do
data
|> DataFrame.dummies(columns_names)
|> DataFrame.concat_columns(DataFrame.discard(data, columns_names))
end
def to_tensor(data) do
data
|> DataFrame.to_columns()
|> Map.values()
|> Nx.tensor(backend: EXLA.Backend)
|> Nx.transpose()
|> Nx.to_batched(1)
|> Enum.to_list()
end
def process(kino_input, id_key, label_key) do
data_org = load_csv(kino_input)
id_list = Series.to_list(data_org[id_key])
has_label_key =
data_org
|> DataFrame.names()
|> Enum.member?(label_key)
label_list =
if has_label_key do
data_org[label_key]
|> Series.to_tensor(backend: EXLA.Backend)
|> Nx.as_type(:f32)
|> Nx.new_axis(1)
|> Nx.to_batched(1)
|> Enum.to_list()
else
nil
end
inputs =
if has_label_key do
DataFrame.discard(data_org, [id_key, label_key])
else
DataFrame.discard(data_org, [id_key])
end
|> DataFrame.discard(["Cabin", "Name", "Ticket"])
|> fill_empty(%{"Age" => :mean, "Embarked" => "S", "Fare" => :mean})
|> replace_dummy(["Sex", "Embarked"])
|> to_tensor()
{id_list, label_list, inputs}
end
end
{
train_id_list,
train_label_list,
train_inputs
} = PreProcess.process(train_data_input, "PassengerId", "Survived")
{
test_id_list,
test_label_list,
test_inputs
} = PreProcess.process(test_data_input, "PassengerId", "Survived")
モデルの定義
model =
Axon.input("input", shape: {nil, 10})
|> Axon.dense(48, activation: :tanh)
|> Axon.dropout(rate: 0.2)
|> Axon.dense(48, activation: :tanh)
|> Axon.dense(1, activation: :sigmoid)
学習
train_data = Enum.zip(train_inputs, train_label_list)
loss_plot =
VegaLite.new(width: 300)
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "step", type: :quantitative)
|> VegaLite.encode_field(:y, "loss", type: :quantitative)
|> Kino.VegaLite.new()
acc_plot =
VegaLite.new(width: 300)
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "step", type: :quantitative)
|> VegaLite.encode_field(:y, "accuracy", type: :quantitative)
|> Kino.VegaLite.new()
Kino.Layout.grid([loss_plot, acc_plot], columns: 2)
trained_state =
model
|> Axon.Loop.trainer(:mean_squared_error, Polaris.Optimizers.adam(learning_rate: 0.0005))
|> Axon.Loop.metric(:accuracy, "accuracy")
|> Axon.Loop.kino_vega_lite_plot(loss_plot, "loss", event: :epoch_completed)
|> Axon.Loop.kino_vega_lite_plot(acc_plot, "accuracy", event: :epoch_completed)
|> Axon.Loop.run(train_data, Axon.ModelState.empty(), epochs: 50, compiler: EXLA)
未知データに対する推論
results =
test_inputs
|> Nx.concatenate()
|> then(&Axon.predict(model, trained_state, &1))
|> Nx.to_flat_list()
|> Enum.map(&round(&1))
|> then(
&%{
"PassengerId" => test_id_list,
"Survived" => &1
}
)
|> DataFrame.new()
Kino.DataTable.new(results)
results
|> DataFrame.dump_csv!()
|> then(&Kino.Download.new(fn -> &1 end, filename: "result.csv"))