Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us
Notesclub

Titanic classification by EXGBoost

titanic_exgboost.livemd

Titanic classification by EXGBoost

Mix.install([
  {:exgboost, "~> 0.3"},
  {:scholar, "~> 0.2"},
  {:csv, "~> 3.0"},
  {:exla, "~> 0.6"},
  {:statistics, "~> 0.6"},
  {:kino_vega_lite, "~> 0.1"}
])
:ok

Section

defmodule Pre do
  def count_missings(datas) do
    datas
    |> Enum.flat_map(fn map -> Map.filter(map, &(elem(&1, 1) == "")) |> Map.keys() end)
    |> Enum.reject(&(&1 == []))
    |> Enum.frequencies()
  end

  def separate(datas, id, label) do
    {
      datas |> Enum.map(&Map.get(&1, id)),
      if Map.has_key?(List.first(datas), label) do
        datas |> Enum.map(&([[String.to_float("#{Map.get(&1, label)}.0")]] |> Nx.tensor()))
      else
        nil
      end,
      datas |> Enum.map(&Map.drop(&1, [id, label]))
    }
  end

  def drop(datas, keys) do
    datas
    |> Enum.map(&Map.drop(&1, keys))
  end

  def empty_replace(datas, replaces_map) do
    replaces_map
    |> Enum.reduce(datas, fn {key, replace}, acc ->
      acc
      |> Enum.map(&Map.put(&1, key, String.replace(Map.get(&1, key), ~r/^$/, replace)))
    end)
  end

  def make_dummies(datas, key) do
    datas
    |> Enum.map(&Map.get(&1, key))
    |> Enum.uniq()
    |> Enum.with_index(&{&1, String.to_float("#{&2}.0")})
    |> Enum.into(%{})
  end

  def to_dummies(datas, train_maps, keys) do
    keys
    |> Enum.reduce(datas, fn key, acc ->
      acc
      |> Enum.map(
        &Map.put(
          &1,
          key,
          case make_dummies(train_maps, key)[Map.get(&1, key)] do
            nil -> 10.0
            n -> n
          end
        )
      )
    end)
  end

  def integer_string_to_float(datas, keys) do
    keys
    |> Enum.reduce(datas, fn key, acc ->
      acc
      |> Enum.map(
        &Map.put(
          &1,
          key,
          Map.get(&1, key)
          |> String.replace(~r/^(?!.*\.).*$/, "\\0\.0")
          |> String.to_float()
        )
      )
    end)
  end

  def map_to_tensor(datas) do
    datas
    |> Enum.map(&([Map.values(&1)] |> Nx.tensor()))
  end

  def for_dummies(datas) do
    datas
    |> Enum.map(
      &Map.put(
        &1,
        :honor,
        &1.name |> String.replace(~r/^.*, /, "") |> String.replace(~r/. .*/, "")
      )
    )
    |> drop([:cabin, :name, :ticket])
    |> empty_replace(%{embarked: "S", age: "30", fare: "32"})
  end

  def process(datas, train_datas) do
    for_dummies_train_data = for_dummies(train_datas)

    datas
    |> for_dummies
    |> to_dummies(for_dummies_train_data, [:embarked, :sex, :honor])
    |> integer_string_to_float([:age, :fare, :parch, :pclass, :sibsp])
    |> map_to_tensor
  end

  def header_and_csv_datas(path) do
    [hd | tl] =
      File.stream!(path)
      |> CSV.decode!()
      |> Enum.to_list()

    header =
      hd
      |> Enum.map(&(&1 |> String.downcase() |> String.to_atom()))

    {header, tl}
  end

  def csv_file_to_datas(path) do
    {header, datas} = header_and_csv_datas(path)

    datas
    |> Enum.map(&(List.zip([header, &1]) |> Enum.into(%{})))
    |> Pre.separate(:passengerid, :survived)
  end
end

{train_csv_ids, train_csv_labels, train_csv_trains} = Pre.csv_file_to_datas("train.csv")

count = train_csv_trains |> Enum.count()

{trains, validate_trains} =
  Pre.process(train_csv_trains, train_csv_trains) |> Enum.split(div(count, 10) * 8)

{labels, validate_labels} = train_csv_labels |> Enum.split(div(count, 10) * 8)
{[
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       ...
     ]
   >,
   ...
 ],
 [
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [1.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       [0.0]
     ]
   >,
   #Nx.Tensor<
     f32[1][1]
     [
       ...
     ]
   >,
   ...
 ]}
alias Scholar.Metrics.Classification

model = EXGBoost.train(trains, labels)

predicts =
  EXGBoost.predict(model, validate_trains)
  |> Nx.map([type: :f32], &amp;Nx.greater(&amp;1, 0.5))

y_validate = validate_labels |> Enum.map(&amp;(&amp;1 |> Nx.to_list())) |> List.flatten() |> Nx.tensor()
Classification.accuracy(predicts, y_validate)
#Nx.Tensor<
  f32
  0.8603351712226868
>
{test_csv_ids, _, test_csv_trains} = Pre.csv_file_to_datas("test.csv")

result =
  Pre.process(test_csv_trains, train_csv_trains)
  |> Enum.map(&amp;(EXGBoost.predict(model, &amp;1) |> Nx.to_flat_list() |> List.first() |> round))
  |> then(&amp;Enum.zip(test_csv_ids, &amp;1))
  |> Enum.map(&amp;[elem(&amp;1, 0), Integer.to_string(elem(&amp;1, 1))])
  |> then(&amp;[["PassengerId", "Survived"] | &amp;1])
[
  ["PassengerId", "Survived"],
  ["892", "0"],
  ["893", "0"],
  ["894", "0"],
  ["895", "0"],
  ["896", "1"],
  ["897", "0"],
  ["898", "0"],
  ["899", "0"],
  ["900", "1"],
  ["901", "0"],
  ["902", "0"],
  ["903", "0"],
  ["904", "1"],
  ["905", "0"],
  ["906", "1"],
  ["907", "1"],
  ["908", "0"],
  ["909", "0"],
  ["910", "1"],
  ["911", "0"],
  ["912", "0"],
  ["913", "0"],
  ["914", "1"],
  ["915", "1"],
  ["916", "1"],
  ["917", "0"],
  ["918", "1"],
  ["919", "0"],
  ["920", "0"],
  ["921", "0"],
  ["922", "0"],
  ["923", "0"],
  ["924", "1"],
  ["925", "0"],
  ["926", "0"],
  ["927", "0"],
  ["928", "0"],
  ["929", "0"],
  ["930", "0"],
  ["931", "1"],
  ["932", "0"],
  ["933", "1"],
  ["934", "0"],
  ["935", "1"],
  ["936", "1"],
  ["937", "0"],
  ["938", "0"],
  ["939", ...],
  [...],
  ...
]
result
|> CSV.encode()
|> Enum.to_list()
|> then(&amp;File.write("result.csv", &amp;1))
:ok