Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Introducing EXGBoost: Gradient Boosting in Elixir

exgboost-intro.livemd

Introducing EXGBoost: Gradient Boosting in Elixir

Mix.install([
  {:nx, "~> 0.5"},
  {:exgboost, "~> 0.2.1"},
  {:scholar, "~> 0.1"},
  {:explorer, "~> 0.5"},
  {:vega_lite, "~> 0.1"},
  {:kino_vega_lite, "~> 0.1.7"}
])
:ok

Intro

Livebook version of https://dockyard.com/blog/2023/07/18/introducing-exgboost-gradient-boosting-in-elixir blog post with some more data analysis.

Prepare the Data

# Let's read the dataset
path =
  __DIR__
  |> Path.join("diamonds.csv")
  |> Path.expand()

df = Explorer.DataFrame.from_csv!(path)
#Explorer.DataFrame<
  Polars[53940 x 11]
   integer [1, 2, 3, 4, 5, ...]
  carat float [0.23, 0.21, 0.23, 0.29, 0.31, ...]
  cut string ["Ideal", "Premium", "Good", "Premium", "Good", ...]
  color string ["E", "E", "E", "I", "J", ...]
  clarity string ["SI2", "SI1", "VS1", "VS2", "SI2", ...]
  depth float [61.5, 59.8, 56.9, 62.4, 63.3, ...]
  table float [55.0, 61.0, 65.0, 58.0, 58.0, ...]
  price integer [326, 326, 327, 334, 335, ...]
  x float [3.95, 3.89, 4.05, 4.2, 4.34, ...]
  y float [3.98, 3.84, 4.07, 4.23, 4.35, ...]
  z float [2.43, 2.31, 2.31, 2.63, 2.75, ...]
>
require Explorer.DataFrame, as: DF

# Discard the ID column (1st column)
df = DF.discard(df, 0)
#Explorer.DataFrame<
  Polars[53940 x 10]
  carat float [0.23, 0.21, 0.23, 0.29, 0.31, ...]
  cut string ["Ideal", "Premium", "Good", "Premium", "Good", ...]
  color string ["E", "E", "E", "I", "J", ...]
  clarity string ["SI2", "SI1", "VS1", "VS2", "SI2", ...]
  depth float [61.5, 59.8, 56.9, 62.4, 63.3, ...]
  table float [55.0, 61.0, 65.0, 58.0, 58.0, ...]
  price integer [326, 326, 327, 334, 335, ...]
  x float [3.95, 3.89, 4.05, 4.2, 4.34, ...]
  y float [3.98, 3.84, 4.07, 4.23, 4.35, ...]
  z float [2.43, 2.31, 2.31, 2.63, 2.75, ...]
>
# Numerically encode the values in the string columns
# before converting the dataframe to a tensor
df =
  DF.mutate(
    df,
    for col <- across(~w[cut color clarity]) do
      {col.name, Explorer.Series.cast(col, :category)}
    end
  )
#Explorer.DataFrame<
  Polars[53940 x 10]
  carat float [0.23, 0.21, 0.23, 0.29, 0.31, ...]
  cut category ["Ideal", "Premium", "Good", "Premium", "Good", ...]
  color category ["E", "E", "E", "I", "J", ...]
  clarity category ["SI2", "SI1", "VS1", "VS2", "SI2", ...]
  depth float [61.5, 59.8, 56.9, 62.4, 63.3, ...]
  table float [55.0, 61.0, 65.0, 58.0, 58.0, ...]
  price integer [326, 326, 327, 334, 335, ...]
  x float [3.95, 3.89, 4.05, 4.2, 4.34, ...]
  y float [3.98, 3.84, 4.07, 4.23, 4.35, ...]
  z float [2.43, 2.31, 2.31, 2.63, 2.75, ...]
>
# We can see that the data type of the string columns `cut`, `color` and `clarity`
# is now `:category`, which is represented internally as integer.

df["cut"] |> Explorer.Series.dtype()
:category
# Shuffle data and split into training (80%) and test (20%) sets

n_rows = DF.n_rows(df)
split_at = floor(0.8 * n_rows)

df = DF.shuffle(df)
train_df = DF.slice(df, 0..split_at)
test_df = DF.slice(df, split_at..-1)
#Explorer.DataFrame<
  Polars[10788 x 10]
  carat float [1.02, 0.41, 2.12, 1.0, 1.02, ...]
  cut category ["Ideal", "Ideal", "Premium", "Very Good", "Premium", ...]
  color category ["E", "F", "E", "E", "F", ...]
  clarity category ["SI2", "VS2", "SI2", "SI2", "SI2", ...]
  depth float [62.9, 62.7, 58.3, 60.6, 62.5, ...]
  table float [57.0, 56.0, 59.0, 61.0, 60.0, ...]
  price integer [4798, 1107, 18120, 4312, 4541, ...]
  x float [6.44, 4.78, 8.48, 6.39, 6.43, ...]
  y float [6.38, 4.72, 8.41, 6.44, 6.36, ...]
  z float [4.03, 2.98, 4.92, 3.89, 4.0, ...]
>
# Convert both train and test data frames into tensors

# Select the features columns and the targets ones
features = ~w(carat cut color clarity depth table x y z)
targets = ~w(price)

# Stack the columns values along the y axis,
# think of it like Excel columns

x_train =
  train_df[features]
  |> Nx.stack(axis: 1)

y_train =
  train_df[targets]
  |> Nx.stack(axis: 1)

x_test =
  test_df[features]
  |> Nx.stack(axis: 1)

y_test =
  test_df[targets]
  |> Nx.stack(axis: 1)
#Nx.Tensor<
  s64[10788][1]
  [
    [4798],
    [1107],
    [18120],
    [4312],
    [4541],
    [8146],
    [6098],
    [9891],
    [1817],
    [2041],
    [12119],
    [16390],
    [666],
    [881],
    [5208],
    [1192],
    [477],
    [5394],
    [7340],
    [3205],
    [4136],
    [5701],
    [928],
    [4843],
    [7781],
    [984],
    [1119],
    [4854],
    [11999],
    [9678],
    [2732],
    [755],
    [3096],
    [1578],
    [2803],
    [3246],
    [7999],
    [1175],
    [2475],
    [7140],
    [13355],
    [4751],
    [7006],
    [673],
    [2105],
    [756],
    [844],
    [1824],
    [4731],
    [6449],
    ...
  ]
>

Train the model

# The objective is the metric or loss function used to optimize the model.
# In this case, our objective is a regression, so we’ll use the squared-error loss.

model = EXGBoost.train(x_train, y_train, obj: :reg_squarederror)
%EXGBoost.Booster{
  ref: #Reference<0.1257637143.4252368897.206621>,
  best_iteration: nil,
  best_score: nil
}
# Then, we can specify an evaluation strategy to evaluate the model
# during the training, we can use the same training set.

model =
  EXGBoost.train(x_train, y_train,
    obj: :reg_squarederror,
    evals: [{x_train, y_train, "train"}]
  )
Iteration 0: %{"train" => %{"rmse" => 2890.7208712453144}}
Iteration 1: %{"train" => %{"rmse" => 2134.0191851392415}}
Iteration 2: %{"train" => %{"rmse" => 1615.3131846345257}}
Iteration 3: %{"train" => %{"rmse" => 1260.5302513690365}}
Iteration 4: %{"train" => %{"rmse" => 1035.273912577993}}
Iteration 5: %{"train" => %{"rmse" => 878.4874980899773}}
Iteration 6: %{"train" => %{"rmse" => 784.2830805904829}}
Iteration 7: %{"train" => %{"rmse" => 712.8615971753834}}
Iteration 8: %{"train" => %{"rmse" => 668.4144474681958}}
Iteration 9: %{"train" => %{"rmse" => 638.9102413425762}}
%EXGBoost.Booster{
  ref: #Reference<0.1257637143.4252368897.206645>,
  best_iteration: nil,
  best_score: nil
}
# By default it stops after 10 iterations, but we can configure it

model =
  EXGBoost.train(x_train, y_train,
    obj: :reg_squarederror,
    evals: [{x_train, y_train, "train"}],
    num_boost_rounds: 15
  )
Iteration 0: %{"train" => %{"rmse" => 2890.7208712453144}}
Iteration 1: %{"train" => %{"rmse" => 2134.0191851392415}}
Iteration 2: %{"train" => %{"rmse" => 1615.3131846345257}}
Iteration 3: %{"train" => %{"rmse" => 1260.5302513690365}}
Iteration 4: %{"train" => %{"rmse" => 1035.273912577993}}
Iteration 5: %{"train" => %{"rmse" => 878.4874980899773}}
Iteration 6: %{"train" => %{"rmse" => 784.2830805904829}}
Iteration 7: %{"train" => %{"rmse" => 712.8615971753834}}
Iteration 8: %{"train" => %{"rmse" => 668.4144474681958}}
Iteration 9: %{"train" => %{"rmse" => 638.9102413425762}}
Iteration 10: %{"train" => %{"rmse" => 615.9900670364887}}
Iteration 11: %{"train" => %{"rmse" => 603.5636994455643}}
Iteration 12: %{"train" => %{"rmse" => 589.9516549997495}}
Iteration 13: %{"train" => %{"rmse" => 576.5182045827278}}
Iteration 14: %{"train" => %{"rmse" => 564.2201739403405}}
%EXGBoost.Booster{
  ref: #Reference<0.1257637143.4252368897.206680>,
  best_iteration: nil,
  best_score: nil
}

Prediction

y_pred = EXGBoost.predict(model, x_test)
#Nx.Tensor<
  f32[10788]
  [4738.07568359375, 1002.7467041015625, 16921.232421875, 4359.3779296875, 4242.79638671875, 8761.689453125, 6005.74560546875, 9416.0615234375, 2289.523193359375, 1795.5733642578125, 14965.5625, 16371.611328125, 767.2431030273438, 851.5617065429688, 6894.3076171875, 1142.4537353515625, 559.9784545898438, 3998.229736328125, 7741.77880859375, 2536.752197265625, 4005.88427734375, 5831.16015625, 815.0265502929688, 4759.9404296875, 7482.3984375, 989.0425415039062, 1212.652099609375, 5602.09912109375, 11887.0732421875, 10879.662109375, 3153.29638671875, 928.280517578125, 3044.616455078125, 1689.5523681640625, 3985.532470703125, 3343.437744140625, 7342.908203125, 1276.2225341796875, 2740.32763671875, 8790.6376953125, 13003.0791015625, 4511.625, 8770.1650390625, 747.6937866210938, 1936.9466552734375, 718.6131591796875, 966.3662109375, 1207.0413818359375, 4030.13134765625, 5668.5859375, ...]
>
# We can use Scholar to evaluate these predictions

Scholar.Metrics.mean_absolute_error(Nx.squeeze(y_test), y_pred)
#Nx.Tensor<
  f32
  327.4473876953125
>
# Pairwise errors

pairwise_errors = Nx.abs(Nx.subtract(Nx.squeeze(y_test), y_pred))
#Nx.Tensor<
  f32[10788]
  [59.92431640625, 104.2532958984375, 1198.767578125, 47.3779296875, 298.20361328125, 615.689453125, 92.25439453125, 474.9384765625, 472.523193359375, 245.4266357421875, 2846.5625, 18.388671875, 101.24310302734375, 29.43829345703125, 1686.3076171875, 49.5462646484375, 82.97845458984375, 1395.770263671875, 401.77880859375, 668.247802734375, 130.11572265625, 130.16015625, 112.97344970703125, 83.0595703125, 298.6015625, 5.04254150390625, 93.652099609375, 748.09912109375, 111.9267578125, 1201.662109375, 421.29638671875, 173.280517578125, 51.383544921875, 111.5523681640625, 1182.532470703125, 97.437744140625, 656.091796875, 101.2225341796875, 265.32763671875, 1650.6376953125, 351.9208984375, 239.375, 1764.1650390625, 74.69378662109375, 168.0533447265625, 37.3868408203125, 122.3662109375, 616.9586181640625, 700.86865234375, 780.4140625, ...]
>
Nx.reduce_min(pairwise_errors) |> IO.inspect(label: "min error:")
Nx.reduce_max(pairwise_errors) |> IO.inspect(label: "max error:")

:ok
min error:: #Nx.Tensor<
  f32
  0.01019287109375
>
max error:: #Nx.Tensor<
  f32
  7115.482421875
>
:ok
# We can even think to compute the distribution of the error

# In order to do so, we first need to define the range
# and given that the min is 0.0023193359375 and the max
# is 6555.39453125

min = 0
max = 7000
bins = Enum.to_list(0..7000//250)
[0, 250, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000, 3250, 3500, 3750, 4000,
 4250, 4500, 4750, 5000, 5250, 5500, 5750, 6000, 6250, 6500, 6750, 7000]
# With our bins we can now compute the distributions of the pairwise_errors

# First convert the pairwise_errors tensor to a serie
# (the type cast is necessary to work-around this error: "Cannot convert binary/tensor type {:f, 32} into dtype")

errors_serie =
  pairwise_errors
  |> Nx.as_type({:f, 64})
  |> Explorer.Series.from_tensor()
#Explorer.Series<
  Polars[10788]
  float [59.92431640625, 104.2532958984375, 1198.767578125, 47.3779296875, 298.20361328125,
   615.689453125, 92.25439453125, 474.9384765625, 472.523193359375, 245.4266357421875, 2846.5625,
   18.388671875, 101.24310302734375, 29.43829345703125, 1686.3076171875, 49.5462646484375,
   82.97845458984375, 1395.770263671875, 401.77880859375, 668.247802734375, 130.11572265625,
   130.16015625, 112.97344970703125, 83.0595703125, 298.6015625, 5.04254150390625, 93.652099609375,
   748.09912109375, 111.9267578125, 1201.662109375, 421.29638671875, 173.280517578125,
   51.383544921875, 111.5523681640625, 1182.532470703125, 97.437744140625, 656.091796875,
   101.2225341796875, 265.32763671875, 1650.6376953125, 351.9208984375, 239.375, 1764.1650390625,
   74.69378662109375, 168.0533447265625, 37.3868408203125, 122.3662109375, 616.9586181640625,
   700.86865234375, 780.4140625, ...]
>
# Let's then sort the errors serie and cut it into the defined bins
# using the `Explorer.Series.cut` function

histogram_df =
  errors_serie
  |> Explorer.Series.sort()
  |> Explorer.Series.cut(bins)
#Explorer.DataFrame<
  Polars[10788 x 3]
  values float [0.01019287109375, 0.01025390625, 0.01025390625, 0.040771484375, 0.15478515625, ...]
  break_point float [250.0, 250.0, 250.0, 250.0, 250.0, ...]
  category category ["(0.0, 250.0]", "(0.0, 250.0]", "(0.0, 250.0]", "(0.0, 250.0]", "(0.0, 250.0]",
   ...]
>
# The errors distribution is the frequencies on the `category` column

distribution_df = Explorer.Series.frequencies(histogram_df["category"])

categories = Explorer.Series.to_list(distribution_df["values"])
counts = Explorer.Series.to_list(distribution_df["counts"])

data = %{categories: categories, counts: counts}
%{
  categories: ["(0.0, 250.0]", "(250.0, 500.0]", "(500.0, 750.0]", "(750.0, 1000.0]",
   "(1000.0, 1250.0]", "(1250.0, 1500.0]", "(1500.0, 1750.0]", "(2000.0, 2250.0]",
   "(1750.0, 2000.0]", "(2250.0, 2500.0]", "(2500.0, 2750.0]", "(2750.0, 3000.0]",
   "(3000.0, 3250.0]", "(3250.0, 3500.0]", "(3750.0, 4000.0]", "(3500.0, 3750.0]",
   "(4000.0, 4250.0]", "(4250.0, 4500.0]", "(4500.0, 4750.0]", "(5500.0, 5750.0]",
   "(5000.0, 5250.0]", "(7000.0, inf]"],
  counts: [7184, 1643, 737, 398, 235, 163, 125, 75, 71, 36, 33, 28, 16, 16, 8, 7, 5, 2, 2, 2, 1, 1]
}
VegaLite.new(title: "Error distribution")
|> VegaLite.data_from_values(data, only: ["categories", "counts"])
|> VegaLite.mark(:bar, tooltip: true)
# specify the `:sort` options to enforce it
|> VegaLite.encode(:x, field: "categories", type: :nominal, sort: categories)
|> VegaLite.encode_field(:y, "counts", type: :quantitative)
|> VegaLite.encode_field(:color, "categories", type: :nominal)
{"$schema":"https://vega.github.io/schema/vega-lite/v5.json","data":{"values":[{"categories":"(0.0, 250.0]","counts":7184},{"categories":"(250.0, 500.0]","counts":1643},{"categories":"(500.0, 750.0]","counts":737},{"categories":"(750.0, 1000.0]","counts":398},{"categories":"(1000.0, 1250.0]","counts":235},{"categories":"(1250.0, 1500.0]","counts":163},{"categories":"(1500.0, 1750.0]","counts":125},{"categories":"(2000.0, 2250.0]","counts":75},{"categories":"(1750.0, 2000.0]","counts":71},{"categories":"(2250.0, 2500.0]","counts":36},{"categories":"(2500.0, 2750.0]","counts":33},{"categories":"(2750.0, 3000.0]","counts":28},{"categories":"(3000.0, 3250.0]","counts":16},{"categories":"(3250.0, 3500.0]","counts":16},{"categories":"(3750.0, 4000.0]","counts":8},{"categories":"(3500.0, 3750.0]","counts":7},{"categories":"(4000.0, 4250.0]","counts":5},{"categories":"(4250.0, 4500.0]","counts":2},{"categories":"(4500.0, 4750.0]","counts":2},{"categories":"(5500.0, 5750.0]","counts":2},{"categories":"(5000.0, 5250.0]","counts":1},{"categories":"(7000.0, inf]","counts":1}]},"encoding":{"color":{"field":"categories","type":"nominal"},"x":{"field":"categories","sort":["(0.0, 250.0]","(250.0, 500.0]","(500.0, 750.0]","(750.0, 1000.0]","(1000.0, 1250.0]","(1250.0, 1500.0]","(1500.0, 1750.0]","(2000.0, 2250.0]","(1750.0, 2000.0]","(2250.0, 2500.0]","(2500.0, 2750.0]","(2750.0, 3000.0]","(3000.0, 3250.0]","(3250.0, 3500.0]","(3750.0, 4000.0]","(3500.0, 3750.0]","(4000.0, 4250.0]","(4250.0, 4500.0]","(4500.0, 4750.0]","(5500.0, 5750.0]","(5000.0, 5250.0]","(7000.0, inf]"],"type":"nominal"},"y":{"field":"counts","type":"quantitative"}},"mark":{"tooltip":true,"type":"bar"},"title":"Error distribution"}

The graph of the errors distribution does not give us new informations, knowning that the mean error is around 330$ is probably enough, nevertherless it was a fun ride. 🎉