Ride Along Prediction Accuracy pt2
Mix.install(
[
{:kino, "~> 0.13"},
{:kino_vega_lite, "~> 0.1.13"},
{:ride_along, path: Path.join(__DIR__, ".."), env: :dev}
],
config_path: Path.join([__DIR__, "..", "config", "config.exs"]),
start_applications: false
)
Load/Group Data
require Explorer.DataFrame, as: DF
alias Explorer.{Duration, Series}
alias VegaLite, as: Vl
df =
"data.csv"
|> Kino.FS.file_path()
|> DF.from_csv!(
parse_dates: true,
nil_values: [""],
dtypes: %{status: :category}
)
|> DF.filter(route > 0)
defmodule Support do
def truncate_to_minute(%DateTime{} = dt) do
Map.merge(dt, %{second: 0, microsecond: {0, 0}})
end
def round_up_to_minute(%DateTime{second: second, microsecond: {microsecond, _precision}} = dt)
when second > 0 or microsecond > 0 do
dt
|> Map.put(:time_zone, "Etc/UTC")
|> DateTime.add(1, :minute)
|> Map.merge(%{second: 0, microsecond: {0, 0}})
dt
end
def round_up_to_minute(dt) do
dt
end
end
:ok
Application.ensure_all_started(:kino)
IEx.Helpers.r(RideAlong.EtaCalculator.Model)
IEx.Helpers.r(RideAlong.EtaCalculator.Training)
alias RideAlong.EtaCalculator.Model
alias RideAlong.EtaCalculator.Training
arrival_times = Training.arrival_times(df)
#arrival_times = df |> DF.group_by([:trip_id, :route])
#|> DF.filter(not is_nil(pickup_arrival))
#|> DF.summarise(arrival_time: Series.min(pickup_arrival))
df
|> DF.select([:trip_id, :route, :pickup_arrival])
|> DF.filter(not is_nil(pickup_arrival))
|> DF.distinct()
|> DF.join(arrival_times, on: [:trip_id, :route])
|> DF.mutate(database_arrival_diff: Training.diff_seconds(arrival_time, pickup_arrival))
#|> DF.filter(database_arrival_diff >= 0 and database_arrival_diff <= 60)
|> DF.select([:database_arrival_diff])
|> DF.describe()
|> Kino.DataTable.new()
alias Explorer.Duration
IEx.Helpers.r(RideAlong.EtaCalculator.Model)
IEx.Helpers.r(RideAlong.EtaCalculator.Training)
training_fields = Model.feature_names()
seed = 4055183217 # Enum.random(0..(Integer.pow(2, 32) - 1))
df = DF.sort_by(df, asc: time)
df =
df
|> DF.join(arrival_times, on: [:trip_id, :route])
|> Training.populate()
|> DF.mutate(ors_eta: time + %Duration{value: 1_000, precision: :millisecond} * ors_duration)
size = Series.size(df[:time])
validation_size = trunc(size * 0.1)
train_size = size - validation_size
train_df = DF.slice(df, 0..train_size-1)
x =
train_df
|> DF.select(training_fields)
|> Nx.stack(axis: 1)
y = DF.select(train_df, :ors_to_add) |> Nx.concatenate()
df =
df
|> DF.slice(train_size .. size)
validate_x =
df
|> DF.select(training_fields)
|> Nx.stack(axis: 1)
validate_y = DF.select(df, :ors_to_add) |> Nx.concatenate()
opts = Keyword.merge(Training.training_params(), [
seed: seed,
#early_stopping_rounds: 5,
#verbose_eval: true,
#verbose_eval: false,
#evals: [{validate_x, validate_y, "validate"}]
#num_boost_rounds: 4000
])
IO.puts("About to train (using seed #{seed})...")
model = EXGBoost.train(x, y, opts)
IO.puts("Trained!")
#model = Model.read_model()
IO.puts(inspect(model))
:ok
IEx.Helpers.r(RideAlong.EtaCalculator.Model)
IEx.Helpers.r(RideAlong.EtaCalculator.Training)
predicted = Training.predict_from_data_frame(model, df)
df = df
|> DF.mutate(
model: time + %Duration{value: 1_000, precision: :millisecond} * ^predicted
)
|> DF.discard([:model_to_add])
[model_size: byte_size(EXGBoost.dump_model(model, format: :ubj)) / 1024.0 / 1024.0]
Accuracy Analysis
IEx.Helpers.r(RideAlong.EtaCalculator.Training)
fields = [:pick, :model]
accuracy = &Training.accuracy/1
quantiles = fn df, field ->
df = Training.with_accuracy(df, :time, :arrival_time, field, accuracy)
[five, twenty_five, seventy_five, ninety_five] =
for q <- [0.05, 0.25, 0.75, 0.95] do
(Series.quantile(df[:diff], q) / 60) |> Float.round(1)
end
DF.new(%{
:field => ["#{field}"],
:median => [Float.round(Series.median(df[:diff]) / 60, 1)],
"50%" => ["#{twenty_five} - #{seventy_five} (#{Float.round(abs(seventy_five - twenty_five), 1)})"],
"90%" => ["#{five} - #{ninety_five} (#{Float.round(abs(ninety_five - five), 1)})"],
:size => [Series.size(df[field])]
})
end
fields
|> Enum.map(&quantiles.(df, &1))
|> DF.concat_rows()
|> DF.select([:field, :median, "50%", "90%", :size])
|> Kino.DataTable.new()
vl_df = for field <- fields, reduce: DF.new(%{field: [], diff: []}, dtypes: [field: :string, diff: {:s, 64}]) do
acc ->
field = "#{field}"
df = Training.with_accuracy(df, :time, :arrival_time, field, accuracy)
|> DF.mutate(diff: diff / 60, field: ^field)
DF.concat_rows(acc, DF.select(df, [:field, :diff]))
end
Vl.new(height: 200, width: 800)
|> Vl.data_from_values(vl_df)
|> Vl.mark(:boxplot, extent: "min-max")
|> Vl.encode_field(:x, "diff", type: :quantitative, scale: [zero: false], outliers: false)
|> Vl.encode_field(:y, "field", type: :nominal)
for field <- fields do
%{
"field" => "#{field}",
"accuracy" => Training.overall_accuracy(df, :time, :arrival_time, field, accuracy)[:accuracy][0]
}
end
|> Kino.DataTable.new(name: "Overall Accuracy %", keys: ["field", "accuracy"])
IEx.Helpers.r(RideAlong.EtaCalculator.Training)
for field <- fields do
df
|> Training.grouped_accuracy(:time, :arrival_time, field, accuracy)
|> DF.select([:category, :accuracy, :size, :accurate_count, :early_count, :late_count])
|> Kino.DataTable.new(name: field)
end
|> Kino.Layout.grid(columns: 2)
df
|> DF.mutate(diff: Training.diff_seconds(arrival_time, model))
|> DF.filter(abs(diff) >= 2700 and Training.diff_seconds(arrival_time, time) < 1800)
|> DF.select([:time, :trip_id, :route, :status, :arrival_time, :promise, :pick, :model, :ors_eta])
|> Kino.DataTable.new()
IEx.Helpers.r(RideAlong.EtaCalculator.Training)
df
|> Training.with_accuracy(:time, :model, :pick, accuracy)
|> DF.group_by(:category)
|> DF.mutate(diff: Training.diff_seconds(arrival_time, model))
|> DF.summarise(median: Series.median(diff), std: Series.standard_deviation(diff))
|> DF.sort_by(asc: category)
|> Kino.DataTable.new()
Model Parameter Tuning
best_result = %{
opts: [],
accuracy: 0.0,
ratio: 0.0
}
validate_x =
df
|> DF.select(training_fields)
|> Nx.stack(axis: 1)
validate_y = DF.select(df, :ors_to_add) |> Nx.concatenate()
results =
for max_depth <- [8],
num_boost_rounds <- [50] do
new_opts = [
max_depth: max_depth,
num_boost_rounds: num_boost_rounds,
seed: seed,
early_stopping_rounds: 20,
evals: [{validate_x, validate_y, "validate"}]
]
opts = Keyword.merge(Training.training_params(), new_opts)
model = EXGBoost.train(x, y, opts)
predicted = Training.predict_from_data_frame(model, df)
df =
df
|> DF.put(:model_to_add, predicted)
|> DF.mutate(
model:
time +
%Duration{value: 1_000, precision: :millisecond} * (ors_duration + model_to_add)
)
|> DF.discard([:model_to_add])
overall =
Training.overall_accuracy(df, :time, :arrival_time, :model, accuracy)[:accuracy][0]
size_mb = byte_size(EXGBoost.dump_model(model, format: :ubj)) / 1024.0 / 1024.0
%{
#opts: Jason.encode!(Map.new(new_opts)),
max_depth: max_depth,
num_boost_rounds: num_boost_rounds,
accuracy: overall,
model_size: size_mb,
} |> IO.inspect()
end
:ok
Vl.new()
|> Vl.data_from_values(results)
|> Vl.mark(:point, tooltip: [content: :data])
|> Vl.encode_field(:color, "max_depth", type: :nominal)
|> Vl.encode_field(:y, "accuracy", type: :quantitative, scale: [zero: false])
|> Vl.encode_field(:x, "model_size", type: :quantitative)