Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

RideAlong Prediction Accuracy pt2

livebook/prdiction_accuracy_2.livemd

RideAlong Prediction Accuracy pt2

Mix.install(
  [
    {:kino, "~> 0.13"},
    {:kino_vega_lite, "~> 0.1.13"},
    {:ride_along, path: Path.join(__DIR__, ".."), env: :dev}
  ],
  config_path: Path.join([__DIR__, "..", "config", "config.exs"]),
  start_applications: false
)

Load/Group Data

require Explorer.DataFrame, as: DF
alias Explorer.{Duration, Series}
alias VegaLite, as: Vl

df =
  "data.csv"
  |> Kino.FS.file_path()
  |> DF.from_csv!(
    parse_dates: true,
    nil_values: [""],
    dtypes: %{status: :category}
  )
  |> DF.filter(route > 0)

defmodule Support do
  def truncate_to_minute(%DateTime{} = dt) do
    Map.merge(dt, %{second: 0, microsecond: {0, 0}})
  end

  def round_up_to_minute(%DateTime{second: second, microsecond: {microsecond, _precision}} = dt)
      when second > 0 or microsecond > 0 do
    dt
    |> Map.put(:time_zone, "Etc/UTC")
    |> DateTime.add(1, :minute)
    |> Map.merge(%{second: 0, microsecond: {0, 0}})

    dt
  end

  def round_up_to_minute(dt) do
    dt
  end
end
  
:ok
Application.ensure_all_started(:kino)
IEx.Helpers.r(RideAlong.EtaCalculator.Model)
IEx.Helpers.r(RideAlong.EtaCalculator.Training)
alias RideAlong.EtaCalculator.Model
alias RideAlong.EtaCalculator.Training

arrival_times = Training.arrival_times(df)
#arrival_times = df |> DF.group_by([:trip_id, :route])
#|> DF.filter(not is_nil(pickup_arrival))
#|> DF.summarise(arrival_time: Series.min(pickup_arrival))

df
|> DF.select([:trip_id, :route, :pickup_arrival])
|> DF.filter(not is_nil(pickup_arrival))
|> DF.distinct()
|> DF.join(arrival_times, on: [:trip_id, :route])
|> DF.mutate(database_arrival_diff: Training.diff_seconds(arrival_time, pickup_arrival))
#|> DF.filter(database_arrival_diff >= 0 and database_arrival_diff <= 60)
|> DF.select([:database_arrival_diff])
|> DF.describe()
|> Kino.DataTable.new()
alias Explorer.Duration
IEx.Helpers.r(RideAlong.EtaCalculator.Model)
IEx.Helpers.r(RideAlong.EtaCalculator.Training)

training_fields = Model.feature_names()

seed = 4055183217 # Enum.random(0..(Integer.pow(2, 32) - 1))

df = DF.sort_by(df, asc: time)
df =
  df
  |> DF.join(arrival_times, on: [:trip_id, :route])
  |> Training.populate()
  |> DF.mutate(ors_eta: time + %Duration{value: 1_000, precision: :millisecond} * ors_duration)

size = Series.size(df[:time])
validation_size = trunc(size * 0.1)
train_size = size - validation_size

train_df = DF.slice(df, 0..train_size-1)

x =
  train_df
  |> DF.select(training_fields)
  |> Nx.stack(axis: 1)

y = DF.select(train_df, :ors_to_add) |> Nx.concatenate()

df =
  df
  |> DF.slice(train_size .. size)

validate_x =
  df
  |> DF.select(training_fields)
  |> Nx.stack(axis: 1)

validate_y = DF.select(df, :ors_to_add) |> Nx.concatenate()

opts = Keyword.merge(Training.training_params(), [
  seed: seed,
  #early_stopping_rounds: 5,
  #verbose_eval: true,
  #verbose_eval: false,
  #evals: [{validate_x, validate_y, "validate"}]
  #num_boost_rounds: 4000
])

IO.puts("About to train (using seed #{seed})...")
model = EXGBoost.train(x, y, opts)
IO.puts("Trained!")

#model = Model.read_model()

IO.puts(inspect(model))
:ok
IEx.Helpers.r(RideAlong.EtaCalculator.Model)
IEx.Helpers.r(RideAlong.EtaCalculator.Training)

predicted = Training.predict_from_data_frame(model, df)

df = df
  |> DF.mutate(
    model: time + %Duration{value: 1_000, precision: :millisecond} * ^predicted
  )
  |> DF.discard([:model_to_add])

[model_size: byte_size(EXGBoost.dump_model(model, format: :ubj)) / 1024.0 / 1024.0]

Accuracy Analysis

IEx.Helpers.r(RideAlong.EtaCalculator.Training)
fields = [:pick, :model]
accuracy = &amp;Training.accuracy/1
quantiles = fn df, field ->
  df = Training.with_accuracy(df, :time, :arrival_time, field, accuracy)

  [five, twenty_five, seventy_five, ninety_five] =
    for q <- [0.05, 0.25, 0.75, 0.95] do
      (Series.quantile(df[:diff], q) / 60) |> Float.round(1)
    end

  DF.new(%{
    :field => ["#{field}"],
    :median => [Float.round(Series.median(df[:diff]) / 60, 1)],
    "50%" => ["#{twenty_five} - #{seventy_five} (#{Float.round(abs(seventy_five - twenty_five), 1)})"],
    "90%" => ["#{five} - #{ninety_five} (#{Float.round(abs(ninety_five - five), 1)})"],
    :size => [Series.size(df[field])]
  })
end

fields
|> Enum.map(&amp;quantiles.(df, &amp;1))
|> DF.concat_rows()
|> DF.select([:field, :median, "50%", "90%", :size])
|> Kino.DataTable.new()
vl_df = for field <- fields, reduce: DF.new(%{field: [], diff: []}, dtypes: [field: :string, diff: {:s, 64}]) do
  acc ->
    field = "#{field}"
    df = Training.with_accuracy(df, :time, :arrival_time, field, accuracy)
    |> DF.mutate(diff: diff / 60, field: ^field)
    DF.concat_rows(acc, DF.select(df, [:field, :diff]))
end

Vl.new(height: 200, width: 800)
|> Vl.data_from_values(vl_df)
|> Vl.mark(:boxplot, extent: "min-max")
|> Vl.encode_field(:x, "diff", type: :quantitative, scale: [zero: false], outliers: false)
|> Vl.encode_field(:y, "field", type: :nominal)

for field <- fields do
  %{
    "field" => "#{field}",
    "accuracy" => Training.overall_accuracy(df, :time, :arrival_time, field, accuracy)[:accuracy][0]
  }
end
|> Kino.DataTable.new(name: "Overall Accuracy %", keys: ["field", "accuracy"])
IEx.Helpers.r(RideAlong.EtaCalculator.Training)
for field <- fields do
  df
  |> Training.grouped_accuracy(:time, :arrival_time, field, accuracy) 
  |> DF.select([:category, :accuracy, :size, :accurate_count, :early_count, :late_count])
  |> Kino.DataTable.new(name: field)
end
|> Kino.Layout.grid(columns: 2)
df
|> DF.mutate(diff: Training.diff_seconds(arrival_time, model))
|> DF.filter(abs(diff) >= 2700 and Training.diff_seconds(arrival_time, time) < 1800)
|> DF.select([:time, :trip_id, :route, :status, :arrival_time, :promise, :pick, :model, :ors_eta])
|> Kino.DataTable.new()

Model Parameter Tuning

best_result = %{
  opts: [],
  accuracy: 0.0,
  ratio: 0.0
}

validate_x =
  df
  |> DF.select(training_fields)
  |> Nx.stack(axis: 1)

validate_y = DF.select(df, :ors_to_add) |> Nx.concatenate()

results =
  for max_depth <- [8],
      num_boost_rounds <- [50] do
    new_opts = [
      max_depth: max_depth,
      num_boost_rounds: num_boost_rounds,
      seed: seed,
      early_stopping_rounds: 20,
      evals: [{validate_x, validate_y, "validate"}]
    ]

    opts = Keyword.merge(Training.training_params(), new_opts)
    model = EXGBoost.train(x, y, opts)

    predicted = Training.predict_from_data_frame(model, df)

    df =
      df
      |> DF.put(:model_to_add, predicted)
      |> DF.mutate(
        model:
          time +
            %Duration{value: 1_000, precision: :millisecond} * (ors_duration + model_to_add)
      )
      |> DF.discard([:model_to_add])

    overall =
      Training.overall_accuracy(df, :time, :arrival_time, :model, accuracy)[:accuracy][0]

    size_mb = byte_size(EXGBoost.dump_model(model, format: :ubj)) / 1024.0 / 1024.0

    %{
      #opts: Jason.encode!(Map.new(new_opts)),
      max_depth: max_depth,
      num_boost_rounds: num_boost_rounds,
      accuracy: overall,
      model_size: size_mb,
    } |> IO.inspect()
  end

:ok
Vl.new()
|> Vl.data_from_values(results)
|> Vl.mark(:point, tooltip: [content: :data])
|> Vl.encode_field(:color, "max_depth", type: :nominal)
|> Vl.encode_field(:y, "accuracy", type: :quantitative, scale: [zero: false])
|> Vl.encode_field(:x, "model_size", type: :quantitative)