Benchmarking Eager Summary
Mix.install([
:dux,
:explorer,
:benchee,
:kino_benchee,
:kino
])
Helpers
require Explorer.DataFrame
require Dux
alias Explorer.DataFrame
Data Generator
defmodule Data do
@regions ~w(North South East West Central)
@products ~w(Widget Gadget Doohickey Thingamajig Gizmo)
def gen(num_rows) do
month = String.pad_leading("#{:rand.uniform(12)}", 2, "0")
day = String.pad_leading("#{:rand.uniform(28)}", 2, "0")
1..num_rows
|> Enum.map(fn _ ->
%{
region: Enum.random(@regions),
product: Enum.random(@products),
quantity: :rand.uniform(100),
price: Float.round(:rand.uniform() * 500, 2),
date: Date.from_iso8601!("2025-#{month}-#{day}")
}
end)
end
end
Init test data
sm_data_set = Data.gen(1_000)
md_data_set = Data.gen(100_000)
lg_data_set = Data.gen(1_000_000)
xl_data_set = Data.gen(10_000_000)
:ok
IO.puts("Generating Explorer DataFrames")
[sm_df, md_df, lg_df, xl_df] =
[sm_data_set, md_data_set, lg_data_set, xl_data_set]
|> Enum.map(fn data_set ->
{time, value} =
:timer.tc(
fn ->
DataFrame.new(data_set)
end
)
time = time / 1_000.0
IO.puts("Time to process #{length(data_set)} entries: #{time}ms")
value
end)
IO.puts("Generating Dux")
[sm_dux, md_dux, lg_dux, xl_dux] =
[sm_data_set, md_data_set, lg_data_set, xl_data_set]
|> Enum.map(fn data_set ->
{time, value} =
:timer.tc(fn ->
data_set
|> Dux.from_list()
|> Dux.compute()
end)
time = time / 1_000.0
IO.puts("Time to process #{length(data_set)} entries: #{time}ms")
value
end)
Benchmark
xl_df
|> DataFrame.group_by(:region)
|> DataFrame.summarise(total: sum(quantity), avg_price: mean(price))
xl_dux
|> Dux.group_by(:region)
|> Dux.summarise(total: sum(quantity), avg_price: avg(price))
|> Dux.compute()
Benchee.run(
%{
"Explorer summary" => fn {series, _} ->
series
|> DataFrame.group_by(:region)
|> DataFrame.summarise(total: sum(quantity), avg_price: mean(price))
end,
"Dux summary" => fn {_, series} ->
series
|> Dux.group_by(:region)
|> Dux.summarise(total: sum(quantity), avg_price: avg(price))
|> Dux.compute()
end
},
inputs: %{
# "Small data set" => {sm_df, sm_dux},
# "Medium data set" => {md_df, md_dux},
# "Large data set" => {lg_df, lg_dux},
"Extra-large data set" => {xl_df, xl_dux}
},
warmup: 2,
time: 2,
memory_time: 2
)