Median of medians
Section
########################
# Median-of-Medians (BFPRT) selection algorithm
#
# This implementation exposes a public `select/2` function that returns the
# *k-th* smallest element (0-based index) of a list in deterministic linear
# time. Internally it uses the "median of medians" strategy to pick a good
# pivot and guarantee worst-case O(n) complexity.
#
########################
defmodule MedianOfMedians do
@moduledoc """
Deterministic *O(n)* selection (**BFPRT**) with low overhead.
Public API:
* `select(list, k)` – element that would sit at 0‑based index *k* if the
list were sorted.
"""
@doc """
Return the k‑th smallest element (0‑based) of `list`.
Raises `ArgumentError` when `k` is out of range.
"""
@spec select(list(number()), non_neg_integer()) :: number()
def select(list, k) when is_list(list) and is_integer(k) and k >= 0 do
n = length(list)
if k >= n, do: raise(ArgumentError, "k out of range (#{k} ≥ #{n})")
do_select(list, k)
end
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
# For small lists sort is cheapest – empirically 40 is a sweet‑spot on BEAM.
@small_cutoff 100
defp do_select(list, k) when length(list) <= @small_cutoff do
Enum.sort(list) |> Enum.at(k)
end
defp do_select(list, k) do
pivot = choose_pivot(list)
{smaller, larger, n_small, n_equal} = partition(list, pivot)
cond do
k < n_small -> do_select(smaller, k)
k < n_small + n_equal -> pivot
true -> do_select(larger, k - n_small - n_equal)
end
end
# Choose pivot via median‑of‑medians (groups of 5)
defp choose_pivot(list) do
list
|> Enum.chunk_every(5)
|> Enum.map(&median_of_group/1)
|> case do
[single] -> single
medians -> do_select(medians, div(length(medians), 2))
end
end
# Median of ≤5 elements using sort
defp median_of_group(group) do
Enum.sort(group) |> Enum.at(div(length(group) - 1, 2))
end
# ------------------------------------------------------------------
# Tight tail‑recursive partition counting sizes *while* building lists
# ------------------------------------------------------------------
@spec partition(list(), number()) :: {list(), list(), non_neg_integer(), non_neg_integer()}
defp partition(list, pivot), do: partition(list, pivot, [], [], 0, 0)
defp partition([], _pivot, small, large, n_small, n_equal),
do: {small, large, n_small, n_equal}
defp partition([h | t], pivot, small, large, n_small, n_equal) do
cond do
h < pivot -> partition(t, pivot, [h | small], large, n_small + 1, n_equal)
h > pivot -> partition(t, pivot, small, [h | large], n_small, n_equal)
true -> partition(t, pivot, small, large, n_small, n_equal + 1)
end
end
end
############################################################################
# Test‑suite
############################################################################
ExUnit.start()
defmodule MedianOfMediansTest do
use ExUnit.Case, async: true
alias MedianOfMedians, as: MOM
@tag :basic
test "select on small static lists" do
assert MOM.select([2], 0) == 2
assert MOM.select([3, 1], 0) == 1
assert MOM.select([3, 1], 1) == 3
assert MOM.select([3, 1, 4, 1, 5], 2) == 3
end
@tag :random
test "select matches Enum.sort/2 for random samples" do
# Seed once for reproducibility; :exsplus is available since OTP 24
:rand.seed(:exsplus, {101, 102, 103})
for n <- 1..200 do
list = Enum.to_list(1..n) |> Enum.shuffle()
k = :rand.uniform(n) - 1
assert MOM.select(list, k) == Enum.sort(list) |> Enum.at(k)
end
end
@tag :edge
test "errors on k out of range" do
assert_raise ArgumentError, fn -> MOM.select([], 0) end
assert_raise ArgumentError, fn -> MOM.select([1, 2, 3], 3) end
end
end
ExUnit.run()
Running ExUnit with seed: 557566, max_cases: 22
...
Finished in 0.00 seconds (0.00s async, 0.00s sync)
3 tests, 0 failures
%{total: 3, failures: 0, excluded: 0, skipped: 0}
############################################################################
# Benchmark (Benchee)
# -------------------------------------------------------------------------
# The following optional section lets you benchmark the deterministic
# BFPRT selection against the naïve "sort then pick" approach.
#
# There are **two** ways to run it:
# 1. **One‑off script** – run only the benchmark:
# $ elixir median_of_medians.exs --bench
# 2. **Environment flag** – set BENCH=1 to combine with the tests:
# $ BENCH=1 elixir median_of_medians.exs
#
# The code will `Mix.install/1` Benchee automatically if it isn’t available.
############################################################################
{opts, _argv, _} = OptionParser.parse(System.argv(), switches: [bench: :boolean])
run_bench? = Keyword.get(opts, :bench, false) or System.get_env("BENCH") in ["1", "true"]
# Install Benchee dynamically when running as a loose script
if not Code.ensure_loaded?(Benchee) do
Mix.install([
{:benchee, "~> 1.2"}
])
end
alias MedianOfMedians, as: MOM
# Generate input lists of increasing size and target k (middle element)
sizes = [10_000_000]
inputs =
for n <- sizes, into: %{} do
list = Enum.to_list(1..n) |> Enum.shuffle()
k = div(n, 2)
{"#{n} elements (k=#{k})", {list, k}}
end
Benchee.run(
%{
"MOM.select" => fn {lst, idx} -> MOM.select(lst, idx) end,
"Enum.sort + Enum.at" => fn {lst, idx} -> Enum.sort(lst) |> Enum.at(idx) end
},
inputs: inputs,
# seconds for each scenario
time: 3,
# warm‑up time (seconds)
warmup: 1,
# memory measurements (seconds)
memory_time: 0.5
)
Error trying to determine erlang version enoent, falling back to overall OTP version
Warning: the benchmark Enum.sort + Enum.at is using an evaluated function.
Evaluated functions perform slower than compiled functions.
You can move the Benchee caller to a function in a module and invoke `Mod.fun()` instead.
Alternatively, you can move the benchmark into a benchmark.exs file and run mix run benchmark.exs
Warning: the benchmark MOM.select is using an evaluated function.
Evaluated functions perform slower than compiled functions.
You can move the Benchee caller to a function in a module and invoke `Mod.fun()` instead.
Alternatively, you can move the benchmark into a benchmark.exs file and run mix run benchmark.exs
Operating System: macOS
CPU Information: Apple M3 Pro
Number of Available Cores: 11
Available memory: 18 GB
Elixir 1.17.2
Erlang 27
JIT enabled: true
Benchmark suite executing with the following configuration:
warmup: 1 s
time: 3 s
memory time: 500 ms
reduction time: 0 ns
parallel: 1
inputs: 10000000 elements (k=5000000)
Estimated total run time: 9 s
Benchmarking Enum.sort + Enum.at with input 10000000 elements (k=5000000) ...
Benchmarking MOM.select with input 10000000 elements (k=5000000) ...
Calculating statistics...
Formatting results...
##### With input 10000000 elements (k=5000000) #####
Name ips average deviation median 99th %
Enum.sort + Enum.at 0.66 1.51 s ±3.97% 1.51 s 1.55 s
MOM.select 0.51 1.97 s ±1.43% 1.97 s 1.99 s
Comparison:
Enum.sort + Enum.at 0.66
MOM.select 0.51 - 1.31x slower +0.46 s
Memory usage statistics:
Name Memory usage
Enum.sort + Enum.at 2.43 GB
MOM.select 6.30 GB - 2.59x memory usage +3.87 GB
**All measurements for memory usage were the same**
%Benchee.Suite{
system: %Benchee.System{
elixir: "1.17.2",
erlang: "27",
jit_enabled?: true,
num_cores: 11,
os: :macOS,
available_memory: "18 GB",
cpu_speed: "Apple M3 Pro"
},
configuration: %Benchee.Configuration{
parallel: 1,
time: 3000000000.0,
warmup: 1000000000.0,
memory_time: 500000000.0,
reduction_time: 0.0,
pre_check: false,
formatters: [Benchee.Formatters.Console],
percentiles: ~c"2c",
print: %{configuration: true, fast_warning: true, benchmarking: true},
inputs: [
{"10000000 elements (k=5000000)",
{[5815768, 7057140, 2471541, 1606788, 7417519, 7701659, 5482706, 4042559, 7083056, 3626123,
1232352, 4565989, 3963697, 4263532, 6455684, 9110627, 8847371, 4117920, 9306534, 639294,
8094874, 4384479, 8343965, 2435139, 5263360, 2577560, 6393355, 8138325, 4016326, 4569113,
547683, 2871411, 7263711, 6074463, ...], 5000000}}
],
input_names: ["10000000 elements (k=5000000)"],
save: false,
load: false,
unit_scaling: :best,
assigns: %{},
before_each: nil,
after_each: nil,
before_scenario: nil,
after_scenario: nil,
measure_function_call_overhead: false,
title: nil,
profile_after: false
},
scenarios: [
%Benchee.Scenario{
name: "Enum.sort + Enum.at",
job_name: "Enum.sort + Enum.at",
function: #Function<42.39164016/1 in :erl_eval.expr/6>,
input_name: "10000000 elements (k=5000000)",
input: {[5815768, 7057140, 2471541, 1606788, 7417519, 7701659, 5482706, 4042559, 7083056,
3626123, 1232352, 4565989, 3963697, 4263532, 6455684, 9110627, 8847371, 4117920, 9306534,
639294, 8094874, 4384479, 8343965, 2435139, 5263360, 2577560, 6393355, 8138325, 4016326,
4569113, 547683, 2871411, 7263711, 6074463, 7146071, 3732249, 8346894, 2919740, 2954675,
5227775, ...], 5000000},
before_each: nil,
after_each: nil,
before_scenario: nil,
after_scenario: nil,
tag: nil,
run_time_data: %Benchee.CollectionData{
statistics: %Benchee.Statistics{
average: 1506668262.5,
ips: 0.6637161111635217,
std_dev: 59792849.71507831,
std_dev_ratio: 0.03968547768827666,
std_dev_ips: 0.02633989092092969,
median: 1506668262.5,
percentiles: %{50 => 1506668262.5, 99 => 1548948192.0},
mode: nil,
minimum: 1464388333,
maximum: 1548948192,
relative_more: nil,
relative_less: nil,
absolute_difference: nil,
sample_size: 2
},
samples: [1464388333, 1548948192]
},
memory_usage_data: %Benchee.CollectionData{
statistics: %Benchee.Statistics{
average: 2607031144.0,
ips: nil,
std_dev: 0.0,
std_dev_ratio: 0.0,
std_dev_ips: nil,
median: 2607031144.0,
percentiles: %{50 => 2607031144.0, 99 => 2607031144.0},
mode: nil,
minimum: 2607031144,
maximum: 2607031144,
relative_more: nil,
relative_less: nil,
absolute_difference: nil,
sample_size: 1
},
samples: [2607031144]
},
reductions_data: %Benchee.CollectionData{
statistics: %Benchee.Statistics{
average: nil,
ips: nil,
std_dev: nil,
std_dev_ratio: nil,
std_dev_ips: nil,
median: nil,
percentiles: nil,
mode: nil,
minimum: nil,
maximum: nil,
relative_more: nil,
relative_less: nil,
absolute_difference: nil,
sample_size: 0
},
samples: []
}
},
%Benchee.Scenario{
name: "MOM.select",
job_name: "MOM.select",
function: #Function<42.39164016/1 in :erl_eval.expr/6>,
input_name: "10000000 elements (k=5000000)",
input: {[5815768, 7057140, 2471541, 1606788, 7417519, 7701659, 5482706, 4042559, 7083056,
3626123, 1232352, 4565989, 3963697, 4263532, 6455684, 9110627, 8847371, 4117920, 9306534,
639294, 8094874, 4384479, 8343965, 2435139, 5263360, 2577560, 6393355, 8138325, 4016326,
4569113, 547683, 2871411, 7263711, 6074463, 7146071, 3732249, 8346894, 2919740, 2954675,
...], 5000000},
before_each: nil,
after_each: nil,
before_scenario: nil,
after_scenario: nil,
tag: nil,
run_time_data: %Benchee.CollectionData{
statistics: %Benchee.Statistics{
average: 1968655822.0,
ips: 0.5079608069754308,
std_dev: 28239703.721257277,
std_dev_ratio: 0.014344662690996922,
std_dev_ips: 0.007286526436309151,
median: 1968655822.0,
percentiles: %{50 => 1968655822.0, 99 => 1988624308.0},
mode: nil,
minimum: 1948687336,
maximum: 1988624308,
relative_more: 1.306628586397266,
relative_less: 0.7653284264637702,
absolute_difference: 461987559.5,
sample_size: 2
},
samples: [1948687336, 1988624308]
},
memory_usage_data: %Benchee.CollectionData{
statistics: %Benchee.Statistics{
average: 6759718032.0,
ips: nil,
std_dev: 0.0,
std_dev_ratio: 0.0,
std_dev_ips: nil,
median: 6759718032.0,
percentiles: %{50 => 6759718032.0, 99 => 6759718032.0},
mode: nil,
minimum: 6759718032,
maximum: 6759718032,
relative_more: 2.5928796622001538,
relative_less: 0.3856715815154581,
absolute_difference: 4152686888.0,
sample_size: 1
},
samples: [6759718032]
},
reductions_data: %Benchee.CollectionData{
statistics: %Benchee.Statistics{
average: nil,
ips: nil,
std_dev: nil,
std_dev_ratio: nil,
std_dev_ips: nil,
median: nil,
percentiles: nil,
mode: nil,
minimum: nil,
maximum: nil,
relative_more: nil,
relative_less: nil,
absolute_difference: nil,
sample_size: 0
},
samples: []
}
}
]
}
I wonder why this particular median of medians implementation was slower than sorting the array and picking kth element manually?