Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Median of medians

median_of_medians.livemd

Median of medians

Section

########################
# Median-of-Medians (BFPRT) selection algorithm
#
# This implementation exposes a public `select/2` function that returns the
# *k-th* smallest element (0-based index) of a list in deterministic linear
# time.  Internally it uses the "median of medians" strategy to pick a good
# pivot and guarantee worst-case O(n) complexity.
#
########################


defmodule MedianOfMedians do
  @moduledoc """
  Deterministic *O(n)* selection (**BFPRT**) with low overhead.

  Public API:

    * `select(list, k)` – element that would sit at 0‑based index *k* if the
      list were sorted.
  """

  @doc """
  Return the k‑th smallest element (0‑based) of `list`.
  Raises `ArgumentError` when `k` is out of range.
  """
  @spec select(list(number()), non_neg_integer()) :: number()
  def select(list, k) when is_list(list) and is_integer(k) and k >= 0 do
    n = length(list)
    if k >= n, do: raise(ArgumentError, "k out of range (#{k}#{n})")
    do_select(list, k)
  end

  # ------------------------------------------------------------------
  # Internal helpers
  # ------------------------------------------------------------------

  # For small lists sort is cheapest –  empirically 40 is a sweet‑spot on BEAM.
  @small_cutoff 100
  defp do_select(list, k) when length(list) <=  @small_cutoff do
    Enum.sort(list) |> Enum.at(k)
  end

  defp do_select(list, k) do
    pivot = choose_pivot(list)

    {smaller, larger, n_small, n_equal} = partition(list, pivot)

    cond do
      k < n_small -> do_select(smaller, k)
      k < n_small + n_equal -> pivot
      true -> do_select(larger, k - n_small - n_equal)
    end
  end

  # Choose pivot via median‑of‑medians (groups of 5)
  defp choose_pivot(list) do
    list
    |> Enum.chunk_every(5)
    |> Enum.map(&amp;median_of_group/1)
    |> case do
      [single] -> single
      medians -> do_select(medians, div(length(medians), 2))
    end
  end

  # Median of ≤5 elements using sort
  defp median_of_group(group) do
    Enum.sort(group) |> Enum.at(div(length(group) - 1, 2))
  end

  # ------------------------------------------------------------------
  # Tight tail‑recursive partition counting sizes *while* building lists
  # ------------------------------------------------------------------

  @spec partition(list(), number()) :: {list(), list(), non_neg_integer(), non_neg_integer()}
  defp partition(list, pivot), do: partition(list, pivot, [], [], 0, 0)

  defp partition([], _pivot, small, large, n_small, n_equal),
    do: {small, large, n_small, n_equal}

  defp partition([h | t], pivot, small, large, n_small, n_equal) do
    cond do
      h < pivot -> partition(t, pivot, [h | small], large, n_small + 1, n_equal)
      h > pivot -> partition(t, pivot, small, [h | large], n_small, n_equal)
      true -> partition(t, pivot, small, large, n_small, n_equal + 1)
    end
  end
end

############################################################################
# Test‑suite
############################################################################

ExUnit.start()

defmodule MedianOfMediansTest do
  use ExUnit.Case, async: true

  alias MedianOfMedians, as: MOM

  @tag :basic
  test "select on small static lists" do
    assert MOM.select([2], 0) == 2
    assert MOM.select([3, 1], 0) == 1
    assert MOM.select([3, 1], 1) == 3
    assert MOM.select([3, 1, 4, 1, 5], 2) == 3
  end

  @tag :random
  test "select matches Enum.sort/2 for random samples" do
    # Seed once for reproducibility; :exsplus is available since OTP 24
    :rand.seed(:exsplus, {101, 102, 103})

    for n <- 1..200 do
      list = Enum.to_list(1..n) |> Enum.shuffle()
      k    = :rand.uniform(n) - 1
      assert MOM.select(list, k) == Enum.sort(list) |> Enum.at(k)
    end
  end

  @tag :edge
  test "errors on k out of range" do
    assert_raise ArgumentError, fn -> MOM.select([], 0) end
    assert_raise ArgumentError, fn -> MOM.select([1, 2, 3], 3) end
  end
end


ExUnit.run()
Running ExUnit with seed: 557566, max_cases: 22

...
Finished in 0.00 seconds (0.00s async, 0.00s sync)
3 tests, 0 failures
%{total: 3, failures: 0, excluded: 0, skipped: 0}
############################################################################
# Benchmark (Benchee)
# -------------------------------------------------------------------------
# The following optional section lets you benchmark the deterministic
# BFPRT selection against the naïve "sort then pick" approach.
#
# There are **two** ways to run it:
# 1. **One‑off script** – run only the benchmark:
#       $ elixir median_of_medians.exs --bench
# 2. **Environment flag** – set BENCH=1 to combine with the tests:
#       $ BENCH=1 elixir median_of_medians.exs
#
# The code will `Mix.install/1` Benchee automatically if it isn’t available.
############################################################################

{opts, _argv, _} = OptionParser.parse(System.argv(), switches: [bench: :boolean])
run_bench? = Keyword.get(opts, :bench, false) or System.get_env("BENCH") in ["1", "true"]

# Install Benchee dynamically when running as a loose script
if not Code.ensure_loaded?(Benchee) do
  Mix.install([
    {:benchee, "~> 1.2"}
  ])
end

alias MedianOfMedians, as: MOM

# Generate input lists of increasing size and target k (middle element)
sizes = [10_000_000]

inputs =
  for n <- sizes, into: %{} do
    list = Enum.to_list(1..n) |> Enum.shuffle()
    k = div(n, 2)
    {"#{n} elements (k=#{k})", {list, k}}
  end

Benchee.run(
  %{
    "MOM.select" => fn {lst, idx} -> MOM.select(lst, idx) end,
    "Enum.sort + Enum.at" => fn {lst, idx} -> Enum.sort(lst) |> Enum.at(idx) end
  },
  inputs: inputs,
  # seconds for each scenario
  time: 3,
  # warm‑up time (seconds)
  warmup: 1,
  # memory measurements (seconds)
  memory_time: 0.5
)
Error trying to determine erlang version enoent, falling back to overall OTP version
Warning: the benchmark Enum.sort + Enum.at is using an evaluated function.
  Evaluated functions perform slower than compiled functions.
  You can move the Benchee caller to a function in a module and invoke `Mod.fun()` instead.
  Alternatively, you can move the benchmark into a benchmark.exs file and run mix run benchmark.exs

Warning: the benchmark MOM.select is using an evaluated function.
  Evaluated functions perform slower than compiled functions.
  You can move the Benchee caller to a function in a module and invoke `Mod.fun()` instead.
  Alternatively, you can move the benchmark into a benchmark.exs file and run mix run benchmark.exs

Operating System: macOS
CPU Information: Apple M3 Pro
Number of Available Cores: 11
Available memory: 18 GB
Elixir 1.17.2
Erlang 27
JIT enabled: true

Benchmark suite executing with the following configuration:
warmup: 1 s
time: 3 s
memory time: 500 ms
reduction time: 0 ns
parallel: 1
inputs: 10000000 elements (k=5000000)
Estimated total run time: 9 s

Benchmarking Enum.sort + Enum.at with input 10000000 elements (k=5000000) ...
Benchmarking MOM.select with input 10000000 elements (k=5000000) ...
Calculating statistics...
Formatting results...

##### With input 10000000 elements (k=5000000) #####
Name                          ips        average  deviation         median         99th %
Enum.sort + Enum.at          0.66         1.51 s     ±3.97%         1.51 s         1.55 s
MOM.select                   0.51         1.97 s     ±1.43%         1.97 s         1.99 s

Comparison: 
Enum.sort + Enum.at          0.66
MOM.select                   0.51 - 1.31x slower +0.46 s

Memory usage statistics:

Name                   Memory usage
Enum.sort + Enum.at         2.43 GB
MOM.select                  6.30 GB - 2.59x memory usage +3.87 GB

**All measurements for memory usage were the same**
%Benchee.Suite{
  system: %Benchee.System{
    elixir: "1.17.2",
    erlang: "27",
    jit_enabled?: true,
    num_cores: 11,
    os: :macOS,
    available_memory: "18 GB",
    cpu_speed: "Apple M3 Pro"
  },
  configuration: %Benchee.Configuration{
    parallel: 1,
    time: 3000000000.0,
    warmup: 1000000000.0,
    memory_time: 500000000.0,
    reduction_time: 0.0,
    pre_check: false,
    formatters: [Benchee.Formatters.Console],
    percentiles: ~c"2c",
    print: %{configuration: true, fast_warning: true, benchmarking: true},
    inputs: [
      {"10000000 elements (k=5000000)",
       {[5815768, 7057140, 2471541, 1606788, 7417519, 7701659, 5482706, 4042559, 7083056, 3626123,
         1232352, 4565989, 3963697, 4263532, 6455684, 9110627, 8847371, 4117920, 9306534, 639294,
         8094874, 4384479, 8343965, 2435139, 5263360, 2577560, 6393355, 8138325, 4016326, 4569113,
         547683, 2871411, 7263711, 6074463, ...], 5000000}}
    ],
    input_names: ["10000000 elements (k=5000000)"],
    save: false,
    load: false,
    unit_scaling: :best,
    assigns: %{},
    before_each: nil,
    after_each: nil,
    before_scenario: nil,
    after_scenario: nil,
    measure_function_call_overhead: false,
    title: nil,
    profile_after: false
  },
  scenarios: [
    %Benchee.Scenario{
      name: "Enum.sort + Enum.at",
      job_name: "Enum.sort + Enum.at",
      function: #Function<42.39164016/1 in :erl_eval.expr/6>,
      input_name: "10000000 elements (k=5000000)",
      input: {[5815768, 7057140, 2471541, 1606788, 7417519, 7701659, 5482706, 4042559, 7083056,
        3626123, 1232352, 4565989, 3963697, 4263532, 6455684, 9110627, 8847371, 4117920, 9306534,
        639294, 8094874, 4384479, 8343965, 2435139, 5263360, 2577560, 6393355, 8138325, 4016326,
        4569113, 547683, 2871411, 7263711, 6074463, 7146071, 3732249, 8346894, 2919740, 2954675,
        5227775, ...], 5000000},
      before_each: nil,
      after_each: nil,
      before_scenario: nil,
      after_scenario: nil,
      tag: nil,
      run_time_data: %Benchee.CollectionData{
        statistics: %Benchee.Statistics{
          average: 1506668262.5,
          ips: 0.6637161111635217,
          std_dev: 59792849.71507831,
          std_dev_ratio: 0.03968547768827666,
          std_dev_ips: 0.02633989092092969,
          median: 1506668262.5,
          percentiles: %{50 => 1506668262.5, 99 => 1548948192.0},
          mode: nil,
          minimum: 1464388333,
          maximum: 1548948192,
          relative_more: nil,
          relative_less: nil,
          absolute_difference: nil,
          sample_size: 2
        },
        samples: [1464388333, 1548948192]
      },
      memory_usage_data: %Benchee.CollectionData{
        statistics: %Benchee.Statistics{
          average: 2607031144.0,
          ips: nil,
          std_dev: 0.0,
          std_dev_ratio: 0.0,
          std_dev_ips: nil,
          median: 2607031144.0,
          percentiles: %{50 => 2607031144.0, 99 => 2607031144.0},
          mode: nil,
          minimum: 2607031144,
          maximum: 2607031144,
          relative_more: nil,
          relative_less: nil,
          absolute_difference: nil,
          sample_size: 1
        },
        samples: [2607031144]
      },
      reductions_data: %Benchee.CollectionData{
        statistics: %Benchee.Statistics{
          average: nil,
          ips: nil,
          std_dev: nil,
          std_dev_ratio: nil,
          std_dev_ips: nil,
          median: nil,
          percentiles: nil,
          mode: nil,
          minimum: nil,
          maximum: nil,
          relative_more: nil,
          relative_less: nil,
          absolute_difference: nil,
          sample_size: 0
        },
        samples: []
      }
    },
    %Benchee.Scenario{
      name: "MOM.select",
      job_name: "MOM.select",
      function: #Function<42.39164016/1 in :erl_eval.expr/6>,
      input_name: "10000000 elements (k=5000000)",
      input: {[5815768, 7057140, 2471541, 1606788, 7417519, 7701659, 5482706, 4042559, 7083056,
        3626123, 1232352, 4565989, 3963697, 4263532, 6455684, 9110627, 8847371, 4117920, 9306534,
        639294, 8094874, 4384479, 8343965, 2435139, 5263360, 2577560, 6393355, 8138325, 4016326,
        4569113, 547683, 2871411, 7263711, 6074463, 7146071, 3732249, 8346894, 2919740, 2954675,
        ...], 5000000},
      before_each: nil,
      after_each: nil,
      before_scenario: nil,
      after_scenario: nil,
      tag: nil,
      run_time_data: %Benchee.CollectionData{
        statistics: %Benchee.Statistics{
          average: 1968655822.0,
          ips: 0.5079608069754308,
          std_dev: 28239703.721257277,
          std_dev_ratio: 0.014344662690996922,
          std_dev_ips: 0.007286526436309151,
          median: 1968655822.0,
          percentiles: %{50 => 1968655822.0, 99 => 1988624308.0},
          mode: nil,
          minimum: 1948687336,
          maximum: 1988624308,
          relative_more: 1.306628586397266,
          relative_less: 0.7653284264637702,
          absolute_difference: 461987559.5,
          sample_size: 2
        },
        samples: [1948687336, 1988624308]
      },
      memory_usage_data: %Benchee.CollectionData{
        statistics: %Benchee.Statistics{
          average: 6759718032.0,
          ips: nil,
          std_dev: 0.0,
          std_dev_ratio: 0.0,
          std_dev_ips: nil,
          median: 6759718032.0,
          percentiles: %{50 => 6759718032.0, 99 => 6759718032.0},
          mode: nil,
          minimum: 6759718032,
          maximum: 6759718032,
          relative_more: 2.5928796622001538,
          relative_less: 0.3856715815154581,
          absolute_difference: 4152686888.0,
          sample_size: 1
        },
        samples: [6759718032]
      },
      reductions_data: %Benchee.CollectionData{
        statistics: %Benchee.Statistics{
          average: nil,
          ips: nil,
          std_dev: nil,
          std_dev_ratio: nil,
          std_dev_ips: nil,
          median: nil,
          percentiles: nil,
          mode: nil,
          minimum: nil,
          maximum: nil,
          relative_more: nil,
          relative_less: nil,
          absolute_difference: nil,
          sample_size: 0
        },
        samples: []
      }
    }
  ]
}

I wonder why this particular median of medians implementation was slower than sorting the array and picking kth element manually?