Powered by AppSignal & Oban Pro

Codec Comparison

livebooks/03_codec_comparison.livemd

Codec Comparison

Mix.install([
  {:ex_codecs, path: Path.join(__DIR__, "..")},
  {:kino, "~> 0.14"},
  {:vega_lite, "~> 0.1"}
])

Setup: Test Datasets

codecs = [:lz4, :snappy, :zstd, :bzip2, :blosc2]

random_data = :crypto.strong_rand_bytes(65536)

repetitive_text = String.duplicate("The quick brown fox jumps over the lazy dog. ", 2000)

semi_structured = Enum.join(for i <- 1..5000 do
  "id=#{i}&name=user#{rem(i, 100)}&score=#{:rand.uniform(1000)}&active=#{rem(i, 2) == 0}"
end, "&")

float_array = for i <- 1..8192, into: <<>>, do: <<i * 0.125::float-size(64)-little>>

datasets = %{
  "Random bytes" => random_data,
  "Repetitive text" => repetitive_text,
  "Semi-structured" => semi_structured,
  "Float64 array" => float_array
}

dataset_sizes = for {name, data} <- datasets, into: %{} do
  {name, byte_size(data)}
end

IO.puts("Dataset sizes:")
for {name, size} <- dataset_sizes do
  IO.puts("  #{String.pad_trailing(name, 20)} #{size} bytes")
end

Compression Ratio Benchmarks

compression_results = for {dname, data} <- datasets, codec <- codecs do
  opts = if codec == :blosc2, do: [cname: :zstd, clevel: 5, shuffle: :byte], else: []
  {:ok, enc} = ExCodecs.encode(codec, data, opts)
  %{
    dataset: dname,
    codec: inspect(codec),
    original: byte_size(data),
    compressed: byte_size(enc),
    ratio_pct: Float.round(100 * byte_size(enc) / byte_size(data), 1),
    savings_pct: Float.round(100 * (1 - byte_size(enc) / byte_size(data)), 1)
  }
end

Kino.DataTable.new(compression_results)

Compression Ratio Chart

VegaLite.new(width: 700, height: 350)
|> VegaLite.data_from_values(compression_results)
|> VegaLite.mark(:bar)
|> VegaLite.encode_field(:x, "codec", type: :nominal, sort: ["lz4", "snappy", "zstd", "bzip2", "blosc2"])
|> VegaLite.encode_field(:y, "ratio_pct", type: :quantitative, title: "Compressed Size (%)", scale: [domain: [0, 110]])
|> VegaLite.encode_field(:color, "codec", type: :nominal)
|> VegaLite.encode_field(:column, "dataset", type: :nominal)
|> Kino.VegaLite.new()

Space Savings Chart

VegaLite.new(width: 700, height: 350)
|> VegaLite.data_from_values(compression_results)
|> VegaLite.mark(:bar)
|> VegaLite.encode_field(:x, "codec", type: :nominal, sort: ["lz4", "snappy", "zstd", "bzip2", "blosc2"])
|> VegaLite.encode_field(:y, "savings_pct", type: :quantitative, title: "Space Saved (%)")
|> VegaLite.encode_field(:color, "codec", type: :nominal)
|> VegaLite.encode_field(:column, "dataset", type: :nominal)
|> Kino.VegaLite.new()

Speed Benchmarks

iterations = 20

speed_results = for {dname, data} <- datasets, codec <- codecs do
  opts = if codec == :blosc2, do: [cname: :zstd, clevel: 5, shuffle: :byte], else: []
  {:ok, enc} = ExCodecs.encode(codec, data, opts)

  {enc_time, _} = :timer.tc(fn ->
    for _ <- 1..iterations, do: ExCodecs.encode(codec, data, opts)
  end)

  {dec_time, _} = :timer.tc(fn ->
    for _ <- 1..iterations, do: ExCodecs.decode(codec, enc)
  end)

  enc_throughput = Float.round(byte_size(data) * iterations / enc_time, 1)
  dec_throughput = Float.round(byte_size(data) * iterations / dec_time, 1)

  %{
    dataset: dname,
    codec: inspect(codec),
    encode_time_us: div(enc_time, iterations),
    decode_time_us: div(dec_time, iterations),
    encode_mbps: Float.round(enc_throughput, 1),
    decode_mbps: Float.round(dec_throughput, 1)
  }
end

Kino.DataTable.new(speed_results)

Encode Speed Chart

VegaLite.new(width: 700, height: 350)
|> VegaLite.data_from_values(speed_results)
|> VegaLite.mark(:bar)
|> VegaLite.encode_field(:x, "codec", type: :nominal, sort: ["lz4", "snappy", "zstd", "bzip2", "blosc2"])
|> VegaLite.encode_field(:y, "encode_time_us", type: :quantitative, title: "Encode Time (µs)")
|> VegaLite.encode_field(:color, "codec", type: :nominal)
|> VegaLite.encode_field(:column, "dataset", type: :nominal)
|> Kino.VegaLite.new()

Decode Speed Chart

VegaLite.new(width: 700, height: 350)
|> VegaLite.data_from_values(speed_results)
|> VegaLite.mark(:bar)
|> VegaLite.encode_field(:x, "codec", type: :nominal, sort: ["lz4", "snappy", "zstd", "bzip2", "blosc2"])
|> VegaLite.encode_field(:y, "decode_time_us", type: :quantitative, title: "Decode Time (µs)")
|> VegaLite.encode_field(:color, "codec", type: :nominal)
|> VegaLite.encode_field(:column, "dataset", type: :nominal)
|> Kino.VegaLite.new()

Memory Usage

memory_results = for codec <- codecs do
  opts = if codec == :blosc2, do: [cname: :zstd, clevel: 5, shuffle: :byte], else: []
  {:ok, info} = ExCodecs.codec_info(codec)

  mem_before = Process.info(self(), :heap_size) |> elem(1)
  {:ok, enc} = ExCodecs.encode(codec, float_array, opts)
  {:ok, _dec} = ExCodecs.decode(codec, enc)
  mem_after = Process.info(self(), :heap_size) |> elem(1)

  %{
    codec: inspect(codec),
    category: info.category,
    configurable: info.configurable?,
    streaming: info.streaming?,
    heap_growth_words: mem_after - mem_before
  }
end

Kino.DataTable.new(memory_results)

Codec Profiles

profile_data = %{
  "Speed King" => %{
    best: [:lz4, :snappy],
    why: "Fastest encode/decode, ideal for hot paths, caching, and real-time systems"
  },
  "Balanced" => %{
    best: [:zstd],
    why: "Good ratio with fast decompression. Default choice for most workloads"
  },
  "Maximum Ratio" => %{
    best: [:bzip2],
    why: "Smallest output. Accept slower speed for archival and cold storage"
  },
  "Numeric Arrays" => %{
    best: [:blosc2],
    why: "Shuffle+compress slashes size of typed data. Threaded for large arrays"
  }
}

for {profile, %{best: best, why: why}} <- profile_data do
  IO.puts("## #{profile}")
  IO.puts("  Codecs: #{inspect(best)}")
  IO.puts("  #{why}\n")
end

Interactive Codec Selector

use_case = Kino.Input.select("Your use case:", [
  "Real-time / low latency" => :speed,
  "General purpose" => :balanced,
  "Maximum compression / archival" => :ratio,
  "Numerical / scientific data" => :numeric,
  "Small payloads / low overhead" => :tiny
])

data_type = Kino.Input.select("Data type:", [
  "Text / JSON" => :text,
  "Binary blobs" => :binary,
  "Typed arrays (floats, ints)" => :array,
  "Mixed" => :mixed
])

Kino.Layout.grid([use_case, data_type], columns: 2)
use_case_val = Kino.Input.read(use_case)
data_type_val = Kino.Input.read(data_type)

recommendation = case {use_case_val, data_type_val} do
  {:speed, _} -> {:lz4, "Fastest compression/decompression. Minimal latency overhead."}
  {:tiny, _} -> {:snappy, "Low overhead even on very small payloads. No configuration needed."}
  {:ratio, :array} -> {:blosc2, "Shuffle+compress gives best ratios on typed arrays."}
  {:ratio, _} -> {:bzip2, "Highest compression ratio for general data. Slow but compact."}
  {:numeric, _} -> {:blosc2, "Purpose-built for numerical data with shuffle filters and threading."}
  {:balanced, :array} -> {:blosc2, "Good ratio on typed data with decent speed."}
  {:balanced, _} -> {:zstd, "Best all-around codec. Configurable from fast (level 1) to compact (level 22)."}
end

{codec, reason} = recommendation
{:ok, info} = ExCodecs.codec_info(codec)

IO.puts("Recommended codec: #{inspect(codec)}")
IO.puts("Reason: #{reason}")
IO.puts("Configurable: #{info.configurable?}")
IO.puts("Streaming: #{info.streaming?}")

default_opts = case codec do
  :zstd -> [level: 3]
  :lz4 -> [level: 1]
  :bzip2 -> [block_size: 9]
  :blosc2 -> [cname: :zstd, clevel: 5, shuffle: :byte]
  :snappy -> []
end
IO.puts("Suggested options: #{inspect(default_opts)}")

Decision Flowchart

flowchart = """
When choosing a codec, follow this decision path:

1. Is your data typed numerical arrays?
   YES → Use Blosc2 (with appropriate shuffle and typesize)
   NO  → Continue

2. Is latency critical (hot path, real-time)?
   YES → Use LZ4 (fastest) or Snappy (low overhead)
   NO  → Continue

3. Is storage cost the primary concern?
   YES → Use Bzip2 (best ratio) or Zstd with high level
   NO  → Continue

4. Default choice:
   → Use Zstd (level 3)
   → Good ratio, fast decompression, configurable
"""

IO.puts(flowchart)

Codec Feature Matrix

Feature LZ4 Snappy Zstd Bzip2 Blosc2
Speed Very Fast Very Fast Fast Slow Medium
Ratio Low Low High Very High High (arrays)
Configurable Level 1–16 No Level 1–22 Block 1–9 Many options
Streaming No No Yes No Yes
Best For Hot paths Short data General Archival Arrays
Shuffle Byte/Bit
Multi-thread No No No No Yes

Next Steps