Zarr-Style Workloads
Mix.install([
{:ex_codecs, path: ".."}
])
Scientific Dataset Compression
ExCodecs is designed to serve as the codec foundation for scientific computing libraries like ExZarr and ExArrow. This notebook demonstrates compression patterns optimized for numerical and scientific data.
alias ExCodecs.Compression
Array Data Layout
Scientific datasets are typically stored as multi-dimensional arrays with strong regularity:
- Numeric types (integers, floats) with fixed element sizes
- Regular dimension strides
- Column-oriented or row-oriented access patterns
Blosc2 was specifically designed for this workload:
# Generate a synthetic dataset (64-bit float array)
n_elements = 100_000
data = :binary.copy(<<3.14159265::float-64>>, n_elements)
byte_size = byte_size(data)
# Compare codecs on regular numeric data
codecs = [:zstd, :lz4, :snappy, :bzip2, :blosc2]
results =
for codec <- codecs do
{time, {:ok, compressed}} = :timer.tc(fn -> ExCodecs.encode(codec, data) end)
ratio = Float.round(byte_size / byte_size(compressed), 2)
{codec, byte_size: byte_size(compressed), ratio: ratio, time_us: time}
end
IO.puts("Codec | Compressed | Ratio | Time (us)")
IO.puts("---------|-----------|-------|----------")
for {codec, opts} <- results do
IO.puts("#{String.pad_trailing("#{codec}", 8)} | #{String.pad_trailing("#{opts[:byte_size]}", 9)} | #{opts[:ratio]} | #{opts[:time_us]}")
end
Blosc2 Shuffle for Numerical Data
The shuffle filter is the key to Blosc2’s effectiveness on array data:
data = :binary.copy(<<1.0::float-64>>, 50_000) <> :binary.copy(<<2.0::float-64>>, 50_000)
# No shuffle
{:ok, c_none} = ExCodecs.encode(:blosc2, data, shuffle: :none)
# Byte shuffle - reorders bytes for better compression
{:ok, c_byte} = ExCodecs.encode(:blosc2, data, shuffle: :byte)
# Bit shuffle - reorders bits for even better compression on some data
{:ok, c_bit} = ExCodecs.encode(:blosc2, data, shuffle: :bit)
IO.puts("No shuffle: #{byte_size(c_none)} bytes")
IO.puts("Byte shuffle: #{byte_size(c_byte)} bytes")
IO.puts("Bit shuffle: #{byte_size(c_bit)} bytes")
Chunk-Based Compression
Large datasets are typically split into chunks, each compressed independently:
chunk_size = 1024 * 8 # 8 KiB chunks
large_data = :crypto.strong_rand_bytes(1024 * 1024) # 1 MiB
chunks =
large_data
|> binary_part(0, byte_size(large_data))
|> then(fn data ->
for <<chunk::binary-size(chunk_size) <- data>>, do: chunk
end)
compressed_chunks =
chunks
|> Enum.map(fn chunk ->
{:ok, compressed} = ExCodecs.encode(:zstd, chunk, level: 3)
compressed
end)
total_original = byte_size(large_data)
total_compressed = compressed_chunks |> Enum.map(&byte_size/1) |> Enum.sum()
IO.puts("Original: #{total_original} bytes")
IO.puts("Compressed: #{total_compressed} bytes")
IO.puts("Ratio: #{Float.round(total_original / total_compressed, 2)}x")
Decompressing Individual Chunks
Only the needed chunks need to be decompressed:
# Decompress just chunk 42
target = Enum.at(compressed_chunks, 42)
{:ok, decompressed} = ExCodecs.decode(:zstd, target)
IO.puts("Decompressed chunk size: #{byte_size(decompressed)} bytes")
IO.puts("Matches original: #{decompressed == Enum.at(chunks, 42)}")
Choosing the Right Codec for Your Data
data_patterns = %{
"Repetitive" => String.duplicate("AAAA", 100_000),
"Numeric array" => :binary.copy(<<3.14159::float-64>>, 100_000),
"Mixed" => (:crypto.strong_rand_bytes(500) <> String.duplicate("X", 500)) |> String.duplicate(100),
"Random" => :crypto.strong_rand_bytes(100_000)
}
codecs = [:zstd, :lz4, :snappy, :bzip2, :blosc2]
IO.puts(String.pad_trailing("Pattern", 15) <> " | " <> Enum.join(codecs, " | "))
for {name, data} <- data_patterns do
ratios =
for codec <- codecs do
{:ok, c} = ExCodecs.encode(codec, data)
Float.round(byte_size(data) / byte_size(c), 2)
end
ratio_strs = Enum.map(ratios, &String.pad_trailing("#{&1}", 5))
IO.puts(String.pad_trailing(name, 15) <> " | " <> Enum.join(ratio_strs, " | "))
end
Key Takeaways
- Blosc2 with byte shuffle excels on regular numeric arrays
- Zstd is the best general-purpose codec
- LZ4 and Snappy are fastest for real-time compression
- Bzip2 gives the best ratios for archival
- Chunk-based compression enables random access to large datasets