Data IO
Mix.install([{:dux, "~> 0.2.0"}])
Setup
require Dux
tmp_dir = System.tmp_dir!()
Reading CSV
penguins = Dux.Datasets.penguins()
penguins |> Dux.head(3) |> Dux.compute()
> #### CSV options {: .info}
>
> - :delimiter — field delimiter (default ",")
> - :header — whether file has a header row (default true)
> - :nullstr — string to treat as NULL (e.g., "NA")
> - :skip — rows to skip at the start
> - :null_padding — pad missing columns with NULL
Reading Parquet
Parquet is a columnar format — fast and compact. DuckDB reads it natively:
path = Path.join(tmp_dir, "demo.parquet")
Dux.from_list(Enum.map(1..1000, &%{id: &1, value: :rand.uniform(100)}))
|> Dux.to_parquet(path)
Dux.from_parquet(path)
|> Dux.summarise_with(n: "COUNT(*)", avg_val: "AVG(value)")
|> Dux.compute()
Glob Patterns
Read multiple files at once. DuckDB handles the UNION:
for i <- 1..3 do
rows = Enum.map(1..100, &%{part: i, value: (i - 1) * 100 + &1})
Dux.from_list(rows) |> Dux.to_parquet(Path.join(tmp_dir, "part_#{i}.parquet"))
end
Dux.from_parquet(Path.join(tmp_dir, "part_*.parquet"))
|> Dux.summarise_with(n: "COUNT(*)", total: "SUM(value)")
|> Dux.compute()
Reading NDJSON
ndjson_path = Path.join(tmp_dir, "events.ndjson")
File.write!(ndjson_path, """
{"event":"click","ts":"2024-01-01","user":1}
{"event":"view","ts":"2024-01-01","user":2}
{"event":"click","ts":"2024-01-02","user":1}
""")
Dux.from_ndjson(ndjson_path) |> Dux.compute()
Writing Files
CSV
csv_path = Path.join(tmp_dir, "output.csv")
Dux.Datasets.penguins()
|> Dux.filter_with("species = 'Gentoo'")
|> Dux.select([:species, :island, :body_mass_g])
|> Dux.to_csv(csv_path)
File.read!(csv_path) |> String.split("\n") |> Enum.take(3)
Parquet with Compression
parquet_path = Path.join(tmp_dir, "compressed.parquet")
Dux.Datasets.penguins()
|> Dux.to_parquet(parquet_path, compression: :zstd)
"#{div(File.stat!(parquet_path).size, 1024)} KB"
Reading Excel
DuckDB 1.5+ reads .xlsx files natively. Dux defaults to ignore_errors: true
and empty_as_varchar: true for safe handling of messy spreadsheets:
# xlsx_path = "sales.xlsx"
# Dux.from_excel(xlsx_path) |> Dux.compute()
#
# # With options
# Dux.from_excel("data.xlsx", sheet: "Q1 2024", range: "A1:F100")
#
# # For messy spreadsheets with mixed types
# Dux.from_excel("messy.xlsx", all_varchar: true)
Writing Excel
# excel_out = Path.join(tmp_dir, "output.xlsx")
# Dux.Datasets.penguins()
# |> Dux.filter_with("species = 'Gentoo'")
# |> Dux.to_excel(excel_out)
Database Tables: insert_into
Write pipeline results to a table — local DuckDB or an attached database:
# Create a local table from a pipeline
Dux.from_query("SELECT * FROM range(100) t(x)")
|> Dux.insert_into("my_table", create: true)
# Read it back
Dux.from_query("SELECT * FROM my_table") |> Dux.n_rows()
# Cleanup
conn = Dux.Connection.get_conn()
Adbc.Connection.query(conn, "DROP TABLE IF EXISTS my_table")
> #### Attached databases {: .info}
>
> insert_into works with attached databases too:
> elixir > Dux.attach(:pg, "host=... dbname=analytics", type: :postgres, read_only: false) > Dux.from_parquet("data.parquet") > |> Dux.insert_into("pg.public.events", create: true) >
> See the Distributed Execution guide for parallel writes.
The SQL Escape Hatch
from_query/1 lets you write raw DuckDB SQL for anything the verbs don’t cover:
Dux.from_query("SELECT range AS n, n * n AS square FROM range(1, 11)")
|> Dux.compute()
Dux.from_query("""
SELECT
i AS id,
['US', 'EU', 'APAC'][1 + (i % 3)] AS region,
round(random() * 1000, 2) AS amount
FROM range(1, 101) t(i)
""")
|> Dux.group_by(:region)
|> Dux.summarise_with(total: "SUM(amount)", n: "COUNT(*)")
|> Dux.compute()
Cleanup
for f <- Path.wildcard(Path.join(tmp_dir, "*.{csv,parquet,ndjson}")) do
File.rm(f)
end
:ok
What’s Next
- Getting Started — core concepts
- Transformations — filter, mutate, window functions
- Joins & Reshape — star schema joins