Powered by AppSignal & Oban Pro

Transformations

guides/transformations.livemd

Transformations

Mix.install([{:dux, "~> 0.2.0"}])

Setup

require Dux

penguins = Dux.Datasets.penguins()

Filter: Selecting Rows

Expression Syntax

Bare identifiers are column names. Standard comparison and boolean operators work:

penguins
|> Dux.filter(species == "Adelie" and island == "Torgersen")
|> Dux.head(5)
|> Dux.compute()

Interpolation with ^

Use ^ to inject Elixir values safely (parameter bindings, not string interpolation):

target_species = "Gentoo"
min_mass = 5000

penguins
|> Dux.filter(species == ^target_species and body_mass_g > ^min_mass)
|> Dux.head(3)
|> Dux.compute()

Raw SQL with filter_with

For complex DuckDB expressions:

penguins
|> Dux.filter_with("bill_length_mm BETWEEN 40 AND 45")
|> Dux.head(5)
|> Dux.compute()

Mutate: Computing New Columns

Expression Syntax

penguins
|> Dux.drop_nil([:bill_length_mm, :bill_depth_mm, :body_mass_g])
|> Dux.mutate(
  bill_ratio: bill_length_mm / bill_depth_mm,
  mass_kg: body_mass_g / 1000.0
)
|> Dux.select([:species, :bill_ratio, :mass_kg])
|> Dux.head(5)
|> Dux.compute()

Raw SQL with mutate_with

Access DuckDB string functions, CASE expressions, and more:

penguins
|> Dux.mutate_with(
  species_upper: "UPPER(species)",
  size_category: "CASE WHEN body_mass_g > 4500 THEN 'large' ELSE 'small' END"
)
|> Dux.select([:species_upper, :size_category, :body_mass_g])
|> Dux.head(5)
|> Dux.compute()

Window Functions

Use mutate_with for window functions — they compute values relative to other rows:

penguins
|> Dux.drop_nil([:body_mass_g])
|> Dux.mutate_with(
  rank_in_species: "ROW_NUMBER() OVER (PARTITION BY species ORDER BY body_mass_g DESC)",
  species_avg: "AVG(body_mass_g) OVER (PARTITION BY species)"
)
|> Dux.select([:species, :body_mass_g, :rank_in_species, :species_avg])
|> Dux.sort_by([:species, :rank_in_species])
|> Dux.head(6)
|> Dux.compute()

Select & Discard

# Keep specific columns
penguins
|> Dux.select([:species, :island, :body_mass_g])
|> Dux.head(3)
|> Dux.compute()
# Drop specific columns
penguins
|> Dux.discard([:year])
|> Dux.head(3)
|> Dux.compute()

Rename

penguins
|> Dux.rename(body_mass_g: :weight, bill_length_mm: :bill_length)
|> Dux.select([:species, :weight, :bill_length])
|> Dux.head(3)
|> Dux.compute()

Drop Nil: Null Handling

Remove rows where specific columns are nil:

IO.puts("All rows: #{Dux.n_rows(penguins)}")

clean = penguins |> Dux.drop_nil([:sex, :body_mass_g])
IO.puts("After drop_nil: #{Dux.n_rows(clean)}")

Sorting

# Ascending (default)
penguins |> Dux.sort_by(:body_mass_g) |> Dux.head(3) |> Dux.compute()
# Descending
penguins |> Dux.sort_by(desc: :body_mass_g) |> Dux.head(3) |> Dux.compute()
# Multi-column
penguins
|> Dux.drop_nil([:body_mass_g])
|> Dux.sort_by(asc: :species, desc: :body_mass_g)
|> Dux.head(6)
|> Dux.compute()

Head, Slice, Distinct

# First N rows (default 10)
penguins |> Dux.head(3) |> Dux.compute()
# Offset + limit
penguins |> Dux.slice(5, 3) |> Dux.compute()
# Unique combinations
penguins
|> Dux.select([:species, :island])
|> Dux.distinct()
|> Dux.sort_by([:species, :island])
|> Dux.compute()

A Complex Pipeline

penguins
|> Dux.drop_nil([:body_mass_g, :bill_length_mm, :sex])
|> Dux.filter(sex == "female")
|> Dux.mutate(bill_ratio: bill_length_mm / bill_depth_mm)
|> Dux.group_by(:species)
|> Dux.summarise(
  n: count(species),
  avg_mass: avg(body_mass_g),
  avg_bill_ratio: avg(bill_ratio)
)
|> Dux.sort_by(desc: :avg_mass)
|> Dux.compute()

> #### See the SQL {: .tip} > > Replace to_rows() with sql_preview(pretty: true) |> IO.puts() to see > the SQL DuckDB will execute for any pipeline.

What’s Next