Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Exploratory data analysis in Elixir

txhousing.livemd

Exploratory data analysis in Elixir

Mix.install([
  {:kino, "~> 0.9.4"},
  {:kino_explorer, "~> 0.1.8"},
  {:kino_vega_lite, "~> 0.1.8"},
  {:req, "~> 0.3.10"}
])

Setup

Blog post that accompanies this notebook is here.

require Explorer.DataFrame, as: DF
require Explorer.Series, as: Series
txhousing =
  Req.get!("https://www.travishinkelman.com/data/txhousing.csv").body
  |> DF.load_csv!()

Summarize and Plot

df_agg_year =
  txhousing
  |> DF.group_by("year")
  |> DF.summarise(
    avg_sales: mean(sales),
    avg_volume: mean(volume),
    avg_median: mean(median)
  )
VegaLite.new()
|> VegaLite.data_from_values(df_agg_year, only: ["year", "avg_median"])
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "year", type: :quantitative)
|> VegaLite.encode_field(:y, "avg_median", type: :quantitative)
df_agg_city_year =
  txhousing
  |> DF.filter(
    Series.in(city, ["Austin", "Dallas", "El Paso", "Houston", "Lubbock", "San Antonio"])
  )
  |> DF.group_by(["city", "year"])
  |> DF.summarise(
    avg_sales: mean(sales),
    avg_volume: mean(volume),
    avg_median: mean(median)
  )
  |> DF.concat_rows(DF.mutate(df_agg_year, city: "All"))
VegaLite.new()
|> VegaLite.data_from_values(df_agg_city_year, only: ["year", "avg_median", "city"])
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "year", type: :quantitative)
|> VegaLite.encode_field(:y, "avg_median", type: :quantitative)
|> VegaLite.encode_field(:color, "city", type: :nominal)
df_agg_month =
  txhousing
  |> DF.group_by(:month)
  |> DF.summarise(
    avg_sales: mean(sales),
    avg_volume: mean(volume),
    avg_median: mean(median)
  )
  |> DF.arrange(month)
VegaLite.new()
|> VegaLite.data_from_values(df_agg_month, only: ["month", "avg_median"])
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "month", type: :quantitative)
|> VegaLite.encode_field(:y, "avg_median", type: :quantitative)
df_agg_city_month =
  txhousing
  |> DF.filter(
    Series.in(city, ["Austin", "Dallas", "El Paso", "Houston", "Lubbock", "San Antonio"])
  )
  |> DF.group_by(["city", "month"])
  |> DF.summarise(
    avg_sales: mean(sales),
    avg_volume: mean(volume),
    avg_median: mean(median)
  )
  |> DF.arrange(month)
  |> DF.concat_rows(DF.mutate(df_agg_month, city: "All"))
VegaLite.new()
|> VegaLite.data_from_values(df_agg_city_month, only: ["month", "avg_median", "city"])
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "month", type: :quantitative)
|> VegaLite.encode_field(:y, "avg_median", type: :quantitative)
|> VegaLite.encode_field(:color, "city", type: :nominal)
txhousing
|> DF.group_by(["city", "year"])
|> DF.mutate(
  total_sales: sum(sales),
  total_volume: sum(volume)
)
# need a new mutate when working with newly calculated column
|> DF.mutate(prop_sales: sales / total_sales)