Trying to load the large Parquet data set from Yoad

read-parqued.livemd

Dr. Christian Geuer-Pollmann

@chgeuer

Elixir-LiveBook-experimen...

Share to X

Share to Bluesky

More notebooks

Trying to load the large Parquet data set from Yoad

Mix.install([
  # Kino 0.9.4, Explorer 0.5.7
  {:kino_explorer, "~> 0.1.7"}
])

Section

require Explorer.DataFrame, as: DF
alias Explorer.Series

{raw, analytics, fixedRaw, fixedAnalytics} = {
  "cs-rawdata-northeurope-production-inventory.parquet",
  "cs-analytics-lr1-northeurope-production-inventory.parquet",
  "cs-rawdata-northeurope-production-inventory-fixed.parquet",
  "cs-analytics-lr1-northeurope-production-inventory-fixed.parquet"
}

defmodule ParquetCleaner do
  @columnsOfInterest ~w(Name Content-Length Creation-Time Last-Modified LastAccessTime AccessTier AccessTierChangeTime AccessTierInferred ArchiveStatus)

  @analyticsColumsOfInterest ~w(Name Content-Length Creation-Time Last-Modified LastAccessTime AccessTier)

  def downloaded_file(filename), do: "C:/Users/chgeuer/Downloads/#{filename}"

  def read_and_fix(input_filename, output_filename) do
    # Read the raw Parquet and convert the epoch values to proper DateTime
    input_filename
    |> DF.from_parquet!(columns: @columnsOfInterest)
    |> DF.mutate(
      for column <- across(~w(Creation-Time Last-Modified LastAccessTime)) do
        {column.name, cast(column * 1000, :datetime)}
      end
    )
    |> DF.to_parquet!(output_filename)
  end
end

# ParquetCleaner.read_and_fix(
#   ParquetCleaner.downloaded_file(raw),
#   ParquetCleaner.downloaded_file(fixedRaw)
# )

# ParquetCleaner.read_and_fix(
#   ParquetCleaner.downloaded_file(analytics),
#   ParquetCleaner.downloaded_file(fixedAnalytics)
# )

# Read the dataset from disk
df =
  ParquetCleaner.downloaded_file(fixedAnalytics)
  |> DF.from_parquet!()

df
|> DF.describe()

df
|> DF.names()

df |> DF.head(5) |> DF.table()

df |> DF.head(5)

# Summarize the amount of content

bytes =
  df
  |> DF.to_series()
  |> Map.get("Content-Length")
  |> Series.sum()

kiloByte = bytes / 1024
megaByte = kiloByte / 1024
gigaByte = megaByte / 1024
teraByte = gigaByte / 1024

[
  byte: round(bytes),
  kilo: round(kiloByte),
  mega: round(megaByte),
  giga: round(gigaByte),
  tera: round(teraByte)
]

Other notebooks:

Jeremy Brayton
@w0rd-driven

livebook_notebooks

larajobs.com

larajobs.livemd

tutorial advanced data-science kino req floki nimble_csv timex explorer kino_explorer

2023-4-17
José Valim
@josevalim

livebooks

Bringing Elixir to life

04-elixir-conf.livemd

tutorial advanced gen-server otp kino evision req kino_vega_lite kino_bumblebee exla kino_explorer

2023-4-24
Jeremy Brayton
@w0rd-driven

livebook_notebooks

Elixir Companies

elixir-companies.livemd

tutorial advanced kino spider_man floki nimble_csv explorer kino_explorer

2023-5-22
José Valim
@josevalim

livebooks

Meta-programmable functional notebooks

06-lambda-days.livemd

tutorial advanced gen-server otp kino evision req kino_vega_lite kino_bumblebee exla kino_explorer

2023-6-5
@instancer-kirik

resolvinator

Untitled notebook

ganache_contract_test.livemd

tutorial advanced ethers ex_abi jason

2024-11-26
Aleksandar
@isavita

log_sight

Grafana Traffic Anomalies

grafana_traffic_anomalies.livemd

tutorial data-science intermediate req jason kino

2025-10-11
@DockYard-Academy

curriculum

Blog: Tags

blog_tags.livemd

tutorial advanced intermediate sql jason kino youtube hidden_cell

2023-3-21

Back