Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us
Notesclub

Trying to load the large Parquet data set from Yoad

read-parqued.livemd

Trying to load the large Parquet data set from Yoad

Mix.install([
  # Kino 0.9.4, Explorer 0.5.7
  {:kino_explorer, "~> 0.1.7"}
])

Section

require Explorer.DataFrame, as: DF
alias Explorer.Series
{raw, analytics, fixedRaw, fixedAnalytics} = {
  "cs-rawdata-northeurope-production-inventory.parquet",
  "cs-analytics-lr1-northeurope-production-inventory.parquet",
  "cs-rawdata-northeurope-production-inventory-fixed.parquet",
  "cs-analytics-lr1-northeurope-production-inventory-fixed.parquet"
}
defmodule ParquetCleaner do
  @columnsOfInterest ~w(Name Content-Length Creation-Time Last-Modified LastAccessTime AccessTier AccessTierChangeTime AccessTierInferred ArchiveStatus)

  @analyticsColumsOfInterest ~w(Name Content-Length Creation-Time Last-Modified LastAccessTime AccessTier)

  def downloaded_file(filename), do: "C:/Users/chgeuer/Downloads/#{filename}"

  def read_and_fix(input_filename, output_filename) do
    # Read the raw Parquet and convert the epoch values to proper DateTime
    input_filename
    |> DF.from_parquet!(columns: @columnsOfInterest)
    |> DF.mutate(
      for column <- across(~w(Creation-Time Last-Modified LastAccessTime)) do
        {column.name, cast(column * 1000, :datetime)}
      end
    )
    |> DF.to_parquet!(output_filename)
  end
end
# ParquetCleaner.read_and_fix(
#   ParquetCleaner.downloaded_file(raw),
#   ParquetCleaner.downloaded_file(fixedRaw)
# )

# ParquetCleaner.read_and_fix(
#   ParquetCleaner.downloaded_file(analytics),
#   ParquetCleaner.downloaded_file(fixedAnalytics)
# )
# Read the dataset from disk
df =
  ParquetCleaner.downloaded_file(fixedAnalytics)
  |> DF.from_parquet!()
df
|> DF.describe()
df
|> DF.names()
df |> DF.head(5) |> DF.table()
df |> DF.head(5)
# Summarize the amount of content

bytes =
  df
  |> DF.to_series()
  |> Map.get("Content-Length")
  |> Series.sum()

kiloByte = bytes / 1024
megaByte = kiloByte / 1024
gigaByte = megaByte / 1024
teraByte = gigaByte / 1024

[
  byte: round(bytes),
  kilo: round(kiloByte),
  mega: round(megaByte),
  giga: round(gigaByte),
  tera: round(teraByte)
]