Trying to load the large Parquet data set from Yoad
Mix.install([
# Kino 0.9.4, Explorer 0.5.7
{:kino_explorer, "~> 0.1.7"}
])
Section
require Explorer.DataFrame, as: DF
alias Explorer.Series
{raw, analytics, fixedRaw, fixedAnalytics} = {
"cs-rawdata-northeurope-production-inventory.parquet",
"cs-analytics-lr1-northeurope-production-inventory.parquet",
"cs-rawdata-northeurope-production-inventory-fixed.parquet",
"cs-analytics-lr1-northeurope-production-inventory-fixed.parquet"
}
defmodule ParquetCleaner do
@columnsOfInterest ~w(Name Content-Length Creation-Time Last-Modified LastAccessTime AccessTier AccessTierChangeTime AccessTierInferred ArchiveStatus)
@analyticsColumsOfInterest ~w(Name Content-Length Creation-Time Last-Modified LastAccessTime AccessTier)
def downloaded_file(filename), do: "C:/Users/chgeuer/Downloads/#{filename}"
def read_and_fix(input_filename, output_filename) do
# Read the raw Parquet and convert the epoch values to proper DateTime
input_filename
|> DF.from_parquet!(columns: @columnsOfInterest)
|> DF.mutate(
for column <- across(~w(Creation-Time Last-Modified LastAccessTime)) do
{column.name, cast(column * 1000, :datetime)}
end
)
|> DF.to_parquet!(output_filename)
end
end
# ParquetCleaner.read_and_fix(
# ParquetCleaner.downloaded_file(raw),
# ParquetCleaner.downloaded_file(fixedRaw)
# )
# ParquetCleaner.read_and_fix(
# ParquetCleaner.downloaded_file(analytics),
# ParquetCleaner.downloaded_file(fixedAnalytics)
# )
# Read the dataset from disk
df =
ParquetCleaner.downloaded_file(fixedAnalytics)
|> DF.from_parquet!()
df
|> DF.describe()
df
|> DF.names()
df |> DF.head(5) |> DF.table()
df |> DF.head(5)
# Summarize the amount of content
bytes =
df
|> DF.to_series()
|> Map.get("Content-Length")
|> Series.sum()
kiloByte = bytes / 1024
megaByte = kiloByte / 1024
gigaByte = megaByte / 1024
teraByte = gigaByte / 1024
[
byte: round(bytes),
kilo: round(kiloByte),
mega: round(megaByte),
giga: round(gigaByte),
tera: round(teraByte)
]