Powered by AppSignal & Oban Pro

Explorer

livebooks/explorer/basics.livemd

Explorer

Mix.install([
  {:explorer, "~> 0.9"},
  {:kino, "~> 0.15"}
])

エイリアス

alias Explorer.DataFrame
alias Explorer.Series
require Explorer.DataFrame

データフレーム作成

sample_df =
  %{
    "labels" => ["a", "b", "c", "c", "c"],
    "values" => [1, 2, 3, 2, 1]
  }
  |> DataFrame.new()
Kino.DataTable.new(sample_df)
sample_df
|> DataFrame.describe()
|> Kino.DataTable.new()

計算

四則演算、累乗

sample_df
|> DataFrame.mutate(
  add: values + 2,
  subtract: values - 2,
  multiply: values * 2,
  divide: values / 2,
  pow: values ** 2
)
|> Kino.DataTable.new()

商、余り

sample_df
|> DataFrame.mutate(
  quotient: quotient(values, 2),
  remainder: remainder(values, 2)
)
|> Kino.DataTable.new()

比較

sample_df
|> DataFrame.mutate(
  equal: values == 2,
  not_equal: values != 2,
  greater: values > 2,
  greater_equal: values >= 2,
  less: values < 2,
  less_equal: values <= 2
)
|> Kino.DataTable.new()

論理演算

sample_df
|> DataFrame.mutate(
  "not equal": Series.not(values == 2),
  "greater or equal": Series.or(values > 2, values == 2),
  "greater and equal": Series.and(values > 2, values == 2)
)
|> Kino.DataTable.new()
Series.all_equal(sample_df["labels"], sample_df["labels"])
Series.all_equal(sample_df["labels"], sample_df["values"])

フィルター

threshold = 1

sample_df
|> DataFrame.filter(values > ^threshold)
|> Kino.DataTable.new()

欠損値

nil_df =
  %{
    "labels" => ["a", nil, "c", "c", "c"],
    "values" => [1, 2, 3, nil, 1]
  }
  |> DataFrame.new()

Kino.DataTable.new(nil_df)
nil_df
|> DataFrame.mutate(
  is_nil: is_nil(values),
  is_not_nil: is_not_nil(values)
)
|> Kino.DataTable.new()
fill_label =
  ""
  |> List.duplicate(5)
  |> Series.from_list()
fill_value =
  0
  |> List.duplicate(5)
  |> Series.from_list()
nil_df
|> DataFrame.mutate(
  labels_filled: coalesce(labels, ^fill_label),
  values_filled: coalesce(values, ^fill_value)
)
|> Kino.DataTable.new()

集計

sample_df
|> DataFrame.group_by("labels")
|> DataFrame.summarise(
  count: count(values),
  n_distinct: n_distinct(values),
  min: min(values),
  max: max(values),
  sum: sum(values),
  mean: mean(values),
  median: median(values),
  "quantile 1/4": quantile(values, 0.25),
  "quantile 3/4": quantile(values, 0.75),
  variance: variance(values),
  standard_deviation: standard_deviation(values)
)
|> Kino.DataTable.new()
sample_df["values"]
|> Series.frequencies()
|> Kino.DataTable.new()

並べ替え

sample_df
|> DataFrame.sort_by(asc: values)
|> Kino.DataTable.new()

CSV 読み込み

population_df =
  "/home/livebook/explorer/population_20211001.csv"
  |> DataFrame.from_csv!()
  |> DataFrame.rename("人口(千人)": "人口_千人")

テーブル表示

DataFrame.print(population_df)
DataFrame.print(population_df, limit: :infinity)
Kino.DataTable.new(population_df)
population_df
|> DataFrame.mutate(人口_千人: cast(人口_千人, :integer))
|> DataFrame.describe()
|> Kino.DataTable.new()

データ分析

population_df
|> DataFrame.distinct(["年齢層"])
|> DataFrame.pull("年齢層")
|> Series.to_list()
population_df
|> DataFrame.mutate(人口_千人: cast(人口_千人, :float))
|> DataFrame.filter(都道府県 == "東京都")
|> DataFrame.select(["年齢層", "性別", "人口_千人"])
|> Kino.DataTable.new()
trimmed =
  population_df["人口_千人"]
  |> Series.transform(fn input ->
    String.replace(input, ",", "")
  end)
  |> Series.cast(:float)
population_df = DataFrame.put(population_df, "人口_千人", trimmed)
population_df
|> DataFrame.filter(都道府県 == "東京都")
|> DataFrame.select(["年齢層", "性別", "人口_千人"])
|> Kino.DataTable.new()
population_df =
  population_df
  |> DataFrame.mutate(人口_千人: 人口_千人 * 1000)
  |> DataFrame.mutate(人口_千人: cast(人口_千人, :integer))
  |> DataFrame.rename(人口_千人: "人口")

population_df
|> DataFrame.filter(都道府県 == "東京都")
|> DataFrame.select(["年齢層", "性別", "人口"])
|> Kino.DataTable.new()
population_df
|> DataFrame.filter(年齢層 == "15歳未満")
|> DataFrame.filter(性別 == "男性")
|> DataFrame.filter(人口 > 300_000)
|> DataFrame.select(["都道府県", "人口"])
|> DataFrame.sort_by(desc: 人口)
|> Kino.DataTable.new()
population_df
|> DataFrame.pivot_wider("年齢層", "人口")
|> Kino.DataTable.new()
sum_df =
  population_df
  |> DataFrame.group_by(["都道府県"])
  |> DataFrame.summarise(人口_千人: sum(人口 / 1_000))
  |> DataFrame.mutate(人口_千人: cast(人口_千人, :float))

sum_df
|> DataFrame.sort_by(desc: 人口_千人)
|> Kino.DataTable.new()
sex_ratio_df =
  population_df
  |> DataFrame.group_by(["都道府県", "性別"])
  |> DataFrame.summarise(人口: sum(人口))
  |> DataFrame.mutate(人口: cast(人口, :float))
  |> DataFrame.pivot_wider("性別", "人口")
  |> DataFrame.mutate(合計: 男性 + 女性)
  |> DataFrame.mutate(男性率: 男性 / 合計)
  |> DataFrame.mutate(女性率: 女性 / 合計)
  |> DataFrame.select(["都道府県", "男性率", "女性率"])

sex_ratio_df
|> DataFrame.sort_by(desc: 男性率)
|> Kino.DataTable.new()
elderly_rate_df =
  population_df
  |> DataFrame.group_by(["都道府県", "年齢層"])
  |> DataFrame.summarise(人口: sum(人口))
  |> DataFrame.mutate(人口: cast(人口, :float))
  |> DataFrame.pivot_wider("年齢層", "人口")
  |> DataFrame.rename("15歳未満": "young")
  |> DataFrame.rename("15~64歳": "middle")
  |> DataFrame.rename("65歳以上": "elder")
  |> DataFrame.mutate(合計: young + middle + elder)
  |> DataFrame.mutate(高齢者率: elder / 合計)
  |> DataFrame.select(["都道府県", "高齢者率"])

elderly_rate_df
|> DataFrame.sort_by(desc: 高齢者率)
|> Kino.DataTable.new(sorting_enabled: true)
assets_df = DataFrame.from_csv!("/home/livebook/explorer/assets_20151216.csv")

Kino.DataTable.new(assets_df)
parse_float = fn df, col ->
  df
  |> DataFrame.put(
    col,
    df[col]
    |> Series.transform(fn input ->
      String.replace(input, ",", "")
    end)
    |> Series.cast(:float)
  )
end

assets_df =
  assets_df
  |> parse_float.("年間収入")
  |> parse_float.("貯蓄現在高")
  |> parse_float.("負債現在高")

Kino.DataTable.new(assets_df)
joined_df =
  sum_df
  |> DataFrame.join(sex_ratio_df, how: :left)
  |> DataFrame.join(elderly_rate_df, how: :left)
  |> DataFrame.join(assets_df, how: :left)

Kino.DataTable.new(joined_df)
DataFrame.to_csv(joined_df, "/tmp/joined.csv")
cols =
  joined_df
  |> DataFrame.names()
  |> Enum.filter(&amp;(&amp;1 != "都道府県"))
std_map =
  joined_df
  |> DataFrame.select(cols)
  |> DataFrame.to_series()
  |> Enum.map(fn {key, value} -> {key, Series.standard_deviation(value)} end)
  |> Enum.into(%{})
get_mul = fn df, col_1, col_2 ->
  DataFrame.put(
    df,
    "#{col_1}*#{col_2}",
    Series.multiply(df[col_1], df[col_2])
  )
end

covariance_df =
  cols
  |> Enum.reduce(joined_df, fn col_1, sub_df_1 ->
    cols
    |> Enum.reduce(sub_df_1, fn col_2, sub_df_2 ->
      get_mul.(sub_df_2, col_1, col_2)
    end)
  end)
select_cols =
  covariance_df
  |> DataFrame.names()
  |> Enum.filter(&amp;(&amp;1 != "都道府県"))

mean_map =
  covariance_df
  |> DataFrame.select(select_cols)
  |> DataFrame.to_series()
  |> Enum.map(fn {key, value} -> {key, Series.mean(value)} end)
  |> Enum.into(%{})
get_covariance = fn col_1, col_2 ->
  Map.get(mean_map, col_1 <> "*" <> col_2) - Map.get(mean_map, col_1) * Map.get(mean_map, col_2)
end

covariance_map =
  cols
  |> Enum.map(fn col_1 ->
    cols
    |> Enum.map(fn col_2 ->
      {col_1 <> "*" <> col_2, get_covariance.(col_1, col_2)}
    end)
    |> Enum.into(%{})
  end)
  |> Enum.reduce(fn map, merged_map ->
    Map.merge(merged_map, map)
  end)
get_correlation = fn col_1, col_2 ->
  cond do
    col_1 == col_2 ->
      1

    true ->
      Map.get(covariance_map, col_1 <> "*" <> col_2) /
        (Map.get(std_map, col_1) * Map.get(std_map, col_2))
  end
end

correlation_map =
  cols
  |> Enum.map(fn col_1 ->
    cols
    |> Enum.map(fn col_2 ->
      {col_1 <> "*" <> col_2, get_correlation.(col_1, col_2)}
    end)
    |> Enum.into(%{})
  end)
  |> Enum.reduce(fn map, merged_map ->
    Map.merge(merged_map, map)
  end)
cols
|> Enum.map(fn col_1 ->
  %{
    col_1 =>
      cols
      |> Enum.map(fn col_2 ->
        Map.get(correlation_map, col_1 <> "*" <> col_2)
      end)
  }
end)
|> Enum.reduce(fn map, merged_map ->
  Map.merge(merged_map, map)
end)
|> Map.merge(%{"x" => cols})
|> DataFrame.new()
|> DataFrame.select(["x" | cols])
|> Kino.DataTable.new(keys: ["x" | cols], sorting_enabled: true)