Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us
Notesclub

csv

02_spam/nbs/csv.livemd

csv

Mix.install([
  {:explorer, "~> 0.5.0"}
])

Section

alias Explorer.DataFrame, as: DF
alias Explorer.Series
path_base = "data/archive"
name1 = "completeSpamAssassin"
name2 = "enronSpamSubset"
name3 = "lingSpam"

path_out = "data/combined"
df1 = DF.from_csv!(path_base <> "/" <> name1 <> ".csv")
df1 = DF.select(df1, ["Body", "Label"])
df1["Label"] |> Series.to_enum() |> Enum.frequencies()
df2 = DF.from_csv!(path_base <> "/" <> name2 <> ".csv")
df2 = DF.select(df2, ["Body", "Label"])
df2["Label"] |> Series.to_enum() |> Enum.frequencies()
df3 = DF.from_csv!(path_base <> "/" <> name3 <> ".csv")
df3 = DF.select(df3, ["Body", "Label"])
df3["Label"] |> Series.to_enum() |> Enum.frequencies()
DF.names(df1)
DF.names(df2)
DF.names(df3)
df = DF.concat_rows([df1, df2, df3])
df = DF.shuffle(df)
df["Label"] |> Series.to_enum() |> Enum.frequencies()
n_rows = DF.n_rows(df)
perc_test = 0.3
test_rows = round(n_rows * perc_test)
df_train = DF.slice(df, 0..test_rows)
df_test = DF.slice(df, (test_rows + 1)..-1)
# path_out = "data/combined"
# if File.exists?(path_out) do
#   File.mkdir!(path_out)
# end
check = fn df ->
  Enum.map(DF.names(df), fn x ->
    IO.inspect(x, label: "name")
    Series.nil_count(df[x]) |> IO.inspect(label: "nil")
  end)
end
check.(df_train)
check.(df_test)
df_test = DF.drop_nil(df_test)
check.(df_test)
DF.to_csv!(df_train, path_out <> "/train.csv")
DF.to_csv!(df_test, path_out <> "/test.csv")