Data Processing
Dependencies
Mix.install([
{:exla, "~> 0.1.0-dev", github: "elixir-nx/nx", sparse: "exla", override: true},
{:nx, "~> 0.1.0-dev", github: "elixir-nx/nx", sparse: "nx", override: true},
{:scholar, "~> 0.1.0", github: "elixir-nx/scholar", branch: "main"},
{:explorer, "~> 0.1.0-dev", github: "elixir-nx/explorer", branch: "main"},
{:vega_lite, "~> 0.1.3"},
{:kino, "~> 0.5.2"}
])
alias Explorer.DataFrame
alias Explorer.Series
alias VegaLite, as: Vl
Data Frame
The NSL-KDD dataset is made up of multiple 41 feature columns and 2 label columns.
From all the columns, protocol_type
, service
, flag
, and class
are nominal.
land
, logged_in
, is_host_login
, andis_guest_login
are binary.
The rest is either continuous or ratio.
columns = ~w(
duration
protocol_type
service
flag
src_bytes
dst_bytes
land
wrong_fragment
urgent
hot
num_failed_logins
logged_in
num_compromised
root_shell
su_attempted
num_root
num_file_creations
num_shells
num_access_files
num_outbound_cmds
is_host_login
is_guest_login
count
srv_count
serror_rate
srv_serror_rate
rerror_rate
srv_rerror_rate
same_srv_rate
diff_srv_rate
srv_diff_host_rate
dst_host_count
dst_host_srv_count
dst_host_same_srv_rate
dst_host_diff_srv_rate
dst_host_same_src_port_rate
dst_host_srv_diff_host_rate
dst_host_serror_rate
dst_host_srv_serror_rate
dst_host_rerror_rate
dst_host_srv_rerror_rate
class
difficulty_level
)
train_df = DataFrame.read_csv!("KDDTrain+.csv", header?: false, names: columns)
test_df = DataFrame.read_csv!("KDDTest+.csv", header?: false, names: columns)
Since we have to do a lot of manipulations on the training and testing data, we group them together in order to only process the data once.
df = DataFrame.concat_rows([train_df, test_df])
With the data loaded in, we can now try to visualize some metrics about the dataset with which we are currently playing.
We start of by figuring out how many unique values of the nominal columns we are dealing with.
protocol_type_count = Series.count(df["protocol_type"])
Vl.new(width: 800, height: 600)
|> Vl.mark(:bar)
|> Vl.data_from_series(DataFrame.to_map(protocol_type_count))
|> Vl.encode_field(:x, "values", type: :nominal, title: "Protocol Types")
|> Vl.encode_field(:y, "counts", type: :quantitative, title: "Count")
service_count = Series.count(df["service"])
Vl.new(width: 800, height: 600)
|> Vl.mark(:bar)
|> Vl.data_from_series(DataFrame.to_map(service_count))
|> Vl.encode_field(:x, "values", type: :nominal, title: "Services")
|> Vl.encode_field(:y, "counts", type: :quantitative, title: "Count")
flag_count = Series.count(df["flag"])
Vl.new(width: 800, height: 600)
|> Vl.mark(:bar)
|> Vl.data_from_series(DataFrame.to_map(flag_count))
|> Vl.encode_field(:x, "values", type: :nominal, title: "Flags")
|> Vl.encode_field(:y, "counts", type: :quantitative, title: "Count")
class_count = Series.count(df["class"])
Vl.new(width: 800, height: 600)
|> Vl.mark(:bar)
|> Vl.data_from_series(DataFrame.to_map(class_count))
|> Vl.encode_field(:x, "values", type: :nominal, title: "Classes")
|> Vl.encode_field(:y, "counts", type: :quantitative, title: "Count")
Now that we have analyzed the nominal values of the dataset, we create a new column
indicating if the row is an anomalous or benign packet. We then drop the class
and
difficulty_level
, as we have not need for both of these labels.
df =
df
|> DataFrame.mutate(anomalous: &Series.not_equal(&1["class"], "normal"))
|> DataFrame.select(["class", "difficulty_level"], :drop)
df2 =
df
|> DataFrame.select(["protocol_type", "anomalous"])
|> DataFrame.mutate(
anomalous:
&Series.transform(&1["anomalous"], fn
true -> "anomalous"
false -> "normal"
end)
)
Vl.new(width: 800, height: 600)
|> Vl.mark(:bar)
|> Vl.data_from_series(df2 |> DataFrame.to_map())
|> Vl.encode_field(:x, "protocol_type", type: :nominal, title: "Protocol Types")
|> Vl.encode_field(:y, "anomalous", type: :quantitative, aggregate: :count, title: "Count")
|> Vl.encode_field(:color, "anomalous", type: :nominal, title: "Class")
|> Vl.encode_field(:x_offset, "anomalous", type: :nominal)
Now that the anomalous label now identifies normal and anomalous packets, let’s use it to analyse the dataset a bit more.
anomalous_count = Series.count(df["anomalous"])
Vl.new(width: 800, height: 600)
|> Vl.mark(:arc)
|> Vl.data_from_series(%{
counts: Series.to_list(anomalous_count["counts"]),
values: ["normal", "anomalous"]
})
|> Vl.encode_field(:theta, "counts", type: :quantitative, title: "Count")
|> Vl.encode_field(:color, "values", type: :nominal, title: "Class")
We also drop the num_outbound_cmds
, since the only value present in this feature is 0
.
df = DataFrame.select(df, ["num_outbound_cmds"], :drop)
Now, all that we have to do is process the features of the data frame.
DataFrame.dummies(df, ["service"])
dummy_columns = ~w(
protocol_type
service
flag
)
zscore_columns = ~w(
duration
src_bytes
dst_bytes
wrong_fragment
urgent
hot
num_failed_logins
num_compromised
root_shell
su_attempted
num_root
num_file_creations
num_shells
num_access_files
count
srv_count
serror_rate
srv_serror_rate
rerror_rate
srv_rerror_rate
same_srv_rate
diff_srv_rate
srv_diff_host_rate
dst_host_count
dst_host_srv_count
dst_host_same_srv_rate
dst_host_diff_srv_rate
dst_host_same_src_port_rate
dst_host_srv_diff_host_rate
dst_host_serror_rate
dst_host_srv_serror_rate
dst_host_rerror_rate
dst_host_srv_rerror_rate
)
dummies = DataFrame.dummies(df, dummy_columns)
df =
df
|> DataFrame.mutate(
for column <- DataFrame.names(dummies), into: %{} do
{column, dummies[column]}
end
)
|> DataFrame.select(dummy_columns, :drop)
df =
df
|> DataFrame.mutate(
for column <- zscore_columns, into: %{} do
{column,
fn df ->
series = df[column]
mean = Series.mean(series)
std = Series.std(series)
series
|> Series.subtract(mean)
|> Series.divide(std)
end}
end
)
df = DataFrame.mutate(df, anomalous: &Series.cast(&1["anomalous"], :integer))
Now that we have some good looking data, we store it in a CSV file for use by our neural networks.
DataFrame.write_csv!(df, "KDD.csv")