kNN
Mix.install([
{:scholar, "~> 0.2"},
{:explorer, "~> 0.7"},
{:exla, "~> 0.6"},
{:nx, "~> 0.6", override: true},
{:req, "~> 0.4"},
{:kino_vega_lite, "~> 0.1"},
{:kino, "~> 0.12"},
{:scidata, "~> 0.1"},
{:kino_explorer, "~> 0.1"}
])
Setup
require Explorer.DataFrame, as: DF
require Explorer.Series, as: S
require Explorer.Query, as: Q
alias Scholar.Neighbors.KNearestNeighbors, as: KNN
alias Scholar.Metrics.Classification
alias Scholar.Metrics.Regression
alias VegaLite, as: VL
VegaLite
Nx.global_default_backend(EXLA.Backend)
seed = 24
key = Nx.Random.key(seed)
13:23:47.778 [info] TfrtCpuClient created.
#Nx.Tensor<
u32[2]
EXLA.Backend
[0, 24]
>
Data Exploration - Classification
# Split into train and test datasets
df = Explorer.Datasets.iris() |> DF.shuffle(seed: seed)
train_portion = ceil(DF.n_rows(df) * 0.8)
df_train = DF.head(df, train_portion)
df_test = DF.tail(df, DF.n_rows(df) - train_portion)
x =
df
|> DF.discard("species")
|> Nx.stack(axis: 1)
y =
df[["species"]]
|> DF.dummies(["species"])
|> Nx.stack(axis: 1)
|> Nx.argmax(axis: 1)
{train_x, test_x} = Nx.split(x, train_portion)
{train_y, test_y} = Nx.split(y, train_portion)
{#Nx.Tensor<
s64[120]
EXLA.Backend
[0, 1, 1, 2, 1, 1, 2, 2, 0, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 2, 1, 1, 0, 2, 2, 1, 1, 2, 0, 1, 0, 1, 2, 2, 0, 2, 2, 1, 0, 2, 0, 2, 0, 1, 2, 0, 2, 0, ...]
>,
#Nx.Tensor<
s64[30]
EXLA.Backend
[2, 1, 1, 0, 0, 1, 2, 1, 2, 2, 1, 0, 1, 0, 1, 1, 2, 1, 1, 1, 2, 0, 0, 1, 2, 1, 0, 0, 2, 0]
>}
df_train
|> DF.group_by("species")
|> DF.summarise(
petal_length: mean(petal_length),
petal_width: mean(petal_width),
sepal_length: mean(sepal_length),
sepal_width: mean(sepal_width)
)
VL.new(title: [text: "Petal Size Comparisons"], height: 600, width: 600, anchor: :middle)
|> VL.data_from_values(df_train)
|> VL.mark(:point)
|> VL.encode_field(:x, "petal_width", type: :quantitative, title: "Petal Width")
|> VL.encode_field(:y, "petal_length", type: :quantitative, title: "Petal Length")
|> VL.encode_field(:color, "species", type: :nominal)
|> VL.encode_field(:shape, "species", type: :nominal)
Classification
model = KNN.fit(train_x, train_y, num_classes: 3, num_neighbors: 3)
%Scholar.Neighbors.KNearestNeighbors{
data: #Nx.Tensor<
f64[120][4]
EXLA.Backend
[
[5.8, 4.0, 1.2, 0.2],
[5.8, 2.8, 5.1, 2.4],
[4.9, 2.5, 4.5, 1.7],
[6.2, 2.2, 4.5, 1.5],
[7.2, 3.2, 6.0, 1.8],
[6.5, 3.0, 5.2, 2.0],
[6.3, 2.5, 4.9, 1.5],
[5.5, 2.4, 3.8, 1.1],
[5.2, 4.1, 1.5, 0.1],
[6.3, 2.3, 4.4, 1.3],
[6.8, 2.8, 4.8, 1.4],
[5.5, 2.6, 4.4, 1.2],
[6.4, ...],
...
]
>,
labels: #Nx.Tensor<
s64[120]
EXLA.Backend
[0, 1, 1, 2, 1, 1, 2, 2, 0, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 2, 1, 1, 0, 2, 2, 1, 1, 2, 0, 1, 0, 1, 2, 2, 0, 2, 2, 1, 0, 2, 0, 2, 0, 1, 2, 0, 2, ...]
>,
default_num_neighbors: 3,
weights: :uniform,
num_classes: 3,
task: :classification,
metric: {:minkowski, 2}
}
predictions = KNN.predict(model, test_x)
#Nx.Tensor<
s64[30]
EXLA.Backend
[2, 1, 1, 0, 0, 1, 2, 1, 2, 2, 1, 0, 1, 0, 1, 1, 2, 1, 1, 1, 1, 0, 0, 1, 2, 1, 0, 0, 2, 0]
>
Evaluation - Classification
df_pred = DF.put(df_test, "prediction", predictions)
VL.new(title: [text: "Prediction Verification"], height: 600, width: 600, anchor: :middle)
|> VL.data_from_values(df_pred)
|> VL.mark(:point)
|> VL.encode_field(:x, "petal_width", type: :quantitative, title: "Petal Width")
|> VL.encode_field(:y, "petal_length", type: :quantitative, title: "Petal Length")
|> VL.encode_field(:color, "prediction", type: :nominal)
|> VL.encode_field(:shape, "species", type: :nominal)
Classification.accuracy(test_y, predictions)
#Nx.Tensor<
f32
EXLA.Backend
0.9666666388511658
>
Data Exploration - Regression
file_input = Kino.Input.file("Choose CSV")
# Split into train and test datasets
df =
file_input
|> Kino.Input.read()
|> Map.get(:file_ref)
|> Kino.Input.file_path()
|> DF.from_csv!()
df =
DF.rename(df,
SquareFeet: "square_feet",
Bedrooms: "bedrooms",
Bathrooms: "bathrooms",
Neighborhood: "neighborhood",
YearBuilt: "year_built",
Price: "price"
)
train_portion = ceil(DF.n_rows(df) * 0.8)
df_train = DF.head(df, train_portion)
df_test = DF.tail(df, DF.n_rows(df) - train_portion)
neighborhoods = DF.dummies(df, "neighborhood")
df = DF.concat_columns([df, neighborhoods])
x =
df
|> DF.discard(~w(price neighborhood))
|> Nx.stack(axis: 1)
y = Nx.stack(df[["price"]], axis: 1)
{train_x, test_x} = Nx.split(x, train_portion)
{train_y, test_y} = Nx.split(y, train_portion)
df
df_train
|> DF.group_by("year_built")
|> DF.summarise(
bathrooms: mean(bathrooms),
bedrooms: mean(bedrooms),
square_feet: mean(square_feet),
price: median(price)
)
Regression
model = KNN.fit(train_x, train_y, num_classes: 3, num_neighbors: 3, task: :regression)
%Scholar.Neighbors.KNearestNeighbors{
data: #Nx.Tensor<
s64[40000][7]
EXLA.Backend
[
[2126, 4, 1, 1969, 1, 0, 0],
[2459, 3, 2, 1980, 1, 0, 0],
[1860, 2, 1, 1970, 0, 1, 0],
[2294, 2, 1, 1996, 0, 0, 1],
[2130, 5, 2, 2001, 0, 1, 0],
[2095, 2, 3, 2020, 0, 1, 0],
[2724, 2, 1, 1993, 0, 1, 0],
...
]
>,
labels: #Nx.Tensor<
f64[40000][1]
EXLA.Backend
[
[215355.28361820139],
[195014.22162584803],
[306891.0120763329],
[206786.78715332696],
[272436.239065061],
[198208.80390657106],
[343429.3191099182],
[184992.321268412],
[377998.58815204125],
[95961.92601406391],
[191113.76867886042],
[253358.64500167352],
[132172.3926169813],
[231157.02767588635],
[118393.82316264397],
[267377.3996858498],
[190773.14856300573],
[172989.80490101498],
[239222.66779695038],
[143050.20178240185],
[405523.82831733953],
[263954.15406277135],
[148310.62016790514],
[151733.92248999208],
[307961.10738239513],
[276162.86180465267],
[243985.2054715822],
[88030.54185271678],
[282908.98169371625],
[240976.55176671062],
[104747.33458904951],
[347207.38956128363],
[77493.9314389322],
[331851.0816694101],
[110408.67080143407],
[127932.75677961827],
[228683.22699618756],
[124711.70785494654],
[415850.77083678055],
[184819.96119001514],
[164855.98777549056],
[156928.01471681413],
[282457.8613741379],
[287591.6728221959],
[156313.5941951282],
[279764.3710402287],
[366494.48060208897],
[244539.28168903317],
...
]
>,
default_num_neighbors: 3,
weights: :uniform,
num_classes: 3,
task: :regression,
metric: {:minkowski, 2}
}
predictions = KNN.predict(model, test_x)
#Nx.Tensor<
f64[10000][1]
EXLA.Backend
[
[226373.9807993843],
[183915.94593704157],
[183180.09138104858],
[212875.8215894591],
[190173.12866427167],
[236196.48192151205],
[191265.73326971615],
[211868.1438984828],
[285944.72421994776],
[292179.7714167268],
[135850.84540173528],
[170355.0694662646],
[245781.42715731516],
[332762.16565358033],
[218778.89671249516],
[278597.5767179295],
[137111.53295960717],
[222939.45174795808],
[346537.8395541983],
[274461.096041744],
[308123.83451235044],
[154487.07154281452],
[230564.8925299188],
[209643.65583746892],
[171856.3829574409],
[222375.53700887578],
[184989.23650707747],
[257232.0068392614],
[266728.27458831324],
[135307.8737726426],
[286752.6754042516],
[278532.3477017124],
[146663.38599155258],
[143760.42672523172],
[217642.40187830463],
[239256.34190318538],
[214894.75755024832],
[143360.16418437145],
[290998.1990680697],
[166314.0409957024],
[348214.14531584276],
[329513.4915151199],
[298984.1640842976],
[301549.31636405794],
[193079.5165821925],
[173499.1817696606],
[231942.86135317638],
[94320.87884601638],
[250972.33804283806],
[169602.52992067536],
...
]
>
Evaluation - Regression
Regression.mean_square_error(test_y, predictions)
#Nx.Tensor<
f64
EXLA.Backend
3353824360.183254
>
Regression.mean_absolute_percentage_error(test_y, predictions)
#Nx.Tensor<
f64
EXLA.Backend
0.33527562741283073
>