Sponsored by AppSignal
Would you like to see your link here? Contact us
Notesclub

kNN

knn.livemd

kNN

Mix.install([
  {:scholar, "~> 0.2"},
  {:explorer, "~> 0.7"},
  {:exla, "~> 0.6"},
  {:nx, "~> 0.6", override: true},
  {:req, "~> 0.4"},
  {:kino_vega_lite, "~> 0.1"},
  {:kino, "~> 0.12"},
  {:scidata, "~> 0.1"},
  {:kino_explorer, "~> 0.1"}
])

Setup

require Explorer.DataFrame, as: DF
require Explorer.Series, as: S
require Explorer.Query, as: Q
alias Scholar.Neighbors.KNearestNeighbors, as: KNN
alias Scholar.Metrics.Classification
alias Scholar.Metrics.Regression
alias VegaLite, as: VL
VegaLite
Nx.global_default_backend(EXLA.Backend)
seed = 24
key = Nx.Random.key(seed)

13:23:47.778 [info] TfrtCpuClient created.
#Nx.Tensor<
  u32[2]
  EXLA.Backend
  [0, 24]
>

Data Exploration - Classification

# Split into train and test datasets
df = Explorer.Datasets.iris() |> DF.shuffle(seed: seed)
train_portion = ceil(DF.n_rows(df) * 0.8)
df_train = DF.head(df, train_portion)
df_test = DF.tail(df, DF.n_rows(df) - train_portion)

x =
  df
  |> DF.discard("species")
  |> Nx.stack(axis: 1)

y =
  df[["species"]]
  |> DF.dummies(["species"])
  |> Nx.stack(axis: 1)
  |> Nx.argmax(axis: 1)

{train_x, test_x} = Nx.split(x, train_portion)
{train_y, test_y} = Nx.split(y, train_portion)
{#Nx.Tensor<
   s64[120]
   EXLA.Backend
   [0, 1, 1, 2, 1, 1, 2, 2, 0, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 2, 1, 1, 0, 2, 2, 1, 1, 2, 0, 1, 0, 1, 2, 2, 0, 2, 2, 1, 0, 2, 0, 2, 0, 1, 2, 0, 2, 0, ...]
 >,
 #Nx.Tensor<
   s64[30]
   EXLA.Backend
   [2, 1, 1, 0, 0, 1, 2, 1, 2, 2, 1, 0, 1, 0, 1, 1, 2, 1, 1, 1, 2, 0, 0, 1, 2, 1, 0, 0, 2, 0]
 >}
df_train
|> DF.group_by("species")
|> DF.summarise(
  petal_length: mean(petal_length),
  petal_width: mean(petal_width),
  sepal_length: mean(sepal_length),
  sepal_width: mean(sepal_width)
)
VL.new(title: [text: "Petal Size Comparisons"], height: 600, width: 600, anchor: :middle)
|> VL.data_from_values(df_train)
|> VL.mark(:point)
|> VL.encode_field(:x, "petal_width", type: :quantitative, title: "Petal Width")
|> VL.encode_field(:y, "petal_length", type: :quantitative, title: "Petal Length")
|> VL.encode_field(:color, "species", type: :nominal)
|> VL.encode_field(:shape, "species", type: :nominal)

Classification

model = KNN.fit(train_x, train_y, num_classes: 3, num_neighbors: 3)
%Scholar.Neighbors.KNearestNeighbors{
  data: #Nx.Tensor<
    f64[120][4]
    EXLA.Backend
    [
      [5.8, 4.0, 1.2, 0.2],
      [5.8, 2.8, 5.1, 2.4],
      [4.9, 2.5, 4.5, 1.7],
      [6.2, 2.2, 4.5, 1.5],
      [7.2, 3.2, 6.0, 1.8],
      [6.5, 3.0, 5.2, 2.0],
      [6.3, 2.5, 4.9, 1.5],
      [5.5, 2.4, 3.8, 1.1],
      [5.2, 4.1, 1.5, 0.1],
      [6.3, 2.3, 4.4, 1.3],
      [6.8, 2.8, 4.8, 1.4],
      [5.5, 2.6, 4.4, 1.2],
      [6.4, ...],
      ...
    ]
  >,
  labels: #Nx.Tensor<
    s64[120]
    EXLA.Backend
    [0, 1, 1, 2, 1, 1, 2, 2, 0, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 2, 1, 1, 0, 2, 2, 1, 1, 2, 0, 1, 0, 1, 2, 2, 0, 2, 2, 1, 0, 2, 0, 2, 0, 1, 2, 0, 2, ...]
  >,
  default_num_neighbors: 3,
  weights: :uniform,
  num_classes: 3,
  task: :classification,
  metric: {:minkowski, 2}
}
predictions = KNN.predict(model, test_x)
#Nx.Tensor<
  s64[30]
  EXLA.Backend
  [2, 1, 1, 0, 0, 1, 2, 1, 2, 2, 1, 0, 1, 0, 1, 1, 2, 1, 1, 1, 1, 0, 0, 1, 2, 1, 0, 0, 2, 0]
>

Evaluation - Classification

df_pred = DF.put(df_test, "prediction", predictions)

VL.new(title: [text: "Prediction Verification"], height: 600, width: 600, anchor: :middle)
|> VL.data_from_values(df_pred)
|> VL.mark(:point)
|> VL.encode_field(:x, "petal_width", type: :quantitative, title: "Petal Width")
|> VL.encode_field(:y, "petal_length", type: :quantitative, title: "Petal Length")
|> VL.encode_field(:color, "prediction", type: :nominal)
|> VL.encode_field(:shape, "species", type: :nominal)
Classification.accuracy(test_y, predictions)
#Nx.Tensor<
  f32
  EXLA.Backend
  0.9666666388511658
>

Data Exploration - Regression

file_input = Kino.Input.file("Choose CSV")
# Split into train and test datasets
df =
  file_input
  |> Kino.Input.read()
  |> Map.get(:file_ref)
  |> Kino.Input.file_path()
  |> DF.from_csv!()

df =
  DF.rename(df,
    SquareFeet: "square_feet",
    Bedrooms: "bedrooms",
    Bathrooms: "bathrooms",
    Neighborhood: "neighborhood",
    YearBuilt: "year_built",
    Price: "price"
  )

train_portion = ceil(DF.n_rows(df) * 0.8)
df_train = DF.head(df, train_portion)
df_test = DF.tail(df, DF.n_rows(df) - train_portion)
neighborhoods = DF.dummies(df, "neighborhood")
df = DF.concat_columns([df, neighborhoods])

x =
  df
  |> DF.discard(~w(price neighborhood))
  |> Nx.stack(axis: 1)

y = Nx.stack(df[["price"]], axis: 1)

{train_x, test_x} = Nx.split(x, train_portion)
{train_y, test_y} = Nx.split(y, train_portion)

df
df_train
|> DF.group_by("year_built")
|> DF.summarise(
  bathrooms: mean(bathrooms),
  bedrooms: mean(bedrooms),
  square_feet: mean(square_feet),
  price: median(price)
)

Regression

model = KNN.fit(train_x, train_y, num_classes: 3, num_neighbors: 3, task: :regression)
%Scholar.Neighbors.KNearestNeighbors{
  data: #Nx.Tensor<
    s64[40000][7]
    EXLA.Backend
    [
      [2126, 4, 1, 1969, 1, 0, 0],
      [2459, 3, 2, 1980, 1, 0, 0],
      [1860, 2, 1, 1970, 0, 1, 0],
      [2294, 2, 1, 1996, 0, 0, 1],
      [2130, 5, 2, 2001, 0, 1, 0],
      [2095, 2, 3, 2020, 0, 1, 0],
      [2724, 2, 1, 1993, 0, 1, 0],
      ...
    ]
  >,
  labels: #Nx.Tensor<
    f64[40000][1]
    EXLA.Backend
    [
      [215355.28361820139],
      [195014.22162584803],
      [306891.0120763329],
      [206786.78715332696],
      [272436.239065061],
      [198208.80390657106],
      [343429.3191099182],
      [184992.321268412],
      [377998.58815204125],
      [95961.92601406391],
      [191113.76867886042],
      [253358.64500167352],
      [132172.3926169813],
      [231157.02767588635],
      [118393.82316264397],
      [267377.3996858498],
      [190773.14856300573],
      [172989.80490101498],
      [239222.66779695038],
      [143050.20178240185],
      [405523.82831733953],
      [263954.15406277135],
      [148310.62016790514],
      [151733.92248999208],
      [307961.10738239513],
      [276162.86180465267],
      [243985.2054715822],
      [88030.54185271678],
      [282908.98169371625],
      [240976.55176671062],
      [104747.33458904951],
      [347207.38956128363],
      [77493.9314389322],
      [331851.0816694101],
      [110408.67080143407],
      [127932.75677961827],
      [228683.22699618756],
      [124711.70785494654],
      [415850.77083678055],
      [184819.96119001514],
      [164855.98777549056],
      [156928.01471681413],
      [282457.8613741379],
      [287591.6728221959],
      [156313.5941951282],
      [279764.3710402287],
      [366494.48060208897],
      [244539.28168903317],
      ...
    ]
  >,
  default_num_neighbors: 3,
  weights: :uniform,
  num_classes: 3,
  task: :regression,
  metric: {:minkowski, 2}
}
predictions = KNN.predict(model, test_x)
#Nx.Tensor<
  f64[10000][1]
  EXLA.Backend
  [
    [226373.9807993843],
    [183915.94593704157],
    [183180.09138104858],
    [212875.8215894591],
    [190173.12866427167],
    [236196.48192151205],
    [191265.73326971615],
    [211868.1438984828],
    [285944.72421994776],
    [292179.7714167268],
    [135850.84540173528],
    [170355.0694662646],
    [245781.42715731516],
    [332762.16565358033],
    [218778.89671249516],
    [278597.5767179295],
    [137111.53295960717],
    [222939.45174795808],
    [346537.8395541983],
    [274461.096041744],
    [308123.83451235044],
    [154487.07154281452],
    [230564.8925299188],
    [209643.65583746892],
    [171856.3829574409],
    [222375.53700887578],
    [184989.23650707747],
    [257232.0068392614],
    [266728.27458831324],
    [135307.8737726426],
    [286752.6754042516],
    [278532.3477017124],
    [146663.38599155258],
    [143760.42672523172],
    [217642.40187830463],
    [239256.34190318538],
    [214894.75755024832],
    [143360.16418437145],
    [290998.1990680697],
    [166314.0409957024],
    [348214.14531584276],
    [329513.4915151199],
    [298984.1640842976],
    [301549.31636405794],
    [193079.5165821925],
    [173499.1817696606],
    [231942.86135317638],
    [94320.87884601638],
    [250972.33804283806],
    [169602.52992067536],
    ...
  ]
>

Evaluation - Regression

Regression.mean_square_error(test_y, predictions)
#Nx.Tensor<
  f64
  EXLA.Backend
  3353824360.183254
>
Regression.mean_absolute_percentage_error(test_y, predictions)
#Nx.Tensor<
  f64
  EXLA.Backend
  0.33527562741283073
>