K-Means

k_means/lib/k_means.livemd

Caio de Almeida Araujo

@caioalmeida12

elixir

Share to X

Share to Bluesky

More notebooks

K-Means

Mix.install([
  {:csv, "~> 3.2"},
  {:kino, "~> 0.14.1"},
  {:kino_vega_lite, "~> 0.1.11"}
])

Módulo K-Means

defmodule KMeans do
  require Integer
  require CSV

  def read_csv(file_path, columns_to_read, :normalize) when is_list(columns_to_read),
    do: read_csv(file_path, columns_to_read) |> normalize()

  def read_csv(file_path, columns_to_read) when is_list(columns_to_read) do
    file_stream =
      file_path
      |> Path.expand(__DIR__)
      |> File.stream!()
      |> CSV.decode(headers: true)
      |> Stream.with_index()

    file_stream
    |> Enum.map(fn {{:ok, map}, _ind} ->
      columns_to_read
      |> Enum.map(&amp;{&amp;1, String.to_float(Map.get(map, &amp;1))})
      |> Enum.into(%{})
    end)
  end

  def normalize(list_of_maps) when is_list(list_of_maps) do
    keys =
      list_of_maps
      |> Enum.at(0)
      |> Map.keys()

    all_values =
      keys
      |> Enum.reduce(%{}, fn key, acc ->
        value = Enum.map(list_of_maps, &amp;Map.get(&amp;1, key))

        Map.put(acc, key, value)
      end)

    mins_and_maxes =
      all_values
      |> Enum.map(fn {key, values} ->
        {
          key,
          {Enum.min(values), Enum.max(values)}
        }
      end)
      |> Enum.into(%{})

    list_of_maps
    |> Enum.map(fn dimensions ->
      dimensions
      |> Enum.map(fn {key, value} ->
        {min, max} = Map.get(mins_and_maxes, key)

        (value - min) / (max - min)
      end)
    end)
  end

  def distance(reference, destination) when is_list(reference) and is_list(destination) do
    Enum.zip(reference, destination)
    |> Enum.reduce(0, fn {dim_ref, dim_dest}, sum_of_squares ->
      sum_of_squares + :math.pow(dim_ref - dim_dest, 2)
    end)
    |> then(&amp;:math.sqrt/1)
  end

  def converge(list_of_points, k)
      when is_list(list_of_points) and
             is_integer(k) do
    initial_centroids =
      list_of_points
      |> Enum.take_random(k)

    converge(list_of_points, initial_centroids)
  end

  def converge(list_of_points, list_of_centroids)
      when is_list(list_of_centroids) and
             is_list(list_of_points) do
    labeled_points =
      list_of_points
      |> Enum.reduce([], fn point, acc ->
        distances =
          list_of_centroids
          |> Enum.map(&amp;distance(&amp;1, point))

        min_distance = Enum.min(distances)

        nearest_centroid =
          distances
          |> Enum.find_index(&amp;(&amp;1 == min_distance))

        [
          %{dimensions: point, centroid: nearest_centroid, distance_to_centroid: min_distance}
          | acc
        ]
      end)

    new_centroids =
      labeled_points
      |> Enum.group_by(&amp; &amp;1.centroid, &amp; &amp;1.dimensions)
      |> Enum.map(fn {_centroid, list_of_points} ->
        list_of_points
        |> Enum.map(&amp;Enum.with_index/1)
        |> List.flatten()
        |> Enum.group_by(fn {_value, dim} -> dim end, fn {value, _dim} -> value end)
        |> Enum.map(fn {_dim, values} -> Enum.sum(values) / length(values) end)
      end)

    if new_centroids != list_of_centroids,
      do: converge(list_of_points, new_centroids),
      else: labeled_points
  end

  def wcss(labeled_points) when is_list(labeled_points) do
    labeled_points
    |> Enum.group_by(&amp; &amp;1.centroid)
    |> Enum.map(fn {_centroid, points} ->
      points
      |> Enum.reduce(0, &amp;(&amp;1.distance_to_centroid + &amp;2))
    end)
    |> Enum.sum()
  end
end

Runtime

file_path = Kino.FS.file_path("housing.csv")

points =
  KMeans.read_csv(
    file_path,
    [
      "latitude",
      "longitude",
      "median_house_value",
      "housing_median_age"
    ],
    :normalize
  )

search_values = 1..20

wcss_by_k =
  search_values
  |> Enum.reduce([], fn k, acc ->
    wcss =
      points
      |> KMeans.converge(k)
      |> KMeans.wcss()

    [wcss | acc]
  end)
  |> Enum.reverse()

plottable =
  %{
    x: Range.to_list(search_values),
    y: wcss_by_k
  }

VegaLite.new(width: 1000, height: 500, title: "WCSS por valor de K")
|> VegaLite.data_from_values(plottable, only: ["x", "y"])
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "x", type: :quantitative)
|> VegaLite.encode_field(:y, "y", type: :quantitative)

Other notebooks:

@TomBers

livebookNotes

Attractors

attractors.livemd

decimal vega_lite kino

2022-8-18
Kevin Pan
@feng19

spider_man

ElixirJobs

elixirjobs.livemd

spider_man floki nimble_csv kino

2022-8-18
@TomBers

livebookNotes

Fun with Graphs

graphs.livemd

vega_lite kino math

2022-8-18
@TomBers

livebookNotes

Epicycloid - draw Curves with Straight Lines

Epicycloid.livemd

vega_lite kino math

2022-8-18
Stewart
@imakestews

cur

Math Module Testing

deprecated_math_module_testing.livemd

jason kino youtube hidden_cell

2025-6-29
Ryo Wakabayashi
@RyoWakabayashi

elixir-learning

Hybrid RAG

hybrid_rag.livemd

openai_ex boltx kino req

2025-1-21
Edgar Gomes
@lostbean

project51

Agent Prototyping

magus_example_2.livemd

magus kino ex_dot

2025-4-7

Back