Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Building Makemore: Part 1

elixir/makemore.livemd

Building Makemore: Part 1

Mix.install([
  {:axon, "~> 0.5.1"},
  {:nx, "~> 0.5.2"},
  {:kino, "~> 0.9.0"},
  {:vega_lite, "~> 0.1.7"},
  {:kino_vega_lite, "~> 0.1.8"}
])
:ok

Introduction & Makemore Overview

This is a Livebook following the video, created by Andrej Karpathy, The spelled-out intro to language modeling: building makemore

Makemore Overview

Github

Makemore takes one text file as input, where each line is assumed to be one training thing, and generates more things like it. Under the hood, it is an autoregressive character-level language model, with a wide choice of models from bigrams all the way to a Transformer (exactly as seen in GPT). For example, we can feed it a database of names, and makemore will generate cool baby name ideas that all sound name-like, but are not already existing names. Or if we feed it a database of company names then we can generate new ideas for a name of a company. Or we can just feed it valid scrabble words and generate english-like babble.

Loading Names Dataset

dataset_path = "/Users/charlie/ML/datasets/names.txt"
"/Users/charlie/ML/datasets/names.txt"
names =
  dataset_path
  |> File.read!()
  |> String.split("\n", trim: true)
["emma", "olivia", "ava", "isabella", "sophia", "charlotte", "mia", "amelia", "harper", "evelyn",
 "abigail", "emily", "elizabeth", "mila", "ella", "avery", "sofia", "camila", "aria", "scarlett",
 "victoria", "madison", "luna", "grace", "chloe", "penelope", "layla", "riley", "zoey", "nora",
 "lily", "eleanor", "hannah", "lillian", "addison", "aubrey", "ellie", "stella", "natalie", "zoe",
 "leah", "hazel", "violet", "aurora", "savannah", "audrey", "brooklyn", "bella", "claire", "skylar",
 ...]
number_of_names = length(names)
32033
minimum_char_count = names |> Enum.min_by(&String.length/1) |> String.length()
2
maximum_char_count = names |> Enum.max_by(&String.length/1) |> String.length()
15
nil
chars =
  97..122
  |> Enum.map(fn uc -> <> end)
  |> Enum.with_index()
[
  {"a", 0},
  {"b", 1},
  {"c", 2},
  {"d", 3},
  {"e", 4},
  {"f", 5},
  {"g", 6},
  {"h", 7},
  {"i", 8},
  {"j", 9},
  {"k", 10},
  {"l", 11},
  {"m", 12},
  {"n", 13},
  {"o", 14},
  {"p", 15},
  {"q", 16},
  {"r", 17},
  {"s", 18},
  {"t", 19},
  {"u", 20},
  {"v", 21},
  {"w", 22},
  {"x", 23},
  {"y", 24},
  {"z", 25}
]
itos =
  chars
  |> Enum.reduce(%{}, fn {c, i}, acc -> Map.put(acc, i + 1, c) end)
  |> Map.put(0, ".")
%{
  0 => ".",
  1 => "a",
  2 => "b",
  3 => "c",
  4 => "d",
  5 => "e",
  6 => "f",
  7 => "g",
  8 => "h",
  9 => "i",
  10 => "j",
  11 => "k",
  12 => "l",
  13 => "m",
  14 => "n",
  15 => "o",
  16 => "p",
  17 => "q",
  18 => "r",
  19 => "s",
  20 => "t",
  21 => "u",
  22 => "v",
  23 => "w",
  24 => "x",
  25 => "y",
  26 => "z"
}
stoi =
  chars
  |> Enum.reduce(%{}, fn {c, i}, acc -> Map.put(acc, c, i + 1) end)
  |> Map.put(".", 0)
%{
  "." => 0,
  "a" => 1,
  "b" => 2,
  "c" => 3,
  "d" => 4,
  "e" => 5,
  "f" => 6,
  "g" => 7,
  "h" => 8,
  "i" => 9,
  "j" => 10,
  "k" => 11,
  "l" => 12,
  "m" => 13,
  "n" => 14,
  "o" => 15,
  "p" => 16,
  "q" => 17,
  "r" => 18,
  "s" => 19,
  "t" => 20,
  "u" => 21,
  "v" => 22,
  "w" => 23,
  "x" => 24,
  "y" => 25,
  "z" => 26
}

Bigram Language Model

Only working with two characters at a time. We are looking at a single character and trying to predict the next character. It is a simple and weak language model but a good place to start

bigrams =
  names
  |> Enum.flat_map(fn name ->
    chunks = name |> String.graphemes() |> Enum.chunk_every(2, 1, ["."])
    [[first_ch | _other_ch] | _rest] = chunks
    [[".", first_ch] | chunks]
  end)
  |> Enum.map(&amp;List.to_tuple/1)
[
  {".", "e"},
  {"e", "m"},
  {"m", "m"},
  {"m", "a"},
  {"a", "."},
  {".", "o"},
  {"o", "l"},
  {"l", "i"},
  {"i", "v"},
  {"v", "i"},
  {"i", "a"},
  {"a", "."},
  {".", "a"},
  {"a", "v"},
  {"v", "a"},
  {"a", "."},
  {".", "i"},
  {"i", "s"},
  {"s", "a"},
  {"a", "b"},
  {"b", "e"},
  {"e", "l"},
  {"l", "l"},
  {"l", "a"},
  {"a", "."},
  {".", "s"},
  {"s", "o"},
  {"o", "p"},
  {"p", "h"},
  {"h", "i"},
  {"i", "a"},
  {"a", "."},
  {".", "c"},
  {"c", "h"},
  {"h", "a"},
  {"a", "r"},
  {"r", "l"},
  {"l", "o"},
  {"o", "t"},
  {"t", "t"},
  {"t", "e"},
  {"e", "."},
  {".", "m"},
  {"m", "i"},
  {"i", "a"},
  {"a", "."},
  {".", "a"},
  {"a", "m"},
  {"m", ...},
  {...},
  ...
]
bigram_map =
  bigrams
  |> Enum.reduce(%{}, fn bigram, acc ->
    Map.update(acc, bigram, 1, fn count -> count + 1 end)
  end)
%{
  {".", "i"} => 591,
  {"i", "i"} => 82,
  {"o", "s"} => 504,
  {"e", "v"} => 463,
  {"e", "e"} => 1271,
  {"t", "g"} => 2,
  {"h", "d"} => 24,
  {"n", "g"} => 273,
  {"l", "a"} => 2623,
  {"k", "l"} => 139,
  {"y", "z"} => 78,
  {"d", "h"} => 118,
  {"e", "u"} => 69,
  {"t", "u"} => 78,
  {"n", "x"} => 6,
  {"v", "h"} => 1,
  {"g", "g"} => 25,
  {"c", "."} => 97,
  {"b", "i"} => 217,
  {"s", "w"} => 24,
  {"m", "t"} => 4,
  {".", "p"} => 515,
  {"b", "c"} => 1,
  {"y", "c"} => 115,
  {"q", "e"} => 1,
  {"v", "e"} => 568,
  {"o", "e"} => 132,
  {"d", "n"} => 31,
  {"i", "a"} => 2445,
  {"m", "s"} => 35,
  {"e", "w"} => 50,
  {"x", "h"} => 1,
  {"o", "g"} => 44,
  {"l", "y"} => 1588,
  {"f", "h"} => 1,
  {"y", "y"} => 23,
  {"u", "t"} => 82,
  {"g", "y"} => 31,
  {"x", "a"} => 103,
  {"s", "c"} => 60,
  {"j", "l"} => 9,
  {".", "r"} => 1639,
  {"x", "c"} => 4,
  {"c", "q"} => 11,
  {"v", "k"} => 3,
  {"a", "c"} => 470,
  {"z", "b"} => 4,
  {"d", "f"} => 5,
  {"r", ...} => 99,
  {...} => 2,
  ...
}
index_counts =
  bigrams
  |> Enum.map(fn {ch1, ch2} ->
    {Map.get(stoi, ch1), Map.get(stoi, ch2)}
  end)
  |> Enum.group_by(fn t -> t end)
  |> Enum.reduce(%{}, fn {location, occurences}, acc ->
    Map.put(acc, location, length(occurences))
  end)
%{
  {1, 26} => 435,
  {4, 5} => 1283,
  {20, 3} => 17,
  {19, 22} => 14,
  {16, 10} => 1,
  {11, 8} => 307,
  {26, 21} => 73,
  {10, 25} => 10,
  {11, 19} => 95,
  {6, 18} => 114,
  {15, 17} => 3,
  {10, 13} => 5,
  {5, 9} => 818,
  {12, 16} => 15,
  {14, 16} => 5,
  {6, 23} => 4,
  {10, 19} => 7,
  {17, 0} => 28,
  {14, 15} => 496,
  {3, 15} => 380,
  {13, 25} => 287,
  {14, 17} => 2,
  {13, 6} => 1,
  {4, 19} => 29,
  {12, 8} => 19,
  {5, 17} => 14,
  {3, 16} => 1,
  {15, 3} => 114,
  {23, 8} => 23,
  {5, 22} => 463,
  {1, 2} => 541,
  {16, 15} => 59,
  {15, 16} => 95,
  {8, 5} => 674,
  {15, 2} => 140,
  {1, 24} => 182,
  {26, 4} => 2,
  {23, 23} => 2,
  {25, 17} => 6,
  {25, 0} => 2007,
  {18, 10} => 25,
  {4, 15} => 378,
  {14, 24} => 6,
  {4, 18} => 424,
  {21, 24} => 34,
  {4, 26} => 1,
  {24, 15} => 41,
  {14, 8} => 26,
  {6, ...} => 10,
  {...} => 181,
  ...
}
t =
  0..26
  |> Enum.map(fn i ->
    0..26
    |> Enum.map(fn j ->
      Map.get(index_counts, {i, j}, 0)
    end)
  end)
  |> Nx.tensor(type: :f32)
#Nx.Tensor<
  f32[27][27]
  [
    [0.0, 4410.0, 1306.0, 1542.0, 1690.0, 1531.0, 417.0, 669.0, 874.0, 591.0, 2422.0, 2963.0, 1572.0, 2538.0, 1146.0, 394.0, 515.0, 92.0, 1639.0, 2055.0, 1308.0, 78.0, 376.0, 307.0, 134.0, 535.0, 929.0],
    [6640.0, 556.0, 541.0, 470.0, 1042.0, 692.0, 134.0, 168.0, 2332.0, 1650.0, 175.0, 568.0, 2528.0, 1634.0, 5438.0, 63.0, 82.0, 60.0, 3264.0, 1118.0, 687.0, 381.0, 834.0, ...],
    ...
  ]
>

We want to sample from the tensor t. To do this we need to convert our raw counts to probability vectors. It is known to be a proper probability vector if the sum of the elements is 1

Can be confirmed with:

t_sum = t[0] |> Nx.sum()
t0_sum = t[0] |> Nx.divide(t_sum) |> Nx.sum()
t_sum = t[0] |> Nx.sum()
t[0] |> Nx.divide(t_sum)
#Nx.Tensor<
  f32[27]
  [0.0, 0.13767053186893463, 0.040770456194877625, 0.048137858510017395, 0.05275809392333031, 0.047794461250305176, 0.013017825782299042, 0.020884713158011436, 0.02728436328470707, 0.018449723720550537, 0.07560952752828598, 0.09249836206436157, 0.04907439276576042, 0.07923079282045364, 0.03577560558915138, 0.012299816124141216, 0.01607717014849186, 0.0028720381669700146, 0.05116598680615425, 0.06415259093046188, 0.040832892060279846, 0.002434988971799612, 0.01173789519816637, 0.009583866223692894, 0.0041831862181425095, 0.016701526939868927, 0.02900134213268757]
>

Sampling from the distributions

key = Nx.Random.key(2_147_483_647)
{values, _} = Nx.Random.normal(key, 0, 1, shape: {1, 3}, type: :f32)
v_sum = values |> Nx.sum()
values |> Nx.divide(v_sum)
#Nx.Tensor<
  f32[1][3]
  [
    [1.0721101760864258, -0.2846589684486389, 0.21254877746105194]
  ]
>

Visualize Tensor

# TODO: This requires more advanced knowledge of VegaLite and Explorer
# See https://youtu.be/PaCmpygFfXo?t=1219 for example of visualization trying to create
# display =
#   VegaLite.new(width: 400, height: 400)
#   |> VegaLite.data_from_values(i: 0..27, j: 0..27)
#   |> VegaLite.mark(:line)
#   |> VegaLite.encode_field(:x, "i", type: :quantitative)
#   |> VegaLite.encode_field(:y, "j", type: :quantitative)
#   |> Kino.VegaLite.new()
#   |> Kino.render()

# Enum.each(index_counts, fn {loc, count} ->
#   {i, j} = loc
#   Kino.VegaLite.push(display, %{"i" => i, "j" => j})
# end)
nil