Chapter 2: Working with Text Data
Mix.install([
{:nx, "~> 0.5"},
{:exla, "~> 0.5"},
{:axon, "~> 0.5"},
{:tiktoken, "~> 0.3.2"},
{:table_rex, "~> 3.1.1"},
])
2.1 Understanding word embeddings
The concept of converting data into a vector format is often referred to as embedding.
Embedding model to transform this raw data into a dense vector representation that deep learning architectures can easily understand and process.
Different data formats require distinct embedding models.
An embedding is a mapping from discrete objects, such as words, images, or even entire documents, to points in a continuous vector space.
The primary purpose of embeddings is to convert non-numeric data into a format that neural networks can process.
Retrieval-augmented generation combines generation (like producing text) with retrieval (like searching an external knowledge base) to pull relevant information when generating text, which is a technique that is beyond the scope of this book.
A higher dimensionality might capture more nuanced relationships but at the cost of computational efficiency.
LLMs commonly produce their own embeddings that are part of the input layer and are updated during training.
The embedding size is a trade-off between performance and efficiency.
2.2 Tokenizing text
Tokens are either individual words or special characters, including punctuation characters.
path =
"/home/alde/Documents/MyDevelopment/Build_A_Large_Language_Model/the-verdict.txt"
{:ok, raw_text} = File.read(path)
String.length(raw_text) |> IO.inspect()
{line, rest} = String.split_at(raw_text, 99)
IO.puts(line)
text = "Hello, world. This, is a test."
{:ok, regex} = Regex.compile("(\s)")
Regex.split(regex, text, include_captures: true)
{:ok, regex} = Regex.compile("([,.]|\s)")
result = Regex.split(regex, text, include_captures: true)
Enum.filter(result, fn x -> not (x == " " or x == "") end)
Removing whitespaces reduces the memory and computing requirements. However, keeping whitespaces can be useful if we train models that are sensitive to the exact structure of the text (for example, Python code, which is sensitive to indentation and spacing).
text = "Hello, world. Is this-- a test?"
{:ok, regex} = Regex.compile("([,.:;?_!\"()\']|--|\s)")
result = Regex.split(regex, text, include_captures: true)
Enum.filter(result, fn x -> not (x == " " or x == "") end)
result = Regex.split(regex, raw_text, include_captures: true)
preprocessed = Enum.filter(result, fn x -> not (x == " " or x == "") end)
Enum.count(preprocessed) |> IO.inspect()
{head, tail} = Enum.split(preprocessed, 30)
IO.inspect(head)
2.3 Converting tokens into token IDs
In this section, we will convert these tokens from strings to an integer representation to produce the so-called token IDs.
This conversion is an intermediate step before converting the token IDs into embedding vectors.
To map the previously generated tokens into token IDs, we have to build a so-called vocabulary first.
We build a vocabulary by tokenizing the entire text in a training dataset into individual tokens.
These individual tokens are then sorted alphabetically, and duplicate tokens are removed.
The unique tokens are then aggregated into a vocabulary that defines a mapping from each unique token to a unique integer value.
The depicted vocabulary is purposefully small for illustration purposes and contains no punctuation or special characters for simplicity.
vocabulary =
preprocessed
|> Enum.uniq()
|> Enum.sort()
|> Enum.with_index()
|> Enum.reduce(%{}, fn {value, index}, acc -> Map.put(acc, value, index) end)
when we want to convert the outputs of an LLM from numbers back into text, we also need a way to turn token IDs into text.
defmodule SimpleTokenizerV1 do
def encode(vocabulary, text) do
{:ok, regex} = Regex.compile("([,.:;?_!\"()\']|--|\s)")
result = Regex.split(regex, text, include_captures: true)
preprocessed = Enum.filter(result, fn x -> not (x == " " or x == "") end)
Enum.map(preprocessed, fn token -> Map.get(vocabulary, token) end)
end
def decode(reverse_vocabulary, ids) do
tokens = Enum.map(ids, fn id -> Map.get(reverse_vocabulary, id) end)
text = Enum.reduce(tokens, "", fn token, acc -> acc <> token <> " " end)
{:ok, regex} = Regex.compile("\s+([,.?!\"()\'])")
String.replace(text, regex, "\\1")
end
end
reverse_vocabulary =
vocabulary
|> Enum.map(fn {key, value} -> {value, key} end)
|> Enum.into(%{})
text = """
It's the last he painted, you know," Mrs. Gisburn said with pardonable pride.\
"""
ids = SimpleTokenizerV1.encode(vocabulary, text)
SimpleTokenizerV1.decode(reverse_vocabulary, ids)
text = "Hello, do you like tea?"
ids = SimpleTokenizerV1.encode(vocabulary, text)
2.4 Adding special context tokens
To handle certain contexts. We add an <|unk|>
token to represent new and unknown words that were not part of the training data and thus not part of the existing vocabulary.
Furthermore, we add an <|endoftext|>
token that we can use to separate two unrelated text sources.
build_vocabulary = fn text ->
{:ok, regex} = Regex.compile("([,.:;?_!\"()\']|--|\s)")
Regex.split(regex, text, include_captures: true)
|> Enum.map(fn token -> String.trim(token, "\n") end)
|> Enum.filter(fn x -> not (x == " " or x == "") end)
|> Enum.uniq()
|> Enum.sort()
|> Enum.concat(["<|endoftext|>", "<|unk|>"])
|> Enum.with_index()
|> Enum.reduce(%{}, fn {value, index}, acc -> Map.put(acc, value, index) end)
end
build_reverse_vocabulary = fn vocabulary ->
vocabulary
|> Enum.map(fn {key, value} -> {value, key} end)
|> Enum.into(%{})
end
vocabulary = build_vocabulary.(raw_text)
reverse_vocabulary = build_reverse_vocabulary.(vocabulary)
vocabulary["<|unk|>"]
Enum.count(vocabulary)
vocabulary["yourself"]
defmodule SimpleTokenizerV2 do
def encode(vocabulary, text) do
{:ok, regex} = Regex.compile("([,.:;?_!\"()\']|--|\s)")
result = Regex.split(regex, text, include_captures: true)
preprocessed = Enum.filter(result, fn x -> not (x == " " or x == "") end)
Enum.map(
preprocessed,
fn token ->
id = Map.get(vocabulary, token)
if is_nil(id), do: Map.get(vocabulary, "<|unk|>"), else: id
end
)
end
def decode(reverse_vocabulary, ids) do
tokens = Enum.map(ids, fn id -> Map.get(reverse_vocabulary, id) end)
text = Enum.reduce(tokens, "", fn token, acc -> acc <> token <> " " end)
{:ok, regex} = Regex.compile("\s+([,.?!\"()\'])")
String.replace(text, regex, "\\1")
end
end
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = text1 <> " <|endoftext|> " <> text2
IO.puts(text)
ids = SimpleTokenizerV2.encode(vocabulary, text)
SimpleTokenizerV2.decode(reverse_vocabulary, ids)
Depending on the LLM, some researchers also consider additional special tokens such as the following:
-
[BOS]
(beginning of sequence): This token marks the start of a text. It signifies to the LLM where a piece of content begins. -
[EOS]
(end of sequence): This token is positioned at the end of a text, and is especially useful when concatenating multiple unrelated texts, similar to<|endoftext|>
. For instance, when combining two different Wikipedia articles or books, the[EOS]
token indicates where one article ends and the next one begins. -
[PAD]
(padding): When training LLMs with batch sizes larger than one, the batch might contain texts of varying lengths. To ensure all texts have the same length, the shorter texts are extended or “padded” using the[PAD]
token, up to the length of the longest text in the batch.
When training on batched inputs, we typically use a mask, meaning we don’t attend to padded tokens. Thus, the specific token chosen for padding becomes inconsequential.
2.5 Byte pair encoding
This section covers a more sophisticated tokenization scheme based on a concept called byte pair encoding (BPE).
In Elixir there is also an tiktoken wrapper, to add it to the project use:
{:tiktoken, "~> 0.3.2"}
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."
# gpt2 not supported
{:ok, ids} = Tiktoken.encode("gpt-3.5-turbo", text, ["<|endoftext|>"])
Tiktoken.decode("gpt-3.5-turbo", ids)
Tiktoken.decode("gpt-4", ids)
The BPE tokenizer can handle any unknown word.
The algorithm underlying BPE breaks down words that aren’t in its predefined vocabulary into smaller subword units or even individual characters, enabling it to handle out-of- vocabulary words.
The algorithm builds its vocabulary by iteratively merging frequent characters into subwords and frequent subwords into words.The merges are determined by a frequency cutoff.
2.6 Data sampling with a sliding window
Given a text sample, extract input blocks as subsamples that serve as input to the LLM, and the LLM’s prediction task during training is to predict the next word that follows the input block. During training, we mask out all words that are past the target.
{:ok, ids} = Tiktoken.encode("gpt-3.5-turbo", raw_text)
Enum.count(ids)
{_unused, enc_sample} = Enum.split(ids, 50)
context_size = 4
enc_tensor = Nx.tensor(enc_sample)
x = enc_tensor[0..context_size-1]
y = enc_tensor[1..context_size]
{x,y}
for index <- 0..context_size-1 do
context = enc_tensor[0..index]
desired = enc_tensor[index+1]
{:ok, context_text} = context |> Nx.to_list() |> then(fn ids -> Tiktoken.decode("gpt-3.5-turbo", ids) end)
{:ok, desired_text} = [Nx.to_number(desired)] |> then(fn ids -> Tiktoken.decode("gpt-3.5-turbo", ids) end)
IO.puts("#{context_text} ---> #{desired_text}")
end
In particular, we are interested in returning two tensors: an input tensor containing the text that the LLM sees and a target tensor that includes the targets for the LLM to predict.
defmodule GPTDatasetV1 do
def build_dataset(txt, tokenizer_model, max_length, stride) do
{:ok, token_ids} = Tiktoken.encode(tokenizer_model, txt)
token_ids_tensor = Nx.tensor(token_ids)
text_length = length(token_ids)
linespace =
Enum.to_list(0..(length(token_ids) - max_length - 1)) |> Enum.take_every(stride)
for i <- linespace, reduce: %{input_ids: [], target_ids: []} do
%{input_ids: input_ids, target_ids: target_ids} = acc ->
{input_ids, target_ids} =
cond do
i + max_length > text_length - 1 ->
{input_ids, target_ids}
i + max_length + 1 > text_length - 1 ->
{input_ids, target_ids}
true ->
input_chunk = token_ids_tensor[i..(i + max_length - 1)]
target_chunk = token_ids_tensor[(i + 1)..(i + max_length)]
{input_ids ++ [input_chunk], target_ids ++ [target_chunk]}
end
%{acc | input_ids: input_ids, target_ids: target_ids}
end
end
end
dataset = GPTDatasetV1.build_dataset(raw_text, "gpt-3.5-turbo", 4, 4)
{:ok, input_text} = dataset[:input_ids] |> Enum.at(1) |> Nx.to_list() |> then(fn ids -> Tiktoken.decode("gpt-3.5-turbo", ids) end)
{:ok, target_text} = dataset[:target_ids] |> Enum.at(1) |> Nx.to_list() |> then(fn ids -> Tiktoken.decode("gpt-3.5-turbo", ids) end)
IO.puts("INPUT> #{input_text}\n")
IO.puts("TARGET> #{target_text}")
input_ids = Nx.stack(dataset[:input_ids])
target_ids = Nx.stack(dataset[:target_ids])
2.7 Creating token embeddings
The last step for preparing the input text for LLM training is to convert the token IDs into embedding vectors.
We initialize these embedding weights with random values as a preliminary step. This initialization serves as the starting point for the LLM’s learning process.
A continuous vector representation, or embedding, is necessary since GPT-like LLMs are deep neural networks trained with the backpropagation algorithm.
# embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
# Axon equivalent to pytorch is Axon.embedding()
dummy_model =
Axon.input("txt", shape: {1})
|> Axon.embedding(6, 3)
template = Nx.template({1, 1}, :s64)
Axon.Display.as_table(dummy_model, template)
|> IO.puts
{init_fn, predict_fn} = Axon.build(dummy_model)
params = init_fn.(template, %{})
dbg(params)
input = Nx.tensor([3])
predict_fn.(params, input)
It matches with the 4 row, the embedding layer is essentially a look-up operation that retrieves rows from the embedding layer’s weight matrix via a token ID.
input = Nx.tensor([2, 3, 5, 1])
predict_fn.(params, input)
Embedding layers perform a look-up operation, retrieving the embedding vector corresponding to the token ID from the embedding layer’s weight matrix.
2.8 Encoding word positions
A minor shortcoming of LLMs is that their self-attention mechanism, sinces it doesn’t have a notion of position or order for the tokens within a sequence.
The same token ID always gets mapped to the same vector representation, regardless of where the token ID is positioned in the input sequence.
In principle, the deterministic, position-independent embedding of the token ID is good for reproducibility purposes. However, since the self-attention mechanism of LLMs itself is also position-agnostic, it is helpful to inject additional position information into the LLM.
Absolute positional embeddings are directly associated with specific positions in a sequence. For each position in the input sequence, a unique embedding is added to the token’s embedding to convey its exact location.
Instead of focusing on the absolute position of a token, the emphasis of relative positional embeddings is on the relative position or distance between tokens. This means the model learns the relationships in terms of “how far apart” rather than “at which exact position.” The advantage here is that the model can generalize better to sequences of varying lengths, even if it hasn’t seen such lengths during training.
# For a GPT model's absolute embedding approach, we just need to create another
# embedding layer that has the same dimension as the token_embedding_layer
vocabulary_size = 100257
output_dim = 256
dummy_model =
Axon.input("txt")
|> Axon.embedding(vocabulary_size, output_dim) # token_embedding_layer
|> Axon.embedding(context_size, output_dim) # token_embedding_layer
input = Nx.tensor([[1, 2, 4, 5], [4, 3, 2, 9]])
kernels = Nx.tensor([
[0.46299999952316284, 0.5562999844551086, 0.18170000612735748],
[0.9801999926567078, 0.09780000150203705, 0.5333999991416931],
[0.6980000138282776, 0.9240999817848206, 0.23479999601840973],
[0.31929999589920044, 0.42250001430511475, 0.7865999937057495],
[0.5519000291824341, 0.5662999749183655, 0.20559999346733093],
[0.1898999959230423, 0.9311000108718872, 0.8356000185012817],
[0.6383000016212463, 0.8794000148773193, 0.5282999873161316],
[0.9523000121116638, 0.7597000002861023, 0.08250000327825546],
[0.6622999906539917, 0.02329999953508377, 0.8205999732017517],
[0.9855999946594238, 0.36419999599456787, 0.5372999906539917]])
Axon.Layers.embedding(Nx.tensor([1,2]), kernels)
defmodule PosEmbedding do
import Nx.Defn
def pos_embedding(%Axon{} = x, vocab_size, embedding_size, opts \\ []) do
opts = Keyword.validate!(opts, [:name, kernel_initializer: :uniform])
kernel_shape = &Axon.Shape.embedding_kernel(&1, vocab_size, embedding_size)
kernel = Axon.param("kernel", kernel_shape, initializer: opts[:kernel_initializer])
Axon.layer(&pos_embedding_impl/3, [x, kernel], name: opts[:name], op_name: :pos_embedding)
end
defnp pos_embedding_impl(x, kernel, _opts \\ []) do
{_batch_size, sequence_size} = Nx.shape(x)
input = Nx.iota({1, sequence_size})
Nx.take(kernel, input, axis: 0)
end
end
input = Axon.input("sequence", shape: {nil, 4})
model1 =
input
|> PosEmbedding.pos_embedding(1024, 768)
{init_fn1, predict_fn1} = Axon.build(model1, compiler: EXLA)
template = Nx.template({1, 4}, :f32)
params = init_fn1.(template, %{})
input_batch = [
[6109, 3629, 6100, 345],
[6109, 1110, 6622, 257]
]
batch = Nx.tensor(input_batch)
result = predict_fn1.(params, batch)
# 4 * 768 = 3072
Nx.equal(params["pos_embedding_0"]["kernel"][0..3], result[0]) |> Nx.sum()
model2 =
input
|> Axon.embedding(50257, 768)
{init_fn2, predict_fn2} = Axon.build(model2, compiler: EXLA)
template = Nx.template({1, 4}, :s64)
params = init_fn2.(template, %{})
model = Axon.add(model1, model2)
{init_fn, predict_fn} = Axon.build(model, compiler: EXLA)
template = Nx.template({1, 4}, :s64)
params = init_fn.(template, %{})
Nx.iota({10}) |> Nx.multiply(-1)
The input processing pipeline, input text is first broken up into individual tokens. These tokens are then converted into token IDs using a vocabulary. The token IDs are converted into embedding vectors to which positional embeddings of a similar size are added, resulting in input embeddings that are used as input for the main LLM layers.