Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Chatper 09

chatper09.livemd

Chatper 09

Mix.install([
  {:scidata, "~> 0.1"},
  {:axon, "~> 0.5"},
  {:exla, "~> 0.6"},
  {:nx, "~> 0.6"},
  {:table_rex, "~> 3.1.1"},
  {:kino, "~> 0.7"}
])

Nx.default_backend(EXLA.Backend)

Section

data = Scidata.IMDBReviews.download()

{train_data, test_data} =
  data.review
  |> Enum.zip(data.sentiment)
  |> Enum.shuffle()
  |> Enum.split(23_000)

review = "I didn't like the movie. It was so bad; nobody should ever watch it."

review
|> String.downcase()
|> String.replace(~r/[\p{P}\p{S}]/, "")
|> String.split()
frequencies = %{
  "diplomaswho" => 1,
  "egyptologists" => 1,
  "characther" => 1,
  "nearlyempty" => 1,
  "blobs" => 4,
  "doubletwist" => 1,
  "thingshe" => 1,
  "loleralacartelort7890" => 1,
  "placebo" => 1,
  "betterif" => 1,
  "smarttalk" => 1,
  "sorcererin" => 1,
  "celies" => 6,
  "tenancier" => 1,
  "ladies" => 284
}

num_tokens = 1024

tokens =
  frequencies
  |> Enum.sort_by(&elem(&1, 1), :desc)
  |> Enum.take(num_tokens)

tokens =
  frequencies
  |> Enum.sort_by(&elem(&1, 1), :desc)
  |> Enum.take(num_tokens)
  |> Enum.with_index(fn {token, _}, i -> {token, i} end) |> Map.new()
review = "The Departed is Martin Scorsese's best work, and anybody who disagrees is wrong. This movie is amazing."

tokenize = fn review -> review
|> String.downcase()
|> String.replace(~r/[\p{P}\p{S}]/, "") |> String.split()
|> Enum.map(&Map.get(tokens, &1))
end

tokenize.(review)
review = "The Departed is Martin Scorsese's best work, and anybody who disagrees is wrong. This movie is amazing."
unknown_token = 0

tokens =
  frequencies
    |> Enum.sort_by(&elem(&1, 1), :desc)
    |> Enum.take(num_tokens)
    |> Enum.with_index(fn {token, _}, i -> {token, i + 1} end) |> Map.new()

tokenize = fn review -> review
  |> String.downcase()
  |> String.replace(~r/[\p{P}\p{S}]/, "")
  |> String.split()
  |> Enum.map(&Map.get(tokens, &1, unknown_token))
  |> Nx.tensor()
end

tokenize.(review)
batch_size = 64

train_pipeline =
  train_data
  |> Stream.map(fn {review, label} ->
    {tokenize.(review), Nx.tensor(label)}
  end)
  |> Stream.chunk_every(batch_size, batch_size, :discard)
  |> Stream.map(fn reviews_and_labels ->
    {review, label} = Enum.unzip(reviews_and_labels)
    {Nx.stack(review), Nx.stack(label) |> Nx.new_axis(-1)}
  end)

test_pipeline =
  test_data
  |> Stream.map(fn {review, label} ->
    {tokenize.(review), Nx.tensor(label)}
    end)
  |> Stream.chunk_every(batch_size, batch_size, :discard)
  |> Stream.map(fn reviews_and_labels ->
    {review, label} = Enum.unzip(reviews_and_labels)
    {Nx.stack(review), Nx.stack(label) |> Nx.new_axis(-1)}
  end)
pad_token = 0
unknown_token = 1
max_seq_len = 64

tokens =
  frequencies
  |> Enum.sort_by(&elem(&1, 1), :desc)
  |> Enum.take(num_tokens)
  |> Enum.with_index(fn {token, _}, i -> {token, i + 2} end)
  |> Map.new()

tokenize = fn review ->
  review
  |> String.downcase()
  |> String.replace(~r/[\p{P}\p{S}]/, "")
  |> String.split()
  |> Enum.map(&Map.get(tokens, &1, unknown_token))
  |> Nx.tensor()
  |> then(&Nx.pad(&1, pad_token, [{0, max_seq_len - Nx.size(&1), 0}]))
end

batch_size = 64

train_pipeline =
  train_data
  |> Stream.map(fn {review, label} ->
    {tokenize.(review), Nx.tensor(label)}
    end)
  |> Stream.chunk_every(batch_size, batch_size, :discard)
  |> Stream.map(fn reviews_and_labels ->
    {review, label} = Enum.unzip(reviews_and_labels)
    {Nx.stack(review), Nx.stack(label) |> Nx.new_axis(-1)} 
    end)

test_pipeline =
  test_data
  |> Stream.map(fn {review, label} ->
    {tokenize.(review), Nx.tensor(label)} 
    end)
  |> Stream.chunk_every(batch_size, batch_size, :discard)
  |> Stream.map(fn reviews_and_labels ->
    {review, label} = Enum.unzip(reviews_and_labels)
    {Nx.stack(review), Nx.stack(label)
      |> Nx.new_axis(-1)} end)
Enum.take(train_pipeline, 1)
model =
  Axon.input("review")
  |> Axon.embedding(num_tokens + 2, 64)
  |> Axon.flatten()
  |> Axon.dense(64, activation: :relu)
  |> Axon.dense(1)

input_template = Nx.template({64, 64}, :s64)
Axon.Display.as_graph(model, input_template)
loss = &Axon.Losses.binary_cross_entropy(&1, &2,
  from_logits: true,
  reduction: :mean
)

optimizer = Axon.Optimizers.adam(1.0e-4)

trained_model_state =
  model
  |> Axon.Loop.trainer(loss, optimizer)
  |> Axon.Loop.metric(:accuracy)
  |> Axon.Loop.run(train_pipeline, %{}, epochs: 10, compiler: EXLA)
model
|> Axon.Loop.evaluator()
|> Axon.Loop.metric(:accuracy)
|> Axon.Loop.run(test_pipeline, trained_model_state, compiler: EXLA)
sequence = Axon.input("review")

embedded = sequence |> Axon.embedding(num_tokens + 2, 64)
mask = Axon.mask(sequence, 0)
{rnn_sequence, _state} = Axon.lstm(embedded, 64, mask: mask, unroll: :static)
magic_word = "sixers"
input = ["rockets", "mavericks", "sixers", "sixers",
"magic", "nets", "sixers"]

{output, _} = Enum.map_reduce(input, 0, fn entry, count -> 
  if entry == magic_word do
    {count + 1, count + 1}
  else
    {count, count}
  end 
end)
final_token = Axon.nx(rnn_sequence, fn seq ->
  Nx.squeeze(seq[[0..-1 // 1, -1, 0..-1 // 1]])
end)

model =
  final_token
  |> Axon.dense(64, activation: :relu)
  |> Axon.dense(1)
input_template = Nx.template({64, 64}, :s64) 
Axon.Display.as_graph(model, input_template)
loss = &Axon.Losses.binary_cross_entropy(&1, &2,
  from_logits: true,
  reduction: :mean
)

optimizer = Axon.Optimizers.adam(1.0e-4)

trained_model_state =
  model
  |> Axon.Loop.trainer(loss, optimizer)
  |> Axon.Loop.metric(:accuracy)
  |> Axon.Loop.run(train_pipeline, %{}, epochs: 10, compiler: EXLA)
model
|> Axon.Loop.evaluator()
|> Axon.Loop.metric(:accuracy)
|> Axon.Loop.run(test_pipeline, trained_model_state, compiler: EXLA)