Powered by AppSignal & Oban Pro

Sophoclean Vocabulary Comparisons

logos-sophokleios.livemd

Sophoclean Vocabulary Comparisons

Mix.install([
  {:csv, "~> 3.2"},
  {:kino_vega_lite, "~> 0.1.8"},
  {:scholar, "~> 0.3.1"},
  {:similarity, "~> 0.2"},
  {:tucan, "~> 0.2.1"}
])

Module for data processing

defmodule SophoclesTfIdf do
  def speaker_documents_by_play do
    csvs()
    |> Enum.reduce(%{}, fn filename, acc ->
      speaker_doc =
        File.stream!(filename)
        |> CSV.decode!(escape_character: ?|, headers: true, separator: ?\t)
        |> Enum.group_by(&Map.get(&1, "speaker"))

      urn = filename |> Path.basename() |> Path.rootname()
      Map.put(acc, urn, speaker_doc)
    end)
  end

  def plays_as_documents do
    csvs()
    |> Enum.map(fn filename ->
      File.stream!(filename)
      |> CSV.decode!(escape_character: ?|, headers: true, separator: ?\t)
    end)
  end

  def speaker_frequencies(speaker_document) do
    speaker_document |> Enum.frequencies_by(&Map.get(&1, "lemma"))
  end

  def score_line(words, other_speaker_document) do
    other_speaker_max_tf_idf =
      other_speaker_document
      |> Enum.max_by(&Map.get(&1, "tf-idf"))
      |> Map.get("tf-idf")

    word_scores =
      words
      |> Enum.map(fn word ->
        other_speaker_document
        |> Enum.find(%{"tf-idf" => 0}, &(Map.get(&1, "lemma") == Map.get(word, "lemma")))
        |> Map.get("tf-idf")
      end)

    Enum.sum(word_scores) / Enum.count(words) / other_speaker_max_tf_idf
  end

  @doc """
  Compare the two documents using cosine similarity (Similarity.cosine/2).

  Use an ε value of 1e-8 to avoid divide-by-zero errors. (PyTorch does 
  the same: https://pytorch.org/docs/stable/generated/torch.nn.CosineSimilarity.html.)

  With similarity scores in hand, we can calculate the area under the line for
  each speaker comparison to get a similarity heuristic
  """
  def score_line_by_cosine_similarity(words, other_speaker_document) do
    score_tuples = words |> Enum.map(fn word -> 
      their_score = 
        other_speaker_document 
        |> Enum.find(
          %{"tf-idf" => nil}, 
          &(Map.get(&1, "lemma") == Map.get(word, "lemma"))
        )
        |> Map.get("tf-idf")

      my_score = Map.get(word, "tf-idf")

      if is_nil(their_score) || their_score == 0 || my_score == 0 do
        nil
      else
        {my_score, their_score}
      end
    end)
    |> Enum.reject(&is_nil/1)
        
    my_word_scores = Enum.map(score_tuples, &elem(&1, 0))
    other_word_scores = Enum.map(score_tuples, &elem(&1, 1))
    
    if length(my_word_scores) == 0 || length(other_word_scores) == 0 do
      0.0
    else 
      Similarity.cosine(my_word_scores, other_word_scores)
    end
  end

  def compare_speakers(speaker_documents) do
    speaker_documents
    |> Enum.map(fn {speaker, document} ->
      document_by_line = document |> Enum.group_by(&Map.get(&1, "n"))

      scored_document =
        document_by_line
        |> Enum.map(fn {n, line_words} ->
          line_scores =
            speaker_documents
            |> Enum.map(fn {speaker2, speaker2_document} ->
              {speaker2, score_line_by_cosine_similarity(line_words, speaker2_document)}
            end)

          {n, line_scores}
        end)

      {speaker, scored_document}
    end)
  end

  def tf(term, document) do
    values = Map.values(document)
    total_terms = Enum.sum(values)
    term_value = Map.get(document, term, 0)

    term_value / total_terms
  end

  def idf(term, documents) do
    df =
      documents
      |> Enum.reduce(0, fn {_id, document}, acc ->
        if Enum.find(document, &(Map.get(&1, "lemma") == term)) do
          acc + 1
        else
          acc
        end
      end)

    :math.log(Enum.count(documents) / df)
  end

  def clean_line_number(n) do
    try do
      String.to_integer(n)
    rescue
      _ ->
        Regex.replace(~r/[[:alpha:]]/u, n, "") |> String.to_integer()
    end
  end

  def urn_to_title(urn) do
    cond do
      String.ends_with?(urn, "tlg0011.tlg001.perseus-grc2.cp-words") -> "Trachiniae"
      String.ends_with?(urn, "tlg0011.tlg002.perseus-grc2.cp-words") -> "Antigone"
      String.ends_with?(urn, "tlg0011.tlg003.perseus-grc2.cp-words") -> "Ajax"
      String.ends_with?(urn, "tlg0011.tlg004.perseus-grc2.cp-words") -> "Oedipus Tyrannus"
      String.ends_with?(urn, "tlg0011.tlg005.perseus-grc2.cp-words") -> "Electra"
      String.ends_with?(urn, "tlg0011.tlg006.perseus-grc2.cp-words") -> "Philoctetes"
      String.ends_with?(urn, "tlg0011.tlg007.perseus-grc2.cp-words") -> "Oedipus at Colonus"
      true -> "Invalid URN for this project"
    end
  end

  defp csvs do
    path =
      if __DIR__ == "." do
        "../apps/files/*.cp-words.csv"
      else
        "/files/*.cp-words.csv"
      end

    Path.wildcard(__DIR__ <> path)
  end
end
compared_lines =
  SophoclesTfIdf.speaker_documents_by_play()
  |> Enum.map(fn {urn, speaker_documents} ->
    speaker_tf_idfs =
      speaker_documents
      |> Enum.reduce(%{}, fn {speaker, document}, acc ->
        frequencies =
          document
          |> Enum.frequencies_by(&amp;Map.get(&amp;1, "lemma"))

        document =
          document
          |> Enum.map(fn word ->
            tf = SophoclesTfIdf.tf(Map.get(word, "lemma"), frequencies)
            idf = SophoclesTfIdf.idf(Map.get(word, "lemma"), speaker_documents)

            word
            |> Map.put("tf", tf)
            |> Map.put("idf", idf)
            |> Map.put("tf-idf", tf * idf)
          end)

        Map.put(acc, speaker, document)
      end)

    {urn, SophoclesTfIdf.compare_speakers(speaker_tf_idfs)}
  end)

Plotting the data

defmodule ComparisonToCsv do
  def to_csv(compared_lines) do
    compared_lines
    |> Enum.each(fn {urn, comparisons_by_speaker} ->
      urn = String.replace(urn, "urn:cts:greekLit:", "")

      comparisons_by_speaker
      |> Enum.each(fn {speaker, scored_lines} ->
        sorted_lines =
          scored_lines
          |> Enum.sort_by(fn {n, _scores} ->
            SophoclesTfIdf.clean_line_number(n)
          end)

        data =
          sorted_lines
          |> Enum.flat_map(fn {n, scores} ->
            scores
            |> Enum.map(fn {name, score} ->
              %{
                "line" => SophoclesTfIdf.clean_line_number(n),
                "name" => name,
                "similarity_score" => score
              }
            end)
          end)

        write("#{urn}.#{speaker}.csv", data)
      end)
    end)
  end

  def write(filename, data) do
    path =
      if __DIR__ == "." do
        __DIR__ <> "/../apps/files/" <> filename
      else
        __DIR__ <> "/files/" <> filename
      end

    dirname = Path.dirname(path)

    case File.mkdir(dirname) do
      :ok -> :ok
      {:error, :eexist} -> :ok
      {:error, reason} -> raise "Error creating file #{reason}"
    end

    f = File.open!(path, [:write, :utf8])

    data
    |> CSV.encode(
      escape_character: ?|,
      headers: ["line", "name", "similarity_score"],
      separator: ?\t
    )
    |> Enum.each(&amp;IO.write(f, &amp;1))
  end
end
ComparisonToCsv.to_csv(compared_lines)
defmodule ComparisonPlots do
  def csv_to_dataset(urn, speaker, filter \\ fn _x -> true end)

  def csv_to_dataset(urn, speaker, filter) when is_function(filter) do
    filename =
      if __DIR__ == "." do
        __DIR__ <> "/../apps/files/#{urn}.#{speaker}.csv"
      else
        __DIR__ <> "/files/#{urn}.#{speaker}.csv"
      end

    lines = 
      File.stream!(filename)
      |> CSV.decode!(escape_character: ?|, headers: true, separator: ?\t)
      |> Enum.filter(fn row -> filter.(row) end)
      |> Enum.reject(fn row -> Map.get(row, "name") == speaker end)
      |> Enum.map(fn %{
                       "line" => line,
                       "name" => name,
                       "similarity_score" => similarity_score
                     } ->
        %{
          "line" => SophoclesTfIdf.clean_line_number(line),
          "name" => name,
          "similarity" => String.to_float(similarity_score)
        }
      end)

    areas = 
      lines
      |> Enum.group_by(&amp;Map.get(&amp;1, "name"))
      |> Enum.map(fn {name, name_lines} -> 
        %{
          "name" => name, 
          "area" => name_lines 
            |> Enum.map(&amp;Map.get(&amp;1, "similarity"))
            |> Nx.tensor()
            |> Scholar.Integrate.trapezoidal_uniform()
            |> Nx.to_number()
        }
      end)
    
    {lines, areas}
  end

  def csv_to_dataset(urn, speaker, filter) when is_list(filter) do
    csv_to_dataset(urn, speaker, &amp;Enum.member?(filter, Map.get(&amp;1, "name")))
  end

  def csv_to_dataset(urn, speaker, filter) when is_binary(filter) do
    csv_to_dataset(urn, speaker, &amp;(Map.get(&amp;1, "name") == filter))
  end

  def to_bar_chart(title, lines) do
    lines
    |> Tucan.bar(
      "line",
      "similarity",
      x: [type: :ordinal],
      mode: :grouped,
      color_by: "name",
      title: title,
      tooltip: :data
    )
    |> Tucan.set_width(8000)
    |> Tucan.Legend.put_options(:color, label_font_size: 12, title: "Speaker")
  end 

  def to_stripplot(title, areas) do
    areas
    |> Tucan.stripplot(
      "area",
      color_by: "name",
      title: title,
      tooltip: :data,
      style: :point,
      orient: :vertical
    )
    |> Tucan.set_height(300)
    |> Tucan.set_width(300)
    |> Tucan.Legend.put_options(:color, label_font_size: 12, title: "Speaker")
  end

  def transliterate(s) do
    String.normalize(s, :nfd)
    |> String.replace(~r/\W|῾/u, "")
    |> String.split("")
    |> Enum.filter(fn c -> c != "" end)
    |> Enum.map(fn c ->
      greek_to_ascii(String.downcase(c))
    end)
    |> Enum.join("")
    |> String.capitalize()
  end

  def greek_to_ascii(c) do
    case c do
      "\῾" -> "h"
      "α" -> "a"
      "β" -> "b"
      "γ" -> "g"
      "δ" -> "d"
      "ε" -> "e"
      "ζ" -> "z"
      "η" -> "e"
      "θ" -> "th"
      "ι" -> "i"
      "κ" -> "c"
      "λ" -> "l"
      "μ" -> "m"
      "ν" -> "n"
      "ξ" -> "x"
      "ο" -> "o"
      "π" -> "p"
      "ρ" -> "r"
      "σ" -> "s"
      "ς" -> "s"
      "τ" -> "t"
      "υ" -> "u"
      "φ" -> "ph"
      "χ" -> "ch"
      "ψ" -> "ps"
      "ω" -> "o"
    end
  end
end
defmodule SelectComparisons do
  def plot(urn, name, title, filter) do
    {lines, areas} = ComparisonPlots.csv_to_dataset(urn, name, filter)
    
    bar_chart = ComparisonPlots.to_bar_chart(title, lines)
    stripplot = ComparisonPlots.to_stripplot(title, areas)

    Tucan.vconcat([stripplot, bar_chart])
    |> Tucan.set_theme(:five_thirty_eight)
  end
  
  def plot(urn, name, title) do
    {lines, areas} = ComparisonPlots.csv_to_dataset(urn, name)
    
    bar_chart = ComparisonPlots.to_bar_chart(title, lines)
    stripplot = ComparisonPlots.to_stripplot(title, areas)

    Tucan.vconcat([stripplot, bar_chart])
    |> Tucan.set_theme(:five_thirty_eight)
  end
  
  def deianira do
    urn = "tlg0011.tlg001.perseus-grc2.cp-words"
    
    plot(
      urn,
      "Δηιάνειρα",
      "Deianira compared to all other speakers in Trachiniae"
    )
  end

  def hyllus do
    urn = "tlg0011.tlg001.perseus-grc2.cp-words"

    plot(
      urn,
      "Ὕλλος",
      "Hyllus compared to all other speakers in Trachiniae"
    )
  end

  def lichas do
    urn = "tlg0011.tlg001.perseus-grc2.cp-words"

    plot(
        urn,
        "Λίχας",
        "Lichas compared to all other speakers in Trachiniae"
      )
  end

  def messenger do
    urn = "tlg0011.tlg001.perseus-grc2.cp-words"

    plot(
        urn,
        "Ἄγγελος",
        "Messenger compared to all other speakers in Trachiniae"
      )
  end

  def nurse do
    urn = "tlg0011.tlg001.perseus-grc2.cp-words"

    plot(
      urn,
      "Τροφός",
      "Nurse compared to all other speakers in Trachiniae"
    )
  end

  def servant do
    urn = "tlg0011.tlg001.perseus-grc2.cp-words"

    plot(
      urn,
      "Θεράπαινα",
      "Servant compared to all other speakers in Trachiniae"
    )
  end

  def antigone do
    urn = "tlg0011.tlg002.perseus-grc2.cp-words"

    plot(urn, "Ἀντιγόνη", "Antigone compared to all other speakers")
  end
  
  def antigone_ismene_kreon do
    urn = "tlg0011.tlg002.perseus-grc2.cp-words"
    
    plot(urn, "Ἀντιγόνη", "Antigone compared to Ismene and Creon", [
      "Ἀντιγόνη",
      "Ἰσμήνη",
      "Κρέων"
    ])
  end

  def creon do
    urn = "tlg0011.tlg002.perseus-grc2.cp-words"

    plot(urn, "Κρέων", "Creon compared to all other speakers")
  end

  def creon_antigone_ismene do
    urn = "tlg0011.tlg002.perseus-grc2.cp-words"
    
    plot(urn, "Κρέων", "Creon compared to Antigone and Ismene", [
      "Κρέων",
      "Ἀντιγόνη",
      "Ἰσμήνη"
    ])
  end

  def ismene do
    urn = "tlg0011.tlg002.perseus-grc2.cp-words"

    plot(urn, "Ἰσμήνη", "Ismene compared to all other speakers")
  end

  def ismene_antigone_creon do
    urn = "tlg0011.tlg002.perseus-grc2.cp-words"
    
    plot(urn, "Ἰσμήνη", "Ismene compared to Antigone and Creon", [
      "Ἰσμήνη",
      "Ἀντιγόνη",
      "Κρέων"
    ])
  end

  def menelaus do
    urn = "tlg0011.tlg003.perseus-grc2.cp-words"

    plot(urn, "Μενέλαος", "Menelaus compared to all other speakers")
  end

  def menelaus_odysseus do
    urn = "tlg0011.tlg003.perseus-grc2.cp-words"
    
    plot(urn, "Μενέλαος", "Menelaus compared to Odysseus", [
      "Μενέλαος",
      "Ὀδυσσεύς"
    ])
  end

  def heracles do
    urn = "tlg0011.tlg006.perseus-grc2.cp-words"

    plot(urn, "Ἡρακλῆς", "Heracles compared to all other speakers")
  end

  def merchant do
    urn = "tlg0011.tlg006.perseus-grc2.cp-words"
    
    plot(urn, "Ἔμπορος", "Merchant compared to all other speakers")
  end

  def odysseus do
    urn = "tlg0011.tlg006.perseus-grc2.cp-words"

    plot(urn, "Ὀδυσσεύς", "Odysseus compared to all other speakers")
  end

  def tutor do
    urn = "tlg0011.tlg005.perseus-grc2.cp-words"

    plot(urn, "Παιδαγωγός", "Tutor compared to all other speakers")
  end
end
SelectComparisons.lichas()
SelectComparisons.messenger()
SelectComparisons.antigone()
SelectComparisons.creon()
SelectComparisons.ismene()
SelectComparisons.menelaus()
SelectComparisons.merchant()
SelectComparisons.heracles()
SelectComparisons.odysseus()
SelectComparisons.tutor()