Sophoclean Vocabulary Comparisons
Mix.install([
{:csv, "~> 3.2"},
{:kino_vega_lite, "~> 0.1.8"},
{:scholar, "~> 0.3.1"},
{:similarity, "~> 0.2"},
{:tucan, "~> 0.2.1"}
])
Module for data processing
defmodule SophoclesTfIdf do
def speaker_documents_by_play do
csvs()
|> Enum.reduce(%{}, fn filename, acc ->
speaker_doc =
File.stream!(filename)
|> CSV.decode!(escape_character: ?|, headers: true, separator: ?\t)
|> Enum.group_by(&Map.get(&1, "speaker"))
urn = filename |> Path.basename() |> Path.rootname()
Map.put(acc, urn, speaker_doc)
end)
end
def plays_as_documents do
csvs()
|> Enum.map(fn filename ->
File.stream!(filename)
|> CSV.decode!(escape_character: ?|, headers: true, separator: ?\t)
end)
end
def speaker_frequencies(speaker_document) do
speaker_document |> Enum.frequencies_by(&Map.get(&1, "lemma"))
end
def score_line(words, other_speaker_document) do
other_speaker_max_tf_idf =
other_speaker_document
|> Enum.max_by(&Map.get(&1, "tf-idf"))
|> Map.get("tf-idf")
word_scores =
words
|> Enum.map(fn word ->
other_speaker_document
|> Enum.find(%{"tf-idf" => 0}, &(Map.get(&1, "lemma") == Map.get(word, "lemma")))
|> Map.get("tf-idf")
end)
Enum.sum(word_scores) / Enum.count(words) / other_speaker_max_tf_idf
end
@doc """
Compare the two documents using cosine similarity (Similarity.cosine/2).
Use an ε value of 1e-8 to avoid divide-by-zero errors. (PyTorch does
the same: https://pytorch.org/docs/stable/generated/torch.nn.CosineSimilarity.html.)
With similarity scores in hand, we can calculate the area under the line for
each speaker comparison to get a similarity heuristic
"""
def score_line_by_cosine_similarity(words, other_speaker_document) do
score_tuples = words |> Enum.map(fn word ->
their_score =
other_speaker_document
|> Enum.find(
%{"tf-idf" => nil},
&(Map.get(&1, "lemma") == Map.get(word, "lemma"))
)
|> Map.get("tf-idf")
my_score = Map.get(word, "tf-idf")
if is_nil(their_score) || their_score == 0 || my_score == 0 do
nil
else
{my_score, their_score}
end
end)
|> Enum.reject(&is_nil/1)
my_word_scores = Enum.map(score_tuples, &elem(&1, 0))
other_word_scores = Enum.map(score_tuples, &elem(&1, 1))
if length(my_word_scores) == 0 || length(other_word_scores) == 0 do
0.0
else
Similarity.cosine(my_word_scores, other_word_scores)
end
end
def compare_speakers(speaker_documents) do
speaker_documents
|> Enum.map(fn {speaker, document} ->
document_by_line = document |> Enum.group_by(&Map.get(&1, "n"))
scored_document =
document_by_line
|> Enum.map(fn {n, line_words} ->
line_scores =
speaker_documents
|> Enum.map(fn {speaker2, speaker2_document} ->
{speaker2, score_line_by_cosine_similarity(line_words, speaker2_document)}
end)
{n, line_scores}
end)
{speaker, scored_document}
end)
end
def tf(term, document) do
values = Map.values(document)
total_terms = Enum.sum(values)
term_value = Map.get(document, term, 0)
term_value / total_terms
end
def idf(term, documents) do
df =
documents
|> Enum.reduce(0, fn {_id, document}, acc ->
if Enum.find(document, &(Map.get(&1, "lemma") == term)) do
acc + 1
else
acc
end
end)
:math.log(Enum.count(documents) / df)
end
def clean_line_number(n) do
try do
String.to_integer(n)
rescue
_ ->
Regex.replace(~r/[[:alpha:]]/u, n, "") |> String.to_integer()
end
end
def urn_to_title(urn) do
cond do
String.ends_with?(urn, "tlg0011.tlg001.perseus-grc2.cp-words") -> "Trachiniae"
String.ends_with?(urn, "tlg0011.tlg002.perseus-grc2.cp-words") -> "Antigone"
String.ends_with?(urn, "tlg0011.tlg003.perseus-grc2.cp-words") -> "Ajax"
String.ends_with?(urn, "tlg0011.tlg004.perseus-grc2.cp-words") -> "Oedipus Tyrannus"
String.ends_with?(urn, "tlg0011.tlg005.perseus-grc2.cp-words") -> "Electra"
String.ends_with?(urn, "tlg0011.tlg006.perseus-grc2.cp-words") -> "Philoctetes"
String.ends_with?(urn, "tlg0011.tlg007.perseus-grc2.cp-words") -> "Oedipus at Colonus"
true -> "Invalid URN for this project"
end
end
defp csvs do
path =
if __DIR__ == "." do
"../apps/files/*.cp-words.csv"
else
"/files/*.cp-words.csv"
end
Path.wildcard(__DIR__ <> path)
end
end
compared_lines =
SophoclesTfIdf.speaker_documents_by_play()
|> Enum.map(fn {urn, speaker_documents} ->
speaker_tf_idfs =
speaker_documents
|> Enum.reduce(%{}, fn {speaker, document}, acc ->
frequencies =
document
|> Enum.frequencies_by(&Map.get(&1, "lemma"))
document =
document
|> Enum.map(fn word ->
tf = SophoclesTfIdf.tf(Map.get(word, "lemma"), frequencies)
idf = SophoclesTfIdf.idf(Map.get(word, "lemma"), speaker_documents)
word
|> Map.put("tf", tf)
|> Map.put("idf", idf)
|> Map.put("tf-idf", tf * idf)
end)
Map.put(acc, speaker, document)
end)
{urn, SophoclesTfIdf.compare_speakers(speaker_tf_idfs)}
end)
Plotting the data
defmodule ComparisonToCsv do
def to_csv(compared_lines) do
compared_lines
|> Enum.each(fn {urn, comparisons_by_speaker} ->
urn = String.replace(urn, "urn:cts:greekLit:", "")
comparisons_by_speaker
|> Enum.each(fn {speaker, scored_lines} ->
sorted_lines =
scored_lines
|> Enum.sort_by(fn {n, _scores} ->
SophoclesTfIdf.clean_line_number(n)
end)
data =
sorted_lines
|> Enum.flat_map(fn {n, scores} ->
scores
|> Enum.map(fn {name, score} ->
%{
"line" => SophoclesTfIdf.clean_line_number(n),
"name" => name,
"similarity_score" => score
}
end)
end)
write("#{urn}.#{speaker}.csv", data)
end)
end)
end
def write(filename, data) do
path =
if __DIR__ == "." do
__DIR__ <> "/../apps/files/" <> filename
else
__DIR__ <> "/files/" <> filename
end
dirname = Path.dirname(path)
case File.mkdir(dirname) do
:ok -> :ok
{:error, :eexist} -> :ok
{:error, reason} -> raise "Error creating file #{reason}"
end
f = File.open!(path, [:write, :utf8])
data
|> CSV.encode(
escape_character: ?|,
headers: ["line", "name", "similarity_score"],
separator: ?\t
)
|> Enum.each(&IO.write(f, &1))
end
end
ComparisonToCsv.to_csv(compared_lines)
defmodule ComparisonPlots do
def csv_to_dataset(urn, speaker, filter \\ fn _x -> true end)
def csv_to_dataset(urn, speaker, filter) when is_function(filter) do
filename =
if __DIR__ == "." do
__DIR__ <> "/../apps/files/#{urn}.#{speaker}.csv"
else
__DIR__ <> "/files/#{urn}.#{speaker}.csv"
end
lines =
File.stream!(filename)
|> CSV.decode!(escape_character: ?|, headers: true, separator: ?\t)
|> Enum.filter(fn row -> filter.(row) end)
|> Enum.reject(fn row -> Map.get(row, "name") == speaker end)
|> Enum.map(fn %{
"line" => line,
"name" => name,
"similarity_score" => similarity_score
} ->
%{
"line" => SophoclesTfIdf.clean_line_number(line),
"name" => name,
"similarity" => String.to_float(similarity_score)
}
end)
areas =
lines
|> Enum.group_by(&Map.get(&1, "name"))
|> Enum.map(fn {name, name_lines} ->
%{
"name" => name,
"area" => name_lines
|> Enum.map(&Map.get(&1, "similarity"))
|> Nx.tensor()
|> Scholar.Integrate.trapezoidal_uniform()
|> Nx.to_number()
}
end)
{lines, areas}
end
def csv_to_dataset(urn, speaker, filter) when is_list(filter) do
csv_to_dataset(urn, speaker, &Enum.member?(filter, Map.get(&1, "name")))
end
def csv_to_dataset(urn, speaker, filter) when is_binary(filter) do
csv_to_dataset(urn, speaker, &(Map.get(&1, "name") == filter))
end
def to_bar_chart(title, lines) do
lines
|> Tucan.bar(
"line",
"similarity",
x: [type: :ordinal],
mode: :grouped,
color_by: "name",
title: title,
tooltip: :data
)
|> Tucan.set_width(8000)
|> Tucan.Legend.put_options(:color, label_font_size: 12, title: "Speaker")
end
def to_stripplot(title, areas) do
areas
|> Tucan.stripplot(
"area",
color_by: "name",
title: title,
tooltip: :data,
style: :point,
orient: :vertical
)
|> Tucan.set_height(300)
|> Tucan.set_width(300)
|> Tucan.Legend.put_options(:color, label_font_size: 12, title: "Speaker")
end
def transliterate(s) do
String.normalize(s, :nfd)
|> String.replace(~r/\W|῾/u, "")
|> String.split("")
|> Enum.filter(fn c -> c != "" end)
|> Enum.map(fn c ->
greek_to_ascii(String.downcase(c))
end)
|> Enum.join("")
|> String.capitalize()
end
def greek_to_ascii(c) do
case c do
"\῾" -> "h"
"α" -> "a"
"β" -> "b"
"γ" -> "g"
"δ" -> "d"
"ε" -> "e"
"ζ" -> "z"
"η" -> "e"
"θ" -> "th"
"ι" -> "i"
"κ" -> "c"
"λ" -> "l"
"μ" -> "m"
"ν" -> "n"
"ξ" -> "x"
"ο" -> "o"
"π" -> "p"
"ρ" -> "r"
"σ" -> "s"
"ς" -> "s"
"τ" -> "t"
"υ" -> "u"
"φ" -> "ph"
"χ" -> "ch"
"ψ" -> "ps"
"ω" -> "o"
end
end
end
defmodule SelectComparisons do
def plot(urn, name, title, filter) do
{lines, areas} = ComparisonPlots.csv_to_dataset(urn, name, filter)
bar_chart = ComparisonPlots.to_bar_chart(title, lines)
stripplot = ComparisonPlots.to_stripplot(title, areas)
Tucan.vconcat([stripplot, bar_chart])
|> Tucan.set_theme(:five_thirty_eight)
end
def plot(urn, name, title) do
{lines, areas} = ComparisonPlots.csv_to_dataset(urn, name)
bar_chart = ComparisonPlots.to_bar_chart(title, lines)
stripplot = ComparisonPlots.to_stripplot(title, areas)
Tucan.vconcat([stripplot, bar_chart])
|> Tucan.set_theme(:five_thirty_eight)
end
def deianira do
urn = "tlg0011.tlg001.perseus-grc2.cp-words"
plot(
urn,
"Δηιάνειρα",
"Deianira compared to all other speakers in Trachiniae"
)
end
def hyllus do
urn = "tlg0011.tlg001.perseus-grc2.cp-words"
plot(
urn,
"Ὕλλος",
"Hyllus compared to all other speakers in Trachiniae"
)
end
def lichas do
urn = "tlg0011.tlg001.perseus-grc2.cp-words"
plot(
urn,
"Λίχας",
"Lichas compared to all other speakers in Trachiniae"
)
end
def messenger do
urn = "tlg0011.tlg001.perseus-grc2.cp-words"
plot(
urn,
"Ἄγγελος",
"Messenger compared to all other speakers in Trachiniae"
)
end
def nurse do
urn = "tlg0011.tlg001.perseus-grc2.cp-words"
plot(
urn,
"Τροφός",
"Nurse compared to all other speakers in Trachiniae"
)
end
def servant do
urn = "tlg0011.tlg001.perseus-grc2.cp-words"
plot(
urn,
"Θεράπαινα",
"Servant compared to all other speakers in Trachiniae"
)
end
def antigone do
urn = "tlg0011.tlg002.perseus-grc2.cp-words"
plot(urn, "Ἀντιγόνη", "Antigone compared to all other speakers")
end
def antigone_ismene_kreon do
urn = "tlg0011.tlg002.perseus-grc2.cp-words"
plot(urn, "Ἀντιγόνη", "Antigone compared to Ismene and Creon", [
"Ἀντιγόνη",
"Ἰσμήνη",
"Κρέων"
])
end
def creon do
urn = "tlg0011.tlg002.perseus-grc2.cp-words"
plot(urn, "Κρέων", "Creon compared to all other speakers")
end
def creon_antigone_ismene do
urn = "tlg0011.tlg002.perseus-grc2.cp-words"
plot(urn, "Κρέων", "Creon compared to Antigone and Ismene", [
"Κρέων",
"Ἀντιγόνη",
"Ἰσμήνη"
])
end
def ismene do
urn = "tlg0011.tlg002.perseus-grc2.cp-words"
plot(urn, "Ἰσμήνη", "Ismene compared to all other speakers")
end
def ismene_antigone_creon do
urn = "tlg0011.tlg002.perseus-grc2.cp-words"
plot(urn, "Ἰσμήνη", "Ismene compared to Antigone and Creon", [
"Ἰσμήνη",
"Ἀντιγόνη",
"Κρέων"
])
end
def menelaus do
urn = "tlg0011.tlg003.perseus-grc2.cp-words"
plot(urn, "Μενέλαος", "Menelaus compared to all other speakers")
end
def menelaus_odysseus do
urn = "tlg0011.tlg003.perseus-grc2.cp-words"
plot(urn, "Μενέλαος", "Menelaus compared to Odysseus", [
"Μενέλαος",
"Ὀδυσσεύς"
])
end
def heracles do
urn = "tlg0011.tlg006.perseus-grc2.cp-words"
plot(urn, "Ἡρακλῆς", "Heracles compared to all other speakers")
end
def merchant do
urn = "tlg0011.tlg006.perseus-grc2.cp-words"
plot(urn, "Ἔμπορος", "Merchant compared to all other speakers")
end
def odysseus do
urn = "tlg0011.tlg006.perseus-grc2.cp-words"
plot(urn, "Ὀδυσσεύς", "Odysseus compared to all other speakers")
end
def tutor do
urn = "tlg0011.tlg005.perseus-grc2.cp-words"
plot(urn, "Παιδαγωγός", "Tutor compared to all other speakers")
end
end
SelectComparisons.lichas()
SelectComparisons.messenger()
SelectComparisons.antigone()
SelectComparisons.creon()
SelectComparisons.ismene()
SelectComparisons.menelaus()
SelectComparisons.merchant()
SelectComparisons.heracles()
SelectComparisons.odysseus()
SelectComparisons.tutor()