Creating a TEI XML Version from Scratch
Section
alias TextServer.Collections
alias TextServer.Languages
alias TextServer.TextGroups
alias TextServer.Works
alias TextServer.Versions
{:ok, collection} =
Collections.find_or_create_collection(%{
repository: "https://github.com/gregorycrane/Wolf1807",
urn: "urn:cts:greekLit",
title: "Towards a Smart App. Crit.: Sophocles' Ajax"
})
{:ok, text_group} =
TextGroups.find_or_create_text_group(%{
title: "Sophocles",
urn: "urn:cts:greekLit:tlg0011",
collection_id: collection.id
})
{:ok, work} =
Works.find_or_create_work(%{
description: "",
english_title: "Ajax",
original_title: "Αἶας",
urn: "urn:cts:greekLit:tlg0011.tlg003",
text_group_id: text_group.id
})
language = Languages.find_or_create_language(%{slug: "grc", title: "Greek"})
xml = File.read!("./priv/static/xml/lloyd-jones1994/tlg0011.tlg003.ajmc-lj.xml")
{:ok, version} =
Versions.find_or_create_version(%{
description: "edited by Hugh Lloyd-Jones",
filename: "lloyd-jones1994/tlg0011.tlg003.ajmc-lj.xml",
filemd5hash: :crypto.hash(:md5, xml) |> Base.encode16(case: :lower),
label: "Sophocles' Ajax",
language_id: language.id,
urn: "urn:cts:greekLit:tlg0011.tlg003.ajmc-lj",
version_type: :edition,
work_id: work.id
})
version = TextServer.Repo.preload(version, :xml_document)
if is_nil(version.xml_document) do
Versions.create_xml_document!(version, %{document: xml})
end
XML Spelunking with DataSchema
version = TextServer.Versions.get_version!(1) |> TextServer.Repo.preload(:xml_document)
{:ok, xml} = TextServer.Versions.XmlDocuments.get_text_body(version.xml_document)
{:ok, lloyd_jones_body} = DataSchema.to_struct(version.xml_document, DataSchemas.Version)
%{word_count: _word_count, lines: lines} =
lloyd_jones_body.body.lines
|> Enum.reduce(%{word_count: 0, lines: []}, fn line, acc ->
text = line.text |> String.trim()
word_count = acc.word_count
words =
Regex.split(~r/[[:space:]]+/, text)
|> Enum.with_index()
|> Enum.map(fn {word, index} ->
offset =
case String.split(text, word, parts: 2) do
[left, _] -> String.length(left)
[_] -> nil
end
%{
xml_id: "word_index_#{word_count + index}",
offset: offset,
text: word
}
end)
new_line = %{elements: line.elements, location: [line.n], text: text, words: words}
%{word_count: word_count + length(words), lines: [new_line | acc.lines]}
end)
lines = Enum.reverse(lines)
alias TextServer.ElementTypes
alias TextServer.TextElements
alias TextServer.TextNodes
lines
|> Enum.each(fn line ->
{:ok, text_node} =
TextNodes.find_or_create_text_node(%{
location: line.location,
text: line.text,
urn: "#{version.urn}:#{Enum.at(line.location, 0)}",
version_id: version.id
})
line.elements
|> Enum.each(fn element ->
{:ok, element_type} = ElementTypes.find_or_create_element_type(%{name: element.name})
{:ok, _text_element} =
%{
attributes: Map.new(element.attributes),
end_offset: element.end_offset,
element_type_id: element_type.id,
end_text_node_id: text_node.id,
start_offset: element.start_offset,
start_text_node_id: text_node.id
}
|> TextElements.find_or_create_text_element()
end)
end)
Schematizing AjMC Canonical JSON
defmodule WordText do
def get_words_for_range(words, %Range{} = range) do
words
|> Enum.slice(range)
|> Enum.map(&Map.get(&1, "text"))
|> Enum.join(" ")
end
def get_words_for_range(words, [first, last]), do: get_words_for_range(words, first..last)
def filter_commentaries_containing_range(commentaries, %Range{} = range) do
commentaries
|> Enum.filter(fn c ->
[f | [l]] = Map.get(c, "word_range")
!Range.disjoint?(f..l, range)
end)
end
end
path =
"/Users/pletcher/code/ajmc/ajmc-multicommentary/priv/static/json/3467O2_tess_retrained.json"
s = File.read!(path)
json = Jason.decode!(s)
children = json |> Map.get("children")
lemmas = children |> Map.get("lemmas") |> Enum.filter(&(Map.get(&1, "label") == "word-anchor"))
words = children |> Map.get("words")
regions = children |> Map.get("regions")
regions
|> Enum.drop_while(&(Map.get(&1, "region_type") != "primary_text"))
|> Enum.chunk_by(&Map.get(&1, "region_type"))
commentaries = regions |> Enum.filter(&(Map.get(&1, "region_type") == "commentary"))
prepped_lemmas =
lemmas
|> Enum.with_index()
|> Enum.map(fn {lemma, index} ->
[lemma_first | [lemma_last]] = Map.get(lemma, "word_range")
lemma_range = lemma_first..lemma_last
lemma_words = WordText.get_words_for_range(words, lemma_range)
with next_lemma when not is_nil(next_lemma) <- Enum.at(lemmas, index + 1) do
[next_lemma_first | [next_lemma_last]] = Map.get(next_lemma, "word_range")
next_lemma_range = next_lemma_first..next_lemma_last
next_lemma_words = WordText.get_words_for_range(words, next_lemma_range)
commentaries =
WordText.filter_commentaries_containing_range(
commentaries,
lemma_first..next_lemma_first
)
g =
commentaries
|> Enum.map(&WordText.get_words_for_range(words, Map.get(&1, "word_range")))
|> Enum.join(" ")
no_lemma = String.split(g, lemma_words, parts: 2, trim: true) |> List.last()
no_next_lemma =
String.split(no_lemma, next_lemma_words, parts: 2, trim: true) |> List.first()
glossa = (lemma_words <> no_next_lemma) |> String.trim()
Map.merge(lemma, %{
"content" => glossa,
"words" => lemma_words,
"commentary_word_ranges" => commentaries |> Enum.map(&Map.get(&1, "word_range"))
})
else
nil ->
commentaries =
WordText.filter_commentaries_containing_range(commentaries, lemma_range)
[_ | [g]] =
commentaries
|> Enum.map(&WordText.get_words_for_range(words, Map.get(&1, "word_range")))
|> Enum.join(" ")
|> String.split(lemma_words, parts: 2, trim: true)
glossa = (lemma_words <> g) |> String.trim()
Map.merge(lemma, %{
"content" => glossa,
"words" => lemma_words,
"commentary_word_ranges" => commentaries |> Enum.map(&Map.get(&1, "word_range"))
})
end
end)
passage_regex =
~r/tei-l@n=(?\d+)\[(?\d+)\]:tei-l@n=(?\d+)\[(?\d+)\]/
urn = "urn:cts:greekLit:tlg0011.tlg003.ajmc-lj"
{:ok, comment_element_type} = ElementTypes.find_or_create_element_type(%{name: "comment"})
prepped_lemmas
|> Enum.each(fn lemma ->
with anchor_target when not is_nil(anchor_target) <- Map.get(lemma, "anchor_target") do
j = Jason.decode!(anchor_target)
selector = Map.get(j, "selector")
content = Map.get(lemma, "content") |> String.replace("- ", "")
{_, popped_content_lemma} = Map.pop(lemma, "content")
with %{
"first_line_n" => first_line_n,
"first_line_offset" => first_line_offset,
"last_line_n" => last_line_n,
"last_line_offset" => last_line_offset
} <- Regex.named_captures(passage_regex, selector) do
if first_line_n != last_line_n do
# If the lemma spans multiple lines, create two comments
# (This is essentially how Word handles block-spanning comments in their docx)
first_n = min(String.to_integer(first_line_n), String.to_integer(last_line_n))
[first_offset, last_offset] =
if first_n == String.to_integer(first_line_n) do
[first_line_offset, last_line_offset]
else
[last_line_offset, first_line_offset]
end
first_text_node = TextNodes.get_text_node_by(%{urn: "#{urn}:#{first_n}"})
{:ok, _text_element} =
%{
attributes: popped_content_lemma,
content: content,
end_offset: String.length(first_text_node.text),
element_type_id: comment_element_type.id,
end_text_node_id: first_text_node.id,
start_offset: first_offset,
start_text_node_id: first_text_node.id
}
|> TextElements.find_or_create_text_element()
last_n = max(String.to_integer(first_line_n), String.to_integer(last_line_n))
last_text_node = TextNodes.get_text_node_by(%{urn: "#{urn}:#{last_n}"})
{:ok, _text_element} =
%{
attributes: popped_content_lemma |> Map.put(:comment_end, true),
content: "",
end_offset: last_offset,
element_type_id: comment_element_type.id,
end_text_node_id: last_text_node.id,
start_offset: 0,
start_text_node_id: last_text_node.id
}
|> TextElements.find_or_create_text_element()
else
text_node = TextNodes.get_text_node_by(%{urn: "#{urn}:#{first_line_n}"})
{:ok, _text_element} =
%{
attributes: popped_content_lemma,
content: content,
end_offset: last_line_offset,
element_type_id: comment_element_type.id,
end_text_node_id: text_node.id,
start_offset: first_line_offset,
start_text_node_id: text_node.id
}
|> TextElements.find_or_create_text_element()
end
else
nil ->
IO.warn("Invalid selector #{selector}.")
end
end
end)