TEI XML Explorations
text_server_root = Path.join(__DIR__, "..")
Mix.install(
[
{:text_server, path: text_server_root, env: :dev}
],
config_path: Path.join(text_server_root, "config/config.exs"),
lockfile: Path.join(text_server_root, "mix.lock")
)
Read XML file and get replacement patterns from TEI header
pausanias_f = "tmp/canonical-greekLit/data/tlg0525/tlg001/tlg0525.tlg001.perseus-grc2.xml"
xml_document = File.read!(pausanias_f)
work = TextServer.Works.get_work_by_urn("greekLit:tlg0525.tlg001")
{:ok, version} =
TextServer.Xml.find_or_create_version(%{
version_type: :edition,
urn: "urn:cts:greekLit:tlg0525.tlg001.perseus-grc2",
xml_document: xml_document,
work_id: work.id
})
What do we need from the document?
defmodule TeiExplorer.TableOfContents do
def collect_citations(passage_refs) when length(passage_refs) == 1 do
passage_refs |> List.first() |> Enum.map(&String.to_integer/1)
end
def collect_citations(passage_refs, grouped \\ [])
def collect_citations([], grouped), do: grouped
def collect_citations(passage_refs, grouped) when length(passage_refs) == 3 do
[sections, chapters, books] = passage_refs
current_sections = get_current_level(sections)
[current_chapter | rest_chapters] = chapters
[current_book | rest_books] = books
citations =
for section <- current_sections do
{current_book, current_chapter, section}
end
rest_sections = sections -- current_sections
if Enum.count(rest_sections) == 0 do
[citations | grouped] |> List.flatten()
else
next_books =
if is_greater(current_chapter, List.first(rest_chapters)) do
rest_books
else
books
end
collect_citations([rest_sections, rest_chapters, next_books], [citations | grouped])
end
end
def collect_citations(passage_refs, grouped) when length(passage_refs) == 2 do
[lines, books] = passage_refs
current_lines = get_current_level(lines)
[current_book | rest_books] = books
citations =
for line <- lines do
{current_book, line}
end
rest_lines = lines -- current_lines
if Enum.count(rest_lines) == 0 do
[citations | grouped] |> List.flatten()
else
collect_citations([rest_lines, rest_books], [citations | grouped])
end
end
defp get_current_level(sections) do
sections
|> Enum.reduce_while([], fn section, acc ->
if is_greater(section, List.first(acc)) do
{:cont, [section | acc]}
else
{:halt, acc}
end
end)
end
defp is_greater(x, y) do
to_comparable(x) > to_comparable(y)
end
defp to_comparable(x) when is_nil(x), do: 0
defp to_comparable(x) when is_binary(x), do: String.to_integer(x, 10)
defp to_comparable(x) when is_integer(x), do: x
end
defmodule TeiExplorer do
alias TextServer.Repo
alias TextServer.Xml.Version
import Ecto.Query
@doc """
Get the list of possible references.
What if instead of doing all of this reference build-up
for marginal gain, we just expanded to the second-deepest
level of the citation schema and highlighted the requested node?
Eg., "get Pausanias 3.4.2" would retrieve all of
Book 3 Chapter 4, and highlight Section 2.
"get Pausanias 3.4" would get the same page but not
highlight anything.
"get Pausanias 3" would get Book 3 Chapter 1 and not
highlight anything.
For two levels of citation, the top one is our page? (Should we
should display whole books of Homer at once? Or at least fetch them?)
For one level of citation, fetch a reasonable number of nodes.
(One level often means tragedy --- how to include speaker tags?)
"""
def get_table_of_contents(%Version{} = version) do
version.refs_declaration.replacement_patterns
|> Enum.map(fn pattern ->
path = clean_xpath_string(pattern)
get_xpath_result(version, path <> "/@n")
end)
|> TeiExplorer.TableOfContents.collect_citations()
end
@doc """
Get all of the leaf nodes in the text tree.
"""
def get_text_nodes(%Version{} = version) do
pattern = List.first(version.refs_declaration.replacement_patterns)
path = clean_xpath_string(pattern)
get_xpath_result(version, path)
end
def get_pages(%Version{} = version) do
patterns = version.refs_declaration.replacement_patterns |> Enum.reverse()
pattern = Enum.at(patterns, 1)
path = clean_xpath_string(pattern)
get_xpath_result(version, path)
end
@doc """
Get the set of element names that are children
of any text node in the document.
"""
def get_element_names(%Version{} = _version) do
end
def clean_xpath_string(s) do
Regex.named_captures(~r/\#xpath\((?.*)\)/, s)
|> Map.get("path")
|> String.replace(~r/\[@n='\$\d+'\]/, "")
end
@doc """
Queries the given version using PostgreSQL's built-in
xpath support.
"""
def get_xpath_result(%Version{} = version, path) do
Version
|> where([v], v.id == ^version.id)
|> select(
fragment(
"""
xpath(
?,
xml_document,
ARRAY[ARRAY['tei', 'http://www.tei-c.org/ns/1.0']]
)::text[]
""",
^path
)
)
|> Repo.one()
end
end
Create a version from CTS-compliant docs
defmodule TextServer.Livebook.CTSVersions do
alias TextServer.Languages
alias TextServer.TextGroups.TextGroup
alias TextServer.Versions
alias TextServer.Works.Work
def base_cts_dir do
"tmp/canonical-greekLit/data"
end
def create_commentary(work, version_data) do
create_version(work, version_data, :commentary)
end
def create_edition(work, version_data) do
create_version(work, version_data, :edition)
end
def create_translation(work, version_data) do
create_version(work, version_data, :translation)
end
def create_version(work, version_data, version_type) do
urn = Map.get(version_data, :urn) |> CTS.URN.parse()
file = get_version_file(urn)
xml_raw = File.read!(file)
md5 = :crypto.hash(:md5, xml_raw) |> Base.encode16(case: :lower)
language = Languages.get_language_by_slug(version_data.language)
{:ok, version} =
Map.take(version_data, [:description, :label])
|> Map.merge(%{
filename: file,
filemd5hash: md5,
language_id: language.id,
urn: urn,
version_type: version_type,
work_id: work.id
})
|> Versions.find_or_create_version()
Versions.create_xml_document!(version, %{document: xml_raw})
end
def create_versions_of_work(%Work{} = work) do
{:ok, work_cts_data} = get_work_cts_data(work)
Map.get(work_cts_data, :commentaries) |> Enum.each(&create_commentary(work, &1))
Map.get(work_cts_data, :editions) |> Enum.each(&create_edition(work, &1))
Map.get(work_cts_data, :translations) |> Enum.each(&create_translation(work, &1))
end
def get_version_file(urn) do
path = base_cts_dir() <> "/" <> get_work_dir(urn) <> "/#{urn.work_component}.xml"
if File.exists?(path) do
path
else
:enoent
end
end
def get_work_cts_data(%Work{} = work) do
cts_file = get_work_cts_file(work)
cts_data_raw = File.read!(cts_file)
DataSchema.to_struct(cts_data_raw, DataSchemas.Work.CTSDocument)
end
def get_work_cts_file(work) do
path = base_cts_dir() <> "/#{get_work_dir(work.urn)}/__cts__.xml"
if File.exists?(path) do
path
else
:enoent
end
end
def get_work_dir(%CTS.URN{} = urn) do
"#{urn.text_group}/#{urn.work}"
end
def list_text_group_files(%TextGroup{} = text_group) do
text_group_cts_file = base_cts_dir() <> "/#{text_group.urn.text_group}/__cts__.xml"
work_cts_files =
Path.wildcard(base_cts_dir() <> "/#{text_group.urn.text_group}/*/__cts__.xml")
[text_group_cts_file | work_cts_files]
end
end
pausanias_work = TextServer.Works.get_work_by_urn("urn:cts:greekLit:tlg0525.tlg001")
TextServer.Livebook.CTSVersions.get_work_cts_data(pausanias_work)
version =
TextServer.Versions.get_version_by_urn!(
CTS.URN.parse("urn:cts:greekLit:tlg0525.tlg001.perseus-grc2")
)
|> TextServer.Repo.preload(:xml_document)
TextServer.Versions.XmlDocuments.get_refs_decl(version.xml_document)