TEI XML Explorations

livebooks/tei_xml.livemd

@Open-Commentaries

open-commentaries

Share to X

Share to Bluesky

More notebooks

TEI XML Explorations

text_server_root = Path.join(__DIR__, "..")

Mix.install(
  [
    {:text_server, path: text_server_root, env: :dev}
  ],
  config_path: Path.join(text_server_root, "config/config.exs"),
  lockfile: Path.join(text_server_root, "mix.lock")
)

Read XML file and get replacement patterns from TEI header

pausanias_f = "tmp/canonical-greekLit/data/tlg0525/tlg001/tlg0525.tlg001.perseus-grc2.xml"
xml_document = File.read!(pausanias_f)
work = TextServer.Works.get_work_by_urn("greekLit:tlg0525.tlg001")

{:ok, version} =
  TextServer.Xml.find_or_create_version(%{
    version_type: :edition,
    urn: "urn:cts:greekLit:tlg0525.tlg001.perseus-grc2",
    xml_document: xml_document,
    work_id: work.id
  })

What do we need from the document?

defmodule TeiExplorer.TableOfContents do
  def collect_citations(passage_refs) when length(passage_refs) == 1 do
    passage_refs |> List.first() |> Enum.map(&String.to_integer/1)
  end

  def collect_citations(passage_refs, grouped \\ [])

  def collect_citations([], grouped), do: grouped

  def collect_citations(passage_refs, grouped) when length(passage_refs) == 3 do
    [sections, chapters, books] = passage_refs

    current_sections = get_current_level(sections)
    [current_chapter | rest_chapters] = chapters
    [current_book | rest_books] = books

    citations =
      for section <- current_sections do
        {current_book, current_chapter, section}
      end

    rest_sections = sections -- current_sections

    if Enum.count(rest_sections) == 0 do
      [citations | grouped] |> List.flatten()
    else
      next_books =
        if is_greater(current_chapter, List.first(rest_chapters)) do
          rest_books
        else
          books
        end

      collect_citations([rest_sections, rest_chapters, next_books], [citations | grouped])
    end
  end

  def collect_citations(passage_refs, grouped) when length(passage_refs) == 2 do
    [lines, books] = passage_refs

    current_lines = get_current_level(lines)
    [current_book | rest_books] = books

    citations =
      for line <- lines do
        {current_book, line}
      end

    rest_lines = lines -- current_lines

    if Enum.count(rest_lines) == 0 do
      [citations | grouped] |> List.flatten()
    else
      collect_citations([rest_lines, rest_books], [citations | grouped])
    end
  end

  defp get_current_level(sections) do
    sections
    |> Enum.reduce_while([], fn section, acc ->
      if is_greater(section, List.first(acc)) do
        {:cont, [section | acc]}
      else
        {:halt, acc}
      end
    end)
  end

  defp is_greater(x, y) do
    to_comparable(x) > to_comparable(y)
  end

  defp to_comparable(x) when is_nil(x), do: 0
  defp to_comparable(x) when is_binary(x), do: String.to_integer(x, 10)
  defp to_comparable(x) when is_integer(x), do: x
end

defmodule TeiExplorer do
  alias TextServer.Repo
  alias TextServer.Xml.Version

  import Ecto.Query

  @doc """
  Get the list of possible references.
  What if instead of doing all of this reference build-up
  for marginal gain, we just expanded to the second-deepest
  level of the citation schema and highlighted the requested node?

  Eg., "get Pausanias 3.4.2" would retrieve all of 
  Book 3 Chapter 4, and highlight Section 2.

  "get Pausanias 3.4" would get the same page but not
  highlight anything.

  "get Pausanias 3" would get Book 3 Chapter 1 and not
  highlight anything.

  For two levels of citation, the top one is our page? (Should we
  should display whole books of Homer at once? Or at least fetch them?)

  For one level of citation, fetch a reasonable number of nodes.
  (One level often means tragedy --- how to include speaker tags?)
  """
  def get_table_of_contents(%Version{} = version) do
    version.refs_declaration.replacement_patterns
    |> Enum.map(fn pattern ->
      path = clean_xpath_string(pattern)
      get_xpath_result(version, path <> "/@n")
    end)
    |> TeiExplorer.TableOfContents.collect_citations()
  end

  @doc """
  Get all of the leaf nodes in the text tree.
  """
  def get_text_nodes(%Version{} = version) do
    pattern = List.first(version.refs_declaration.replacement_patterns)
    path = clean_xpath_string(pattern)

    get_xpath_result(version, path)
  end

  def get_pages(%Version{} = version) do
    patterns = version.refs_declaration.replacement_patterns |> Enum.reverse()
    pattern = Enum.at(patterns, 1)
    path = clean_xpath_string(pattern)

    get_xpath_result(version, path)
  end

  @doc """
  Get the set of element names that are children
  of any text node in the document.
  """
  def get_element_names(%Version{} = _version) do
  end

  def clean_xpath_string(s) do
    Regex.named_captures(~r/\#xpath\((?<path>.*)\)/, s)
    |> Map.get("path")
    |> String.replace(~r/\[@n='\$\d+'\]/, "")
  end

  @doc """
  Queries the given version using PostgreSQL's built-in
  xpath support.
  """
  def get_xpath_result(%Version{} = version, path) do
    Version
    |> where([v], v.id == ^version.id)
    |> select(
      fragment(
        """
        xpath(
          ?,
          xml_document,
          ARRAY[ARRAY['tei', 'http://www.tei-c.org/ns/1.0']]
        )::text[]
        """,
        ^path
      )
    )
    |> Repo.one()
  end
end

Create a version from CTS-compliant docs

defmodule TextServer.Livebook.CTSVersions do
  alias TextServer.Languages
  alias TextServer.TextGroups.TextGroup
  alias TextServer.Versions
  alias TextServer.Works.Work

  def base_cts_dir do
    "tmp/canonical-greekLit/data"
  end

  def create_commentary(work, version_data) do
    create_version(work, version_data, :commentary)
  end

  def create_edition(work, version_data) do
    create_version(work, version_data, :edition)
  end

  def create_translation(work, version_data) do
    create_version(work, version_data, :translation)
  end

  def create_version(work, version_data, version_type) do
    urn = Map.get(version_data, :urn) |> CTS.URN.parse()
    file = get_version_file(urn)
    xml_raw = File.read!(file)
    md5 = :crypto.hash(:md5, xml_raw) |> Base.encode16(case: :lower)
    language = Languages.get_language_by_slug(version_data.language)

    {:ok, version} =
      Map.take(version_data, [:description, :label])
      |> Map.merge(%{
        filename: file,
        filemd5hash: md5,
        language_id: language.id,
        urn: urn,
        version_type: version_type,
        work_id: work.id
      })
      |> Versions.find_or_create_version()

    Versions.create_xml_document!(version, %{document: xml_raw})
  end

  def create_versions_of_work(%Work{} = work) do
    {:ok, work_cts_data} = get_work_cts_data(work)

    Map.get(work_cts_data, :commentaries) |> Enum.each(&create_commentary(work, &1))
    Map.get(work_cts_data, :editions) |> Enum.each(&create_edition(work, &1))
    Map.get(work_cts_data, :translations) |> Enum.each(&create_translation(work, &1))
  end

  def get_version_file(urn) do
    path = base_cts_dir() <> "/" <> get_work_dir(urn) <> "/#{urn.work_component}.xml"

    if File.exists?(path) do
      path
    else
      :enoent
    end
  end

  def get_work_cts_data(%Work{} = work) do
    cts_file = get_work_cts_file(work)
    cts_data_raw = File.read!(cts_file)
    DataSchema.to_struct(cts_data_raw, DataSchemas.Work.CTSDocument)
  end

  def get_work_cts_file(work) do
    path = base_cts_dir() <> "/#{get_work_dir(work.urn)}/__cts__.xml"

    if File.exists?(path) do
      path
    else
      :enoent
    end
  end

  def get_work_dir(%CTS.URN{} = urn) do
    "#{urn.text_group}/#{urn.work}"
  end

  def list_text_group_files(%TextGroup{} = text_group) do
    text_group_cts_file = base_cts_dir() <> "/#{text_group.urn.text_group}/__cts__.xml"

    work_cts_files =
      Path.wildcard(base_cts_dir() <> "/#{text_group.urn.text_group}/*/__cts__.xml")

    [text_group_cts_file | work_cts_files]
  end
end

pausanias_work = TextServer.Works.get_work_by_urn("urn:cts:greekLit:tlg0525.tlg001")
TextServer.Livebook.CTSVersions.get_work_cts_data(pausanias_work)

version =
  TextServer.Versions.get_version_by_urn!(
    CTS.URN.parse("urn:cts:greekLit:tlg0525.tlg001.perseus-grc2")
  )
  |> TextServer.Repo.preload(:xml_document)

TextServer.Versions.XmlDocuments.get_refs_decl(version.xml_document)

Other notebooks:

@Open-Commentaries

open-commentaries

Docx with Pandoc via Panpipe

docx_with_pandoc_via_panpipe.livemd

tutorial advanced text_server

2023-12-11
Ajax Multi-Commentary
@AjaxMultiCommentary

ajmc-elixir

TEI XML Explorations

tei_xml.livemd

tutorial advanced text_server

2024-6-13
Ammar Massoud
@ammar-mohamed-massoud

Dockyard-Academy

Sublist

deprecated_sublist.livemd

algorithms intermediate jason kino youtube hidden_cell

2026-7-11
PJ DiIorio
@pjdiiori

adventofcode

Advent of Code 2022

AOC_2022.livemd

tutorial algorithms intermediate kino vega_lite kino_vega_lite

2024-12-9
Marco Delaurenti
@mfdela

elixir_livebook

Sierpinsky Triangle

sierpinsky.livemd

tutorial intermediate kino_vega_lite vega_lite kino

2026-7-7
edmondfrank
@EdmondFrank

llmgan

Basic LLMGAN - Json

json.livemd

tutorial advanced data-science llmgan vega_lite kino_vega_lite

2026-2-4
Guillaume BAILLEUL
@laibulle

quant

Strategy optimization examples

strategy_optimization.livemd

advanced data-science quant explorer kino decimal kino_vega_lite vega_lite

2026-7-12

Back