Creating a TEI XML Version from Scratch

livebooks/tei_xml_from_scratch.livemd

Ajax Multi-Commentary

@AjaxMultiCommentary

ajmc-elixir

Share to X

Share to Bluesky

More notebooks

Creating a TEI XML Version from Scratch

Section

alias TextServer.Collections
alias TextServer.Languages
alias TextServer.TextGroups
alias TextServer.Works
alias TextServer.Versions

{:ok, collection} =
  Collections.find_or_create_collection(%{
    repository: "https://github.com/gregorycrane/Wolf1807",
    urn: "urn:cts:greekLit",
    title: "Towards a Smart App. Crit.: Sophocles' Ajax"
  })

{:ok, text_group} =
  TextGroups.find_or_create_text_group(%{
    title: "Sophocles",
    urn: "urn:cts:greekLit:tlg0011",
    collection_id: collection.id
  })

{:ok, work} =
  Works.find_or_create_work(%{
    description: "",
    english_title: "Ajax",
    original_title: "Αἶας",
    urn: "urn:cts:greekLit:tlg0011.tlg003",
    text_group_id: text_group.id
  })

language = Languages.find_or_create_language(%{slug: "grc", title: "Greek"})

xml = File.read!("./priv/static/xml/lloyd-jones1994/tlg0011.tlg003.ajmc-lj.xml")

{:ok, version} =
  Versions.find_or_create_version(%{
    description: "edited by Hugh Lloyd-Jones",
    filename: "lloyd-jones1994/tlg0011.tlg003.ajmc-lj.xml",
    filemd5hash: :crypto.hash(:md5, xml) |> Base.encode16(case: :lower),
    label: "Sophocles' Ajax",
    language_id: language.id,
    urn: "urn:cts:greekLit:tlg0011.tlg003.ajmc-lj",
    version_type: :edition,
    work_id: work.id
  })

version = TextServer.Repo.preload(version, :xml_document)

if is_nil(version.xml_document) do
  Versions.create_xml_document!(version, %{document: xml})
end

XML Spelunking with DataSchema

version = TextServer.Versions.get_version!(1) |> TextServer.Repo.preload(:xml_document)
{:ok, xml} = TextServer.Versions.XmlDocuments.get_text_body(version.xml_document)

{:ok, lloyd_jones_body} = DataSchema.to_struct(version.xml_document, DataSchemas.Version)

%{word_count: _word_count, lines: lines} =
  lloyd_jones_body.body.lines
  |> Enum.reduce(%{word_count: 0, lines: []}, fn line, acc ->
    text = line.text |> String.trim()
    word_count = acc.word_count

    words =
      Regex.split(~r/[[:space:]]+/, text)
      |> Enum.with_index()
      |> Enum.map(fn {word, index} ->
        offset =
          case String.split(text, word, parts: 2) do
            [left, _] -> String.length(left)
            [_] -> nil
          end

        %{
          xml_id: "word_index_#{word_count + index}",
          offset: offset,
          text: word
        }
      end)

    new_line = %{elements: line.elements, location: [line.n], text: text, words: words}

    %{word_count: word_count + length(words), lines: [new_line | acc.lines]}
  end)

lines = Enum.reverse(lines)

alias TextServer.ElementTypes
alias TextServer.TextElements
alias TextServer.TextNodes

lines
|> Enum.each(fn line ->
  {:ok, text_node} =
    TextNodes.find_or_create_text_node(%{
      location: line.location,
      text: line.text,
      urn: "#{version.urn}:#{Enum.at(line.location, 0)}",
      version_id: version.id
    })

  line.elements
  |> Enum.each(fn element ->
    {:ok, element_type} = ElementTypes.find_or_create_element_type(%{name: element.name})

    {:ok, _text_element} =
      %{
        attributes: Map.new(element.attributes),
        end_offset: element.end_offset,
        element_type_id: element_type.id,
        end_text_node_id: text_node.id,
        start_offset: element.start_offset,
        start_text_node_id: text_node.id
      }
      |> TextElements.find_or_create_text_element()
  end)
end)

Schematizing AjMC Canonical JSON

defmodule WordText do
  def get_words_for_range(words, %Range{} = range) do
    words
    |> Enum.slice(range)
    |> Enum.map(&amp;Map.get(&amp;1, "text"))
    |> Enum.join(" ")
  end

  def get_words_for_range(words, [first, last]), do: get_words_for_range(words, first..last)

  def filter_commentaries_containing_range(commentaries, %Range{} = range) do
    commentaries
    |> Enum.filter(fn c ->
      [f | [l]] = Map.get(c, "word_range")

      !Range.disjoint?(f..l, range)
    end)
  end
end

path =
  "/Users/pletcher/code/ajmc/ajmc-multicommentary/priv/static/json/3467O2_tess_retrained.json"

s = File.read!(path)
json = Jason.decode!(s)

children = json |> Map.get("children")
lemmas = children |> Map.get("lemmas") |> Enum.filter(&amp;(Map.get(&amp;1, "label") == "word-anchor"))
words = children |> Map.get("words")
regions = children |> Map.get("regions")

regions
|> Enum.drop_while(&amp;(Map.get(&amp;1, "region_type") != "primary_text"))
|> Enum.chunk_by(&amp;Map.get(&amp;1, "region_type"))

commentaries = regions |> Enum.filter(&amp;(Map.get(&amp;1, "region_type") == "commentary"))

prepped_lemmas =
  lemmas
  |> Enum.with_index()
  |> Enum.map(fn {lemma, index} ->
    [lemma_first | [lemma_last]] = Map.get(lemma, "word_range")
    lemma_range = lemma_first..lemma_last
    lemma_words = WordText.get_words_for_range(words, lemma_range)

    with next_lemma when not is_nil(next_lemma) <- Enum.at(lemmas, index + 1) do
      [next_lemma_first | [next_lemma_last]] = Map.get(next_lemma, "word_range")
      next_lemma_range = next_lemma_first..next_lemma_last
      next_lemma_words = WordText.get_words_for_range(words, next_lemma_range)

      commentaries =
        WordText.filter_commentaries_containing_range(
          commentaries,
          lemma_first..next_lemma_first
        )

      g =
        commentaries
        |> Enum.map(&amp;WordText.get_words_for_range(words, Map.get(&amp;1, "word_range")))
        |> Enum.join(" ")

      no_lemma = String.split(g, lemma_words, parts: 2, trim: true) |> List.last()

      no_next_lemma =
        String.split(no_lemma, next_lemma_words, parts: 2, trim: true) |> List.first()

      glossa = (lemma_words <> no_next_lemma) |> String.trim()

      Map.merge(lemma, %{
        "content" => glossa,
        "words" => lemma_words,
        "commentary_word_ranges" => commentaries |> Enum.map(&amp;Map.get(&amp;1, "word_range"))
      })
    else
      nil ->
        commentaries =
          WordText.filter_commentaries_containing_range(commentaries, lemma_range)

        [_ | [g]] =
          commentaries
          |> Enum.map(&amp;WordText.get_words_for_range(words, Map.get(&amp;1, "word_range")))
          |> Enum.join(" ")
          |> String.split(lemma_words, parts: 2, trim: true)

        glossa = (lemma_words <> g) |> String.trim()

        Map.merge(lemma, %{
          "content" => glossa,
          "words" => lemma_words,
          "commentary_word_ranges" => commentaries |> Enum.map(&amp;Map.get(&amp;1, "word_range"))
        })
    end
  end)

passage_regex =
  ~r/tei-l@n=(?\d+)\[(?\d+)\]:tei-l@n=(?\d+)\[(?\d+)\]/

urn = "urn:cts:greekLit:tlg0011.tlg003.ajmc-lj"

{:ok, comment_element_type} = ElementTypes.find_or_create_element_type(%{name: "comment"})

prepped_lemmas
|> Enum.each(fn lemma ->
  with anchor_target when not is_nil(anchor_target) <- Map.get(lemma, "anchor_target") do
    j = Jason.decode!(anchor_target)
    selector = Map.get(j, "selector")
    content = Map.get(lemma, "content") |> String.replace("- ", "")
    {_, popped_content_lemma} = Map.pop(lemma, "content")

    with %{
           "first_line_n" => first_line_n,
           "first_line_offset" => first_line_offset,
           "last_line_n" => last_line_n,
           "last_line_offset" => last_line_offset
         } <- Regex.named_captures(passage_regex, selector) do
      if first_line_n != last_line_n do
        # If the lemma spans multiple lines, create two comments
        # (This is essentially how Word handles block-spanning comments in their docx)
        first_n = min(String.to_integer(first_line_n), String.to_integer(last_line_n))

        [first_offset, last_offset] =
          if first_n == String.to_integer(first_line_n) do
            [first_line_offset, last_line_offset]
          else
            [last_line_offset, first_line_offset]
          end

        first_text_node = TextNodes.get_text_node_by(%{urn: "#{urn}:#{first_n}"})

        {:ok, _text_element} =
          %{
            attributes: popped_content_lemma,
            content: content,
            end_offset: String.length(first_text_node.text),
            element_type_id: comment_element_type.id,
            end_text_node_id: first_text_node.id,
            start_offset: first_offset,
            start_text_node_id: first_text_node.id
          }
          |> TextElements.find_or_create_text_element()

        last_n = max(String.to_integer(first_line_n), String.to_integer(last_line_n))
        last_text_node = TextNodes.get_text_node_by(%{urn: "#{urn}:#{last_n}"})

        {:ok, _text_element} =
          %{
            attributes: popped_content_lemma |> Map.put(:comment_end, true),
            content: "",
            end_offset: last_offset,
            element_type_id: comment_element_type.id,
            end_text_node_id: last_text_node.id,
            start_offset: 0,
            start_text_node_id: last_text_node.id
          }
          |> TextElements.find_or_create_text_element()
      else
        text_node = TextNodes.get_text_node_by(%{urn: "#{urn}:#{first_line_n}"})

        {:ok, _text_element} =
          %{
            attributes: popped_content_lemma,
            content: content,
            end_offset: last_line_offset,
            element_type_id: comment_element_type.id,
            end_text_node_id: text_node.id,
            start_offset: first_line_offset,
            start_text_node_id: text_node.id
          }
          |> TextElements.find_or_create_text_element()
      end
    else
      nil ->
        IO.warn("Invalid selector #{selector}.")
    end
  end
end)

Other notebooks:

Ryo Wakabayashi
@RyoWakabayashi

elixir-learning

Image Classification Client

Image_classification_client.livemd

nx exla axon_onnx evision flow req kino

2023-8-21
Charlie Roth
@charlieroth

makemore

Makemore

makemore.livemd

explorer kino axon tucan kino_vega_lite

2024-1-4
Michael Russo
@mjrusso

hex2txt

hex2context

hex2context.livemd

sqlite_vec ecto ecto_sql ecto_sqlite3 kino nx bumblebee exla axon text_chunker req plug

2024-12-24
@DockYard-Academy

curriculum

Anagram

anagram.livemd

jason kino youtube hidden_cell

2023-3-21
Shane Sveller
@shanesveller

advent-of-code

2023-09

2023-09.livemd

benchee exla kino kino_aoc nimble_parsec nx

2023-12-11
Alex Filatov
@alexfilatov

deepgram-elixir-sdk

Deepgram AI Voice Agent Examples

ai_agent_examples.livemd

deepgram kino jason

2025-7-15
Lucian Parvu
@lucian

mls

erlang multiline examples

examples.livemd

mls bbmustache jsx

2024-4-6

Back