Powered by AppSignal & Oban Pro

Bible Scrap

bible-scraper.livemd

Bible Scrap

Mix.install([
  {:req, "~> 0.5.16"},
  {:floki, "~> 0.38.0"}
])

Section

html = Req.get!("https://www.biblegateway.com/passage/?search=genesis+1&version=NRSVUE").body
doc = Floki.parse_document!(html)

passage = Floki.find(doc, ".passage-content")
titles = Floki.find(passage, "h3") |> Floki.text()
defmodule CrossrefScraper do
  @moduledoc """
  Scrapes 
  • elements structured as Floki-like tuples into a map of id => list of Bible refs. """ def scrape(li_nodes) when is_list(li_nodes) do li_nodes |> Enum.reduce(%{}, fn {"li", attrs, children}, acc -> id = get_attr(attrs, "id") data_bibleref = children |> find_crossref_data_bibleref() case {id, data_bibleref} do {nil, _} -> acc {_, nil} -> acc {id, refs} -> Map.put(acc, id, split_refs(refs)) end _, acc -> acc end) end defp get_attr(attrs, key), do: attrs |> Enum.find_value(fn {k, v} -> if k == key, do: v end) defp find_crossref_data_bibleref(nodes) do Enum.find_value(nodes, fn {"a", attrs, _children} -> if get_attr(attrs, "class") == "crossref-link" do get_attr(attrs, "data-bibleref") end {_, _, children} when is_list(children) -> find_crossref_data_bibleref(children) _ -> nil end) end defp split_refs(nil), do: [] defp split_refs(str) do str |> String.split(~r/,\s*/) |> Enum.map(&String.trim/1) end end
  • defmodule FootnoteScraper do
      @moduledoc """
      Scrapes 
  • elements containing into a map of id => text. """ def scrape(li_nodes) when is_list(li_nodes) do Enum.reduce(li_nodes, %{}, fn {"li", attrs, children}, acc -> id = get_attr(attrs, "id") footnote_text = find_footnote_text(children) case {id, footnote_text} do {nil, _} -> acc {_, nil} -> acc {id, text} -> Map.put(acc, id, normalize_text(text)) end _, acc -> acc end) end # Extract attribute from tag attrs defp get_attr(attrs, key), do: attrs |> Enum.find_value(fn {k, v} -> if k == key, do: v end) # Recursively find defp find_footnote_text(nodes) do Enum.find_value(nodes, fn {"span", attrs, children} -> if get_attr(attrs, "class") == "footnote-text" do flatten_text(children) end {_, _, children} when is_list(children) -> find_footnote_text(children) _ -> nil end) end # Recursively flatten nested tags into plain text defp flatten_text(nodes) when is_list(nodes) do nodes |> Enum.map(fn binary when is_binary(binary) -> binary {_, _, children} -> flatten_text(children) _ -> "" end) |> Enum.join() end defp flatten_text(_), do: "" # Normalize whitespace and trim defp normalize_text(text) do text |> String.replace(~r/\s+/, " ") |> String.trim() end end
  • crossrefs = passage
    |> Floki.find("div.crossrefs ol li")
    |> CrossrefScraper.scrape()
    footnotes = passage
    |> Floki.find("div.footnotes ol li")
    |> FootnoteScraper.scrape()
    defmodule VerseScraper do
      @moduledoc """
      Scrapes a verse span into structured content segments:
      [
        %{text: "...", footnotes: ["..."], crossrefs: ["..."]}
      ]
      """
    
      def scrape({"span", _attrs, children} = span) do
        %{
          verse: parse_num(span),
          content: parse_content(children)
        }
      end
    
      def parse_num(span) do
        span
          |> Floki.find(".versenum")
          |> Floki.text()
          |> String.trim()
        |> case do
          "" -> 1
          txt ->
            txt
            |> String.replace(~r/[^\d]/, "")
            |> case do
              "" -> 1
              num -> String.to_integer(num)
            end
        end
      end
    
      defp parse_content(nodes) do
        nodes
        |> do_parse([])
        |> Enum.map(&normalize_segment/1)
        |> Enum.reverse()
      end
    
      defp do_parse([], acc), do: Enum.reverse(acc)
    
      defp do_parse([node | rest], acc) do
        cond do
          is_binary(node) ->
            # text node
            text = clean_text(node)
            segment = %{text: text, footnotes: [], crossrefs: []}
            do_parse(rest, add_segment(acc, segment))
    
          match?({"span", [{"class", "chapternum"} | _], _}, node) ->
            # skip chapter number
            do_parse(rest, acc)
    
          match?({"sup", _, _}, node) ->
            do_parse(rest, handle_sup(node, acc))
    
          match?({_, _, _}, node) ->
            {_, _, children} = node
            do_parse(children ++ rest, acc)
    
          true ->
            do_parse(rest, acc)
        end
      end
    
      defp handle_sup({"sup", _attrs, _}, []), do: [] 
    
      defp handle_sup({"sup", attrs, _}, acc) do
        cond do
          fn_id = get_attr(attrs, "data-fn") ->
            attach_to_last(acc, :footnotes, strip_ref(fn_id))
    
          cr_id = get_attr(attrs, "data-cr") ->
            attach_to_last(acc, :crossrefs, strip_ref(cr_id))
    
          true ->
            acc
        end
      end
    
      defp attach_to_last(acc, key, id) do
        List.update_at(acc, -1, fn seg ->
          Map.update(seg, key, [id], fn lst -> lst ++ [id] end)
        end)
      end
    
      defp add_segment([], seg), do: [seg]
    
      defp add_segment(acc, %{text: ""}), do: acc
    
      defp add_segment([last | rest], %{text: text} = seg) do
        if String.trim(text) == "" do
          [last | rest]
        else
          [seg | [last | rest]]
        end
        |> Enum.reverse()
      end
    
      # ensure text is trimmed and single spaces
      defp normalize_segment(%{text: t} = seg),
        do: %{seg | text: String.trim(t)}
    
      defp get_attr(attrs, key),
        do: Enum.find_value(attrs, fn {k, v} -> if k == key, do: v end)
    
      defp strip_ref("#" <> id), do: id
      defp strip_ref(id), do: id
    
      defp clean_text(txt),
        do: txt |> String.replace("\u00A0", " ") |> String.replace(~r/\s+/, " ")
    end
    
    # [span|span_passages] = passage |> Floki.find("p>span.text")
    # VerseScraper.scrape(span)
    verses = 
      passage
      |> Floki.find("p span.text")
      |> Enum.map(&amp;VerseScraper.scrape/1)
      |> Enum.map(fn verse ->
        new_content = verse.content
          |> Enum.map(fn content ->
            content
            |> Map.put(:crossrefs, Enum.flat_map(content.crossrefs, fn key -> crossrefs[key] end))
            |> Map.put(:footnotes, Enum.map(content.footnotes, fn key -> footnotes[key] end))
          end)
        Map.put(verse, :content, new_content)
      end)