Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Bible Indexing

lib/data/bible_index.livemd

Bible Indexing

Download King James Bible

# Download Bible
Mix.install([{:httpoison, "~> 1.8"}, {:stemmer, "~> 1.0"}, {:flow, "~> 1.1"}])
%{body: bible} = HTTPoison.get!("https://www.gutenberg.org/files/10/10-0.txt")

Split the Bible into lines of text

all_lines = String.split(bible, "\r\n")
IO.puts(Enum.count(all_lines))

lines =
  all_lines
  |> Enum.drop(99)
  |> Enum.reverse()
  |> Enum.drop(353)
  |> Enum.reverse()

last_line_of_revelation = Enum.count(lines) - 1

Find the start of Genesis

lines
|> Enum.take(8)

Find the end of Revelation

lines |> Enum.drop(last_line_of_revelation)

Re-combine remaining text lines

recombined_text =
  lines
  |> Enum.join("\n")

Remove KJV book titles between verses

full_book_names = [
  "The First Book of Moses:  Called Genesis",
  "The Second Book of Moses: Called Exodus",
  "The Third Book of Moses: Called Leviticus",
  "The Fourth Book of Moses: Called Numbers",
  "The Fifth Book of Moses: Called Deuteronomy",
  "The Book of Joshua",
  "The Book of Judges",
  "The Book of Ruth",
  "The First Book of Samuel",
  "The Second Book of Samuel",
  "The First Book of the Kings",
  "The Second Book of the Kings",
  "The First Book of the Chronicles",
  "The Second Book of the Chronicles",
  "Ezra",
  "The Book of Nehemiah",
  "The Book of Esther",
  "The Book of Job",
  "The Book of Psalms",
  "The Proverbs",
  "Ecclesiastes",
  "The Song of Solomon",
  "The Book of the Prophet Isaiah",
  "The Book of the Prophet Jeremiah",
  "The Lamentations of Jeremiah",
  "The Book of the Prophet Ezekiel",
  "The Book of Daniel",
  "Hosea",
  "Joel",
  "Amos",
  "Obadiah",
  "Jonah",
  "Micah",
  "Nahum",
  "Habakkuk",
  "Zephaniah",
  "Haggai",
  "Zechariah",
  "Malachi",
  "The Gospel According to Saint Matthew",
  "The Gospel According to Saint Mark",
  "The Gospel According to Saint Luke",
  "The Gospel According to Saint John",
  "The Acts of the Apostles",
  "The Epistle of Paul the Apostle to the Romans",
  "The First Epistle of Paul the Apostle to the Corinthians",
  "The Second Epistle of Paul the Apostle to the Corinthians",
  "The Epistle of Paul the Apostle to the Galatians",
  "The Epistle of Paul the Apostle to the Ephesians",
  "The Epistle of Paul the Apostle to the Philippians",
  "The Epistle of Paul the Apostle to the Colossians",
  "The First Epistle of Paul the Apostle to the Thessalonians",
  "The Second Epistle of Paul the Apostle to the Thessalonians",
  "The First Epistle of Paul the Apostle to Timothy",
  "The Second Epistle of Paul the Apostle to Timothy",
  "The Epistle of Paul the Apostle to Titus",
  "The Epistle of Paul the Apostle to Philemon",
  "The Epistle of Paul the Apostle to the Hebrews",
  "The General Epistle of James",
  "The First Epistle General of Peter",
  "The Second General Epistle of Peter",
  "The First Epistle General of John",
  "The Second Epistle General of John",
  "The Third Epistle General of John",
  "The General Epistle of Jude",
  "The Revelation of Saint John the Divine"
]

raw_verses =
  Enum.reduce(full_book_names, recombined_text, fn title, body ->
    String.replace(body, "\n\n\n\n#{title}\n\n", "")
  end)

Find verse labels

verse_match = ~r/(?\d+):(?\d+)/

verse_labels =
  recombined_text
  |> String.split(~r/\s/)
  |> Enum.map(fn line ->
    Regex.named_captures(verse_match, line)
  end)
  |> Enum.filter(fn x -> !is_nil(x) end)
  |> Enum.map(fn chapter_verse ->
    chapter = String.to_integer(Map.get(chapter_verse, "chapter"))
    verse = String.to_integer(Map.get(chapter_verse, "verse"))

    %{
      chapter: chapter,
      verse: verse
    }
  end)

Find text of each verse

verse_list =
  raw_verses
  |> String.split(~r/\d+:\d+/)
  |> Enum.map(&String.trim/1)
  |> Enum.drop(1)

IO.puts(Enum.count(verse_list))

verse_list
content =
  verse_list
  |> :erlang.term_to_binary()
  |> :zlib.gzip()

File.write!("verse_list.data", content)

Include verse text in the verse array

verse_database =
  verse_labels
  |> Enum.with_index()
  |> Enum.zip(verse_list)
  |> Enum.map(fn {{coordinate, index}, text} ->
    {coordinate, index, text}
  end)

Index the Bible by book

books_of_the_bible = [
  "Genesis",
  "Exodus",
  "Leviticus",
  "Numbers",
  "Deuteronomy",
  "Joshua",
  "Judges",
  "Ruth",
  "First Samuel",
  "Second Samuel",
  "First Kings",
  "Second Kings",
  "First Chronicles",
  "Second Chronicles",
  "Ezra",
  "Nehemiah",
  "Esther",
  "Job",
  "Psalms",
  "Proverbs",
  "Ecclesiastes",
  "Song of Solomon",
  "Isaiah",
  "Jeremiah",
  "Lamentations",
  "Ezekiel",
  "Daniel",
  "Hosea",
  "Joel",
  "Amos",
  "Obadiah",
  "Jonah",
  "Micah",
  "Nahum",
  "Habakkuk",
  "Zephaniah",
  "Haggai",
  "Zechariah",
  "Malachi",
  "Matthew",
  "Mark",
  "Luke",
  "John",
  "Acts",
  "Romans",
  "First Corinthians",
  "Second Corinthians",
  "Galatians",
  "Ephesians",
  "Philippians",
  "Colossians",
  "First Thessalonians",
  "Second Thessalonians",
  "First Timothy",
  "Second Timothy",
  "Titus",
  "Philemon",
  "Hebrews",
  "James",
  "First Peter",
  "Second Peter",
  "First John",
  "Second John",
  "Third John",
  "Jude",
  "Revelation"
]

chapter1_indexes =
  verse_database
  |> Enum.filter(fn {coordinate, _index, _text} -> coordinate == %{chapter: 1, verse: 1} end)

starting_verse_book_index =
  Enum.zip(chapter1_indexes, books_of_the_bible)
  |> Enum.map(fn {{coordinate, index, _text}, book} ->
    {Map.put(coordinate, :book, book), index}
  end)

Book verse ranges

starting_verse_index =
  starting_verse_book_index
  |> Enum.map(fn {k, v} -> {v, k} end)
  |> Enum.sort()
  |> Enum.chunk_every(2, 1)
  |> Enum.map(fn
    [{verse_ref, %{book: "Revelation"}}] ->
      {"Revelation", {verse_ref, last_line_of_revelation}}

    [{a_ref, %{book: book}}, {b_ref, _b_coord}] ->
      {book, {a_ref, b_ref - 1}}
  end)

book_index =
  starting_verse_index
  |> Map.new()

Verse index

find_book = fn verse_id ->
  starting_verse_index
  |> Enum.find(fn {_book, {low, high}} -> verse_id in low..high end)
  |> elem(0)
end

verse_labels_list =
  verse_labels
  |> Enum.with_index()
  |> Enum.map(fn {coordinate, index} ->
    Map.put(coordinate, :book, find_book.(index))
  end)

verse_index =
  verse_labels_list
  |> Enum.with_index()
  |> Map.new()

reverse_verse_index =
  verse_index
  |> Enum.map(fn {k, v} -> {v, k} end)
  |> Map.new()
content =
  reverse_verse_index
  |> :erlang.term_to_binary()
  |> :zlib.gzip()

File.write!("reverse_verse_index.data", content)

How many verses in the Bible?

Enum.count(verse_labels)

Find the start of the book of John

verse_tuple = verse_list |> List.to_tuple()

{low, high} = Map.get(book_index, "John")
IO.puts(elem(verse_tuple, low))

Expand lines of text around the line searched for context

report_verse = fn verse_number ->
  {verse_number, Map.get(reverse_verse_index, verse_number),
   0..4 |> Enum.map(&elem(verse_tuple, max(0, verse_number + &1)))}
end

{low, high} = Map.get(book_index, "John")

report_verse.(low)

Verse Search

book =
  IO.gets("book")
  |> String.trim()

chapter =
  IO.gets("chapter")
  |> String.trim()
  |> String.to_integer()

verse =
  IO.gets("verse")
  |> String.trim()
  |> String.to_integer()

key = %{book: book, chapter: chapter, verse: verse}
IO.puts("#{book} #{chapter}:#{verse}\n#{}")
verse_index = Map.get(verse_index, key)

IO.puts(elem(verse_tuple, verse_index))

String sanitization

When searching for exact text from imprefect human memory, we are often wrong in terms of tense, geneder, number, etc. It would be nice to remove these sources of innacuracy from consideration. We can approximate this approach with a technique called stemming. In this process, a word is converted into a root string of letters. In fact, these may not be an English word!

string_sanitization = fn line ->
  line = String.replace(line, ~r/\d+:\d+/, "")

  Regex.split(~r/\W/, line)
  |> Enum.filter(fn x -> x != "" end)
  |> Enum.map(&String.downcase/1)
  |> Enum.uniq()
  |> Enum.map(&Stemmer.stem/1)
end

string_sanitization.("Jesus wept.")

Create word index

# with Flow
word_index =
  verse_list
  |> Stream.with_index()
  |> Flow.from_enumerable()
  |> Flow.map(fn {verse, number} ->
    {string_sanitization.(verse), number}
  end)
  |> Flow.filter(fn {x, _} -> x != [] end)
  |> Enum.to_list()

# With Enum
# verse_list
# |> Enum.with_index()
# |> Enum.map(fn {verse, number} -> 
#   {string_sanitization.(verse), number} end)
# |> Enum.filter(fn {x, _} -> x != [] end)

Create Inverse index of words

reverse_index =
  word_index
  |> Enum.flat_map(fn {words, line} ->
    Enum.map(words, fn word -> {word, line} end)
  end)
  |> Enum.reduce(%{}, fn {word, line_number}, acc ->
    word_refs = Map.get(acc, word, [])
    Map.put(acc, word, [line_number | word_refs])
  end)
  |> Enum.map(fn {k, v} -> {k, Enum.sort(v)} end)
  |> Map.new()
reverse_index =
  reverse_index
  |> Enum.filter(fn {_k, list} -> Enum.count(list) < 1000 end)
  |> Map.new()
content =
  reverse_index
  |> :erlang.term_to_binary()
  |> :zlib.gzip()

File.write!("inverse_index.data", content)

Search from inverse index

jesus = Map.get(reverse_index, "jesus")
wept = Map.get(reverse_index, "wept")

Enum.frequencies(jesus ++ wept)
|> Enum.filter(fn {_k, v} -> v > 1 end)
|> Enum.map(fn {line, _} -> line end)
|> Enum.map(fn line_number ->
  {line_number, IO.puts(elem(verse_tuple, line_number)), elem(verse_tuple, line_number)}
end)

Search the Bible

input =
  IO.gets("query")
  |> string_sanitization.()
  |> Enum.flat_map(fn word ->
    Map.get(reverse_index, word, [])
  end)
  |> Enum.frequencies()
  #  |> Enum.filter(fn {_k, v} -> v > 1 end)
  |> Enum.group_by(fn {_k, v} -> v end)
  |> Enum.sort(&amp;(&amp;1 > &amp;2))
  |> Enum.take(1)
  |> Enum.flat_map(fn {_, list} -> list end)
  |> Enum.sort(fn {k1, v1}, {k2, v2} -> {k2, v1} > {k1, v2} end)
  |> Enum.map(fn {verse_number, _} -> verse_number end)
  |> Enum.map(fn verse_number ->
    %{book: book, chapter: chapter, verse: verse} = Map.get(reverse_verse_index, verse_number)
    IO.puts("#{book} #{chapter}:#{verse}")
    IO.puts(elem(verse_tuple, verse_number))
    {verse_number, Map.get(reverse_verse_index, verse_number), elem(verse_tuple, verse_number)}
  end)