Powered by AppSignal & Oban Pro

01_pdf_extraction.livemd

notebooks/pdf_extraction.livemd

01_pdf_extraction.livemd



Mix.install([
  {:pdf, "~> 0.7.2"},
  {:tablex, "~> 0.3.1"},
  {:nx, "~> 0.5"},
  {:jason, "~> 1.4"}
])

Text (via pdftotext)

pdf_path = Path.expand("data/sample_pdfs/invoicesample.pdf")

{:ok, pdf} = PDF.from_file(pdf_path)

pdf.pages
|> Enum.each(fn page ->
  IO.puts("Page #{page.number}:")
  IO.puts(page.text)
end)
pdf_path = Path.expand("data/sample_pdfs/invoicesample.pdf")

{output, 0} = System.cmd("pdftotext", ["-layout", pdf_path, "-"])
IO.puts(output)

Tables (parsed with Tablex)

rows =
  output
  |> String.split("\n", trim: true)
  |> Enum.map(&String.split(&1, ~r/\s{2,}/, trim: true))

IO.inspect(rows, label: "Extracted rows")

Images (via pdfimages)

System.cmd("pdfimages", ["-png", pdf_path, "outputs/invoice_img"])
ocr_path = "outputs/ocr_invoice.pdf"
{log, status} = System.cmd("ocrmypdf", [pdf_path, ocr_path])
IO.puts("OCR log: #{log}")

pdf_path = Path.expand("data/sample_pdfs/invoicesample.pdf")
output_path = Path.expand("outputs/ocr_invoice.pdf")

{result, status} = System.cmd("ocrmypdf", [pdf_path, output_path])
IO.puts(result)
IO.inspect(status)
File.exists?("outputs/ocr_invoice.pdf")

pdf_path = Path.expand("data/sample_pdfs/invoicesample.pdf")
output_path = Path.expand("outputs/ocr_invoice.pdf")

{result, status} = System.cmd("ocrmypdf", [pdf_path, output_path], stderr_to_stdout: true)
IO.puts(result)
IO.inspect(status)
pdf_path = Path.expand("data/sample_pdfs/scanned_article.pdf")
output_path = Path.expand("outputs/ocr_scanned.pdf")

{result, status} = System.cmd("ocrmypdf", ["--force-ocr", pdf_path, output_path], stderr_to_stdout: true)
IO.puts(result)
IO.inspect(status)
ocr_path = Path.expand("outputs/ocr_scanned.pdf")

{text_output, 0} = System.cmd("pdftotext", ["-layout", ocr_path, "-"])

rows = []  

images =
  File.ls!("outputs")
  |> Enum.filter(&String.contains?(&1, "scanned_article_img"))

results = %{
  text: text_output,
  tables: rows,
  images: images,
  ocr_pdf: "outputs/ocr_scanned.pdf"
}

File.write!("outputs/scanned_article.json", Jason.encode!(results, pretty: true))
IO.puts("Saved structured results to outputs/scanned_article.json")