Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Job Description to Markdown

req--job_description.livemd

Job Description to Markdown

Mix.install([
  {:kino, github: "livebook-dev/kino", override: true},
  {:kino_lab, "~> 0.1.0-dev", github: "jonatanklosko/kino_lab"},
  {:floki, "~> 0.32"},
  {:pandex, "~> 0.2.0"},
  {:utilities, path: "utilities"}
])

Summary

Using the excellent req library, we want to get the HTML of the job post url and convert the contents to markdown. The conversion to markdown is handled via pandex and the pandoc application.

NOTE: We require pandoc to be installed to convert from HTML to Markdown

Specifications

  1. [x] If :
    1. Use req to download the full HTML page into data/#{host}/#{hash}.html.
  2. [x] If :
    1. Ask to overwrite?
    2. Skip to next step (3).
  3. [x] Detect ATS system, one of [breezyhr, greenhouse, lever, unknown]
  4. [x] Parse HTML into Markdown
    1. [x] Pandoc
    2. [ ] Manually
    3. [ ] EasyHTML - Was unaware this was just a wrapper around Floki
  5. [x] Add [ ] to every list item
    1. We do this specifically to help with a manual checklist.
    2. I would not apply to a position that had very few checked off.
    3. This list conversion has been almost 100% universal, to the point that this process should “be a thing.”

Code

url = Kino.Input.url("URL")
url_value = Kino.Input.read(url)
{directory, filename} = Utilities.Persistence.get_path(url_value)
transform_title = fn document ->
  title =
    Floki.find(document, "title")
    |> hd()
    |> Floki.raw_html()

  # Since this is a single tag we can just convert it to Markdown directly
  title |> String.replace("", "") |> String.replace("", "")
end

transform_description = fn document ->
  {:ok, markdown} = Pandex.html_to_markdown_strict(document)

  markdown_title = transform_title.(document)

  # Cleanup escaped brackets [ ]
  content = markdown |> String.replace("\\[ \\]", "[ ]") |> String.replace("-   [ ]", "* [ ]")

  # Inject our title as the first line
  markdown_content = "# #{markdown_title}\n\n" <> content

  [title: markdown_title, content: markdown_content]
end

html = Utilities.Persistence.read(url_value)

{type, description} = Utilities.Detection.find_description(html)

# [title, content] =
#   case type do
#     # :breezy_hr -> Utilities.Detection.BreezyHR.transform_description(description)
#     # :lever_co -> Utilities.Detection.LeverCo.transform_description(description)
#     # :greenhouse_io -> Utilities.Detection.GreenhouseIo.transform_description(description)
#     :greenhouse_io -> [title: "", content: description]
#     :unknown -> [title: "", content: description]
#   end

# transformed = description
#   |> Floki.find("#app_body")
#   |> hd()
#   |> Floki.children(include_text: false)
#   |> Floki.traverse_and_update(fn
#     {"li", attrs, [text]} ->
#       cond do
#         is_binary(text) -> {"li", attrs, ["[ ] " <> text]}
#         is_tuple(text) -> 
#           Floki.traverse_and_update(text, fn 
#             {_, _attrs, [text]} -> {"li", [], ["[ ] " <> text]}
#           end)
#         true -> {"li", attrs, [text]}
#       end
#     tag -> tag
#   end)

[title, content] = transform_description.(html)

{_, title_markdown} = title
{_, content_markdown} = content

Utilities.Persistence.save(url_value, content_markdown, "md")

title_markdown

View Results

Kino.Markdown.new(content_markdown)