Job Description to Markdown
Mix.install([
{:kino, github: "livebook-dev/kino", override: true},
{:kino_lab, "~> 0.1.0-dev", github: "jonatanklosko/kino_lab"},
{:floki, "~> 0.32"},
{:pandex, "~> 0.2.0"},
{:utilities, path: "utilities"}
])
Summary
Using the excellent req
library, we want to get the HTML of the job post url and convert the contents to markdown. The conversion to markdown is handled via pandex
and the pandoc application.
NOTE: We require pandoc to be installed to convert from HTML to Markdown
Specifications
-
[x] If
:
-
Use
req
to download the full HTML page intodata/#{host}/#{hash}.html
.
-
Use
-
[x] If
:
- Ask to overwrite?
- Skip to next step (3).
-
[x] Detect ATS system, one of
[breezyhr, greenhouse, lever, unknown]
-
[x] Parse HTML into Markdown
- [x] Pandoc
- [ ] Manually
- [ ] EasyHTML - Was unaware this was just a wrapper around Floki
-
[x] Add
[ ]
to every list item- We do this specifically to help with a manual checklist.
- I would not apply to a position that had very few checked off.
- This list conversion has been almost 100% universal, to the point that this process should “be a thing.”
Code
url = Kino.Input.url("URL")
url_value = Kino.Input.read(url)
{directory, filename} = Utilities.Persistence.get_path(url_value)
transform_title = fn document ->
title =
Floki.find(document, "title")
|> hd()
|> Floki.raw_html()
# Since this is a single tag we can just convert it to Markdown directly
title |> String.replace("", "") |> String.replace("", "")
end
transform_description = fn document ->
{:ok, markdown} = Pandex.html_to_markdown_strict(document)
markdown_title = transform_title.(document)
# Cleanup escaped brackets [ ]
content = markdown |> String.replace("\\[ \\]", "[ ]") |> String.replace("- [ ]", "* [ ]")
# Inject our title as the first line
markdown_content = "# #{markdown_title}\n\n" <> content
[title: markdown_title, content: markdown_content]
end
html = Utilities.Persistence.read(url_value)
{type, description} = Utilities.Detection.find_description(html)
# [title, content] =
# case type do
# # :breezy_hr -> Utilities.Detection.BreezyHR.transform_description(description)
# # :lever_co -> Utilities.Detection.LeverCo.transform_description(description)
# # :greenhouse_io -> Utilities.Detection.GreenhouseIo.transform_description(description)
# :greenhouse_io -> [title: "", content: description]
# :unknown -> [title: "", content: description]
# end
# transformed = description
# |> Floki.find("#app_body")
# |> hd()
# |> Floki.children(include_text: false)
# |> Floki.traverse_and_update(fn
# {"li", attrs, [text]} ->
# cond do
# is_binary(text) -> {"li", attrs, ["[ ] " <> text]}
# is_tuple(text) ->
# Floki.traverse_and_update(text, fn
# {_, _attrs, [text]} -> {"li", [], ["[ ] " <> text]}
# end)
# true -> {"li", attrs, [text]}
# end
# tag -> tag
# end)
[title, content] = transform_description.(html)
{_, title_markdown} = title
{_, content_markdown} = content
Utilities.Persistence.save(url_value, content_markdown, "md")
title_markdown
View Results
Kino.Markdown.new(content_markdown)