larajobs
Mix.install([
{:kino, "~> 0.8.0"},
# {:kino_lab, "~> 0.1.0-dev", github: "jonatanklosko/kino_lab"},
# {:kino_vega_lite, "~> 0.1.1"},
# {:kino_db, "~> 0.1.1"},
{:req, "~> 0.3.4"},
{:floki, "~> 0.32"},
{:nimble_csv, "~> 1.1"},
{:timex, "~> 3.0"}
])
Summary
Using the excellent req
library, we want to get the list of jobs from the jobs site.
The bulk of our notebooks use spider man
but that is generally better suited to crawling paginated results. A number of sites stuff everything into the immediate response and use javascript to show/hide the results by “page”.
Specifications
-
[x] Map job items
-
[x] New jobs
#app .container div
(no classes applied) -
[x] Older jobs
#app .container div
(no classed applied)
-
[x] New jobs
-
[x] Gather fields
-
[x]
date
-
[x]
title
-
[x]
company
-
[x]
location
-
[x]
tags
-
[x]
source_url
-
[x]
post_url
-
[x]
- [x] Store results as CSV
-
[x] Sort CSV file by
date
-
[x] Convert date from relative textual representation to a rough date.
- We can’t be too precise unfortunately so we’ll just use the first of the month, etc.
Code
Get Items from Parsing HTML
###
parse_date = fn date ->
# xd
# xw
# xmo
day_groups = Regex.run(~r/(\d+?)d/, date, capture: :all_but_first)
week_groups = Regex.run(~r/(\d+?)w/, date, capture: :all_but_first)
month_groups = Regex.run(~r/(\d+?)mo/, date, capture: :all_but_first)
days =
case day_groups do
[number] -> String.to_integer(number)
nil -> 0
end
weeks =
case week_groups do
[number] -> String.to_integer(number)
nil -> 0
end
months =
case month_groups do
[number] -> String.to_integer(number)
nil -> 0
end
# IO.inspect(days, label: "days")
# IO.inspect(weeks, label: "weeks")
# IO.inspect(months, label: "months")
Timex.shift(Timex.today(), days: -days)
|> Timex.shift(weeks: -weeks)
|> Timex.shift(months: -months)
end
base_url = "https://larajobs.com/"
req_html = Req.new(http_errors: :raise)
html = Req.get!(req_html, url: base_url).body
{:ok, document} = Floki.parse_document(html)
jobs = Floki.find(document, "a.job-link")
items =
Enum.map(jobs, fn job ->
# source_url = Floki.attribute(job, ".job-link", "href")
# post_url = Floki.attribute(job, ".job-link", "data-url")
source_url = Floki.attribute(job, ".job-link", "href") |> hd()
post_url = Floki.attribute(job, ".job-link", "data-url") |> hd()
title = Floki.find(job, "p:nth-of-type(2)") |> Floki.text() |> String.trim()
company = Floki.find(job, "p:nth-of-type(1)") |> Floki.text() |> String.trim()
location =
Floki.find(
job,
".job-link > div > div:last-child > div:last-child > div:nth-child(1) > div:nth-child(1)"
)
|> Floki.text()
|> String.trim()
tags =
Floki.find(
job,
".job-link > div > div:last-child > div:last-child > div:last-child .text-sm"
)
|> Enum.map(&(&1 |> Floki.text() |> String.trim()))
|> Enum.join(" | ")
date =
Floki.find(
job,
".job-link > div > div:last-child > div:last-child > div:nth-child(1) > div:last-child"
)
|> Floki.text()
|> String.trim()
full_date = parse_date.(date)
# IO.inspect(title, label: "title")
# IO.inspect(company, label: "company")
# IO.inspect(location, label: "location")
# IO.inspect(tags, label: "tags")
# IO.inspect(date, label: "date")
# IO.inspect(source_url, label: "source_url")
# IO.inspect(post_url, label: "post_url")
# IO.inspect(job, label: "job")
%{
posted_at: full_date,
title: title,
company: company,
location: location,
tags: tags,
source_url: base_url <> String.slice(source_url, 1..-1),
post_url: post_url
}
end)
# IO.inspect(jobs_elements, label: "jobs_elements")
items
Write Items to CSV
alias NimbleCSV.RFC4180, as: CSV
take = fn item, header_keys ->
Enum.map(header_keys, &Map.get(item, &1))
end
header_keys = [
:posted_at,
:title,
:company,
:location,
:tags,
:source_url,
:post_url
]
sorted_path = "./data/larajobs-sorted.csv"
File.rm_rf(sorted_path)
io_device = File.open!(sorted_path, [:write, :append, :binary, :utf8])
header = CSV.dump_to_iodata([header_keys])
# See https://www.coletiv.com/blog/elixir-sort-lists-by-date/
# https://elixir-lang.org/blog/2020/01/27/elixir-v1-10-0-released/#improvements-to-sort-based-apis-in-enum
csv =
items
|> Enum.sort(&(Date.compare(&1.posted_at, &2.posted_at) != :lt))
|> Enum.map(&take.(&1, header_keys))
|> CSV.dump_to_iodata()
:ok = IO.write(io_device, header)
:ok = IO.write(io_device, csv)
:ok = File.close(io_device)
Display the Sorted CSV
data =
"./data/larajobs-sorted.csv"
|> File.read!()
|> CSV.parse_string()
data
|> Enum.with_index()
|> Enum.map(fn {row, index} ->
%{
id: index,
posted_at: Enum.at(row, 0),
title: Enum.at(row, 1),
company: Enum.at(row, 2),
location: Enum.at(row, 3),
tags: Enum.at(row, 4),
source_url: Enum.at(row, 5),
post_url: Enum.at(row, 6)
}
end)
|> Kino.DataTable.new()