Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

larajobs

req--larajobs.livemd

larajobs

Mix.install([
  {:kino, "~> 0.8.0"},
  # {:kino_lab, "~> 0.1.0-dev", github: "jonatanklosko/kino_lab"},
  # {:kino_vega_lite, "~> 0.1.1"},
  # {:kino_db, "~> 0.1.1"},
  {:req, "~> 0.3.4"},
  {:floki, "~> 0.32"},
  {:nimble_csv, "~> 1.1"},
  {:timex, "~> 3.0"}
])

Summary

Using the excellent req library, we want to get the list of jobs from the jobs site. The bulk of our notebooks use spider man but that is generally better suited to crawling paginated results. A number of sites stuff everything into the immediate response and use javascript to show/hide the results by “page”.

Specifications

  1. [x] Map job items
    1. [x] New jobs #app .container div (no classes applied)
    2. [x] Older jobs #app .container div (no classed applied)
  2. [x] Gather fields
    1. [x] date
    2. [x] title
    3. [x] company
    4. [x] location
    5. [x] tags
    6. [x] source_url
    7. [x] post_url
  3. [x] Store results as CSV
  4. [x] Sort CSV file by date
  5. [x] Convert date from relative textual representation to a rough date.
    1. We can’t be too precise unfortunately so we’ll just use the first of the month, etc.

Code

Get Items from Parsing HTML

###

parse_date = fn date ->
  # xd
  # xw
  # xmo
  day_groups = Regex.run(~r/(\d+?)d/, date, capture: :all_but_first)
  week_groups = Regex.run(~r/(\d+?)w/, date, capture: :all_but_first)
  month_groups = Regex.run(~r/(\d+?)mo/, date, capture: :all_but_first)

  days =
    case day_groups do
      [number] -> String.to_integer(number)
      nil -> 0
    end

  weeks =
    case week_groups do
      [number] -> String.to_integer(number)
      nil -> 0
    end

  months =
    case month_groups do
      [number] -> String.to_integer(number)
      nil -> 0
    end

  # IO.inspect(days, label: "days")
  # IO.inspect(weeks, label: "weeks")
  # IO.inspect(months, label: "months")
  Timex.shift(Timex.today(), days: -days)
  |> Timex.shift(weeks: -weeks)
  |> Timex.shift(months: -months)
end

base_url = "https://larajobs.com/"

req_html = Req.new(http_errors: :raise)

html = Req.get!(req_html, url: base_url).body

{:ok, document} = Floki.parse_document(html)

jobs = Floki.find(document, "a.job-link")

items =
  Enum.map(jobs, fn job ->
    # source_url = Floki.attribute(job, ".job-link", "href")
    # post_url = Floki.attribute(job, ".job-link", "data-url")
    source_url = Floki.attribute(job, ".job-link", "href") |> hd()
    post_url = Floki.attribute(job, ".job-link", "data-url") |> hd()

    title = Floki.find(job, "p:nth-of-type(2)") |> Floki.text() |> String.trim()
    company = Floki.find(job, "p:nth-of-type(1)") |> Floki.text() |> String.trim()

    location =
      Floki.find(
        job,
        ".job-link > div > div:last-child > div:last-child > div:nth-child(1) > div:nth-child(1)"
      )
      |> Floki.text()
      |> String.trim()

    tags =
      Floki.find(
        job,
        ".job-link > div > div:last-child > div:last-child > div:last-child .text-sm"
      )
      |> Enum.map(&(&1 |> Floki.text() |> String.trim()))
      |> Enum.join(" | ")

    date =
      Floki.find(
        job,
        ".job-link > div > div:last-child > div:last-child > div:nth-child(1) > div:last-child"
      )
      |> Floki.text()
      |> String.trim()

    full_date = parse_date.(date)
    # IO.inspect(title, label: "title")
    # IO.inspect(company, label: "company")
    # IO.inspect(location, label: "location")
    # IO.inspect(tags, label: "tags")
    # IO.inspect(date, label: "date")
    # IO.inspect(source_url, label: "source_url")
    # IO.inspect(post_url, label: "post_url")
    # IO.inspect(job, label: "job")

    %{
      posted_at: full_date,
      title: title,
      company: company,
      location: location,
      tags: tags,
      source_url: base_url <> String.slice(source_url, 1..-1),
      post_url: post_url
    }
  end)

# IO.inspect(jobs_elements, label: "jobs_elements")

items

Write Items to CSV

alias NimbleCSV.RFC4180, as: CSV

take = fn item, header_keys ->
  Enum.map(header_keys, &amp;Map.get(item, &amp;1))
end

header_keys = [
  :posted_at,
  :title,
  :company,
  :location,
  :tags,
  :source_url,
  :post_url
]

sorted_path = "./data/larajobs-sorted.csv"
File.rm_rf(sorted_path)
io_device = File.open!(sorted_path, [:write, :append, :binary, :utf8])

header = CSV.dump_to_iodata([header_keys])

# See https://www.coletiv.com/blog/elixir-sort-lists-by-date/
# https://elixir-lang.org/blog/2020/01/27/elixir-v1-10-0-released/#improvements-to-sort-based-apis-in-enum
csv =
  items
  |> Enum.sort(&amp;(Date.compare(&amp;1.posted_at, &amp;2.posted_at) != :lt))
  |> Enum.map(&amp;take.(&amp;1, header_keys))
  |> CSV.dump_to_iodata()

:ok = IO.write(io_device, header)
:ok = IO.write(io_device, csv)
:ok = File.close(io_device)

Display the Sorted CSV

data =
  "./data/larajobs-sorted.csv"
  |> File.read!()
  |> CSV.parse_string()

data
|> Enum.with_index()
|> Enum.map(fn {row, index} ->
  %{
    id: index,
    posted_at: Enum.at(row, 0),
    title: Enum.at(row, 1),
    company: Enum.at(row, 2),
    location: Enum.at(row, 3),
    tags: Enum.at(row, 4),
    source_url: Enum.at(row, 5),
    post_url: Enum.at(row, 6)
  }
end)
|> Kino.DataTable.new()