larajobs

req--larajobs.livemd

Jeremy Brayton

@w0rd-driven

livebook_notebooks

Share to X

Share to Bluesky

More notebooks

larajobs

Mix.install([
  {:kino, "~> 0.8.0"},
  # {:kino_lab, "~> 0.1.0-dev", github: "jonatanklosko/kino_lab"},
  # {:kino_vega_lite, "~> 0.1.1"},
  # {:kino_db, "~> 0.1.1"},
  {:req, "~> 0.3.4"},
  {:floki, "~> 0.32"},
  {:nimble_csv, "~> 1.1"},
  {:timex, "~> 3.0"}
])

Summary

Using the excellent req library, we want to get the list of jobs from the jobs site. The bulk of our notebooks use spider man but that is generally better suited to crawling paginated results. A number of sites stuff everything into the immediate response and use javascript to show/hide the results by “page”.

Specifications

[x] Map job items
1. [x] New jobs #app .container div (no classes applied)
2. [x] Older jobs #app .container div (no classed applied)
[x] Gather fields
1. [x] date
2. [x] title
3. [x] company
4. [x] location
5. [x] tags
6. [x] source_url
7. [x] post_url
[x] Store results as CSV
[x] Sort CSV file by date
[x] Convert date from relative textual representation to a rough date.
1. We can’t be too precise unfortunately so we’ll just use the first of the month, etc.

Code

Get Items from Parsing HTML

###

parse_date = fn date ->
  # xd
  # xw
  # xmo
  day_groups = Regex.run(~r/(\d+?)d/, date, capture: :all_but_first)
  week_groups = Regex.run(~r/(\d+?)w/, date, capture: :all_but_first)
  month_groups = Regex.run(~r/(\d+?)mo/, date, capture: :all_but_first)

  days =
    case day_groups do
      [number] -> String.to_integer(number)
      nil -> 0
    end

  weeks =
    case week_groups do
      [number] -> String.to_integer(number)
      nil -> 0
    end

  months =
    case month_groups do
      [number] -> String.to_integer(number)
      nil -> 0
    end

  # IO.inspect(days, label: "days")
  # IO.inspect(weeks, label: "weeks")
  # IO.inspect(months, label: "months")
  Timex.shift(Timex.today(), days: -days)
  |> Timex.shift(weeks: -weeks)
  |> Timex.shift(months: -months)
end

base_url = "https://larajobs.com/"

req_html = Req.new(http_errors: :raise)

html = Req.get!(req_html, url: base_url).body

{:ok, document} = Floki.parse_document(html)

jobs = Floki.find(document, "a.job-link")

items =
  Enum.map(jobs, fn job ->
    # source_url = Floki.attribute(job, ".job-link", "href")
    # post_url = Floki.attribute(job, ".job-link", "data-url")
    source_url = Floki.attribute(job, ".job-link", "href") |> hd()
    post_url = Floki.attribute(job, ".job-link", "data-url") |> hd()

    title = Floki.find(job, "p:nth-of-type(2)") |> Floki.text() |> String.trim()
    company = Floki.find(job, "p:nth-of-type(1)") |> Floki.text() |> String.trim()

    location =
      Floki.find(
        job,
        ".job-link > div > div:last-child > div:last-child > div:nth-child(1) > div:nth-child(1)"
      )
      |> Floki.text()
      |> String.trim()

    tags =
      Floki.find(
        job,
        ".job-link > div > div:last-child > div:last-child > div:last-child .text-sm"
      )
      |> Enum.map(&amp;(&amp;1 |> Floki.text() |> String.trim()))
      |> Enum.join(" | ")

    date =
      Floki.find(
        job,
        ".job-link > div > div:last-child > div:last-child > div:nth-child(1) > div:last-child"
      )
      |> Floki.text()
      |> String.trim()

    full_date = parse_date.(date)
    # IO.inspect(title, label: "title")
    # IO.inspect(company, label: "company")
    # IO.inspect(location, label: "location")
    # IO.inspect(tags, label: "tags")
    # IO.inspect(date, label: "date")
    # IO.inspect(source_url, label: "source_url")
    # IO.inspect(post_url, label: "post_url")
    # IO.inspect(job, label: "job")

    %{
      posted_at: full_date,
      title: title,
      company: company,
      location: location,
      tags: tags,
      source_url: base_url <> String.slice(source_url, 1..-1),
      post_url: post_url
    }
  end)

# IO.inspect(jobs_elements, label: "jobs_elements")

items

Write Items to CSV

alias NimbleCSV.RFC4180, as: CSV

take = fn item, header_keys ->
  Enum.map(header_keys, &amp;Map.get(item, &amp;1))
end

header_keys = [
  :posted_at,
  :title,
  :company,
  :location,
  :tags,
  :source_url,
  :post_url
]

sorted_path = "./data/larajobs-sorted.csv"
File.rm_rf(sorted_path)
io_device = File.open!(sorted_path, [:write, :append, :binary, :utf8])

header = CSV.dump_to_iodata([header_keys])

# See https://www.coletiv.com/blog/elixir-sort-lists-by-date/
# https://elixir-lang.org/blog/2020/01/27/elixir-v1-10-0-released/#improvements-to-sort-based-apis-in-enum
csv =
  items
  |> Enum.sort(&amp;(Date.compare(&amp;1.posted_at, &amp;2.posted_at) != :lt))
  |> Enum.map(&amp;take.(&amp;1, header_keys))
  |> CSV.dump_to_iodata()

:ok = IO.write(io_device, header)
:ok = IO.write(io_device, csv)
:ok = File.close(io_device)

Display the Sorted CSV

data =
  "./data/larajobs-sorted.csv"
  |> File.read!()
  |> CSV.parse_string()

data
|> Enum.with_index()
|> Enum.map(fn {row, index} ->
  %{
    id: index,
    posted_at: Enum.at(row, 0),
    title: Enum.at(row, 1),
    company: Enum.at(row, 2),
    location: Enum.at(row, 3),
    tags: Enum.at(row, 4),
    source_url: Enum.at(row, 5),
    post_url: Enum.at(row, 6)
  }
end)
|> Kino.DataTable.new()

Other notebooks:

@andyl

elix_util

MNIST

mnist.livemd

tutorial advanced data-science req axon exla nx

2022-8-18
Yejun Su
@goofansu

ogp

ogp

ogp.livemd

tutorial intermediate ogp kino

2022-8-18
@TomBers

livebookNotes

Attractors

attractors.livemd

advanced data-science decimal vega_lite kino

2022-8-18
Wojtek Mach
@wojtekmach

notebooks

Playground

rss.livemd

tutorial intermediate req easyxml

2022-8-18
Paulo Valente
@polvalente

handoff

Nx Pipeline Example

nx_pipeline.livemd

advanced data-science nx exla handoff

2025-5-22
Kantum
@kantum

ml_elixir

Chapter 2

chapter_2.livemd

tutorial advanced data-science nx exla benchee

2024-11-26
Etalab
@etalab

transport-site

Analyse puissance_nominale (IRVE)

irve-watts-detection.livemd

tutorial data-science intermediate req nimble_csv explorer kino kino_vega_lite kino_explorer

2023-10-23

Back