ElixirRadar Jobs

spiderman--elixir_radar_jobs.livemd

Jeremy Brayton

@w0rd-driven

livebook_notebooks

Share to X

Share to Bluesky

More notebooks

ElixirRadar Jobs

Mix.install([
  {:spider_man, "~> 0.3"},
  {:floki, "~> 0.31"},
  {:nimble_csv, "~> 1.1"},
  {:kino, "~> 0.8.0"}
])

Summary

Using the excellent spider_man library, we want to get the paginated list of job opportunities from https://elixir-radar.com/jobs. We also use floki to grab the relevant parts from the HTML and nimble_csv to sort our CSV results.

citation needed SpiderMan works by spawning a process for each page and as such, results come in as the processing completes. This means that page 500 could complete before page 2 so we sort the CSV results to keep the newest entries towards the top.

Since Elixir Radar doesn’t include dates in their entries, we sort by page number.

Noted Variables and Defaults

base_url = https://elixir-radar.com/jobs
1. Sets the base url for all other relative paths
total_page = 6
1. We hardcode our page count to 6 to roughly get the last 6 months.
ETS file path = ./data/radar_jobs.ets
1. Erlang term storage file. We don’t work with this.
CSV file path = ./data/radar_jobs.csv
1. Original CSV file, records are saved as they are processed.
CSV sorted file path = ./data/radar_jobs-sorted.csv

Specifications

[x] Analyze page numbers to hardcode grabbing anything later than ~6 months ago.
1. This should be 6 pages, it may need to change in the future.
[x] Map job items
1. .job-board ul > li
[x] Gather fields
1. [x] title
2. [x] company
3. [x] location
4. [x] workplace
5. [x] source_description
6. [x] post_url
7. [x] page_number
[x] Delete ETS and CSV file before next run
[x] Sort CSV file by page_number

Configure Settings

Build settings for spider

base_url = "https://elixir-radar.com/jobs"

requester_options = [
  base_url: base_url,
  middlewares: [
    {SpiderMan.Middleware.UserAgent,
     [
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
     ]},
    {Tesla.Middleware.Headers,
     [
       {"referer", base_url},
       {"accept",
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"},
       {"accept-encoding", "gzip, deflate"},
       {"accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7"}
     ]},
    Tesla.Middleware.DecompressResponse
  ]
]

settings = [
  log2file: false,
  downloader_options: [requester: {SpiderMan.Requester.Finch, requester_options}],
  spider_options: [pipelines: []],
  item_processor_options: [
    storage: [
      {SpiderMan.Storage.ETS, "./data/radar_jobs.ets"},
      {SpiderMan.Storage.CSV,
       file: "./data/radar_jobs.csv",
       headers: [
         :title,
         :company,
         :location,
         :workplace,
         :source_description,
         :post_url,
         :page_number
       ]}
    ]
  ]
]

Configure Parsing

Prepare callbacks for spider

import SpiderMan
import SpiderMan.Utils
require Logger

spider = SpiderList.ElixirRadar

init = fn state ->
  build_request(base_url)
  |> set_flag(:first_page)
  |> then(&amp;SpiderMan.insert_request(spider, &amp;1))

  state
end

handle_list_page = fn body, n ->
  Logger.info("processing page #{n}")
  {:ok, document} = Floki.parse_document(body)

  jobs =
    Floki.find(document, ".job-board ul")
    |> hd()
    |> Floki.children(include_text: false)
    |> Enum.filter(&amp;match?({"li", _, _}, &amp;1))

  items =
    Enum.map(jobs, fn job ->
      title = Floki.find(job, ".job-post-link") |> Floki.text() |> String.trim()
      post_url = Floki.attribute(job, ".job-post-link", "href") |> hd()

      full_location =
        Floki.find(job, ".job-board-job-location")
        |> Floki.text()
        |> String.trim()
        |> String.replace(~r/\s+/, " ")
        |> String.replace(~r/\t/, " ")

      description =
        Floki.find(job, ".job-board-job-description")
        |> Floki.text()
        |> String.trim()
        |> String.replace(~r/\s+/, " ")
        |> String.replace(~r/\t/, " ")

      [company_parts, location | _] = full_location |> String.split(" - ", trim: true)

      [workplace, company] =
        case Regex.run(~r/(\(Remote\)) (.*)/, company_parts, capture: :all_but_first) do
          [_, company] ->
            ["Remote", company]

          nil ->
            company = Regex.run(~r/(.*)/, company_parts, capture: :all_but_first)
            ["Onsite", company]
        end

      build_item(
        post_url,
        %{
          title: title,
          company: company,
          location: location,
          workplace: workplace,
          source_description: description,
          post_url: post_url,
          page_number: n
        }
      )
    end)

  %{items: items}
end

handle_response = fn
  %{env: env, flag: :first_page}, _context ->
    # total_page =
    #   Regex.run(~r/Showing page 1 of (\d+)/, env.body, capture: :all_but_first)
    #   |> hd()
    #   |> String.to_integer()
    total_page = 6

    Logger.info("total: #{total_page}")

    requests =
      Enum.map(2..total_page, fn n ->
        build_request("/?page=#{n}")
        |> set_flag({:list_page, n})
      end)

    handle_list_page.(env.body, 1)
    |> Map.put(:requests, requests)

  %{env: env, flag: {:list_page, n}}, _context ->
    handle_list_page.(env.body, n)
end

callbacks = [init: init, handle_response: handle_response]
{:ok, settings} = SpiderMan.CommonSpider.check_callbacks_and_merge_settings(callbacks, settings)

Executing

Run the spider

# Delete previous dumps
File.rm_rf("./data/radar_jobs.csv")
File.rm_rf("./data/radar_jobs.ets")
SpiderMan.run_until_zero(spider, settings, 5_000)

Sorting the Results

Sort the csv by page number ascending

alias NimbleCSV.RFC4180, as: CSV

headers = [
  :title,
  :company,
  :location,
  :workplace,
  :source_description,
  :post_url,
  :page_number
]

sorted_path = "./data/radar_jobs-sorted.csv"
File.rm_rf(sorted_path)
io_device = File.open!(sorted_path, [:write, :append, :binary, :utf8])

header = CSV.dump_to_iodata([headers])

csv =
  "./data/radar_jobs.csv"
  |> File.read!()
  |> CSV.parse_string()
  |> Enum.sort_by(&amp;List.last(&amp;1), :asc)
  |> CSV.dump_to_iodata()

:ok = IO.write(io_device, header)
:ok = IO.write(io_device, csv)
:ok = File.close(io_device)

Display the Sorted CSV

data =
  "./data/radar_jobs-sorted.csv"
  |> File.read!()
  |> CSV.parse_string()

data
|> Enum.with_index()
|> Enum.map(fn {row, index} ->
  %{
    id: index,
    title: Enum.at(row, 0),
    company: Enum.at(row, 1),
    location: Enum.at(row, 2),
    workplace: Enum.at(row, 3),
    source_description: Enum.at(row, 4),
    post_url: Enum.at(row, 5),
    page_number: Enum.at(row, 6)
  }
end)
|> Kino.DataTable.new()

Other notebooks:

Yejun Su
@goofansu

ogp

ogp

ogp.livemd

tutorial intermediate ogp kino

2022-8-18
@TomBers

livebookNotes

Attractors

attractors.livemd

advanced data-science decimal vega_lite kino

2022-8-18
Kevin Pan
@feng19

spider_man

ElixirJobs

elixirjobs.livemd

tutorial advanced spider_man floki nimble_csv kino

2022-8-18
@TomBers

livebookNotes

Fun with Graphs

graphs.livemd

tutorial advanced intermediate vega_lite kino math

2022-8-18
Ryo Wakabayashi
@RyoWakabayashi

elixir-learning

Text generation

text_generation.livemd

advanced ai bumblebee nx exla kino

2022-12-17
@DockYard-Academy

curriculum

File Search

file_search.livemd

tutorial intermediate jason kino youtube hidden_cell

2023-3-21
@DeSchoel

Elixir_Curriculum

Counting Votes

deprecated_counting_votes.livemd

tutorial beginner jason kino youtube hidden_cell

2026-1-10

Back