Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

ElixirJobs

notebooks/elixirjobs.livemd

ElixirJobs

Mix.install([
  {:spider_man, "~> 0.6"},
  {:floki, "~> 0.34"},
  {:nimble_csv, "~> 1.2"},
  {:kino, "~> 0.12"}
])

Config the spider

spider = SpiderList.ElixirJobs
base_url = "https://elixirjobs.net/"

requester_options = [
  base_url: base_url,
  middlewares: [
    {SpiderMan.Middleware.UserAgent,
     [
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
     ]},
    {Tesla.Middleware.Headers,
     [
       {"referer", base_url},
       {"accept",
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"},
       {"accept-encoding", "gzip, deflate"},
       {"accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7"}
     ]},
    Tesla.Middleware.DecompressResponse
  ]
]

settings = [
  log2file: false,
  print_stats: false,
  downloader_options: [requester: {SpiderMan.Requester.Finch, requester_options}],
  spider_options: [pipelines: []],
  item_processor_options: [
    storage: [
      {SpiderMan.Storage.ETS, %{file_path: "./data/jobs.ets", give_away: {self(), :got_data}}},
      {SpiderMan.Storage.CSV,
       file: "./data/jobs.csv",
       headers: [
         :link,
         :title,
         :sub_title,
         :date,
         :workplace,
         :type
       ]}
    ]
  ]
]

Code for spider

import SpiderMan
import SpiderMan.Utils
require Logger

init = fn state ->
  build_request(base_url)
  |> set_flag(:first_page)
  |> then(&SpiderMan.insert_request(spider, &1))

  state
end

today = Date.from_erl!(:erlang.date())
utc_now = DateTime.utc_now()

months =
  Map.new(
    1..12,
    &{
      Calendar.strftime(%{utc_now | month: &1}, "%b"),
      if(&1 >= 10, do: &1, else: "0#{&1}")
    }
  )

parse_date = fn
  "Today" ->
    today

  "Yesterday" ->
    Date.add(today, -1)

  date ->
    case String.split(date) do
      [day, month] ->
        month = months[month]
        [today.year, month, day]

      [day, month, year] ->
        month = months[month]
        [year, month, day]
    end
    |> case do
      [year, month, <<_>> = day] ->
        Date.from_iso8601!("#{year}-#{month}-0#{day}")

      [year, month, day] ->
        Date.from_iso8601!("#{year}-#{month}-#{day}")
    end
end

handle_list_page = fn body, n ->
  Logger.info("processing page #{n}")
  {:ok, document} = Floki.parse_document(body)

  jobs =
    Floki.find(document, ".offers-index")
    |> hd()
    |> Floki.children(include_text: false)
    |> Enum.filter(&amp;match?({"a", _, _}, &amp;1))

  items =
    Enum.map(jobs, fn job ->
      title = Floki.find(job, ".title strong") |> Floki.text() |> String.trim()
      sub_title = Floki.find(job, ".title small") |> Floki.text() |> String.trim()
      link = Floki.attribute(job, "a", "href") |> hd()

      [_, date, _, workplace, _, type] =
        Floki.find(job, ".control .tag")
        |> Enum.map(&amp;(&amp;1 |> Floki.text() |> String.trim()))

      build_item(
        link,
        %{
          link: base_url <> String.slice(link, 1..-1),
          title: title,
          sub_title: sub_title,
          date: parse_date.(date),
          workplace: workplace,
          type: type
        }
      )
    end)

  %{items: items}
end

handle_response = fn
  %{env: env, flag: :first_page}, _context ->
    total_page =
      Regex.run(~r/Showing page 1 of (\d+)/, env.body, capture: :all_but_first)
      |> hd()
      |> String.to_integer()

    Logger.info("total: #{total_page}")

    requests =
      Enum.map(2..total_page, fn n ->
        build_request("/?page=#{n}")
        |> set_flag({:list_page, n})
      end)

    handle_list_page.(env.body, 1)
    |> Map.put(:requests, requests)

  %{env: env, flag: {:list_page, n}}, _context ->
    handle_list_page.(env.body, n)
end

callbacks = [init: init, handle_response: handle_response]
{:ok, settings} = SpiderMan.CommonSpider.check_callbacks_and_merge_settings(callbacks, settings)
settings

Run it!

# clean old data
File.ls!("data") |> Enum.each(&amp;File.rm!("data/#{&amp;1}"))

frame = Kino.Frame.new()
Kino.render(frame)
button = Kino.Control.button("Run it!")
Kino.Frame.render(frame, button)

Kino.listen(button, fn _event ->
  md = Kino.Markdown.new("**Waiting...**")
  Kino.Frame.render(frame, md)
  {:ok, _} = SpiderMan.start(spider, settings)
  Process.sleep(2000)

  Kino.listen(1000, 2, fn _e, acc ->
    if SpiderMan.check_zero_task?(spider) do
      SpiderMan.stop(spider)
      md = Kino.Markdown.new("**Spider Stopped**")
      Kino.Frame.append(frame, md)
      :halt
    else
      infos =
        SpiderMan.throughput(spider)
        |> Enum.map(fn info ->
          tps = if info.tps > 999, do: "999+", else: info.tps
          component = Atom.to_string(info.component) |> Macro.camelize()

          "- #{component}:[Success/Total: #{info.success}/#{info.total} tps: #{tps}/s Failed:#{info.fail}]"
        end)
        |> Enum.join("\n")

      md =
        Kino.Markdown.new("""
        **Used: #{acc} seconds**
        #{infos}
        """)

      Kino.Frame.render(frame, md)
      {:cont, acc + 1}
    end
  end)
end)

result_frame = Kino.Frame.new()
md = Kino.Markdown.new("**Waiting Result...**")
Kino.Frame.render(result_frame, md)

receive do
  {:"ETS-TRANSFER", tid, _old_owner, :got_data} ->
    table =
      :ets.tab2list(tid)
      |> Stream.map(&amp;elem(&amp;1, 1))
      |> Enum.sort_by(&amp; &amp;1.date, {:desc, Date})
      |> Kino.DataTable.new(sorting_enabled: true)

    Kino.Frame.render(result_frame, table)
end

result_frame