ElixirJobs
Mix.install([
{:spider_man, "~> 0.6"},
{:floki, "~> 0.34"},
{:nimble_csv, "~> 1.2"},
{:kino, "~> 0.12"}
])
Config the spider
spider = SpiderList.ElixirJobs
base_url = "https://elixirjobs.net/"
requester_options = [
base_url: base_url,
middlewares: [
{SpiderMan.Middleware.UserAgent,
[
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4389.82 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4389.82 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
]},
{Tesla.Middleware.Headers,
[
{"referer", base_url},
{"accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"},
{"accept-encoding", "gzip, deflate"},
{"accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7"}
]},
Tesla.Middleware.DecompressResponse
]
]
settings = [
log2file: false,
print_stats: false,
downloader_options: [requester: {SpiderMan.Requester.Finch, requester_options}],
spider_options: [pipelines: []],
item_processor_options: [
storage: [
{SpiderMan.Storage.ETS, %{file_path: "./data/jobs.ets", give_away: {self(), :got_data}}},
{SpiderMan.Storage.CSV,
file: "./data/jobs.csv",
headers: [
:link,
:title,
:sub_title,
:date,
:workplace,
:type
]}
]
]
]
Code for spider
import SpiderMan
import SpiderMan.Utils
require Logger
init = fn state ->
build_request(base_url)
|> set_flag(:first_page)
|> then(&SpiderMan.insert_request(spider, &1))
state
end
today = Date.from_erl!(:erlang.date())
utc_now = DateTime.utc_now()
months =
Map.new(
1..12,
&{
Calendar.strftime(%{utc_now | month: &1}, "%b"),
if(&1 >= 10, do: &1, else: "0#{&1}")
}
)
parse_date = fn
"Today" ->
today
"Yesterday" ->
Date.add(today, -1)
date ->
case String.split(date) do
[day, month] ->
month = months[month]
[today.year, month, day]
[day, month, year] ->
month = months[month]
[year, month, day]
end
|> case do
[year, month, <<_>> = day] ->
Date.from_iso8601!("#{year}-#{month}-0#{day}")
[year, month, day] ->
Date.from_iso8601!("#{year}-#{month}-#{day}")
end
end
handle_list_page = fn body, n ->
Logger.info("processing page #{n}")
{:ok, document} = Floki.parse_document(body)
jobs =
Floki.find(document, ".offers-index")
|> hd()
|> Floki.children(include_text: false)
|> Enum.filter(&match?({"a", _, _}, &1))
items =
Enum.map(jobs, fn job ->
title = Floki.find(job, ".title strong") |> Floki.text() |> String.trim()
sub_title = Floki.find(job, ".title small") |> Floki.text() |> String.trim()
link = Floki.attribute(job, "a", "href") |> hd()
[_, date, _, workplace, _, type] =
Floki.find(job, ".control .tag")
|> Enum.map(&(&1 |> Floki.text() |> String.trim()))
build_item(
link,
%{
link: base_url <> String.slice(link, 1..-1),
title: title,
sub_title: sub_title,
date: parse_date.(date),
workplace: workplace,
type: type
}
)
end)
%{items: items}
end
handle_response = fn
%{env: env, flag: :first_page}, _context ->
total_page =
Regex.run(~r/Showing page 1 of (\d+)/, env.body, capture: :all_but_first)
|> hd()
|> String.to_integer()
Logger.info("total: #{total_page}")
requests =
Enum.map(2..total_page, fn n ->
build_request("/?page=#{n}")
|> set_flag({:list_page, n})
end)
handle_list_page.(env.body, 1)
|> Map.put(:requests, requests)
%{env: env, flag: {:list_page, n}}, _context ->
handle_list_page.(env.body, n)
end
callbacks = [init: init, handle_response: handle_response]
{:ok, settings} = SpiderMan.CommonSpider.check_callbacks_and_merge_settings(callbacks, settings)
settings
Run it!
# clean old data
File.ls!("data") |> Enum.each(&File.rm!("data/#{&1}"))
frame = Kino.Frame.new()
Kino.render(frame)
button = Kino.Control.button("Run it!")
Kino.Frame.render(frame, button)
Kino.listen(button, fn _event ->
md = Kino.Markdown.new("**Waiting...**")
Kino.Frame.render(frame, md)
{:ok, _} = SpiderMan.start(spider, settings)
Process.sleep(2000)
Kino.listen(1000, 2, fn _e, acc ->
if SpiderMan.check_zero_task?(spider) do
SpiderMan.stop(spider)
md = Kino.Markdown.new("**Spider Stopped**")
Kino.Frame.append(frame, md)
:halt
else
infos =
SpiderMan.throughput(spider)
|> Enum.map(fn info ->
tps = if info.tps > 999, do: "999+", else: info.tps
component = Atom.to_string(info.component) |> Macro.camelize()
"- #{component}:[Success/Total: #{info.success}/#{info.total} tps: #{tps}/s Failed:#{info.fail}]"
end)
|> Enum.join("\n")
md =
Kino.Markdown.new("""
**Used: #{acc} seconds**
#{infos}
""")
Kino.Frame.render(frame, md)
{:cont, acc + 1}
end
end)
end)
result_frame = Kino.Frame.new()
md = Kino.Markdown.new("**Waiting Result...**")
Kino.Frame.render(result_frame, md)
receive do
{:"ETS-TRANSFER", tid, _old_owner, :got_data} ->
table =
:ets.tab2list(tid)
|> Stream.map(&elem(&1, 1))
|> Enum.sort_by(& &1.date, {:desc, Date})
|> Kino.DataTable.new(sorting_enabled: true)
Kino.Frame.render(result_frame, table)
end
result_frame