Elixir Companies
Mix.install([
{:kino, "~> 0.9.4"},
{:spider_man, "~> 0.4.6"},
{:floki, "~> 0.34.2"},
{:nimble_csv, "~> 1.2"},
{:explorer, "~> 0.5.7"},
{:kino_explorer, "~> 0.1.6"}
])
Configure Settings
Build settings for spider
data_directory = "./data"
base_url = "https://elixir-companies.com/en/companies"
requester_options = [
base_url: base_url,
middlewares: [
{SpiderMan.Middleware.UserAgent,
[
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4389.82 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4389.82 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
]},
{Tesla.Middleware.Headers,
[
{"referer", base_url},
{"accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"},
{"accept-encoding", "gzip, deflate"},
{"accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7"}
]},
Tesla.Middleware.DecompressResponse
]
]
settings = [
log2file: false,
downloader_options: [requester: {SpiderMan.Requester.Finch, requester_options}],
spider_options: [pipelines: []],
item_processor_options: [
storage: [
{SpiderMan.Storage.ETS, "#{data_directory}/companies.ets"},
{SpiderMan.Storage.CSV,
file: "#{data_directory}/companies.csv",
headers: [
:name,
:industry,
:location,
:website_url,
:github_url,
:blog_url,
:source_url,
:source_description,
:page_number
]}
]
]
]
Configure Parsing
Prepare callbacks for spider
import SpiderMan
import SpiderMan.Utils
require Logger
spider = SpiderList.ElixirCompanies
init = fn state ->
build_request(base_url)
|> set_flag(:first_page)
|> then(&SpiderMan.insert_request(spider, &1))
state
end
handle_list_page = fn body, n ->
Logger.info("processing page #{n}")
{:ok, document} = Floki.parse_document(body)
companies =
Floki.find(document, "#company-index")
|> hd()
|> Floki.children(include_text: false)
|> Enum.filter(&match?({"div", _, _}, &1))
items =
Enum.map(companies, fn company ->
# In the form en/companies/{id}
source_url = Floki.attribute(company, ".company .title a", "href") |> hd()
name = Floki.find(company, ".company .title a") |> Floki.text() |> String.trim()
info =
Floki.find(company, ".company .company-info p")
|> Floki.text()
|> String.trim()
|> String.split("\n")
industry = info |> List.first() |> String.trim()
# We replace GitHub as the last value in the list if there is no location defined
location = info |> List.last() |> String.trim() |> String.replace("GitHub", "")
urls = Floki.attribute(company, ".company .company-info p a", "href")
website_url = urls |> List.first()
# Process github or blog url from the remaining list items
remaining_links = urls |> tl()
github_url =
remaining_links |> Enum.filter(&String.match?(&1, ~r/github.com/)) |> List.first()
blog_url =
remaining_links
|> Enum.filter(&(String.match?(&1, ~r/github.com/) == false))
|> List.first()
source_description =
Floki.find(company, ".company .company-description p")
|> Floki.text()
|> String.trim()
|> String.replace(~r/\s+/, " ")
|> String.replace(~r/\t/, " ")
# Logger.info("name: #{name}")
# Logger.info("industry: #{industry}")
# Logger.info("location: #{location}")
# Logger.info("website_url: #{website_url}")
# Logger.info("github_url: #{github_url}")
# Logger.info("blog_url: #{blog_url}")
# Logger.info("source_url: #{base_url <> String.slice(source_url, 13..-1)}")
# Logger.info("source_description: #{source_description}")
# Logger.info("page_number: #{to_string(n)}")
build_item(
source_url,
%{
name: name,
industry: industry,
location: location,
website_url: website_url,
github_url: github_url,
blog_url: blog_url,
source_url: base_url <> String.slice(source_url, 13..-1),
source_description: source_description,
page_number: n
}
)
end)
%{items: items}
end
handle_response = fn
%{env: env, flag: :first_page}, _context ->
# We have to guess here as it looks like it now stops at page 21
# total_page =
# Regex.run(~r/Showing page 1 of (\d+)/, env.body, capture: :all_but_first)
# |> hd()
# |> String.to_integer()
total_page = 41
# total_page = 2
Logger.info("total: #{total_page}")
requests =
Enum.map(2..total_page, fn n ->
build_request("/?page=#{n}")
|> set_flag({:list_page, n})
end)
handle_list_page.(env.body, 1)
|> Map.put(:requests, requests)
%{env: env, flag: {:list_page, n}}, _context ->
handle_list_page.(env.body, n)
end
callbacks = [init: init, handle_response: handle_response]
{:ok, settings} = SpiderMan.CommonSpider.check_callbacks_and_merge_settings(callbacks, settings)
Executing
Run the spider
# Delete previous dumps
File.rm_rf("#{data_directory}/companies.csv")
File.rm_rf("#{data_directory}/companies.ets")
SpiderMan.run_until_zero(spider, settings, 5_000)
Sorting the Results
Sort the csv by page number ascending
alias NimbleCSV.RFC4180, as: CSV
headers = [
:name,
:industry,
:location,
:website_url,
:github_url,
:blog_url,
:source_url,
:source_description,
:page_number
]
sorted_path = "#{data_directory}/companies-sorted.csv"
File.rm_rf(sorted_path)
io_device = File.open!(sorted_path, [:write, :append, :binary, :utf8])
header = CSV.dump_to_iodata([headers])
csv =
"#{data_directory}/companies.csv"
|> File.read!()
|> CSV.parse_string()
|> Enum.sort_by(&List.first(&1), :asc)
|> CSV.dump_to_iodata()
:ok = IO.write(io_device, header)
:ok = IO.write(io_device, csv)
:ok = File.close(io_device)
Display Information
This is where we want to set our summary or other sections using Kino.Markdown.new()
. We can intersperse code blocks for displaying, filtering, or adding download buttons.
Kino.Markdown.new("""
## Summary
Using the excellent spider_man library, we want to get the list of companies from the elixir-companies.com website in CSV format.
The website uses infinite scroll functionality but also accepts a `?page=` query string parameter to specify the page number to display first. Infinite scroll takes back over from this page though and starts back at 1. A number of websites use the `page` parameter, even when they do not specify page numbers.
The result of running this Livebook should return the sorted list of companies by name. I have added extra filtering that unfortunately has to be configured in the source. Hopefully at some point Livebook will allow for dynamic transformations and exports over the manual process I'm using now.
""")
Display the CSV
Kino.Markdown.new("""
## Display Results
We build the full DataFrame first as we need a variable to bind the `Data transform` Smart Cell to.
""")
dataframe =
sorted_path
|> File.read!()
|> Explorer.DataFrame.load_csv!()
Filtering companies
Kino.Markdown.new("""
## Filtering companies
We filter the DataFrame to display companies with location including `Remote` or `USA`.
**NOTE: Currently we're not using the Smart Cell as it crashes detecting in detecting possible data frames**
""")
require Explorer.DataFrame
dataframe_filtered =
dataframe
|> Explorer.DataFrame.filter(contains(location, "Remote") or contains(location, "USA"))
Download Results
Kino.Markdown.new("""
## Download Results
In the event you wish to work with the CSV locally, the following button should download the contents.
**NOTE: There is also a down arrow icon next to the `x entries` section where you can download multiple formats like CSV or JSON. This is primarily for convenience.**
""")
filename = "companies-sorted.csv"
filtered_filename = "companies-filtered.csv"
# Why is this a function? Is it because I can pass contents if they're already read from disk?
download_file = fn ->
sorted_path
|> File.read!()
end
download_filtered = fn ->
dataframe_filtered |> Explorer.DataFrame.to_csv!("#{data_directory}/#{filtered_filename}")
"#{data_directory}/#{filtered_filename}"
|> File.read!()
end
Kino.Layout.grid([
Kino.Download.new(download_file, filename: filename, label: "Download #{filename}"),
Kino.Download.new(download_filtered,
filename: filtered_filename,
label: "Download #{filtered_filename}"
)
])