Notesclub

Elixir Companies

public-apps/elixir-companies.livemd

Elixir Companies

Mix.install([
  {:kino, "~> 0.9.4"},
  {:spider_man, "~> 0.4.6"},
  {:floki, "~> 0.34.2"},
  {:nimble_csv, "~> 1.2"},
  {:explorer, "~> 0.5.7"},
  {:kino_explorer, "~> 0.1.6"}
])

Configure Settings

Build settings for spider

data_directory = System.get_env("PWD")
base_url = "https://elixir-companies.com/en/companies"

requester_options = [
  base_url: base_url,
  middlewares: [
    {SpiderMan.Middleware.UserAgent,
     [
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
     ]},
    {Tesla.Middleware.Headers,
     [
       {"referer", base_url},
       {"accept",
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"},
       {"accept-encoding", "gzip, deflate"},
       {"accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7"}
     ]},
    Tesla.Middleware.DecompressResponse
  ]
]

settings = [
  log2file: false,
  downloader_options: [requester: {SpiderMan.Requester.Finch, requester_options}],
  spider_options: [pipelines: []],
  item_processor_options: [
    storage: [
      {SpiderMan.Storage.ETS, "#{data_directory}/companies.ets"},
      {SpiderMan.Storage.CSV,
       file: "#{data_directory}/companies.csv",
       headers: [
         :name,
         :industry,
         :location,
         :website_url,
         :github_url,
         :blog_url,
         :source_url,
         :source_description,
         :page_number
       ]}
    ]
  ]
]

Configure Parsing

Prepare callbacks for spider

import SpiderMan
import SpiderMan.Utils
require Logger

spider = SpiderList.ElixirCompanies

init = fn state ->
  build_request(base_url)
  |> set_flag(:first_page)
  |> then(&SpiderMan.insert_request(spider, &1))

  state
end

handle_list_page = fn body, n ->
  Logger.info("processing page #{n}")
  {:ok, document} = Floki.parse_document(body)

  companies =
    Floki.find(document, "#company-index")
    |> hd()
    |> Floki.children(include_text: false)
    |> Enum.filter(&match?({"div", _, _}, &1))

  items =
    Enum.map(companies, fn company ->
      # In the form en/companies/{id}
      source_url = Floki.attribute(company, ".company .title a", "href") |> hd()
      name = Floki.find(company, ".company .title a") |> Floki.text() |> String.trim()

      info =
        Floki.find(company, ".company .company-info p")
        |> Floki.text()
        |> String.trim()
        |> String.split("\n")

      industry = info |> List.first() |> String.trim()
      # We replace GitHub as the last value in the list if there is no location defined
      location = info |> List.last() |> String.trim() |> String.replace("GitHub", "")

      urls = Floki.attribute(company, ".company .company-info p a", "href")
      website_url = urls |> List.first()
      # Process github or blog url from the remaining list items
      remaining_links = urls |> tl()

      github_url =
        remaining_links |> Enum.filter(&String.match?(&1, ~r/github.com/)) |> List.first()

      blog_url =
        remaining_links
        |> Enum.filter(&(String.match?(&1, ~r/github.com/) == false))
        |> List.first()

      source_description =
        Floki.find(company, ".company .company-description p")
        |> Floki.text()
        |> String.trim()
        |> String.replace(~r/\s+/, " ")
        |> String.replace(~r/\t/, " ")

      # Logger.info("name: #{name}")
      # Logger.info("industry: #{industry}")
      # Logger.info("location: #{location}")
      # Logger.info("website_url: #{website_url}")
      # Logger.info("github_url: #{github_url}")
      # Logger.info("blog_url: #{blog_url}")
      # Logger.info("source_url: #{base_url <> String.slice(source_url, 13..-1)}")
      # Logger.info("source_description: #{source_description}")
      # Logger.info("page_number: #{to_string(n)}")

      build_item(
        source_url,
        %{
          name: name,
          industry: industry,
          location: location,
          website_url: website_url,
          github_url: github_url,
          blog_url: blog_url,
          source_url: base_url <> String.slice(source_url, 13..-1),
          source_description: source_description,
          page_number: n
        }
      )
    end)

  %{items: items}
end

handle_response = fn
  %{env: env, flag: :first_page}, _context ->
    # We have to guess here as it looks like it now stops at page 21
    # total_page =
    #   Regex.run(~r/Showing page 1 of (\d+)/, env.body, capture: :all_but_first)
    #   |> hd()
    #   |> String.to_integer()
    total_page = 41
    # total_page = 2

    Logger.info("total: #{total_page}")

    requests =
      Enum.map(2..total_page, fn n ->
        build_request("/?page=#{n}")
        |> set_flag({:list_page, n})
      end)

    handle_list_page.(env.body, 1)
    |> Map.put(:requests, requests)

  %{env: env, flag: {:list_page, n}}, _context ->
    handle_list_page.(env.body, n)
end

callbacks = [init: init, handle_response: handle_response]
{:ok, settings} = SpiderMan.CommonSpider.check_callbacks_and_merge_settings(callbacks, settings)

Executing

Run the spider

# Delete previous dumps
File.rm_rf("#{data_directory}/companies.csv")
File.rm_rf("#{data_directory}/companies.ets")
SpiderMan.run_until_zero(spider, settings, 5_000)

Sorting the Results

Sort the csv by page number ascending

alias NimbleCSV.RFC4180, as: CSV

headers = [
  :name,
  :industry,
  :location,
  :website_url,
  :github_url,
  :blog_url,
  :source_url,
  :source_description,
  :page_number
]

sorted_path = "#{data_directory}/companies-sorted.csv"
File.rm_rf(sorted_path)
io_device = File.open!(sorted_path, [:write, :append, :binary, :utf8])

header = CSV.dump_to_iodata([headers])

csv =
  "#{data_directory}/companies.csv"
  |> File.read!()
  |> CSV.parse_string()
  |> Enum.sort_by(&amp;List.first(&amp;1), :asc)
  |> CSV.dump_to_iodata()

:ok = IO.write(io_device, header)
:ok = IO.write(io_device, csv)
:ok = File.close(io_device)

Display Information

This is where we want to set our summary or other sections using Kino.Markdown.new(). We can intersperse code blocks for displaying, filtering, or adding download buttons.

Kino.Markdown.new("""
## Summary

Using the excellent spider_man library, we want to get the list of companies from the elixir-companies.com website in CSV format.

The website uses infinite scroll functionality but also accepts a `?page=` query string parameter to specify the page number to display first. Infinite scroll takes back over from this page though and starts back at 1. A number of websites use the `page` parameter, even when they do not specify page numbers.

The result of running this Livebook should return the sorted list of companies by name. I have added extra filtering that unfortunately has to be configured in the source. Hopefully at some point Livebook will allow for dynamic transformations and exports over the manual process I'm using now.
""")

Display the CSV

Kino.Markdown.new("""
## Display Results

We build the full DataFrame first as we need a variable to bind the `Data transform` Smart Cell to.

""")
dataframe =
  sorted_path
  |> File.read!()
  |> Explorer.DataFrame.load_csv!()

Filtering companies

Kino.Markdown.new("""
## Filtering companies

We filter the DataFrame to display companies with location including `Remote` or `USA`.

**NOTE: Currently we're not using the Smart Cell as it crashes detecting in detecting possible data frames**
""")
require Explorer.DataFrame

dataframe_filtered =
  dataframe
  |> Explorer.DataFrame.filter(contains(location, "Remote") or contains(location, "USA"))

Download Results

Kino.Markdown.new("""
## Download Results

In the event you wish to work with the CSV locally, the following button should download the contents.

**NOTE: There is also a down arrow icon next to the `x entries` section where you can download multiple formats like CSV or JSON. This is primarily for convenience.**
""")
filename = "companies-sorted.csv"
filtered_filename = "companies-filtered.csv"

# Why is this a function? Is it because I can pass contents if they're already read from disk?
download_file = fn ->
  sorted_path
  |> File.read!()
end

download_filtered = fn ->
  dataframe_filtered |> Explorer.DataFrame.to_csv!("#{data_directory}/#{filtered_filename}")

  "#{data_directory}/#{filtered_filename}"
  |> File.read!()
end

Kino.Layout.grid([
  Kino.Download.new(download_file, filename: filename, label: "Download #{filename}"),
  Kino.Download.new(download_filtered,
    filename: filtered_filename,
    label: "Download #{filtered_filename}"
  )
])