Elixir Companies

public-apps/elixir-companies.livemd

Jeremy Brayton

@w0rd-driven

livebook_notebooks

Share to X

Share to Bluesky

More notebooks

Elixir Companies

Mix.install([
  {:kino, "~> 0.9.4"},
  {:spider_man, "~> 0.4.6"},
  {:floki, "~> 0.34.2"},
  {:nimble_csv, "~> 1.2"},
  {:explorer, "~> 0.5.7"},
  {:kino_explorer, "~> 0.1.6"}
])

Configure Settings

Build settings for spider

data_directory = "./data"
base_url = "https://elixir-companies.com/en/companies"

requester_options = [
  base_url: base_url,
  middlewares: [
    {SpiderMan.Middleware.UserAgent,
     [
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
     ]},
    {Tesla.Middleware.Headers,
     [
       {"referer", base_url},
       {"accept",
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"},
       {"accept-encoding", "gzip, deflate"},
       {"accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7"}
     ]},
    Tesla.Middleware.DecompressResponse
  ]
]

settings = [
  log2file: false,
  downloader_options: [requester: {SpiderMan.Requester.Finch, requester_options}],
  spider_options: [pipelines: []],
  item_processor_options: [
    storage: [
      {SpiderMan.Storage.ETS, "#{data_directory}/companies.ets"},
      {SpiderMan.Storage.CSV,
       file: "#{data_directory}/companies.csv",
       headers: [
         :name,
         :industry,
         :location,
         :website_url,
         :github_url,
         :blog_url,
         :source_url,
         :source_description,
         :page_number
       ]}
    ]
  ]
]

Configure Parsing

Prepare callbacks for spider

import SpiderMan
import SpiderMan.Utils
require Logger

spider = SpiderList.ElixirCompanies

init = fn state ->
  build_request(base_url)
  |> set_flag(:first_page)
  |> then(&amp;SpiderMan.insert_request(spider, &amp;1))

  state
end

handle_list_page = fn body, n ->
  Logger.info("processing page #{n}")
  {:ok, document} = Floki.parse_document(body)

  companies =
    Floki.find(document, "#company-index")
    |> hd()
    |> Floki.children(include_text: false)
    |> Enum.filter(&amp;match?({"div", _, _}, &amp;1))

  items =
    Enum.map(companies, fn company ->
      # In the form en/companies/{id}
      source_url = Floki.attribute(company, ".company .title a", "href") |> hd()
      name = Floki.find(company, ".company .title a") |> Floki.text() |> String.trim()

      info =
        Floki.find(company, ".company .company-info p")
        |> Floki.text()
        |> String.trim()
        |> String.split("\n")

      industry = info |> List.first() |> String.trim()
      # We replace GitHub as the last value in the list if there is no location defined
      location = info |> List.last() |> String.trim() |> String.replace("GitHub", "")

      urls = Floki.attribute(company, ".company .company-info p a", "href")
      website_url = urls |> List.first()
      # Process github or blog url from the remaining list items
      remaining_links = urls |> tl()

      github_url =
        remaining_links |> Enum.filter(&amp;String.match?(&amp;1, ~r/github.com/)) |> List.first()

      blog_url =
        remaining_links
        |> Enum.filter(&amp;(String.match?(&amp;1, ~r/github.com/) == false))
        |> List.first()

      source_description =
        Floki.find(company, ".company .company-description p")
        |> Floki.text()
        |> String.trim()
        |> String.replace(~r/\s+/, " ")
        |> String.replace(~r/\t/, " ")

      # Logger.info("name: #{name}")
      # Logger.info("industry: #{industry}")
      # Logger.info("location: #{location}")
      # Logger.info("website_url: #{website_url}")
      # Logger.info("github_url: #{github_url}")
      # Logger.info("blog_url: #{blog_url}")
      # Logger.info("source_url: #{base_url <> String.slice(source_url, 13..-1)}")
      # Logger.info("source_description: #{source_description}")
      # Logger.info("page_number: #{to_string(n)}")

      build_item(
        source_url,
        %{
          name: name,
          industry: industry,
          location: location,
          website_url: website_url,
          github_url: github_url,
          blog_url: blog_url,
          source_url: base_url <> String.slice(source_url, 13..-1),
          source_description: source_description,
          page_number: n
        }
      )
    end)

  %{items: items}
end

handle_response = fn
  %{env: env, flag: :first_page}, _context ->
    # We have to guess here as it looks like it now stops at page 21
    # total_page =
    #   Regex.run(~r/Showing page 1 of (\d+)/, env.body, capture: :all_but_first)
    #   |> hd()
    #   |> String.to_integer()
    total_page = 41
    # total_page = 2

    Logger.info("total: #{total_page}")

    requests =
      Enum.map(2..total_page, fn n ->
        build_request("/?page=#{n}")
        |> set_flag({:list_page, n})
      end)

    handle_list_page.(env.body, 1)
    |> Map.put(:requests, requests)

  %{env: env, flag: {:list_page, n}}, _context ->
    handle_list_page.(env.body, n)
end

callbacks = [init: init, handle_response: handle_response]
{:ok, settings} = SpiderMan.CommonSpider.check_callbacks_and_merge_settings(callbacks, settings)

Executing

Run the spider

# Delete previous dumps
File.rm_rf("#{data_directory}/companies.csv")
File.rm_rf("#{data_directory}/companies.ets")
SpiderMan.run_until_zero(spider, settings, 5_000)

Sorting the Results

Sort the csv by page number ascending

alias NimbleCSV.RFC4180, as: CSV

headers = [
  :name,
  :industry,
  :location,
  :website_url,
  :github_url,
  :blog_url,
  :source_url,
  :source_description,
  :page_number
]

sorted_path = "#{data_directory}/companies-sorted.csv"
File.rm_rf(sorted_path)
io_device = File.open!(sorted_path, [:write, :append, :binary, :utf8])

header = CSV.dump_to_iodata([headers])

csv =
  "#{data_directory}/companies.csv"
  |> File.read!()
  |> CSV.parse_string()
  |> Enum.sort_by(&amp;List.first(&amp;1), :asc)
  |> CSV.dump_to_iodata()

:ok = IO.write(io_device, header)
:ok = IO.write(io_device, csv)
:ok = File.close(io_device)

Display Information

This is where we want to set our summary or other sections using Kino.Markdown.new(). We can intersperse code blocks for displaying, filtering, or adding download buttons.

Kino.Markdown.new("""
## Summary

Using the excellent spider_man library, we want to get the list of companies from the elixir-companies.com website in CSV format.

The website uses infinite scroll functionality but also accepts a `?page=` query string parameter to specify the page number to display first. Infinite scroll takes back over from this page though and starts back at 1. A number of websites use the `page` parameter, even when they do not specify page numbers.

The result of running this Livebook should return the sorted list of companies by name. I have added extra filtering that unfortunately has to be configured in the source. Hopefully at some point Livebook will allow for dynamic transformations and exports over the manual process I'm using now.
""")

Display the CSV

Kino.Markdown.new("""
## Display Results

We build the full DataFrame first as we need a variable to bind the `Data transform` Smart Cell to.

""")

dataframe =
  sorted_path
  |> File.read!()
  |> Explorer.DataFrame.load_csv!()

Filtering companies

Kino.Markdown.new("""
## Filtering companies

We filter the DataFrame to display companies with location including `Remote` or `USA`.

**NOTE: Currently we're not using the Smart Cell as it crashes detecting in detecting possible data frames**
""")

require Explorer.DataFrame

dataframe_filtered =
  dataframe
  |> Explorer.DataFrame.filter(contains(location, "Remote") or contains(location, "USA"))

Download Results

Kino.Markdown.new("""
## Download Results

In the event you wish to work with the CSV locally, the following button should download the contents.

**NOTE: There is also a down arrow icon next to the `x entries` section where you can download multiple formats like CSV or JSON. This is primarily for convenience.**
""")

filename = "companies-sorted.csv"
filtered_filename = "companies-filtered.csv"

# Why is this a function? Is it because I can pass contents if they're already read from disk?
download_file = fn ->
  sorted_path
  |> File.read!()
end

download_filtered = fn ->
  dataframe_filtered |> Explorer.DataFrame.to_csv!("#{data_directory}/#{filtered_filename}")

  "#{data_directory}/#{filtered_filename}"
  |> File.read!()
end

Kino.Layout.grid([
  Kino.Download.new(download_file, filename: filename, label: "Download #{filename}"),
  Kino.Download.new(download_filtered,
    filename: filtered_filename,
    label: "Download #{filtered_filename}"
  )
])

Other notebooks:

Yejun Su
@goofansu

ogp

ogp

ogp.livemd

tutorial intermediate ogp kino

2022-8-18
@TomBers

livebookNotes

Attractors

attractors.livemd

advanced data-science decimal vega_lite kino

2022-8-18
Kevin Pan
@feng19

spider_man

ElixirJobs

elixirjobs.livemd

tutorial advanced spider_man floki nimble_csv kino

2022-8-18
@TomBers

livebookNotes

Fun with Graphs

graphs.livemd

tutorial advanced intermediate vega_lite kino math

2022-8-18
@instancer-kirik

resolvinator

JSON Data Processor

json_processor.livemd

tutorial intermediate jason explorer kino vega_lite kino_vega_lite

2024-11-8
Zack Siri
@zacksiri

notebooks

Multi input Multi output Axon model

multi-input-multi-output-axon-model.livemd

advanced data-science axon exla kino table_rex

2025-2-15
@DockYard-Academy

curriculum

Modules

modules.livemd

tutorial advanced intermediate jason kino youtube hidden_cell

2023-3-21

Back