Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Game data mälving

game-data-maelving.livemd

Game data mälving

Mix.install(
  [
    {:poison, "~> 3.1"},
    {:crawly, "~> 0.16"},
    {:floki, "~> 0.35"},
    {:kino_vega_lite, "~> 0.1.10"}
  ],
  config: [
    crawly: [
      middlewares: [
        Crawly.Middlewares.DomainFilter,
        # be nice and honor robots.txt
        Crawly.Middlewares.RobotsTxt,
        Crawly.Middlewares.UniqueRequest,
        {Crawly.Middlewares.UserAgent,
         user_agents: [
           # be nice and say who we are
           "Mozilla/5.0 (compatible; guess the game analysis bot; github.com/ilkka)"
         ]}
      ],
      pipelines: [
        Crawly.Pipelines.JSONEncoder,
        {Crawly.Pipelines.WriteToFile, folder: "/data", extension: "jl", include_timestamp: false}
      ]
    ]
  ]
)

Get game titles from data

If we used the crawler from the mix project, we could use this code to parse the results from that. Read the JSON lines data, decode the JSON and grab the game titles.

# games =
#  File.read!("/data/results-2022.jl")
#  |> String.splitter("\n")
#  |> Stream.filter(fn s -> String.length(s) > 0 end)
#  |> Stream.map(&Poison.decode!/1)
#  |> Enum.into([])
#  |> Enum.map(fn g -> Map.fetch!(g, "game") end)

Get game titles from API

response = Crawly.fetch("https://api.guessthe.game/api/all_puzzle_titles")
{:ok, answers} = Poison.decode(response.body)

games =
  answers
  |> Map.get("puzzle_titles")
  |> Enum.map(&List.first/1)

length(games)

Get publication years from Wikipedia

We read Wikidata because they provide structured data based on page title. The trouble with that is that the page titles are not always easily guessable from the game title. For example we need to drop “Sid Meier’s” from Civilization titles, and another particular change is “NieR:Automata” where the actual Wikipedia page title is a less stylized “Nier: Automata”. Probably many missing datapoints are because of missing transformations here.

Let’s hope they don’t throttle us…

defmodule WikiGameYearSpider do
  use Crawly.Spider

  @impl Crawly.Spider
  def base_url do
    "https://www.wikidata.org"
  end

  defp game_to_url(game) do
    wikiurl_title =
      game
      |> String.replace(~r/Sid Meier\S+s /, "")
      |> String.replace(~r/NieR:/, "Nier: ")
      |> String.replace(" ", "+")

    "https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles=#{wikiurl_title}&languages=en&format=json"
  end

  @impl Crawly.Spider
  def init(opts) do
    urls =
      opts[:games]
      |> Enum.map(&game_to_url/1)

    [start_urls: urls]
  end

  @impl Crawly.Spider
  def parse_item(response) do
    item =
      case Poison.decode(response.body) do
        {:ok, data} ->
          case data["entities"] |> Enum.at(0) do
            {"-1", _} ->
              %{url: response.request_url, error: "Game not found"}

            {_entityid, entity} ->
              {:ok, pubdate, _offset} =
                entity
                |> get_in(["claims", "P577"])
                |> Enum.at(0)
                |> get_in(["mainsnak", "datavalue", "value", "time"])
                |> DateTime.from_iso8601()

              game =
                entity
                |> get_in(["labels", "en", "value"])

              %{year: pubdate.year, game: game}
          end

        _ ->
          %{url: response.request_url, error: "Error from API"}
      end

    %Crawly.ParsedItem{
      :items => [item],
      :requests => []
    }
  end
end
# Uncomment to actually crawl (or just use existing data, below):
# Crawly.Engine.start_spider(WikiGameYearSpider, games: games)
# Uncomment and evaluate to stop a run:
# Crawly.Engine.stop_spider(WikiGameYearSpider)

Read the Wikidata results

games_years =
  File.read!("/data/WikiGameYearSpider_f310dbfa-7731-11ee-a5b4-0242ac110002.jl")
  |> String.splitter("\n")
  |> Stream.filter(fn s -> String.length(s) > 0 end)
  |> Stream.map(&Poison.decode!/1)
  |> Enum.into([])
  |> Enum.filter(fn g -> Map.has_key?(g, "game") end)

length(games_years)

Crawl game publication data

For this we need another spider since we’re using a different site.

First though, some prototyping on how we’re gonna do this. How do we generate the requests for the individual years?

{:ok, document} =
  "https://videogamesstats.com/video-games-by-release-year-1971-today/"
  |> Crawly.fetch()
  |> Map.get(:body)
  |> Floki.parse_document()

document
|> Floki.find(".entry-content ul li a")
|> Floki.attribute("href")
|> Enum.filter(fn x -> String.match?(x, ~r/\/ReleaseYear\/[0-9]+/) end)

Alrighty, then how to find all the pages for a given year? Find the page URLs, find the biggest page number, boom.

document =
  "https://videogamesstats.com/ReleaseYear/2009/"
  |> Crawly.fetch()
  |> Map.get(:body)
  |> Floki.parse_document!()
last_page_num =
  document
  |> Floki.find("a.page-numbers")
  |> Floki.attribute("href")
  |> Enum.map(fn s -> String.replace(s, ~r/.*\/page\/([0-9]+).*/, "\\1") end)
  |> Enum.map(&String.to_integer/1)
  |> Enum.sort()
  |> List.last()

Make links:

template_page_link =
  document
  |> Floki.find("a.page-numbers")
  |> Floki.attribute("href")
  |> List.first()

for n <- 2..last_page_num do
  "#{String.replace(template_page_link, ~r/\/page\/.*/, "")}/page/#{n}"
end

Ok then how to find games from a game listing page and create data items from that:

document
|> Floki.find(".entry-content article a")
|> Floki.attribute("title")
|> Enum.uniq()
|> Enum.map(fn g -> %{:year => "2009", :game => g} end)

Finally we need to know how to grab the year from the URL, since we want to store that with the items. Otherwise we can’t really do year stats.

"https://videogamesstats.com/ReleaseYear/2009/page/5"
|> String.replace(~r/.*\/ReleaseYear\/([0-9]+).*/, "\\1")

Test generating page links:

last_page_num =
  year = 2009

2..last_page_num
|> Enum.map(fn n -> "https://videogamesstats.com/ReleaseYear/#{year}/page/#{n}" end)

Ok, that proto seems to work, now define the spider:

defmodule GamesstatsSpider do
  use Crawly.Spider

  @impl Crawly.Spider
  def base_url do
    "https://videogamesstats.com/"
  end

  @impl Crawly.Spider
  def init(_options \\ %{}) do
    [
      start_urls: ["https://videogamesstats.com/video-games-by-release-year-1971-today/"]
    ]
  end

  @impl Crawly.Spider
  def parse_item(response) do
    # Always need to parse
    case Floki.parse_document(response.body) do
      {:ok, document} ->
        # Parsing successful, what page was it?
        cond do
          String.match?(response.request_url, ~r/video-games-by-release-year/) ->
            # It was the main years listing, make individual year requests
            %Crawly.ParsedItem{
              :items => [],
              :requests => make_year_requests(document)
            }

          String.match?(response.request_url, ~r/\/ReleaseYear\/[0-9]+\/page/) ->
            # It was page 2..n for a year
            year = response.request_url |> String.replace(~r/.*\/ReleaseYear\/([0-9]+).*/, "\\1")

            %Crawly.ParsedItem{
              :items => parse_games_from_page(document, year),
              # no new requests, all pages are in queue already
              :requests => []
            }

          String.match?(response.request_url, ~r/\/ReleaseYear\/[0-9]+/) ->
            # It was the first page for a year. Only generate pages requests from
            # this and not for pages 2..n, although the dupes filter _should_ let
            # us just regenerates pages requests for _all_ pages without actually
            # performing the requests. Then we wouldn't need the cond clause
            # above. Anyway, this seems safer.
            year = response.request_url |> String.replace(~r/.*\/ReleaseYear\/([0-9]+).*/, "\\1")

            %Crawly.ParsedItem{
              :items => parse_games_from_page(document, year),
              :requests => make_page_requests(document, year)
            }
        end

      _ ->
        %Crawly.ParsedItem{:items => [], :requests => []}
    end
  end

  # Finds links to pages for listings of games for individual years from the document,
  # and returns a list of requests for those pages.
  #
  # The years probably have more than one page of games, but these will be the first pages.
  defp make_year_requests(document) do
    document
    |> Floki.find(".entry-content ul li a")
    |> Floki.attribute("href")
    |> Enum.filter(fn x -> String.match?(x, ~r/\/ReleaseYear\/[0-9]+/) end)
    |> Enum.map(&amp;Crawly.Utils.request_from_url/1)
  end

  # Takes the first page of results doc, finds out how many pages the year has in total,
  # and returns a list of requests for pages 2..n.
  defp make_page_requests(document, year) do
    last_page_num =
      document
      |> Floki.find("a.page-numbers")
      |> Floki.attribute("href")
      |> Enum.map(fn s -> String.replace(s, ~r/.*\/page\/([0-9]+).*/, "\\1") end)
      |> Enum.map(&amp;String.to_integer/1)
      |> Enum.sort()
      |> List.last()

    # This actually breaks the process for years with only one game, because
    # last_page_num is nil and that's invalid. Anyway, as these years are such
    # outliers (only a few in the 70s to early 80s) we don't care.
    2..last_page_num
    |> Enum.map(fn n -> "https://videogamesstats.com/ReleaseYear/#{year}/page/#{n}/" end)
    |> Enum.map(&amp;Crawly.Utils.request_from_url/1)
  end

  # Document is a game listing page, so find all the game titles
  # and return a list of maps we want to store, including the title
  # and the publication year.
  defp parse_games_from_page(document, year) do
    document
    |> Floki.find(".entry-content article a")
    |> Floki.attribute("title")
    # there's multiple links to the same game, drop dupes
    |> Enum.uniq()
    |> Enum.map(fn g -> %{:year => String.to_integer(year), :game => g} end)
  end
end

Now run the videogamesstats spider:

# uncomment or use static data
# Crawly.Engine.start_spider(GamesstatsSpider)
# this can be used to monitor the execution, i.e. what requests
# are in the queue.
{:requests, req} = Crawly.RequestsStorage.requests(GamesstatsSpider)

req
|> Enum.map(&amp;Map.get(&amp;1, :url))
# uncomment to stop prematurely
# Crawly.Engine.stop_spider(GamesstatsSpider)

Chart the results against number of games published

First we need to load the publication data. We read everything, the dataset is not very big and this makes it simple to chart later.

publications_per_year =
  File.stream!(
    "/data/GamesstatsSpider_837d495c-7738-11ee-bf8d-0242ac110002.jl",
    [],
    :line
  )
  |> Stream.map(&amp;Poison.decode!/1)
  |> Enum.into([])
VegaLite.new(width: 600, height: 300)
|> VegaLite.layers([
  VegaLite.new()
  |> VegaLite.data_from_values(games_years, only: ["year"])
  |> VegaLite.mark(:bar, color: "blue")
  |> VegaLite.encode_field(:x, "year", type: :quantitative, scale: [type: :linear], title: "Year")
  |> VegaLite.encode(:y, aggregate: :count, title: "GtG answers"),
  VegaLite.new()
  |> VegaLite.data_from_values(publications_per_year, only: ["year"])
  |> VegaLite.mark(:line, color: "green")
  |> VegaLite.encode_field(:x, "year", type: :quantitative)
  |> VegaLite.encode(:y, aggregate: :count, title: "Published games")
])
|> VegaLite.resolve(:scale, y: :independent)