Query archive.org

archive_org.livemd

lenz

@l3nz

my_livebooks

More notebooks

Query archive.org

Mix.install([
  {:req, "~> 0.5.0"},
  {:kino, "~> 0.14.0"}
])

Codice di appoggio

Req.get!("https://api.github.com/repos/wojtekmach/req").body["description"]

Arch: lavora su una lista di items estratte dalle API

[
  %{
    digest: "V4XSYM24JSG2JGHCQNPXU2L3DQGIAHEX",
    length: "20432",
    mimetype: "image/gif",
    original: "http://www.pangea.va.it:80/agora/agb.gif",
    statuscode: "200",
    timestamp: "19980627141111",
    urlkey: "it,va,pangea)/agora/agb.gif"
  }
]

defmodule Arch do
  def valid(items) do
    items
    |> Enum.filter(fn %{statuscode: status} -> status == "200" end)
  end

  def filter_by_type(items, mimetype) do
    items
    |> Enum.filter(fn %{mimetype: m} -> String.starts_with?(m, mimetype) end)
  end

  def filter_by_name(items, substring) do
    items
    |> Enum.filter(fn %{original: url} ->
      String.contains?(String.downcase(url), String.downcase(substring))
    end)
  end

  def excluding_name(items, substring) do
    items
    |> Enum.filter(fn %{original: url} ->
      !String.contains?(String.downcase(url), String.downcase(substring))
    end)
  end

  def min_length(items, length_in_kb) do
    items
    |> Enum.filter(fn %{length: len} ->
      {val, ""} = Integer.parse(len)
      val > length_in_kb * 1024
    end)
  end

  @doc """
  Dato un set di items,
  """
  def latest_version_by_url(items) do
    items
    |> Enum.group_by(fn %{original: url} -> url end)
    |> Enum.map(fn {_k, v} ->
      v
      |> Enum.sort_by(fn %{timestamp: t} -> t end, :desc)
      |> List.first()
    end)
  end

  @doc """
  Di tutti gli elementi con lo stesso digest, tiene il più antico.
  """
  def unique_by_digest(items) do
    items
    |> Enum.group_by(fn %{digest: d} -> d end)
    |> Enum.map(fn {_k, v} ->
      v
      |> Enum.sort_by(fn %{timestamp: t} -> t end, :desc)
      |> List.first(v)
    end)
  end

  def archive_url(%{timestamp: t, original: u}) do
    "https://web.archive.org/web/#{URI.encode(t)}/#{URI.encode(u)}"
  end

  def with_archive_url(items) do
    items
    |> Enum.map(fn i -> Map.put(i, :archive_url, archive_url(i)) end)
  end

  def statistics(items) do
    items
    |> Enum.group_by(fn %{mimetype: url} -> url end)
    |> Enum.map(fn {k, v} ->
      unq = v |> Enum.map(fn %{original: url} -> url end) |> Enum.uniq()
      %{mimetype: k, unique_urls: length(unq), all_items: length(v)}
    end)
  end

  @doc """
  Stampa le statistiche (divise per mime type)
  """
  def statistics_kino(items) do
    items
    |> statistics()
    |> Kino.DataTable.new(
      keys: [:mimetype, :unique_urls, :all_items],
      name: "Statistics for #{length(items)} items"
    )
  end

  @doc """
  Stampa una serie di URL come tabella Kino
  """
  def urls_kino(items) do
    items
    |> with_archive_url()
    |> Kino.DataTable.new(
      keys: [:original, :timestamp, :archive_url],
      name: "Found #{length(items)} items"
    )
  end
end

defmodule HTML do
  use Kino.JS

  def new(html) do
    Kino.JS.new(__MODULE__, html)
  end

  def img_table(all_l, opts \\ []) when is_list(all_l) do

    page = case Keyword.get(opts, :page, 1) do
      p when p >= 1 -> p - 1
        _ -> 0  
    end
    
    pagesize = Keyword.get(opts, :pagesize, 1000)
    pages = ceil( length(all_l) / pagesize )
    
    l = all_l
    |> Enum.drop( page * pagesize)
    |> Enum.take(pagesize)
    
    # divide in 4 colonne
    cols =
      l
      |> Enum.with_index()
      |> Enum.group_by(fn {_e, i} -> rem(i, 4) end, fn {e, _i} -> e end)

    rendered_cols =
      for c <- [0, 1, 2, 3] do
        images =
          Map.get(cols, c, [])
          |> Enum.map(&amp;img_html/1)

        """
        
          #{Enum.join(images)}
        
        """
      end

    """
    All items: #{length(all_l)} - Showing #{length(l)} - Page #{page + 1} of #{pages}
    
      #{Enum.join(rendered_cols)}
    
    """
    |> new()
  end

  def img_html(%{timestamp: t, original: u} = item) do
    img_html(Arch.archive_url(item), u, t)
  end

  def img_html(url, name, comment) do
    """
    #{url}">
    \"#{url}\"" title="#{name} #{comment}" />
    
    """
  end

  asset "main.js" do
    """
    export function init(ctx, html) {
      ctx.importCSS("./main.css");
      ctx.root.innerHTML = html;
    }
    """
  end

  asset "main.css" do
    """
    .row {
      display: flex;
      flex-wrap: wrap;
      padding: 0 4px;
    }

    /* Create four equal columns that sits next to each other */
    .column {
      flex: 22%;
      max-width: 22%;
      padding: 0 4px;
    }

    .column img {
      margin-top: 8px;
      vertical-align: middle;
      width: 100%;
      max-height: 300px;
      
    }

    """
  end
end

defmodule R do
  def json(url) do
    Req.get!(url: url, decode_json: [keys: :atoms], receive_timeout: 30_000).body
  end

  def decode_table(rows) do
    [title | data] = rows

    title_atoms =
      title
      |> Enum.map(fn t -> String.to_atom(t) end)

    for r <- data do
      Enum.zip(title_atoms, r)
      |> Map.new()
    end
  end

  def table(url) do
    json(url)
    |> decode_table()
  end

  def lines(url) do
    Req.get!(url: url).body
    |> String.split(["\n", "\r", "\r\n"])
  end
end

API di archive

Ottiene ultimo snapshot
Lista di snapshots
…

Il trick è quello di passare il nome del sito con l’asterisco in fondo, es “pangea.va.it*”

Per la api CDX, vedi https://archive.org/developers/wayback-cdx-server.html

defmodule AApi do
  def new_site_json(site), do: %{"output" => "json", "url" => "#{site}*"}

  def new_site(site), do: %{"url" => "#{site}*"}

  def before(api, year) do
    Map.put(api, "to", "#{year}0101")
  end

  def since(api, year) do
    Map.put(api, "to", "#{year}0101")
  end

  def year(api, year),
    do:
      api
      |> before(year + 1)
      |> since(year)

  def limit(api, n), do: Map.put(api, "limit", "#{n}")

  def jpeg(api), do: Map.put(api, "filter", "mimetype:image/jpeg")
  def images(api), do: Map.put(api, "filter", "mimetype:image/*")
  def videos(api), do: Map.put(api, "filter", "mimetype:video/*")

  
  def url(api) do
    parms =
      api
      |> Enum.map(fn {k, v} -> "#{k}=#{URI.encode_www_form(v)}" end)
      |> Enum.join("&")

    "https://web.archive.org/cdx/search/cdx?#{parms}"
  end
end

#site = "http://www.pangea.va.it"
#Req.get!(url: "https://archive.org/wayback/available?url=#{site}", 
#  decode_json: [keys: :atoms]).body

AApi.new_site("pangea.va.it")
|> AApi.limit(10)
|> AApi.jpeg()
|> AApi.url()
|>R.lines( )

Browse di un sito

Scarico tutte le URL da un sito

Ciascuna URL può essere presente più volte.

    %{
      digest: "V4XSYM24JSG2JGHCQNPXU2L3DQGIAHEX",
      length: "20432",
      mimetype: "image/gif",
      original: "http://www.pangea.va.it:80/agora/agb.gif",
      statuscode: "200",
      timestamp: "19980627141111",
      urlkey: "it,va,pangea)/agora/agb.gif"
    }

Idealmente dovrei effettuare un’unicizzazione per il digest, nel senso che se ho la stessa risorsa più volte è inutile mostrarla.

Quando faccio una modifica, faccio “”⌘ ⇧ ↵”: Evaluate current and all outdated cells

site = "pangea.va.it"

v =
  AApi.new_site_json(site)
  # |> AApi.images()
  # |> AApi.videos()
  # |> AApi.year(2007)
  # |> AApi.limit(100)
  |> AApi.url()
  |> R.table()

Arch.statistics_kino(v)

v
|> Arch.filter_by_type("image")
|> Arch.min_length(15)
|> Arch.valid()
|> Arch.latest_version_by_url()
|> Arch.unique_by_digest()
|> HTML.img_table(page: 1)

ZIP files

zip =
  v
  |> Arch.filter_by_name("zip")
  |> Arch.valid()
  |> Arch.latest_version_by_url()
  |> Arch.unique_by_digest()
  |> Arch.urls_kino()

v
|> Arch.filter_by_name("/lenz")
|> Arch.excluding_name("index.php")
|> Arch.valid()
|> Arch.latest_version_by_url()
|> Arch.unique_by_digest()
|> Arch.urls_kino()

Other notebooks:

@andyl

elix_util

MNIST

mnist.livemd

req axon exla nx

2022-8-18
@TomBers

livebookNotes

Attractors

attractors.livemd

decimal vega_lite kino

2022-8-18
Wojtek Mach
@wojtekmach

notebooks

Playground

rss.livemd

req easyxml

2022-8-18
Wojtek Mach
@wojtekmach

notebooks

RSS

rss2.livemd

req easyxml

2022-8-18
@instancer-kirik

resolvinator

AI Risk Advisor Prototype

risk_advisor.livemd

resolvinator kino httpoison jason

2024-11-8
Luis Ezcurdia
@3zcurdia

livebooks

My IP Info

myip.livemd

req floki kino_maplibre

2023-11-10
Wojtek Mach
@wojtekmach

notebooks

Req plugins

req_plugins.livemd

req req_s3 req_hex

2022-8-18

Back