Query archive.org
Mix.install([
{:req, "~> 0.5.0"},
{:kino, "~> 0.14.0"}
])
Codice di appoggio
Req.get!("https://api.github.com/repos/wojtekmach/req").body["description"]
Arch: lavora su una lista di items estratte dalle API
[
%{
digest: "V4XSYM24JSG2JGHCQNPXU2L3DQGIAHEX",
length: "20432",
mimetype: "image/gif",
original: "http://www.pangea.va.it:80/agora/agb.gif",
statuscode: "200",
timestamp: "19980627141111",
urlkey: "it,va,pangea)/agora/agb.gif"
}
]
defmodule Arch do
def valid(items) do
items
|> Enum.filter(fn %{statuscode: status} -> status == "200" end)
end
def filter_by_type(items, mimetype) do
items
|> Enum.filter(fn %{mimetype: m} -> String.starts_with?(m, mimetype) end)
end
def filter_by_name(items, substring) do
items
|> Enum.filter(fn %{original: url} ->
String.contains?(String.downcase(url), String.downcase(substring))
end)
end
def excluding_name(items, substring) do
items
|> Enum.filter(fn %{original: url} ->
!String.contains?(String.downcase(url), String.downcase(substring))
end)
end
def min_length(items, length_in_kb) do
items
|> Enum.filter(fn %{length: len} ->
{val, ""} = Integer.parse(len)
val > length_in_kb * 1024
end)
end
@doc """
Dato un set di items,
"""
def latest_version_by_url(items) do
items
|> Enum.group_by(fn %{original: url} -> url end)
|> Enum.map(fn {_k, v} ->
v
|> Enum.sort_by(fn %{timestamp: t} -> t end, :desc)
|> List.first()
end)
end
@doc """
Di tutti gli elementi con lo stesso digest, tiene il più antico.
"""
def unique_by_digest(items) do
items
|> Enum.group_by(fn %{digest: d} -> d end)
|> Enum.map(fn {_k, v} ->
v
|> Enum.sort_by(fn %{timestamp: t} -> t end, :desc)
|> List.first(v)
end)
end
def archive_url(%{timestamp: t, original: u}) do
"https://web.archive.org/web/#{URI.encode(t)}/#{URI.encode(u)}"
end
def with_archive_url(items) do
items
|> Enum.map(fn i -> Map.put(i, :archive_url, archive_url(i)) end)
end
def statistics(items) do
items
|> Enum.group_by(fn %{mimetype: url} -> url end)
|> Enum.map(fn {k, v} ->
unq = v |> Enum.map(fn %{original: url} -> url end) |> Enum.uniq()
%{mimetype: k, unique_urls: length(unq), all_items: length(v)}
end)
end
@doc """
Stampa le statistiche (divise per mime type)
"""
def statistics_kino(items) do
items
|> statistics()
|> Kino.DataTable.new(
keys: [:mimetype, :unique_urls, :all_items],
name: "Statistics for #{length(items)} items"
)
end
@doc """
Stampa una serie di URL come tabella Kino
"""
def urls_kino(items) do
items
|> with_archive_url()
|> Kino.DataTable.new(
keys: [:original, :timestamp, :archive_url],
name: "Found #{length(items)} items"
)
end
end
defmodule HTML do
use Kino.JS
def new(html) do
Kino.JS.new(__MODULE__, html)
end
def img_table(all_l, opts \\ []) when is_list(all_l) do
page = case Keyword.get(opts, :page, 1) do
p when p >= 1 -> p - 1
_ -> 0
end
pagesize = Keyword.get(opts, :pagesize, 1000)
pages = ceil( length(all_l) / pagesize )
l = all_l
|> Enum.drop( page * pagesize)
|> Enum.take(pagesize)
# divide in 4 colonne
cols =
l
|> Enum.with_index()
|> Enum.group_by(fn {_e, i} -> rem(i, 4) end, fn {e, _i} -> e end)
rendered_cols =
for c <- [0, 1, 2, 3] do
images =
Map.get(cols, c, [])
|> Enum.map(&img_html/1)
"""
#{Enum.join(images)}
"""
end
"""
All items:
#{length(all_l)} - Showing #{length(l)} - Page #{page + 1} of #{pages}
#{Enum.join(rendered_cols)}
"""
|> new()
end
def img_html(%{timestamp: t, original: u} = item) do
img_html(Arch.archive_url(item), u, t)
end
def img_html(url, name, comment) do
"""
#{url}">
\"#{url}\"" title="#{name} #{comment}" />
"""
end
asset "main.js" do
"""
export function init(ctx, html) {
ctx.importCSS("./main.css");
ctx.root.innerHTML = html;
}
"""
end
asset "main.css" do
"""
.row {
display: flex;
flex-wrap: wrap;
padding: 0 4px;
}
/* Create four equal columns that sits next to each other */
.column {
flex: 22%;
max-width: 22%;
padding: 0 4px;
}
.column img {
margin-top: 8px;
vertical-align: middle;
width: 100%;
max-height: 300px;
}
"""
end
end
defmodule R do
def json(url) do
Req.get!(url: url, decode_json: [keys: :atoms], receive_timeout: 30_000).body
end
def decode_table(rows) do
[title | data] = rows
title_atoms =
title
|> Enum.map(fn t -> String.to_atom(t) end)
for r <- data do
Enum.zip(title_atoms, r)
|> Map.new()
end
end
def table(url) do
json(url)
|> decode_table()
end
def lines(url) do
Req.get!(url: url).body
|> String.split(["\n", "\r", "\r\n"])
end
end
API di archive
- Ottiene ultimo snapshot
- Lista di snapshots
- …
Il trick è quello di passare il nome del sito con l’asterisco in fondo, es “pangea.va.it*”
Per la api CDX, vedi https://archive.org/developers/wayback-cdx-server.html
defmodule AApi do
def new_site_json(site), do: %{"output" => "json", "url" => "#{site}*"}
def new_site(site), do: %{"url" => "#{site}*"}
def before(api, year) do
Map.put(api, "to", "#{year}0101")
end
def since(api, year) do
Map.put(api, "to", "#{year}0101")
end
def year(api, year),
do:
api
|> before(year + 1)
|> since(year)
def limit(api, n), do: Map.put(api, "limit", "#{n}")
def jpeg(api), do: Map.put(api, "filter", "mimetype:image/jpeg")
def images(api), do: Map.put(api, "filter", "mimetype:image/*")
def videos(api), do: Map.put(api, "filter", "mimetype:video/*")
def url(api) do
parms =
api
|> Enum.map(fn {k, v} -> "#{k}=#{URI.encode_www_form(v)}" end)
|> Enum.join("&")
"https://web.archive.org/cdx/search/cdx?#{parms}"
end
end
#site = "http://www.pangea.va.it"
#Req.get!(url: "https://archive.org/wayback/available?url=#{site}",
# decode_json: [keys: :atoms]).body
AApi.new_site("pangea.va.it")
|> AApi.limit(10)
|> AApi.jpeg()
|> AApi.url()
|>R.lines( )
Browse di un sito
Scarico tutte le URL da un sito
Ciascuna URL può essere presente più volte.
%{
digest: "V4XSYM24JSG2JGHCQNPXU2L3DQGIAHEX",
length: "20432",
mimetype: "image/gif",
original: "http://www.pangea.va.it:80/agora/agb.gif",
statuscode: "200",
timestamp: "19980627141111",
urlkey: "it,va,pangea)/agora/agb.gif"
}
Idealmente dovrei effettuare un’unicizzazione per il digest, nel senso che se ho la stessa risorsa più volte è inutile mostrarla.
Quando faccio una modifica, faccio “”⌘ ⇧ ↵”: Evaluate current and all outdated cells
site = "pangea.va.it"
v =
AApi.new_site_json(site)
# |> AApi.images()
# |> AApi.videos()
# |> AApi.year(2007)
# |> AApi.limit(100)
|> AApi.url()
|> R.table()
Arch.statistics_kino(v)
v
|> Arch.filter_by_type("image")
|> Arch.min_length(15)
|> Arch.valid()
|> Arch.latest_version_by_url()
|> Arch.unique_by_digest()
|> HTML.img_table(page: 1)
ZIP files
zip =
v
|> Arch.filter_by_name("zip")
|> Arch.valid()
|> Arch.latest_version_by_url()
|> Arch.unique_by_digest()
|> Arch.urls_kino()
v
|> Arch.filter_by_name("/lenz")
|> Arch.excluding_name("index.php")
|> Arch.valid()
|> Arch.latest_version_by_url()
|> Arch.unique_by_digest()
|> Arch.urls_kino()