Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Exploration des données

screens/screens.livemd

Exploration des données

Ressources avec un “duplicate resource data gouv id”

Transport.Screens.resources_with_duplicate_datagouv_id(markdown: true)
|> Kino.Markdown.new()

Ressources jamais historisées

On va déjà compter tous les datagouv_id des ressources:

import Ecto.Query

datagouv_ids =
  DB.Resource
  |> where([r], not is_nil(r.datagouv_id))
  |> select([r], map(r, [:datagouv_id]))
  |> DB.Repo.all()
  |> Enum.map(fn x -> x[:datagouv_id] end)
  |> Enum.sort()

[
  count: datagouv_ids |> Enum.count(),
  unique_count: datagouv_ids |> Enum.uniq() |> Enum.count()
]
used_datagouv_ids =
  DB.ResourceHistory
  |> select([:datagouv_id])
  |> DB.Repo.all()
  |> Enum.map(& &1.datagouv_id)
  |> MapSet.new()

[count: used_datagouv_ids |> Enum.count()]

Curieux !?

non_duplicate_datagouv_ids =
  datagouv_ids
  |> Enum.group_by(fn x -> x end)
  |> Enum.reject(fn {_a, b} -> b |> Enum.count() > 1 end)
  |> Enum.map(fn {a, _b} -> a end)
  |> MapSet.new()

problematic_non_duplicate_datagouv_ids =
  MapSet.difference(non_duplicate_datagouv_ids, used_datagouv_ids)

[count: problematic_non_duplicate_datagouv_ids |> Enum.count()]
ids = problematic_non_duplicate_datagouv_ids |> Enum.into([])

DB.Resource
|> select([r], %{format: r.format, count: count(r.id)})
|> where([r], r.datagouv_id in ^ids)
|> group_by([r], r.format)
|> DB.Repo.all()
|> Enum.sort_by(fn x -> -x.count end)
|> Kino.DataTable.new()
resource_history_uuids =
  DB.ResourceHistory
  |> select([r], %{uuid: fragment("payload ->> 'uuid'")})
  |> where([r], fragment("payload->>'format' = 'GTFS'"))
  |> DB.Repo.all()
  |> Enum.map(& &1.uuid)
  |> MapSet.new()

geojson_conversion_uuids =
  DB.DataConversion
  |> where([r], r.convert_from == "GTFS" and r.convert_to == "GeoJSON")
  |> select([dc], %{uuid: dc.resource_history_uuid})
  |> DB.Repo.all()
  |> Enum.map(& &1.uuid)
  |> MapSet.new()

# TODO: dry
netex_conversion_uuids =
  DB.DataConversion
  |> where([r], r.convert_from == "GTFS" and r.convert_to == "NeTEx")
  |> select([dc], %{uuid: dc.resource_history_uuid})
  |> DB.Repo.all()
  |> Enum.map(& &1.uuid)
  |> MapSet.new()

missing_netex = MapSet.difference(resource_history_uuids, netex_conversion_uuids)
missing_geojson = MapSet.difference(resource_history_uuids, geojson_conversion_uuids)

[
  missing_netex_per_resource_history: missing_netex |> Enum.count(),
  missing_geojson_per_resource_history: missing_geojson |> Enum.count()
]
uuids = geojson_conversion_uuids |> Enum.into([])

existing_resource_datagouv_id =
  DB.Resource
  |> select([r], %{datagouv_id: r.datagouv_id})
  |> DB.Repo.all()
  |> Enum.map(& &1.datagouv_id)
  |> MapSet.new()

gtfs_resources_with_no_netex =
  DB.ResourceHistory
  |> select([r], %{datagouv_id: r.datagouv_id})
  |> where([r], r.datagouv_id not in ^uuids)
  |> where([r], fragment("payload->>'format' = 'GTFS'"))
  |> distinct(:datagouv_id)
  |> DB.Repo.all()
  |> MapSet.new()
  # clean-up for not used anymore
  |> MapSet.intersection(existing_resource_datagouv_id)

uuids = netex_conversion_uuids |> Enum.into([])

gtfs_resources_with_no_geojson =
  DB.ResourceHistory
  |> select([r], %{datagouv_id: r.datagouv_id})
  |> where([r], r.datagouv_id not in ^uuids)
  |> where([r], fragment("payload->>'format' = 'NeTEx'"))
  |> distinct(:datagouv_id)
  |> DB.Repo.all()
  |> MapSet.new()
  # clean-up for not used anymore
  |> MapSet.intersection(existing_resource_datagouv_id)

[
  gtfs_resources_with_no_netex: gtfs_resources_with_no_netex |> Enum.count(),
  gtfs_resources_with_no_geojson: gtfs_resources_with_no_geojson |> Enum.count()
]