NHL Games
Load data
Data source:
Mix.install([
{:vega_lite, "~> 0.1.3"},
{:kino, "~> 0.5.0"},
{:jason, "~> 1.2"},
{:explorer, "~> 0.1.0-dev", github: "elixir-nx/explorer", branch: "main"}
])
Aliases to reduce typing:
alias Explorer.DataFrame, as: DF
alias Explorer.Series, as: S
alias Explorer.Datasets, as: DS
Relationships across tables:
{:ok, games} = DF.read_csv("~/Data/NHL/game.csv")
new_dt =
games["date_time_GMT"]
|> S.to_list()
|> Enum.map(fn s ->
# resp = DateTime.from_iso8601(s)
resp = NaiveDateTime.from_iso8601(s)
case resp do
{:ok, dt, _} -> dt
{:ok, dt} -> dt
other -> IO.inspect(other, label: "unprocessable")
end
end)
|> S.from_list()
games = games |> DF.mutate(date_time_GMT: new_dt)
DF.table(games)
{:ok, teams} = DF.read_csv("~/Data/NHL/team_info.csv")
DF.table(teams, 33)
kw_teams = teams |> DF.to_map() |> Keyword.new()
kw_teams[:abbreviation]
|> Enum.zip(
Enum.zip([kw_teams[:shortName], kw_teams[:teamName]])
|> Enum.map(fn {x, y} -> x <> " " <> y end)
)
|> Enum.map(fn {abbrev, name} -> %{abbrev: abbrev, name: name} end)
DF.head(teams)
abbrvs = teams |> DF.select(["team_id", "abbreviation"])
abbrvs["abbreviation"]
home_teams = abbrvs |> DF.rename(team_id: "home_team_id") |> DF.rename(abbreviation: "home_team")
away_teams = abbrvs |> DF.rename(team_id: "away_team_id") |> DF.rename(abbreviation: "away_team")
games =
games
|> DF.join(away_teams)
|> DF.join(home_teams)
|> DF.mutate(home_goal_diff: &S.subtract(&1["home_goals"], &1["away_goals"]))
games
|> DF.group_by("home_team")
|> DF.summarise(home_goal_diff: [:min, :mean, :max], outcome: [:n_unique])
defmodule NHL.Games do
@game_data "~/Data/NHL/game.csv"
@teams_data "~/Data/NHL/team_info.csv"
def create_from_raw_data(game_data \\ @game_data, teams_data \\ @teams_data) do
{:ok, games} = DF.read_csv(game_data)
games = games |> convert_date_time_gmt()
{:ok, teams} = DF.read_csv(teams_data)
abbrvs = teams |> DF.select(["team_id", "abbreviation"])
home_teams =
abbrvs |> DF.rename(team_id: "home_team_id") |> DF.rename(abbreviation: "home_team")
away_teams =
abbrvs |> DF.rename(team_id: "away_team_id") |> DF.rename(abbreviation: "away_team")
games
|> DF.join(away_teams)
|> DF.join(home_teams)
|> DF.mutate(home_goal_diff: &S.subtract(&1["home_goals"], &1["away_goals"]))
end
def convert_date_time_gmt(games) do
new_dt =
games["date_time_GMT"]
|> S.to_list()
|> Enum.map(fn s ->
# resp = DateTime.from_iso8601(s)
resp = NaiveDateTime.from_iso8601(s)
case resp do
# I'd rather keep these as NaiveDateTimes, but that causes problems
{:ok, dt, _} -> dt
{:ok, dt} -> dt
other -> IO.inspect(other, label: "unprocessable")
end
end)
|> S.from_list()
games =
games
|> DF.mutate(date_time_GMT: new_dt)
new_date_string =
games["date_time_GMT"]
|> S.to_list()
|> Enum.map(fn dt ->
dt |> NaiveDateTime.to_date() |> Date.to_string()
end)
games |> DF.mutate(date: new_date_string)
# |> DF.mutate(date_time_GMT: &S.cast(&1["date_time_GMT"], :datetime))
# |> DF.mutate(date_time_GMT: &S.cast(&1["date_time_GMT"], :date))
end
def home_mask(df, team_abbrv) do
df["home_team"] |> S.equal(team_abbrv) |> S.to_list()
end
def away_mask(df, team_abbrv) do
df["away_team"] |> S.equal(team_abbrv) |> S.to_list()
end
def filter_home_games(df, team_abbrv) do
df |> DF.filter(&S.equal(&1["home_team"], team_abbrv))
end
def filter_away_games(df, team_abbrv) do
df |> DF.filter(&S.equal(&1["away_team"], team_abbrv))
end
end
games = NHL.Games.create_from_raw_data()
# bos_home_mask = games["home_team"] |> S.equal("BOS") |> S.to_list()
# bos_home_mask = games |> NHL.Games.home_mask("BOS")
# bos_home_games = games |> DF.filter(&S.equal(&1["home_team"], "BOS"))
bos_home_games = games |> NHL.Games.filter_home_games("BOS")
bos_away_games = games |> NHL.Games.filter_away_games("BOS")
alias VegaLite, as: Vl
bos_games = DF.concat_rows(bos_home_games, bos_away_games)
bos =
bos_games
|> DF.select(["date_time_GMT", "home_goal_diff"])
|> DF.mutate(date_time_GMT: &S.cast(&1["date_time_GMT"], :date))
# |> DF.mutate(date_time_GMT: &S.cast(&1["date_time_GMT"], :string))
|> DF.to_map()
|> Keyword.new()
bos =
bos
|> Keyword.put(
:date_time_GMT,
bos[:date_time_GMT]
|> Enum.map(&Date.to_string/1)
)
Vl.new(height: 600, width: 1200)
|> Vl.data_from_series(bos)
|> Vl.mark(:point)
|> Vl.encode_field(:x, "date_time_GMT", type: :temporal)
|> Vl.encode_field(:y, "home_goal_diff", type: :quantitative)
bos_201819 =
bos_games
# |> DF.select(["date_time_GMT", "home_goal_diff", "season"])
|> DF.filter(&S.equal(&1["season"], 20_182_019))
|> DF.mutate(date_time_GMT: &S.cast(&1["date_time_GMT"], :date))
# |> DF.mutate(date_time_GMT: &S.cast(&1["date_time_GMT"], :string))
|> DF.to_map()
|> Keyword.new()
bos_201819 =
bos_201819
|> Keyword.put(
:date_time_GMT,
bos_201819[:date_time_GMT]
|> Enum.map(&Date.to_string/1)
)
Vl.new(height: 600, width: 1200)
|> Vl.data_from_series(bos_201819)
|> Vl.mark(:point, tooltip: %{content: "data"})
|> Vl.encode_field(:x, "date_time_GMT", type: :temporal)
|> Vl.encode_field(:y, "home_goal_diff", type: :quantitative)
# buf_home_games = games |> DF.filter(&S.equal(&1["home_team"], "BUF"))
# buf_home_games = games |> NHL.Games.filter_home_games("BUF")
# mtl_home_games = games |> DF.filter(&S.equal(&1["home_team"], "MTL"))
# mtl_home_games = games |> NHL.Games.filter_home_games("MTL")
# bruins_canadiens_sabres = DF.concat_rows([mtl_home_games, bos_home_games, buf_home_games])
# dat = bruins_canadiens_sabres |> DF.to_map() |> Keyword.new()
First and last games:
sorted = games["date_time_GMT"] |> S.sort() |> S.to_list()
%{first_game: sorted |> List.first(), last_game: sorted |> List.last()}
# dat
# |> Keyword.fetch!(:date_time_GMT)
# |> Enum.sort()
# |> (fn list -> %{first_game: List.first(list), last_game: List.last(list)} end).()
outcomes = games |> DF.select(["outcome"]) |> DF.to_map() |> Keyword.new()
Based on
Vl.new(height: 600, width: 600)
|> Vl.data_from_series(outcomes)
|> Vl.mark(:bar, filled: true)
|> Vl.encode_field(:x, "outcome", type: :nominal)
|> Vl.encode_field(:y, "outcome", aggregate: :count)
# mtl_dat = mtl_home_games |> DF.to_map() |> Keyword.new()
# Vl.new(height: 600, width: 600)
# |> Vl.data_from_series(mtl_dat)
# |> Vl.mark(:point, filled: true, tooltip: %{content: "data"})
# |> Vl.encode_field(:x, "away_goals", type: :quantitative)
# |> Vl.encode_field(:y, "home_goals", type: :quantitative)
# |> Vl.encode(:size, aggregate: :count)
# |> Vl.encode(:color, field: "home_team", type: :nominal)
bos_dat =
bos_games
|> DF.select(["away_goals", "home_goals", "home_team"])
|> DF.to_map()
|> Keyword.new()
Vl.new(height: 600, width: 600)
|> Vl.data_from_series(bos_dat)
|> Vl.mark(:point, filled: true, tooltip: %{content: "data"})
|> Vl.encode_field(:x, "away_goals", type: :quantitative)
|> Vl.encode_field(:y, "home_goals", type: :quantitative)
|> Vl.encode(:size, aggregate: :count)
|> Vl.encode(:color, field: "home_team", type: :nominal)
Based on :
# {:ok, game_teams_stats} = DF.read_csv("~/Data/NHL/game_teams_stats.csv")
# home_game_teams_stats = game_teams_stats
# |> NHL.GameTeam.filter_home_games()
# # |> DF.rename(team_id: "home_team_id")
# away_game_teams_stats = game_teams_stats
# |> NHL.GameTeam.filter_away_games()
# # |> DF.rename(team_id: "away_team_id")
# all_game_teams_stats = DF.concat_rows([away_game_teams_stats, home_game_teams_stats])
# all_vl = all_game_teams_stats |> DF.to_map() |> Keyword.new()
# home = home_game_teams_stats
# |> DF.rename(team_id: "home_team_id")
# |> DF.join(games, how: :inner)
# away = away_game_teams_stats
# |> DF.rename(team_id: "away_team_id")
# |> DF.join(games, how: :inner)
defmodule NHL.GameTeam do
def create_from_raw_data() do
games = NHL.Games.create_from_raw_data()
raw_game_teams_stats = create_raw_table_from_raw_data()
home_game_teams_stats = raw_game_teams_stats |> filter_home_games()
away_game_teams_stats = raw_game_teams_stats |> filter_away_games()
home =
home_game_teams_stats
|> DF.rename(team_id: "home_team_id")
|> DF.join(games, how: :inner)
away =
away_game_teams_stats
|> DF.rename(team_id: "away_team_id")
|> DF.join(games, how: :inner)
DF.concat_rows(home, away)
end
defp create_raw_table_from_raw_data() do
{:ok, game_teams_stats} = DF.read_csv("~/Data/NHL/game_teams_stats.csv")
game_teams_stats
end
def home_mask(df) do
df["HoA"] |> S.equal("home")
end
def away_mask(df) do
df["HoA"] |> S.equal("away")
end
def filter_home_games(df) do
df |> DF.filter(&S.equal(&1["HoA"], "home"))
end
def filter_away_games(df) do
df |> DF.filter(&S.equal(&1["HoA"], "away"))
end
end
# all = DF.concat_rows(home, away)
all = NHL.GameTeam.create_from_raw_data()
# Some day, this shouldn't be necessary, but it seems necessary presently to work
# around an apparent limitation with DateTimes
# all = all
# |> DF.mutate(date_time_GMT: &S.cast(&1["date_time_GMT"], :date))
# dt = all
# |> DF.select(["date_time_GMT"])
# |> DF.to_map()
# |> Keyword.new()
# |> Keyword.get(:date_time_GMT)
# dt_as_string_series = dt
# |> Enum.map(&Date.to_string/1)
# |> S.from_list()
# all = all
# |> DF.mutate(date_time_GMT: dt_as_string_series)
all_kw = all |> DF.to_map() |> Keyword.new() |> Keyword.drop([:date_time_GMT])
Vl.new(height: 600, width: 600)
|> Vl.data_from_series(all_kw)
|> Vl.mark(:circle, opacity: 0.5)
|> Vl.encode_field(:x, "pim", type: :quantitative)
|> Vl.encode_field(:y, "goals", type: :quantitative)
|> Vl.encode(:size, aggregate: :count)
|> Vl.encode(:color, field: "won", type: :nominal)
Vl.new(height: 600, width: 600)
|> Vl.data_from_series(all_kw)
|> Vl.mark(:circle, filled: true, tooltip: %{content: "data"})
|> Vl.encode_field(:x, "away_goals", type: :quantitative)
|> Vl.encode_field(:y, "home_goals", type: :quantitative)
|> Vl.encode(:size, aggregate: :count)
|> Vl.encode(:color, field: "home_team", type: :nominal)
Vl.new(height: 600, width: 600)
|> Vl.data_from_series(all_kw)
|> Vl.mark(:circle, opacity: 0.5)
|> Vl.encode_field(:x, "shots", type: :quantitative)
|> Vl.encode_field(:y, "goals", type: :quantitative)
|> Vl.encode(:size, aggregate: :count)
|> Vl.encode(:color, field: "won", type: :nominal)
shots_pim_won = all |> DF.select(["shots", "pim", "won"]) |> DF.to_map() |> Keyword.new()
Vl.new(height: 600, width: 600)
|> Vl.data_from_series(shots_pim_won)
|> Vl.mark(:rect)
|> Vl.encode_field(:x, "shots", type: :ordinal)
|> Vl.encode_field(:y, "pim", type: :ordinal, sort: :descending)
# |> Vl.encode(:size, aggregate: :count)
|> Vl.encode(:color, aggregate: :mean, field: "won")
# |> Vl.config(axis: [{:grid, true}, {:tickBand, :extent}])
# all |> DF.filter(&S.equal(&1["home_team"], "BOS")) |> DF.select(["goals", "home_goals"])
home_goal_diffs =
all
|> DF.group_by(["home_team"])
|> DF.summarise(home_goal_diff: [:mean])
|> DF.arrange(desc: :home_goal_diff_mean)
|> DF.to_map()
|> Keyword.new()
Vl.new(height: 600, width: 600)
|> Vl.data_from_series(home_goal_diffs)
|> Vl.mark(:bar, filled: true)
|> Vl.encode_field(:x, "home_team", type: :nominal)
|> Vl.encode_field(:y, "home_goal_diff_mean", type: :quantitative)
away_goal_diffs =
all
|> DF.group_by(["away_team"])
|> DF.summarise(home_goal_diff: [:mean])
|> DF.arrange(desc: :home_goal_diff_mean)
|> DF.to_map()
|> Keyword.new()
Vl.new(height: 600, width: 600)
|> Vl.data_from_series(away_goal_diffs)
|> Vl.mark(:bar, filled: true)
|> Vl.encode_field(:x, "away_team", type: :nominal)
|> Vl.encode_field(:y, "home_goal_diff_mean", type: :quantitative)
home =
home_goal_diffs
|> Enum.map(fn
{:home_team, ht} -> {:team, ht}
{:home_goal_diff_mean, diff} -> {:home_goal_diff_mean, diff}
end)
|> Enum.into(%{})
|> DF.from_map()
away =
away_goal_diffs
|> Enum.map(fn
{:away_team, at} -> {:team, at}
{:home_goal_diff_mean, diff} -> {:away_goal_diff_mean, diff |> Enum.map(fn v -> -v end)}
end)
|> Enum.into(%{})
|> DF.from_map()
home_away = DF.join(home, away)
home_away_dat = home_away |> DF.to_map() |> Keyword.new()
Vl.new(width: 600, height: 600)
|> Vl.data_from_series(home_away_dat)
|> Vl.mark(:point, tooltip: true)
|> Vl.encode_field(:x, "home_goal_diff_mean", type: :quantitative)
|> Vl.encode_field(:y, "away_goal_diff_mean", type: :quantitative)
|> Vl.encode_field(:color, "team", type: :nominal)
all |> DF.filter(&S.equal(&1["game_id"], 2_016_020_045))
# vl_all = all |> DF.to_map() |> Keyword.new()
defmodule Regression do
@doc """
E.g.:
x1 = [1, 2, 3, 4]
x2 = [1, 2, 3, 5]
"""
def bivariate(y, x1, x2)
when is_list(y) and is_list(x1) and is_list(x2) and length(x1) == length(x2) do
%{y: y, x1: x1, x2: x2} = remove_observations_with_nils(y, x1, x2)
x0 = List.duplicate(1, length(x1))
x = Nx.tensor([x0, x1, x2]) |> Nx.transpose()
y = Nx.tensor(y)
x_prime_x = Nx.dot(Nx.transpose(x), x)
x_prime_y = Nx.dot(Nx.transpose(x), y)
b_hat = Nx.dot(Nx.LinAlg.invert(x_prime_x), x_prime_y)
y_hat = Nx.dot(x, b_hat)
%{b_hat: b_hat, y_hat: y_hat}
end
def remove_observations_with_nils(y, x1, x2) do
[y, x1, x2]
|> Enum.zip()
|> Enum.reduce(
%{y: [], x1: [], x2: []},
fn
{y_val, x1_val, x2_val}, acc when is_nil(y_val) or is_nil(x1_val) or is_nil(x2_val) ->
acc
{y_val, x1_val, x2_val}, %{y: ys, x1: x1s, x2: x2s} ->
%{y: [y_val | ys], x1: [x1_val | x1s], x2: [x2_val | x2s]}
end
)
|> (fn %{y: ys, x1: x1s, x2: x2s} ->
%{y: ys |> Enum.reverse(), x1: x1s |> Enum.reverse(), x2: x2s |> Enum.reverse()}
end).()
end
end
Regression.bivariate(all_kw[:goals], all_kw[:shots], all_kw[:powerPlayOpportunities])
Vl.new(height: 600, width: 600)
|> Vl.data_from_series(all_kw)
|> Vl.transform(filter: "datum.home_team == 'BOS'")
|> Vl.mark(:point, filled: true)
|> Vl.encode_field(:x, "shots", type: :quantitative)
|> Vl.encode_field(:y, "goals", type: :quantitative)
|> Vl.encode(:size, aggregate: :count)
|> Vl.encode(:color, field: "won", type: :nominal)