Bavarian Election
Mix.install(
[
{:nx, "~>0.6.4"},
{:bumblebee, "~>0.4.2"},
{:explorer, "~>0.7.2"},
{:kino_vega_lite, "~> 0.1.11"},
{:httpoison, "~> 1.8"},
{:exla, "~> 0.6.4"},
{:adbc, "~> 0.2.2"},
{:kino_bumblebee, "~>0.4.0"},
{:tucan, "~> 0.2.1"},
{:scholar, "~> 0.2.1"}
],
config: [
nx: [default_backend: EXLA.Backend, client: :cuda, device_id: 1]
]
)
Modules
require Explorer.DataFrame
alias VegaLite, as: Vl
alias Explorer.DataFrame, as: DF
alias Explorer.Series, as: S
defmodule Graphs do
def create_poll_graph(data_source, title, party, start_date) do
Vl.new(width: 500, height: 300, title: title)
|> Vl.data_from_values(DF.filter(data_source, mid_date >= ^start_date),
only: ["end_date", "start_date", "mid_date", party, "institute"]
)
|> Vl.layers([
# data as bars
Vl.new()
|> Vl.mark(:bar, opacity: 0.5)
|> Vl.encode_field(:x, "start_date", type: :temporal, title: "poll dates")
|> Vl.encode_field(:x2, "end_date")
|> Vl.encode_field(:y, party,
type: :quantitative,
title: "percentage",
scale: [domain: [S.min(data_source[party]), S.max(data_source[party])]]
)
|> Vl.encode_field(:color, "institute", type: :nominal),
Vl.new()
# data with opacity 0, to be able to fit
|> Vl.mark(:bar, opacity: 0.0)
|> Vl.encode_field(:x, "mid_date", type: :temporal, title: "poll dates")
|> Vl.encode_field(:x2, "end_date")
|> Vl.encode_field(:y, party,
type: :quantitative,
title: "percentage",
scale: [domain: [S.min(data_source[party]), S.max(data_source[party])]]
)
|> Vl.encode_field(:color, "institute", type: :nominal),
# rule
Vl.new()
|> Vl.mark(:rule, color: "blue")
|> Vl.encode(:y,
datum: DF.filter(data_source, institute == "Landtagswahl")[party][0],
type: :quantitative
),
# fit
Vl.new()
|> Vl.mark(:line, color: "firebrick")
|> Vl.transform(loess: party, on: "mid_date", bandwidth: 0.5)
|> Vl.encode_field(:x, "mid_date", type: :temporal, title: "poll dates")
|> Vl.encode_field(:y, party, type: :quantitative, title: "percentage")
])
end
def create_poll_graph(data_source, title, party) do
create_poll_graph(data_source, title, party, ~N[2023-01-01 00:00:01])
end
def create_sentiment_graph(data_source, title, party, start_date) do
Vl.new(width: 500, height: 300, title: title)
|> Vl.data_from_values(
DF.filter(data_source, domentent_party == ^party and date > ^start_date),
only: ["date", "sentiment", "region"]
)
|> Vl.layers([
Vl.new()
|> Vl.mark(:point)
|> Vl.encode_field(:x, "date", type: :temporal)
|> Vl.encode_field(:y, "sentiment",
type: :quantitative,
scale: [domain: [-1, 1]]
)
|> Vl.encode_field(:color, "region", type: :nominal, title: "Region")
|> Vl.encode_field(:shape, "region", type: :nominal, title: "Region"),
Vl.new()
|> Vl.mark(:line, color: "firebrick", opacity: 0.5)
|> Vl.transform(loess: "sentiment", on: "date", bandwidth: 0.5)
|> Vl.encode_field(:x, "date", type: :temporal, title: "date")
|> Vl.encode_field(:y, "sentiment", type: :quantitative, title: "sentiment")
])
end
def create_daily_compare_graph(sentiment_data_source, poll_data_source, title, party) do
Tucan.layers([
Tucan.step(poll_data_source, "day", party, line_color: "red"),
Tucan.step(sentiment_data_source, "day", party, line_color: "green")
])
|> Tucan.set_width(500)
|> Tucan.set_title(title)
|> Tucan.Axes.set_x_title("Day of the year")
|> Tucan.Axes.set_y_title("Sentiments, Polls")
|> Tucan.Scale.set_y_domain(0, 1)
|> Tucan.Legend.set_enabled(:color, true)
|> Tucan.Legend.set_title(:color, "TimeLine")
|> Tucan.annotate(243, S.max(poll_data_source[party]) + 0.05, "Polls", color: "red", size: 18)
|> Tucan.annotate(243, S.max(sentiment_data_source[party]) + 0.05, "Sentiments",
color: "green",
size: 18
)
end
def create_weekly_compare_graph(sentiment_data_source, poll_data_source, title, party) do
Tucan.layers([
Tucan.step(poll_data_source, "week", party, line_color: "red"),
Tucan.step(sentiment_data_source, "week", party, line_color: "green")
])
|> Tucan.set_width(500)
|> Tucan.set_title(title)
|> Tucan.Axes.set_x_title("Calendar Week")
|> Tucan.Axes.set_y_title("Sentiments, Polls")
|> Tucan.Scale.set_y_domain(0, 1)
|> Tucan.Legend.set_enabled(:color, true)
|> Tucan.Legend.set_title(:color, "TimeLine")
|> Tucan.annotate(36, poll_data_source[party][1] + 0.05, "Polls", color: "red", size: 18)
|> Tucan.annotate(36, sentiment_data_source[party][1] + 0.05, "Sentiments",
color: "green",
size: 18
)
end
end
defmodule Names do
def append_variants(names) do
names
|> Enum.concat(Enum.map(names, fn x -> String.replace(x, " ", "") end))
|> Enum.concat(Enum.map(names, fn x -> String.replace(x, "ö", "o") end))
|> Enum.concat(Enum.map(names, fn x -> String.replace(x, "ö", "oe") end))
|> Enum.concat(Enum.map(names, fn x -> String.replace(x, "-", "") end))
|> Enum.uniq()
end
def family_name(y) do
String.split(y, " ", parts: 2) |> Enum.at(1)
end
end
defmodule Bool do
def to_integer(true), do: 1
def to_integer(false), do: 0
def to_integer(nil), do: 0
end
defmodule SentimentScore do
def score(prediction) do
prediction
|> Enum.map(fn p ->
case p do
%{label: l} when l in ["POS", "positive"] -> p.score
%{label: l} when l in ["NEG", "negative"] -> -p.score
_ -> 0
end
end)
|> Enum.sum()
end
end
defmodule User do
def image_exists?(body), do: byte_size(body) > 478
def download_image(save_stem, link) do
save_name = "./graphics/userimages/#{save_stem}.png"
if not File.exists?(save_name) do
%HTTPoison.Response{body: body} = HTTPoison.get!(link)
if User.image_exists?(body) do
File.write!(save_name, body)
end
end
end
end
defmodule Text do
def count_mentions(text, mentioned_words)
when is_list(mentioned_words)
when is_binary(text) do
is_scanned =
Enum.map(mentioned_words, fn y ->
_scanned = Regex.scan(Regex.compile!("\\b#{y}\\b"), text)
end)
per_word_sum = Enum.map(is_scanned, &length(&1))
Enum.sum(per_word_sum)
end
def count_unique_mentions(text, mentioned_words)
when is_list(mentioned_words)
when is_binary(text) do
is_scanned =
Enum.map(mentioned_words, fn y ->
scanned = Regex.run(Regex.compile!("\\b#{y}\\b"), text)
if is_nil(scanned), do: 0, else: 1
end)
Enum.sum(is_scanned)
end
def contains_mentions?(text, mentioned_words)
when is_list(mentioned_words)
when is_binary(text) do
count =
text
|> count_unique_mentions(mentioned_words)
if(count > 0, do: true, else: false)
end
end
defmodule CrossCorrelation do
def cross_corr(t1, t2) do
Nx.conv(t1, t2)
end
def full_cross_corr(t1, t2) do
p1 = elem(Nx.shape(t1), 2) - 1
p2 = elem(Nx.shape(t2), 2) - 1
Nx.conv(t1, t2, padding: [{p1, p2}])
end
def find_offset(t1, t2) do
l1 = elem(Nx.shape(t1), 2)
coeffs = full_cross_corr(t1, t2)
index = Nx.argmax(coeffs)
Nx.to_number(index) - l1 + 1
end
end
defmodule DataFrameDate do
def fill(df, timeline_int_col, timeline_range, data_cols) do
timeline =
df[timeline_int_col]
|> S.to_list()
|> MapSet.new()
measurement_timeline = timeline_range |> MapSet.new()
missing_time = MapSet.difference(measurement_timeline, timeline)
missing_data = List.duplicate(nil, length(Enum.to_list(missing_time)))
missing_df =
(Enum.map(data_cols, fn x -> %{x => missing_data} end) ++
[%{timeline_int_col => missing_time}])
|> Enum.map(&DF.new(&1))
|> DF.concat_columns()
DF.concat_rows([missing_df, df])
|> DF.arrange_with(& &1[timeline_int_col])
end
end
defmodule NilSeries do
def fb_fill(series) do
S.fill_missing(S.fill_missing(series, :forward), :backward)
end
end
Polls
start_date = ~N[2023-08-29 00:00:01]
start_day = start_date |> NaiveDateTime.to_date() |> Date.day_of_year()
Load the csv with the polls from different instiutions form the website wahlrecht.de. Show the timeline and trend for every party.
polls =
DF.from_csv!("polls.csv", delimiter: ";", parse_dates: true)
|> DF.mutate(
start_date: S.cast(start_date, {:datetime, :millisecond}),
end_date: S.cast(end_date, {:datetime, :millisecond})
)
polls_mdt =
S.to_list(polls["end_date"])
|> Enum.zip(S.to_list(polls["start_date"]))
|> Enum.map(
&NaiveDateTime.add(
elem(&1, 1),
round(NaiveDateTime.diff(elem(&1, 0), elem(&1, 1)) / 2)
)
)
polls = DF.put(polls, "mid_date", polls_mdt)
polls = DF.mutate(polls, week: S.cast(mid_date, :date) |> S.week_of_year())
DF.print(polls, limit: :infinity)
Graphs.create_poll_graph(polls, "Polls - CSU", "csu")
Graphs.create_poll_graph(polls, "Polls - Freie Waehler", "fw")
Graphs.create_poll_graph(polls, "Polls - Buendnis90-Gruene", "gruene")
Graphs.create_poll_graph(polls, "Polls - SPD", "spd")
Graphs.create_poll_graph(polls, "Polls - FDP", "fdp")
Graphs.create_poll_graph(polls, "Polls - AFD", "afd")
Graphs.create_poll_graph(polls, "Polls - Linke", "linke")
Regions
Names of subdistricts are removed, as the are often to general. Example: Oder
, Gern
. Still, Village name can be general terms as Wald
.
local_entities = DF.from_csv!("geodaten/VerwaltungsEinheit.csv", delimiter: ",")["name"]
# subdist_entities = DF.from_csv!("geodaten/KatasterBezirk.csv", delimiter: ",")["name"]
local_entities =
local_entities
|> S.downcase()
Parties
parties_df =
DF.new(
party: ["csu", "fw", "spd", "gruene", "fdp", "afd", "linke"],
candiate1: [
"Markus Söder",
"Hubert Aiwanger",
"Florian von Brunn",
"Ludwig Hartmann",
"Martin Hagen",
"Katrin Ebner-Steiner",
"Adelheid Rupp"
],
candidate2: [nil, nil, nil, "Katharina Schulze", nil, "Martin Böhm", nil]
)
DF.print(parties_df)
parties = S.to_list(parties_df["party"])
parties_regex =
parties
|> Enum.join("|")
|> Regex.compile!()
Analysis Tags and Toots
f = "mastodon_bayernwahl2023_20231119.db"
# f = "mastodon_bayernwahl2023_20230910.db"
p = Path.absname(f)
Adbc.download_driver!(:sqlite)
{:ok, db} = Kino.start_child({Adbc.Database, driver: :sqlite, uri: p})
{:ok, conn} = Kino.start_child({Adbc.Connection, database: db})
{:ok, tags_df} = Explorer.DataFrame.from_query(conn, "select * from tags", [])
tags_df["tag"]
|> S.downcase()
|> S.to_list()
|> Enum.filter(&Enum.member?(parties, &1))
|> Enum.frequencies()
From the original tracked posts, only a minimal set contains the names of the parties:
- Freie Waehler (82)
- Gruene (58)
- Linke (57)
This is much better for the parties: AFD, CSU and SPD.
{:ok, toots_df} = Explorer.DataFrame.from_query(conn, "select * from toots", [])
toots_df = DF.mutate(toots_df, date: S.strptime(date, "%Y-%m-%dT%H:%M:%S"))
toots_df =
toots_df
|> DF.mutate(
day: S.cast(date, :date) |> S.day_of_year(),
week: S.cast(date, :date) |> S.week_of_year(),
weekday: S.cast(date, :date) |> S.day_of_week(),
hour: S.hour(date)
)
post_number = S.size(toots_df["content"])
toots_df["content"]
|> S.fill_missing(" ")
|> S.downcase()
|> S.to_list()
|> Enum.filter(&Regex.match?(parties_regex, &1))
|> Enum.map(&Regex.scan(parties_regex, &1))
|> Enum.map(&Enum.uniq(&1))
|> List.flatten()
|> Enum.frequencies()
|> Enum.sort_by(&elem(&1, 1), :desc)
|> Enum.map(fn {lang, freq} -> {lang, freq / post_number * 100.0} end)
This is slightly getting better, the whole posts are taken into account.
- Freie Waehler (82 -> 155)
- Grune (58 -> 169)
- Linke (57 -> 175)
TODO: use regions.
candidate_family_names =
parties_df["candiate1"]
|> S.concat(parties_df["candidate2"])
|> S.downcase()
|> S.to_list()
|> Enum.filter(&is_binary(&1))
|> Enum.map(&Names.family_name(&1))
candidate_family_names =
candidate_family_names
|> Names.append_variants()
bavaria_tags = S.to_list(local_entities) ++ ["csu"] ++ candidate_family_names
name_regex = Regex.compile!(Enum.join(candidate_family_names, "|"))
Attribution (Do Execute with caution)
-
gender
-
bavarian
-
age (perhaps)
-
sentiment -> Party
-
date -> calendar week
flowchart TD;
A(User on Bavarian Instance?) -->|yes| B[Bavarian];
A-->|no| C(Bavarian Location in Field?);
C --> |yes| B;
C --> |no| D(Bavarian Location in User note?)
D --> |yes| B;
D --> |no| E(Interfered Language in toot is German?);
E --> |yes| F[German]
E --> |no| G[Foreign]
Mark Bavarian Instance
{:ok, person_df} = Explorer.DataFrame.from_query(conn, "select * from users", [])
bavarian_instances = ~w"muenchen.social augsburg.social mastodon.bayern nuernberg.social
ploen.social wue.social mastodon.dachgau.social sueden.social"
bavarian_instances_reg =
bavarian_instances
|> Enum.join("|")
|> Regex.compile!()
is_user_on_bavarian_instance =
person_df["user_name"]
|> S.downcase()
|> S.transform(&Enum.at(String.split(&1, "@"), 1, "chaos.social"))
|> S.transform(&is_list(Regex.run(bavarian_instances_reg, &1)))
person_df = DF.put(person_df, "bavarian_instance", is_user_on_bavarian_instance)
bavarian_person = DF.filter(person_df, bavarian_instance == true)
elem(DF.shape(bavarian_person), 0) / elem(DF.shape(person_df), 0) * 100
About 6 % of all the users are on a Bavarian instance.
Bavarian Locations in Fields
{:ok, fields_df} = Explorer.DataFrame.from_query(conn, "select * from fields", [])
Remove:
- HTML
- #-Sign
- @-Sign
- _-Sign
- links
- excess white space
Left in:
- Simileys (language model might know them)
- numbers (language model converts them)
links =
"https?:\/\/(?:www\.)?([-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b)*(\/[\/\d\w\.-]*)*(?:[\?])*(.+)*|="
html = "<[^>]*>"
excess_spaces_r = Regex.compile!("\s\s+")
clean_r = Regex.compile!("#{html}|#|@|_|#{links}|\"")
filter_field_regions_names = [
"adresse",
"born where",
"bundesland",
"city",
"country",
"heimat",
"heimathafen",
"heimatort",
"herkunft",
"home",
"location",
"ort",
"standort",
"wahlkreis",
"wo",
"wohnhaft",
"wohnort",
"wohnt in",
"zuhause",
"📍"
]
location_fields =
fields_df["field_name"]
|> S.downcase()
|> S.to_enum()
|> Enum.map(&Enum.member?(filter_field_regions_names, &1))
cleaned_field =
fields_df["field_value"]
|> S.downcase()
|> S.transform(&Regex.replace(clean_r, &1, " "))
|> S.transform(&Regex.replace(excess_spaces_r, &1, " "))
|> S.transform(&String.trim(&1))
fields_df = DF.put(fields_df, "cleaned_field", cleaned_field)
DF.print(fields_df)
bavarian_fields =
fields_df["cleaned_field"]
|> S.to_list()
|> Enum.map(fn a ->
a
|> String.split([",", "-"])
|> Enum.map(&String.trim(&1))
|> Enum.map(&Enum.member?(S.to_list(local_entities), &1))
|> Enum.any?()
end)
fields_df =
fields_df
|> DF.put("locations", location_fields)
|> DF.put("is_bavarian", bavarian_fields)
fields_with_bavarian_locations = DF.filter(fields_df, is_bavarian == true and locations == true)
DF.print(fields_with_bavarian_locations)
elem(DF.shape(fields_with_bavarian_locations), 0) / elem(DF.shape(person_df), 0) * 100
About 5 % users add locations. About 0.5 % of users give locations in Bavarian.
Find bavarian locations in user texts
todo use local_entities funtion
cleaned_note =
person_df["note"]
|> S.transform(&if(is_binary(&1), do: &1, else: ""))
|> S.transform(&Regex.replace(clean_r, &1, " "))
|> S.transform(&Regex.replace(excess_spaces_r, &1, " "))
|> S.transform(&String.trim(&1))
person_df = DF.put(person_df, "cleaned_note", cleaned_note)
bavarian_location_in_note =
person_df["cleaned_note"]
|> S.transform(fn x ->
Text.contains_mentions?(x, S.to_list(local_entities))
end)
person_df = DF.put(person_df, "bavarian_location_in_note", bavarian_location_in_note)
person_with_bavarian_note_locations = DF.filter(person_df, bavarian_location_in_note == true)
elem(DF.shape(person_with_bavarian_note_locations), 0) / elem(DF.shape(person_df), 0) * 100
DF.print(person_with_bavarian_note_locations)
About 2 % of the user name Bavarian location in there user note texts.
Person in Bavaria
OR the Bavarian locations in note, in fields and instance name.
persons_with_bavarian_fields =
DF.join(person_df, DF.select(fields_with_bavarian_locations, ["user_name", "is_bavarian"]),
how: :left,
on: [{"user_name", "user_name"}]
)
DF.print(persons_with_bavarian_fields)
DF.shape(persons_with_bavarian_fields)
bavarian_users =
Enum.reduce(
[
persons_with_bavarian_fields["is_bavarian"],
persons_with_bavarian_fields["bavarian_location_in_note"],
persons_with_bavarian_fields["bavarian_instance"]
],
&S.or(&1, &2)
)
|> S.to_enum()
|> Enum.map(&if(is_nil(&1), do: false, else: &1))
persons_with_bavarian_fields =
DF.put(persons_with_bavarian_fields, "bavarian_users", bavarian_users)
DF.print(persons_with_bavarian_fields)
person_from_bavaria = DF.filter(persons_with_bavarian_fields, bavarian_users == true)
bavarian_count = elem(DF.shape(person_from_bavaria), 0)
{bavarian_count, bavarian_count / elem(DF.shape(persons_with_bavarian_fields), 0) * 100}
About 8.14 % of the sample datasets users are estimated to be from Bavarian. But (6.14 % + 0.39 % + 2.00 % =) 8.53 % is the sum of percentage per feature. Hence, there is little overlap.
Download user pics
In test data set 1390 of 1547 user do have avatars. Which is about 90 % of the users.
Enum.zip(
S.to_list(person_df["avatar"]),
S.to_list(person_df["user_id"])
)
|> Enum.map(fn {link, name} -> User.download_image(name, link) end)
flowchart TD;
A(gender in fields?) -->|male| B[male];
A -->|female| C[female];
A -->|no| D(gender in user text?);
D -->|male| B;
D -->|female| C;
D -->|no| E(user has image?)
E -->|yes| F(image segamentatation);
E -->|no| H[no data]
F-->|single person| G(vgg_ilsvrc_16_gender_imdb_wiki);
F-->|others| H;
G-->|male| B;
G-->|female| C;
flowchart TD;
E(user has image?) -->|yes| F(image segamentatation);
E-->|no| H[no data]
F-->|single person| G(vgg_ilsvrc_16_gender_imdb_wiki);
F-->|others| H[no data];
G--> B[age classes];
filter_field_age_names = ~w[Age Alter Born Geburtstag]
filter_field_gender_names =
~w[Gender Geschlecht Pronom Pronomen pronouns Pronouns Pronomina Pronoms Pronons Pronoun Sexulatität Wer pronomen pronouns]
gender_fields =
fields_df["field_name"]
|> S.downcase()
|> S.transform(&Enum.member?(filter_field_gender_names, &1))
fields_df = DF.put(fields_df, "gender_fields", gender_fields)
%{male: ["he", "him", "his", "er", "ihm", "ihn", "sein"], female: ["she", "her", "sie", "ihr"]}
DF.filter(fields_df, gender_fields)["cleaned_field"]
|> S.to_list()
|> Enum.uniq()
Preprocessing posts
Before the sentiments can be read from the posts. It is nessescary to remove html tags. The mastodon tags, need to be converted.
Convert date column from string to naive date time.
cleared_posts =
toots_df[:content]
|> S.fill_missing(" ")
|> S.transform(&Regex.replace(clean_r, &1, " "))
|> S.transform(&Regex.replace(excess_spaces_r, &1, " "))
|> S.transform(&String.trim(&1))
toots_df = DF.put(toots_df, "cleared_content", cleared_posts)
has_content =
cleared_posts
|> S.transform(&String.length(&1))
|> S.greater(50)
toots_df = DF.put(toots_df, "has_content?", has_content)
toots_df = DF.filter(toots_df, has_content?)
post_length_s =
toots_df["cleared_content"]
|> S.transform(&String.length(&1))
post_length_s
|> S.mean()
S.standard_deviation(post_length_s)
S.median(post_length_s)
S.max(post_length_s)
On the median of the cleared posts is 248 characters in the first set (After removing the length of zero). The mean is 217+/-189 character.
toots_df =
DF.join(
toots_df,
DF.select(persons_with_bavarian_fields, ["user_name", "bavarian_users", "followers"]),
how: :left,
on: [{"user_name", "user_name"}]
)
DF.print(toots_df)
DF.filter(toots_df, bavarian_users == true)
Filter posts
- Filter: Topic of posts is really about Bavaria
- Attribution of Sentiment to a single party
- Filter: No party, multiple parties
Filter: Contains at least a single candidate.
Sample Data: 3503/4563 Mentions words in context of Bavaria or a Candidate.
bavarian_post_filter =
toots_df["cleared_content"]
|> S.downcase()
|> S.transform(&Text.contains_mentions?(&1, bavaria_tags))
bavarian_toots_df = DF.mask(toots_df, bavarian_post_filter)
bavarian_full_post_filter =
toots_df["content"]
|> S.downcase()
|> S.transform(&Text.contains_mentions?(&1, bavaria_tags))
full_bavarian_toots_df = DF.mask(toots_df, bavarian_full_post_filter)
bav_set =
bavarian_toots_df[:id]
|> S.to_list()
|> MapSet.new()
bav_full_set =
full_bavarian_toots_df[:id]
|> S.to_list()
|> MapSet.new()
differences = MapSet.difference(bav_full_set, bav_set) |> Enum.to_list()
tags_df
|> DF.filter(toot_id in ^differences)
|> DF.arrange(tag)
|> DF.print(limit: :infinity)
tags_df
|> DF.filter(toot_id == 165)
|> DF.print(limit: :infinity)
DF.filter(full_bavarian_toots_df, bavarian_users == true)
TODO: Rework Parties Single -> Most commonly used Filter all posts that the toots, contains only a single party or its candidates per post.
party_search_terms =
DF.to_rows(parties_df)
# only per line in DataFrame
|> Enum.map(fn x ->
x
# only takes the values
|> Map.values()
# remove nils
|> Enum.filter(&is_bitstring(&1))
# take family names if candidate anmes
|> Enum.map(fn y ->
cond do
Names.family_name(y) == nil -> y
true -> Names.family_name(y)
end
end)
|> Names.append_variants()
|> Enum.map(&String.downcase(&1))
end)
contains_party =
full_bavarian_toots_df["cleared_content"]
|> S.downcase()
|> S.to_list()
|> Enum.map(fn text ->
Enum.map(party_search_terms, fn party -> Text.count_unique_mentions(text, party) > 0 end)
end)
parties_count =
contains_party
|> Enum.map(fn text -> Enum.sum(Enum.map(text, fn party -> Bool.to_integer(party) end)) end)
contains_single_party =
parties_count
|> Enum.map(&(&1 == 1))
party_frquency =
full_bavarian_toots_df["content"]
|> S.downcase()
|> S.to_list()
|> Enum.map(fn text ->
Enum.map(party_search_terms, fn party -> Text.count_mentions(text, party) end)
end)
|> Enum.map(fn party_counts ->
Enum.map(party_counts, fn party_count ->
if(Enum.sum(party_counts) === 0, do: 0, else: party_count / Enum.sum(party_counts))
end)
end)
has_dominant_party =
party_frquency
|> Enum.map(fn party_frequencies -> Enum.any?(party_frequencies, &(&1 > 0.5)) end)
Enum.sum(Enum.map(has_dominant_party, &Bool.to_integer(&1)))
domentent_party =
party_frquency
|> Enum.map(&Enum.zip(&1, S.to_list(parties_df["party"])))
|> Enum.map(fn text -> Enum.filter(text, fn party -> elem(party, 0) > 0.5 end) end)
|> Enum.map(fn text -> Enum.map(text, fn party -> elem(party, 1) end) end)
|> Enum.map(&if(length(&1) === 0, do: "", else: Enum.at(&1, 0)))
Most posts name a single party. Very often two parties are mentioned in a single post. The other cases are much less frequent.
mentioned_parties =
contains_party
|> Enum.map(&Enum.zip(&1, S.to_list(parties_df["party"])))
|> Enum.map(fn text -> Enum.filter(text, fn party -> elem(party, 0) end) end)
|> Enum.map(fn text -> Enum.map(text, fn party -> elem(party, 1) end) end)
|> Enum.map(&Enum.join(&1, ", "))
full_bavarian_toots_df = DF.put(full_bavarian_toots_df, "mentioned_party", mentioned_parties)
full_bavarian_toots_df = DF.put(full_bavarian_toots_df, "domentent_party", domentent_party)
single_party_toots_df = DF.mask(full_bavarian_toots_df, has_dominant_party)
DF.print(single_party_toots_df |> DF.arrange(desc: date))
DF.filter(single_party_toots_df, bavarian_users == true)
Sentiment Analysis
Before the sentiment analysis. The langauage has to be checked. As the language attribute is very often not correct. Therefore a language detection has to be made first, before the sentiment analysis
- XLM-RoBERTa - language detection
- german-sentiment_bert - Sentiment Analysis German
- RoBERTa (BERTtweet) - Sentiment - English language Sentiment analysis
Language Detection
The modules takes a different number of maximum characters. The language detection takes up to 514 characters, but the results updates of the self set languages is similar to 100 characters. Therefore we restrict to 100 characters.
flowchart TD;
A(XLM-RoBERTa) -->|German| B[german-sentiment_bert];
A(XLM-RoBERTa) -->|English| C[RoBERTa BERTtweet - Sentiment];
B --> E[Bavarian?]
C --> F[Bavarian?]
E -->|no| H(German)
E -->|yes| G(Bavarian)
F -->|yes| G(Bavarian)
F -->|no| I(English)
{:ok, lang_detect_model_info} =
Bumblebee.load_model({:hf, "papluca/xlm-roberta-base-language-detection"})
{:ok, lang_detect_tokenizer} =
Bumblebee.load_tokenizer({:hf, "papluca/xlm-roberta-base-language-detection"})
lang_detect_serving =
Bumblebee.Text.text_classification(lang_detect_model_info, lang_detect_tokenizer,
compile: [batch_size: 128, sequence_length: 100],
defn_options: [compiler: EXLA]
)
lang_detect_model_info.spec.max_positions
Kino.start_child({
Nx.Serving,
serving: lang_detect_serving, name: LangDetectServer
})
p = Nx.Serving.batched_run(LangDetectServer, S.to_list(single_party_toots_df["cleared_content"]))
Each predictions is ordered by probybility. Hence always selecting the label the first language returns the most likely one.
detected_languages =
Enum.map(p, fn post ->
post
|> Enum.at(0)
|> elem(1)
|> Enum.at(0)
|> (& &1[:label]).()
end)
The majority of 96% of all sample posts are detected as german. 1.6 % are detected as English. Why 0.9 % are labeled as Hindi has to be figured out. That 0.5 % are detected as Dutch is more plausable.
detected_languages
|> Enum.frequencies()
|> Enum.sort_by(&elem(&1, 1), :desc)
|> Enum.map(fn {lang, freq} -> {lang, freq / length(detected_languages) * 100.0} end)
single_party_toots_df = DF.put(single_party_toots_df, "detected_languages", detected_languages)
In contrast the manually set language are 92%, 6 % language (often the default) and 1.3 % nil (not specified).
single_party_toots_df["language"]
|> S.to_list()
|> Enum.frequencies()
|> Enum.sort_by(&elem(&1, 1), :desc)
|> Enum.map(fn {lang, freq} -> {lang, freq / length(detected_languages) * 100.0} end)
From visual analysis the langauge attribute is often set wrong, as it set manually, with a given default. Often the language was set to English, when it was German or set to nil. Therefore the language has been evaluated by language detection model, which changed the language in 5.6 % of all posts.
reasigned_language =
single_party_toots_df["language"]
|> S.not_equal(single_party_toots_df["detected_languages"])
|> S.transform(&Bool.to_integer(&1))
|> S.sum()
reasigned_language / S.size(single_party_toots_df["language"]) * 100.0
German Sentiments
The German sentiment analysis works with up to 512 tokens. Per standard Mastodon set the limit to 500 characters. As other services than Mastodon and some Mastodon instances use other cut offs, we select 512 tokens. We assume 5.99 characters per word. OpenAI estimates 4 characters per tokens on a English word.
{:ok, ger_sent_model_info} = Bumblebee.load_model({:hf, "oliverguhr/german-sentiment-bert"})
{:ok, ger_sent_tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-german-cased"})
ger_sent_model_info.spec.max_positions
ger_sent_serving =
Bumblebee.Text.text_classification(ger_sent_model_info, ger_sent_tokenizer,
compile: [batch_size: 128, sequence_length: 512],
defn_options: [compiler: EXLA]
)
Kino.start_child({
Nx.Serving,
serving: ger_sent_serving, name: GerSentimentServer
})
german_toots_df = DF.filter(single_party_toots_df, detected_languages == "de")
german_toots = S.to_list(german_toots_df["cleared_content"])
ger_predictions = Nx.Serving.batched_run(GerSentimentServer, german_toots)
ger_sentiments = Enum.map(ger_predictions, fn x -> SentimentScore.score(x.predictions) end)
german_toots_df = DF.put(german_toots_df, "sentiment", ger_sentiments)
Englisch Sentiment
The sentiment analysis on the English text works only with 130 tokens.
english_toots_df = DF.filter(single_party_toots_df, detected_languages == "en")
english_toots = S.to_list(english_toots_df["cleared_content"])
{:ok, model_info} = Bumblebee.load_model({:hf, "finiteautomata/bertweet-base-sentiment-analysis"})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "vinai/bertweet-base"})
english_sentiment_serving =
Bumblebee.Text.text_classification(model_info, tokenizer,
compile: [batch_size: 128, sequence_length: 130],
defn_options: [compiler: EXLA]
)
Kino.start_child({
Nx.Serving,
serving: english_sentiment_serving, name: EngSentimentServer
})
eng_predictions = Nx.Serving.batched_run(EngSentimentServer, english_toots)
eng_sentiments = Enum.map(eng_predictions, fn x -> SentimentScore.score(x.predictions) end)
english_toots_df = DF.put(english_toots_df, "sentiment", eng_sentiments)
ger_eng_toots = DF.concat_rows(german_toots_df, english_toots_df)
bav_ger_eng_lang =
ger_eng_toots["detected_languages"]
|> S.to_list()
|> Enum.zip(S.to_list(ger_eng_toots["bavarian_users"]))
|> Enum.map(&if(elem(&1, 1), do: "bav", else: elem(&1, 0)))
bav_ger_toots =
DF.put(ger_eng_toots, "region", bav_ger_eng_lang)
|> DF.filter(region != "en")
DF.print(ger_eng_toots)
S.frequencies(bav_ger_toots["region"])
h =
bav_ger_toots
|> DF.filter(region == "bav")
S.distinct(h["user_id"])
S.distinct(bav_ger_toots["user_id"])
DF.filter(bav_ger_toots, date > ^start_date)[:domentent_party]
|> S.to_list()
|> Enum.frequencies()
DF.filter(bav_ger_toots, date > ^start_date)[:region]
|> S.to_list()
|> Enum.frequencies()
Sentiment Graphs
avg_sentiment = S.mean(bav_ger_toots["sentiment"])
party_count = fn x ->
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.filter(domentent_party == ^x)
|> DF.shape()
|> elem(0)
end
total_count =
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.shape()
|> elem(0)
follower_count = fn x ->
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.filter(domentent_party == ^x)
|> DF.to_columns()
|> Map.fetch!("followers")
|> Enum.sum()
end
total_followers =
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.to_columns()
|> Map.fetch!("followers")
|> Enum.sum()
party_bav_count = fn x ->
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.filter(region == "bav")
|> DF.filter(domentent_party == ^x)
|> DF.shape()
|> elem(0)
end
bavaria_total =
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.filter(region == "bav")
|> DF.shape()
|> elem(0)
# Last Positive before election
pos_party_count = fn x ->
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.arrange(desc: sentiment)
|> DF.distinct(["user_id"], keep_all: true)
|> DF.filter(domentent_party == ^x)
|> DF.shape()
|> elem(0)
end
pos_total_count =
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.arrange(desc: sentiment)
|> DF.distinct(["user_id"], keep_all: true)
|> DF.shape()
|> elem(0)
pos_party_bav_count = fn x ->
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.filter(region == "bav")
|> DF.arrange(desc: sentiment)
|> DF.distinct(["user_id"], keep_all: true)
|> DF.filter(domentent_party == ^x)
|> DF.shape()
|> elem(0)
end
pos_follower_count = fn x ->
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.arrange(desc: sentiment)
|> DF.distinct(["user_id"], keep_all: true)
|> DF.filter(domentent_party == ^x)
|> DF.to_columns()
|> Map.fetch!("followers")
|> Enum.sum()
end
pos_total_followers =
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.arrange(desc: sentiment)
|> DF.distinct(["user_id"], keep_all: true)
|> DF.to_columns()
|> Map.fetch!("followers")
|> Enum.sum()
pos_bavaria_total =
bav_ger_toots
|> DF.filter(day >= ^260)
|> DF.filter(day < ^281)
|> DF.filter(region == "bav")
|> DF.arrange(desc: sentiment)
|> DF.distinct(["user_id"], keep_all: true)
|> DF.shape()
|> elem(0)
DF.new(%{
party: [
"afd",
"csu",
"fdp",
"fw",
"gruene",
"linke",
"spd"
],
german_ratio: [
party_count.("afd") / total_count,
party_count.("csu") / total_count,
party_count.("fdp") / total_count,
party_count.("fw") / total_count,
party_count.("gruene") / total_count,
party_count.("linke") / total_count,
party_count.("spd") / total_count
],
german_followers: [
follower_count.("afd") / total_followers,
follower_count.("csu") / total_followers,
follower_count.("fdp") / total_followers,
follower_count.("fw") / total_followers,
follower_count.("gruene") / total_followers,
follower_count.("linke") / total_followers,
follower_count.("spd") / total_followers
],
bavarian_support: [
party_bav_count.("afd"),
party_bav_count.("csu"),
party_bav_count.("fdp"),
party_bav_count.("fw"),
party_bav_count.("gruene"),
party_bav_count.("linke"),
party_bav_count.("spd")
],
bavarian_ratio: [
party_bav_count.("afd") / bavaria_total,
party_bav_count.("csu") / bavaria_total,
party_bav_count.("fdp") / bavaria_total,
party_bav_count.("fw") / bavaria_total,
party_bav_count.("gruene") / bavaria_total,
party_bav_count.("linke") / bavaria_total,
party_bav_count.("spd") / bavaria_total
],
positiv_german_ratio: [
pos_party_count.("afd") / pos_total_count,
pos_party_count.("csu") / pos_total_count,
pos_party_count.("fdp") / pos_total_count,
pos_party_count.("fw") / pos_total_count,
pos_party_count.("gruene") / pos_total_count,
pos_party_count.("linke") / pos_total_count,
pos_party_count.("spd") / pos_total_count
],
positiv_bavarian_support: [
pos_party_bav_count.("afd"),
pos_party_bav_count.("csu"),
pos_party_bav_count.("fdp"),
pos_party_bav_count.("fw"),
pos_party_bav_count.("gruene"),
pos_party_bav_count.("linke"),
pos_party_bav_count.("spd")
],
positiv_bavarian_ratio: [
pos_party_bav_count.("afd") / pos_bavaria_total,
pos_party_bav_count.("csu") / pos_bavaria_total,
pos_party_bav_count.("fdp") / pos_bavaria_total,
pos_party_bav_count.("fw") / pos_bavaria_total,
pos_party_bav_count.("gruene") / pos_bavaria_total,
pos_party_bav_count.("linke") / pos_bavaria_total,
pos_party_bav_count.("spd") / pos_bavaria_total
],
positive_german_followers: [
pos_follower_count.("afd") / pos_total_followers,
pos_follower_count.("csu") / pos_total_followers,
pos_follower_count.("fdp") / pos_total_followers,
pos_follower_count.("fw") / pos_total_followers,
pos_follower_count.("gruene") / pos_total_followers,
pos_follower_count.("linke") / pos_total_followers,
pos_follower_count.("spd") / pos_total_followers
]
})
|> DF.print(limit: :infinity)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the CSU", "csu", start_date)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the Freie Waehler", "fw", start_date)
Graphs.create_sentiment_graph(
bav_ger_toots,
"Sentiments for the Buendnis90/Gruene",
"gruene",
start_date
)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the SPD", "spd", start_date)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the FDP", "fdp", start_date)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the AFD", "afd", start_date)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the Linke", "linke", start_date)
TODO:
- cross correlation
- emojies?
- add min/max dates for sentiment graphs
4 nl posts -> error, 1 fr post -> correct
Comparision Polls and Sentiments
normalized_timeline =
bav_ger_toots
|> DF.filter(date >= ^start_date)
|> DF.select(["domentent_party", "sentiment", "day", "id"])
|> DF.mutate(sentiment: (sentiment + 1) / 2)
|> DF.pivot_wider("domentent_party", "sentiment", id_columns: ["id", "day"])
|> DF.discard(["id"])
sentiment_timeline =
normalized_timeline
|> DF.group_by("day")
|> DF.mutate(
csu: mean(csu),
spd: mean(spd),
fw: mean(fw),
fdp: mean(fdp),
afd: mean(afd),
gruene: mean(gruene),
linke: mean(linke)
)
|> DF.select(["day", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
|> DF.distinct()
|> DF.arrange(asc: day)
DF.print(sentiment_timeline)
filtered_polls = DF.filter(polls, start_date >= ^start_date)
polls_timeline =
DF.mutate(filtered_polls,
start_day: S.day_of_year(start_date),
end_day: S.day_of_year(end_date)
)
polls_timeline =
polls_timeline
|> DF.to_rows()
|> Enum.with_index()
|> Enum.flat_map(fn {row, index} ->
row["start_day"]..row["end_day"]
|> Enum.map(&Map.merge(row, %{"day" => &1, "index" => index}))
end)
|> DF.new()
|> DF.select(["day", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
|> DF.group_by("day")
|> DF.mutate(
csu: mean(csu) / 100,
spd: mean(spd) / 100,
fw: mean(fw) / 100,
fdp: mean(fdp) / 100,
afd: mean(afd) / 100,
gruene: mean(gruene) / 100,
linke: mean(linke) / 100
)
|> DF.arrange(asc: day)
|> DF.distinct()
DF.print(polls_timeline, limit: :infinity)
Align the End dates of Polls and Sentiments.
end_day = Enum.min([S.max(polls_timeline["day"]), S.max(sentiment_timeline["day"])])
sentiment_timeline = DF.filter(sentiment_timeline, day <= ^end_day)
polls_timeline = DF.filter(polls_timeline, day <= ^end_day)
end_sampling_doy = 323
lang_toots = DF.filter(bav_ger_toots, day <= ^end_sampling_doy) |> DF.filter(day >= ^start_day)
Tucan.concat([
Tucan.histogram(lang_toots, "day", relative: true, step: 1) |> Tucan.Axes.set_x_title("Day"),
Tucan.histogram(lang_toots, "weekday", relative: true) |> Tucan.Axes.set_x_title("Weekday"),
Tucan.histogram(lang_toots, "hour", relative: true, step: 1) |> Tucan.Axes.set_x_title("Hour")
])
|> Tucan.set_title("Post frequencies on different time scales")
Fill in missing data.
- Dates not yet included
- Dates included but no data measured.
days_range = start_day..end_day
parties = ["csu", "spd", "fw", "fdp", "afd", "gruene", "linke"]
polls_timeline = DataFrameDate.fill(DF.ungroup(polls_timeline), "day", days_range, parties)
sentiment_timeline =
DataFrameDate.fill(DF.ungroup(sentiment_timeline), "day", days_range, parties)
DF.print(sentiment_timeline, limit: :infinity)
DF.print(polls_timeline, limit: :infinity)
polls_timeline =
polls_timeline
|> DF.ungroup()
|> DF.arrange(day)
|> DF.mutate(
csu: NilSeries.fb_fill(csu),
spd: NilSeries.fb_fill(spd),
fw: NilSeries.fb_fill(fw),
fdp: NilSeries.fb_fill(fdp),
afd: NilSeries.fb_fill(afd),
gruene: NilSeries.fb_fill(gruene),
linke: NilSeries.fb_fill(linke)
)
sentiment_timeline =
sentiment_timeline
|> DF.ungroup()
|> DF.arrange(day)
|> DF.mutate(
csu: NilSeries.fb_fill(csu),
spd: NilSeries.fb_fill(spd),
fw: NilSeries.fb_fill(fw),
fdp: NilSeries.fb_fill(fdp),
afd: NilSeries.fb_fill(afd),
gruene: NilSeries.fb_fill(gruene),
linke: NilSeries.fb_fill(linke)
)
|> DF.mutate(
csu: csu / (csu + spd + fw + fdp + afd + gruene + linke),
spd: spd / (csu + spd + fw + fdp + afd + gruene + linke),
fw: fw / (csu + spd + fw + fdp + afd + gruene + linke),
fdp: fdp / (csu + spd + fw + fdp + afd + gruene + linke),
afd: afd / (csu + spd + fw + fdp + afd + gruene + linke),
gruene: gruene / (csu + spd + fw + fdp + afd + gruene + linke),
linke: linke / (csu + spd + fw + fdp + afd + gruene + linke)
)
Parties with enough Sentiment samples
Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - CSU", "csu")
{S.standard_deviation(polls_timeline["csu"]), S.standard_deviation(polls_timeline["fw"]),
S.standard_deviation(polls_timeline["afd"])}
{S.standard_deviation(sentiment_timeline["csu"]), S.standard_deviation(sentiment_timeline["fw"]),
S.standard_deviation(sentiment_timeline["afd"])}
Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - FW", "fw")
Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - AFD", "afd")
Parties with few Sentiment Samples
Graphs.create_daily_compare_graph(
sentiment_timeline,
polls_timeline,
"Sentiment - Buendis90 Gruene",
"gruene"
)
Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - SPD", "spd")
Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - FDP", "fdp")
Graphs.create_daily_compare_graph(
sentiment_timeline,
polls_timeline,
"Sentiment - Linke",
"linke"
)
Cross Correlation
party_sentiment_offset = fn sentiment_df, poll_df, party_str ->
sentiments_t = S.to_tensor(sentiment_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)
polls_t = S.to_tensor(poll_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)
CrossCorrelation.find_offset(sentiments_t, polls_t)
end
party_sentiment_offset.(sentiment_timeline, polls_timeline, "csu")
party_sentiment_offset.(sentiment_timeline, polls_timeline, "fw")
party_sentiment_offset.(sentiment_timeline, polls_timeline, "afd")
Correlation Sentiment –> Polls
alias Scholar.Linear.LinearRegression, as: LR
alias Scholar.Metrics.Regression, as: RegMetrics
alias Scholar.Preprocessing
longer_sentiment_timeline =
sentiment_timeline
|> DF.mutate(
csu: csu / (csu + spd + fw + fdp + afd + gruene + linke),
spd: spd / (csu + spd + fw + fdp + afd + gruene + linke),
fw: fw / (csu + spd + fw + fdp + afd + gruene + linke),
fdp: fdp / (csu + spd + fw + fdp + afd + gruene + linke),
afd: afd / (csu + spd + fw + fdp + afd + gruene + linke),
gruene: gruene / (csu + spd + fw + fdp + afd + gruene + linke),
linke: linke / (csu + spd + fw + fdp + afd + gruene + linke)
)
|> DF.select(["day", "afd", "csu", "fw"])
|> DF.pivot_longer(["afd", "csu", "fw"])
# |> DF.print(limit: :infinity)
longer_polls_timeline =
polls_timeline
|> DF.select(["day", "afd", "csu", "fw"])
|> DF.pivot_longer(["afd", "csu", "fw"])
daily_polls_sentiment_df =
DF.new(
day: longer_sentiment_timeline["day"],
sentiment: longer_sentiment_timeline["value"],
poll: longer_polls_timeline["value"],
party: longer_polls_timeline["variable"]
)
daily_polls_sentiment_df = DF.mutate(daily_polls_sentiment_df, party: S.cast(party, :category))
Tucan.scatter(daily_polls_sentiment_df, "sentiment", "poll", color_by: "party")
|> Tucan.set_width(500)
|> Tucan.set_height(500)
|> Tucan.set_title("Dependency of Poll results from Sentiment.")
|> Tucan.Axes.set_x_title("Sentiment per Average of Sentiments")
|> Tucan.Axes.set_y_title("Poll")
|> Tucan.Scale.set_y_domain(0.1, 0.4)
|> Tucan.Scale.set_x_domain(0, 0.3)
split = round(elem(DF.shape(daily_polls_sentiment_df), 0) * 0.7)
daily_polls_sentiment_shuffled_df = DF.shuffle(daily_polls_sentiment_df)
polls_sentiment_df_train = DF.slice(daily_polls_sentiment_shuffled_df, 0..split)
polls_sentiment_df_test = DF.slice(daily_polls_sentiment_shuffled_df, split..-1)
get_one_hot = fn x, y ->
x
|> S.cast(:category)
|> S.to_tensor()
|> Preprocessing.one_hot_encode(num_classes: y)
end
get_sentiment = fn x ->
x
|> S.to_tensor()
|> Nx.reshape({:auto, 1})
end
party_train = get_one_hot.(polls_sentiment_df_train["party"], 3)
party_test = get_one_hot.(polls_sentiment_df_test["party"], 3)
sentiments_train = get_sentiment.(polls_sentiment_df_train["sentiment"])
sentiments_test = get_sentiment.(polls_sentiment_df_test["sentiment"])
x_train = Nx.concatenate([party_train, sentiments_train], axis: 1)
x_test = Nx.concatenate([party_test, sentiments_test], axis: 1)
y_train = polls_sentiment_df_train["poll"] |> S.to_tensor()
y_test = polls_sentiment_df_test["poll"] |> S.to_tensor()
model = LR.fit(x_train, y_train)
y_hat = LR.predict(model, x_test)
RegMetrics.r2_score(y_test, y_hat)
Summary:
- intercept: 0.2182
- afd: -0.0749
- csu: 0.1321
- fw: -0.0573
- sentiment: 0.0013
Meaning:
- Almost not depending on (daily) Sentiment! Better Sentiment worse Poll results!?
- afd: 14.9 %
- csu: 35.3 %
- fw: 16.2 %
Comparision: Weekly Sentiments and Polls
normalized_weeks =
bav_ger_toots
|> DF.filter(date >= ^start_date)
|> DF.select(["domentent_party", "sentiment", "week", "id"])
|> DF.mutate(sentiment: (sentiment + 1) / 2)
|> DF.pivot_wider("domentent_party", "sentiment", id_columns: ["id", "week"])
|> DF.discard(["id"])
sentiment_weeks =
normalized_weeks
|> DF.group_by("week")
|> DF.mutate(
csu: mean(csu),
spd: mean(spd),
fw: mean(fw),
fdp: mean(fdp),
afd: mean(afd),
gruene: mean(gruene),
linke: mean(linke)
)
|> DF.select(["week", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
|> DF.distinct()
|> DF.arrange(asc: week)
DF.print(sentiment_weeks)
poll_weeks =
polls
|> DF.filter(mid_date > ^start_date)
|> DF.select(["week", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
|> DF.group_by("week")
|> DF.mutate(
afd: mean(afd) / 100,
csu: mean(csu) / 100,
fdp: mean(fdp) / 100,
fw: mean(fw) / 100,
gruene: mean(gruene) / 100,
linke: mean(linke) / 100,
spd: mean(spd) / 100
)
|> DF.distinct()
|> DF.ungroup()
DF.print(poll_weeks, limit: :infinity)
end_week = Enum.min([S.max(poll_weeks["week"]), S.max(sentiment_weeks["week"])])
start_week = Enum.max([S.min(poll_weeks["week"]), S.min(sentiment_weeks["week"])])
poll_weeks = DF.filter(poll_weeks, week <= ^end_week)
DF.print(poll_weeks, limit: :infinity)
sentiment_weeks = DF.filter(sentiment_weeks, week <= ^end_week)
DF.print(sentiment_weeks, limit: :infinity)
filled_poll_weeks =
DataFrameDate.fill(poll_weeks, "week", start_week..end_week, [
"afd",
"csu",
"fdp",
"fw",
"gruene",
"linke",
"spd"
])
filled_poll_weeks =
DF.mutate(filled_poll_weeks,
afd: NilSeries.fb_fill(afd),
csu: NilSeries.fb_fill(csu),
fdp: NilSeries.fb_fill(fdp),
fw: NilSeries.fb_fill(fw),
gruene: NilSeries.fb_fill(gruene),
linke: NilSeries.fb_fill(linke),
spd: NilSeries.fb_fill(spd)
)
filled_sentiment_weeks =
DataFrameDate.fill(sentiment_weeks, "week", start_week..end_week, [
"afd",
"csu",
"fdp",
"fw",
"gruene",
"linke",
"spd"
])
|> DF.mutate(gruene: NilSeries.fb_fill(gruene))
|> DF.mutate(
csu: csu / (csu + spd + fw + fdp + afd + gruene + linke),
spd: spd / (csu + spd + fw + fdp + afd + gruene + linke),
fw: fw / (csu + spd + fw + fdp + afd + gruene + linke),
fdp: fdp / (csu + spd + fw + fdp + afd + gruene + linke),
afd: afd / (csu + spd + fw + fdp + afd + gruene + linke),
gruene: gruene / (csu + spd + fw + fdp + afd + gruene + linke),
linke: linke / (csu + spd + fw + fdp + afd + gruene + linke)
)
DF.print(filled_sentiment_weeks, limit: :infinity)
Graphs.create_weekly_compare_graph(
filled_sentiment_weeks,
filled_poll_weeks,
"Sentiment - CSU",
"csu"
)
Graphs.create_weekly_compare_graph(
filled_sentiment_weeks,
filled_poll_weeks,
"Sentiment - FW",
"fw"
)
Graphs.create_weekly_compare_graph(
filled_sentiment_weeks,
filled_poll_weeks,
"Sentiment - AFD",
"afd"
)
Graphs.create_weekly_compare_graph(
filled_sentiment_weeks,
filled_poll_weeks,
"Sentiment - SPD",
"spd"
)
Graphs.create_weekly_compare_graph(
filled_sentiment_weeks,
filled_poll_weeks,
"Sentiment - Buendnis/ Gruene",
"gruene"
)
Cross Correlation
party_sentiment_offset = fn sentiment_df, poll_df, party_str ->
sentiments_t = S.to_tensor(sentiment_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)
polls_t = S.to_tensor(poll_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)
CrossCorrelation.find_offset(sentiments_t, polls_t)
end
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "afd") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "csu") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "fdp") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "fw") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "gruene") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "spd") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "linke") |> dbg()
Linear Fit
alias Scholar.Linear.LinearRegression, as: LR
alias Scholar.Linear.RidgeRegression, as: Ridge
alias Scholar.Metrics.Regression, as: RegMetrics
alias Scholar.Preprocessing
DF.print(filled_sentiment_weeks, limit: :infinity)
longer_sentiment_weekly_timeline =
filled_sentiment_weeks
|> DF.pivot_longer(["afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
DF.print(longer_sentiment_weekly_timeline, limit: :infinity)
longer_polls_weekly_timeline =
filled_poll_weeks
|> DF.pivot_longer(["afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
DF.print(longer_polls_weekly_timeline, limit: :infinity)
weekly_polls_sentiment_df =
DF.new(
week: longer_sentiment_weekly_timeline["week"],
sentiment: longer_sentiment_weekly_timeline["value"],
poll: longer_polls_weekly_timeline["value"],
party: longer_sentiment_weekly_timeline["variable"]
)
weekly_polls_sentiment_df = DF.mutate(weekly_polls_sentiment_df, party: S.cast(party, :category))
Tucan.scatter(weekly_polls_sentiment_df, "sentiment", "poll", color_by: "party", shape_by: "party")
|> Tucan.set_width(500)
|> Tucan.set_height(500)
|> Tucan.set_title("Dependency of Polling Results From Sentiment.")
|> Tucan.Axes.set_x_title("Sentiment")
|> Tucan.Axes.set_y_title("Poll")
|> Tucan.Scale.set_y_domain(0, 0.4)
|> Tucan.Scale.set_x_domain(0, 0.25)
split = round(elem(DF.shape(weekly_polls_sentiment_df), 0) * 0.7)
weekly_polls_sentiment_shuffled_df = DF.shuffle(weekly_polls_sentiment_df)
polls_sentiment_df_train = DF.slice(weekly_polls_sentiment_shuffled_df, 0..split)
polls_sentiment_df_test = DF.slice(weekly_polls_sentiment_shuffled_df, split..-1)
get_one_hot = fn x, y ->
x
|> S.cast(:category)
|> S.to_tensor()
|> Preprocessing.one_hot_encode(num_classes: y)
end
get_sentiment = fn x ->
x
|> S.to_tensor()
|> Nx.reshape({:auto, 1})
end
party_train = get_one_hot.(polls_sentiment_df_train["party"], 7)
party_test = get_one_hot.(polls_sentiment_df_test["party"], 7)
sentiments_train = get_sentiment.(polls_sentiment_df_train["sentiment"])
sentiments_test = get_sentiment.(polls_sentiment_df_test["sentiment"])
x_train = Nx.concatenate([party_train, sentiments_train], axis: 1)
x_test = Nx.concatenate([party_test, sentiments_test], axis: 1)
y_train = polls_sentiment_df_train["poll"] |> S.to_tensor()
y_test = polls_sentiment_df_test["poll"] |> S.to_tensor()
model = LR.fit(x_train, y_train, fit_intercept?: false)
y_hat = LR.predict(model, x_test)
RegMetrics.r2_score(y_test, y_hat)
Summary:
-
intercept: 0.131
-
afd: 0.003
-
csu: 0.229
-
fdp: -0.103
-
fw: 0.026
-
gruene 0.008
-
linke: -0.118
-
spd: -0.046
-
sentiment: 0.003
defmodule GridSearch do
def ridge_single(point, x_train, x_test, y_train, y_test) do
model_ridge = Ridge.fit(x_train, y_train, fit_intercept?: false, alpha: point)
y_hat_ridge = Ridge.predict(model_ridge, x_test)
Nx.to_number(RegMetrics.r2_score(y_test, y_hat_ridge))
end
def ridge_1d(points, x_train, x_test, y_train, y_test)
when is_list(points) do
points
|> Enum.map(&ridge_single(&1, x_train, x_test, y_train, y_test))
end
end
GridSearch.ridge_1d([0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], x_train, x_test, y_train, y_test)
model_ridge = Ridge.fit(x_train, y_train, fit_intercept?: false, alpha: 0.1) |> dbg
y_hat_ridge = Ridge.predict(model_ridge, x_test)
Nx.to_number(RegMetrics.r2_score(y_test, y_hat_ridge))
Compare with average of that time frame
filled_poll_weeks
|> DF.mutate(
afd: S.mean(afd),
csu: S.mean(csu),
fdp: S.mean(fdp),
fw: S.mean(fw),
gruene: S.mean(gruene),
linke: S.mean(linke),
spd: S.mean(spd)
)
|> DF.discard(["week"])
|> DF.distinct()
|> DF.print()