Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Bavarian Election

election_bavaria.livemd

Bavarian Election

Mix.install(
  [
    {:nx, "~>0.6.4"},
    {:bumblebee, "~>0.4.2"},
    {:explorer, "~>0.7.2"},
    {:kino_vega_lite, "~> 0.1.11"},
    {:httpoison, "~> 1.8"},
    {:exla, "~> 0.6.4"},
    {:adbc, "~> 0.2.2"},
    {:kino_bumblebee, "~>0.4.0"},
    {:tucan, "~> 0.2.1"},
    {:scholar, "~> 0.2.1"}
  ],
  config: [
    nx: [default_backend: EXLA.Backend, client: :cuda, device_id: 1]
  ]
)

Modules

require Explorer.DataFrame

alias VegaLite, as: Vl
alias Explorer.DataFrame, as: DF
alias Explorer.Series, as: S
defmodule Graphs do
  def create_poll_graph(data_source, title, party, start_date) do
    Vl.new(width: 500, height: 300, title: title)
    |> Vl.data_from_values(DF.filter(data_source, mid_date >= ^start_date),
      only: ["end_date", "start_date", "mid_date", party, "institute"]
    )
    |> Vl.layers([
      # data as bars
      Vl.new()
      |> Vl.mark(:bar, opacity: 0.5)
      |> Vl.encode_field(:x, "start_date", type: :temporal, title: "poll dates")
      |> Vl.encode_field(:x2, "end_date")
      |> Vl.encode_field(:y, party,
        type: :quantitative,
        title: "percentage",
        scale: [domain: [S.min(data_source[party]), S.max(data_source[party])]]
      )
      |> Vl.encode_field(:color, "institute", type: :nominal),
      Vl.new()
      # data with opacity 0, to be able to fit
      |> Vl.mark(:bar, opacity: 0.0)
      |> Vl.encode_field(:x, "mid_date", type: :temporal, title: "poll dates")
      |> Vl.encode_field(:x2, "end_date")
      |> Vl.encode_field(:y, party,
        type: :quantitative,
        title: "percentage",
        scale: [domain: [S.min(data_source[party]), S.max(data_source[party])]]
      )
      |> Vl.encode_field(:color, "institute", type: :nominal),
      # rule
      Vl.new()
      |> Vl.mark(:rule, color: "blue")
      |> Vl.encode(:y,
        datum: DF.filter(data_source, institute == "Landtagswahl")[party][0],
        type: :quantitative
      ),
      # fit
      Vl.new()
      |> Vl.mark(:line, color: "firebrick")
      |> Vl.transform(loess: party, on: "mid_date", bandwidth: 0.5)
      |> Vl.encode_field(:x, "mid_date", type: :temporal, title: "poll dates")
      |> Vl.encode_field(:y, party, type: :quantitative, title: "percentage")
    ])
  end

  def create_poll_graph(data_source, title, party) do
    create_poll_graph(data_source, title, party, ~N[2023-01-01 00:00:01])
  end

  def create_sentiment_graph(data_source, title, party, start_date) do
    Vl.new(width: 500, height: 300, title: title)
    |> Vl.data_from_values(
      DF.filter(data_source, domentent_party == ^party and date > ^start_date),
      only: ["date", "sentiment", "region"]
    )
    |> Vl.layers([
      Vl.new()
      |> Vl.mark(:point)
      |> Vl.encode_field(:x, "date", type: :temporal)
      |> Vl.encode_field(:y, "sentiment",
        type: :quantitative,
        scale: [domain: [-1, 1]]
      )
      |> Vl.encode_field(:color, "region", type: :nominal, title: "Region")
      |> Vl.encode_field(:shape, "region", type: :nominal, title: "Region"),
      Vl.new()
      |> Vl.mark(:line, color: "firebrick", opacity: 0.5)
      |> Vl.transform(loess: "sentiment", on: "date", bandwidth: 0.5)
      |> Vl.encode_field(:x, "date", type: :temporal, title: "date")
      |> Vl.encode_field(:y, "sentiment", type: :quantitative, title: "sentiment")
    ])
  end

  def create_daily_compare_graph(sentiment_data_source, poll_data_source, title, party) do
    Tucan.layers([
      Tucan.step(poll_data_source, "day", party, line_color: "red"),
      Tucan.step(sentiment_data_source, "day", party, line_color: "green")
    ])
    |> Tucan.set_width(500)
    |> Tucan.set_title(title)
    |> Tucan.Axes.set_x_title("Day of the year")
    |> Tucan.Axes.set_y_title("Sentiments, Polls")
    |> Tucan.Scale.set_y_domain(0, 1)
    |> Tucan.Legend.set_enabled(:color, true)
    |> Tucan.Legend.set_title(:color, "TimeLine")
    |> Tucan.annotate(243, S.max(poll_data_source[party]) + 0.05, "Polls", color: "red", size: 18)
    |> Tucan.annotate(243, S.max(sentiment_data_source[party]) + 0.05, "Sentiments",
      color: "green",
      size: 18
    )
  end

  def create_weekly_compare_graph(sentiment_data_source, poll_data_source, title, party) do
    Tucan.layers([
      Tucan.step(poll_data_source, "week", party, line_color: "red"),
      Tucan.step(sentiment_data_source, "week", party, line_color: "green")
    ])
    |> Tucan.set_width(500)
    |> Tucan.set_title(title)
    |> Tucan.Axes.set_x_title("Calendar Week")
    |> Tucan.Axes.set_y_title("Sentiments, Polls")
    |> Tucan.Scale.set_y_domain(0, 1)
    |> Tucan.Legend.set_enabled(:color, true)
    |> Tucan.Legend.set_title(:color, "TimeLine")
    |> Tucan.annotate(36, poll_data_source[party][1] + 0.05, "Polls", color: "red", size: 18)
    |> Tucan.annotate(36, sentiment_data_source[party][1] + 0.05, "Sentiments",
      color: "green",
      size: 18
    )
  end
end
defmodule Names do
  def append_variants(names) do
    names
    |> Enum.concat(Enum.map(names, fn x -> String.replace(x, " ", "") end))
    |> Enum.concat(Enum.map(names, fn x -> String.replace(x, "ö", "o") end))
    |> Enum.concat(Enum.map(names, fn x -> String.replace(x, "ö", "oe") end))
    |> Enum.concat(Enum.map(names, fn x -> String.replace(x, "-", "") end))
    |> Enum.uniq()
  end

  def family_name(y) do
    String.split(y, " ", parts: 2) |> Enum.at(1)
  end
end
defmodule Bool do
  def to_integer(true), do: 1
  def to_integer(false), do: 0
  def to_integer(nil), do: 0
end
defmodule SentimentScore do
  def score(prediction) do
    prediction
    |> Enum.map(fn p ->
      case p do
        %{label: l} when l in ["POS", "positive"] -> p.score
        %{label: l} when l in ["NEG", "negative"] -> -p.score
        _ -> 0
      end
    end)
    |> Enum.sum()
  end
end
defmodule User do
  def image_exists?(body), do: byte_size(body) > 478

  def download_image(save_stem, link) do
    save_name = "./graphics/userimages/#{save_stem}.png"

    if not File.exists?(save_name) do
      %HTTPoison.Response{body: body} = HTTPoison.get!(link)

      if User.image_exists?(body) do
        File.write!(save_name, body)
      end
    end
  end
end
defmodule Text do
  def count_mentions(text, mentioned_words)
      when is_list(mentioned_words)
      when is_binary(text) do
    is_scanned =
      Enum.map(mentioned_words, fn y ->
        _scanned = Regex.scan(Regex.compile!("\\b#{y}\\b"), text)
      end)

    per_word_sum = Enum.map(is_scanned, &length(&1))
    Enum.sum(per_word_sum)
  end

  def count_unique_mentions(text, mentioned_words)
      when is_list(mentioned_words)
      when is_binary(text) do
    is_scanned =
      Enum.map(mentioned_words, fn y ->
        scanned = Regex.run(Regex.compile!("\\b#{y}\\b"), text)
        if is_nil(scanned), do: 0, else: 1
      end)

    Enum.sum(is_scanned)
  end

  def contains_mentions?(text, mentioned_words)
      when is_list(mentioned_words)
      when is_binary(text) do
    count =
      text
      |> count_unique_mentions(mentioned_words)

    if(count > 0, do: true, else: false)
  end
end
defmodule CrossCorrelation do
  def cross_corr(t1, t2) do
    Nx.conv(t1, t2)
  end

  def full_cross_corr(t1, t2) do
    p1 = elem(Nx.shape(t1), 2) - 1
    p2 = elem(Nx.shape(t2), 2) - 1

    Nx.conv(t1, t2, padding: [{p1, p2}])
  end

  def find_offset(t1, t2) do
    l1 = elem(Nx.shape(t1), 2)

    coeffs = full_cross_corr(t1, t2)
    index = Nx.argmax(coeffs)
    Nx.to_number(index) - l1 + 1
  end
end
defmodule DataFrameDate do
  def fill(df, timeline_int_col, timeline_range, data_cols) do
    timeline =
      df[timeline_int_col]
      |> S.to_list()
      |> MapSet.new()

    measurement_timeline = timeline_range |> MapSet.new()

    missing_time = MapSet.difference(measurement_timeline, timeline)
    missing_data = List.duplicate(nil, length(Enum.to_list(missing_time)))

    missing_df =
      (Enum.map(data_cols, fn x -> %{x => missing_data} end) ++
         [%{timeline_int_col => missing_time}])
      |> Enum.map(&DF.new(&1))
      |> DF.concat_columns()

    DF.concat_rows([missing_df, df])
    |> DF.arrange_with(& &1[timeline_int_col])
  end
end
defmodule NilSeries do
  def fb_fill(series) do
    S.fill_missing(S.fill_missing(series, :forward), :backward)
  end
end

Polls

start_date = ~N[2023-08-29 00:00:01]
start_day = start_date |> NaiveDateTime.to_date() |> Date.day_of_year()

Load the csv with the polls from different instiutions form the website wahlrecht.de. Show the timeline and trend for every party.

polls =
  DF.from_csv!("polls.csv", delimiter: ";", parse_dates: true)
  |> DF.mutate(
    start_date: S.cast(start_date, {:datetime, :millisecond}),
    end_date: S.cast(end_date, {:datetime, :millisecond})
  )
polls_mdt =
  S.to_list(polls["end_date"])
  |> Enum.zip(S.to_list(polls["start_date"]))
  |> Enum.map(
    &NaiveDateTime.add(
      elem(&1, 1),
      round(NaiveDateTime.diff(elem(&1, 0), elem(&1, 1)) / 2)
    )
  )

polls = DF.put(polls, "mid_date", polls_mdt)
polls = DF.mutate(polls, week: S.cast(mid_date, :date) |> S.week_of_year())
DF.print(polls, limit: :infinity)
Graphs.create_poll_graph(polls, "Polls - CSU", "csu")
Graphs.create_poll_graph(polls, "Polls - Freie Waehler", "fw")
Graphs.create_poll_graph(polls, "Polls - Buendnis90-Gruene", "gruene")
Graphs.create_poll_graph(polls, "Polls - SPD", "spd")
Graphs.create_poll_graph(polls, "Polls - FDP", "fdp")
Graphs.create_poll_graph(polls, "Polls - AFD", "afd")
Graphs.create_poll_graph(polls, "Polls - Linke", "linke")

Regions

Names of subdistricts are removed, as the are often to general. Example: Oder, Gern. Still, Village name can be general terms as Wald.

local_entities = DF.from_csv!("geodaten/VerwaltungsEinheit.csv", delimiter: ",")["name"]
# subdist_entities = DF.from_csv!("geodaten/KatasterBezirk.csv", delimiter: ",")["name"]

local_entities =
  local_entities
  |> S.downcase()

Parties

parties_df =
  DF.new(
    party: ["csu", "fw", "spd", "gruene", "fdp", "afd", "linke"],
    candiate1: [
      "Markus Söder",
      "Hubert Aiwanger",
      "Florian von Brunn",
      "Ludwig Hartmann",
      "Martin Hagen",
      "Katrin Ebner-Steiner",
      "Adelheid Rupp"
    ],
    candidate2: [nil, nil, nil, "Katharina Schulze", nil, "Martin Böhm", nil]
  )

DF.print(parties_df)
parties = S.to_list(parties_df["party"])

parties_regex =
  parties
  |> Enum.join("|")
  |> Regex.compile!()

Analysis Tags and Toots

f = "mastodon_bayernwahl2023_20231119.db"
# f = "mastodon_bayernwahl2023_20230910.db"
p = Path.absname(f)
Adbc.download_driver!(:sqlite)
{:ok, db} = Kino.start_child({Adbc.Database, driver: :sqlite, uri: p})
{:ok, conn} = Kino.start_child({Adbc.Connection, database: db})
{:ok, tags_df} = Explorer.DataFrame.from_query(conn, "select * from tags", [])

tags_df["tag"]
|> S.downcase()
|> S.to_list()
|> Enum.filter(&Enum.member?(parties, &1))
|> Enum.frequencies()

From the original tracked posts, only a minimal set contains the names of the parties:

  • Freie Waehler (82)
  • Gruene (58)
  • Linke (57)

This is much better for the parties: AFD, CSU and SPD.

{:ok, toots_df} = Explorer.DataFrame.from_query(conn, "select * from toots", [])
toots_df = DF.mutate(toots_df, date: S.strptime(date, "%Y-%m-%dT%H:%M:%S"))

toots_df =
  toots_df
  |> DF.mutate(
    day: S.cast(date, :date) |> S.day_of_year(),
    week: S.cast(date, :date) |> S.week_of_year(),
    weekday: S.cast(date, :date) |> S.day_of_week(),
    hour: S.hour(date)
  )
post_number = S.size(toots_df["content"])
toots_df["content"]
|> S.fill_missing(" ")
|> S.downcase()
|> S.to_list()
|> Enum.filter(&Regex.match?(parties_regex, &1))
|> Enum.map(&Regex.scan(parties_regex, &1))
|> Enum.map(&Enum.uniq(&1))
|> List.flatten()
|> Enum.frequencies()
|> Enum.sort_by(&elem(&1, 1), :desc)
|> Enum.map(fn {lang, freq} -> {lang, freq / post_number * 100.0} end)

This is slightly getting better, the whole posts are taken into account.

  • Freie Waehler (82 -> 155)
  • Grune (58 -> 169)
  • Linke (57 -> 175)

TODO: use regions.

candidate_family_names =
  parties_df["candiate1"]
  |> S.concat(parties_df["candidate2"])
  |> S.downcase()
  |> S.to_list()
  |> Enum.filter(&is_binary(&1))
  |> Enum.map(&Names.family_name(&1))

candidate_family_names =
  candidate_family_names
  |> Names.append_variants()

bavaria_tags = S.to_list(local_entities) ++ ["csu"] ++ candidate_family_names
name_regex = Regex.compile!(Enum.join(candidate_family_names, "|"))

Attribution (Do Execute with caution)

  • gender

  • bavarian

  • age (perhaps)

  • sentiment -> Party

  • date -> calendar week

flowchart TD;
  A(User on Bavarian Instance?) -->|yes| B[Bavarian];
  A-->|no| C(Bavarian Location in Field?);
  C --> |yes| B;
  C --> |no| D(Bavarian Location in User note?)
  D --> |yes| B;
  D --> |no| E(Interfered Language in toot is German?);
  E --> |yes| F[German]
  E --> |no| G[Foreign]

Mark Bavarian Instance

{:ok, person_df} = Explorer.DataFrame.from_query(conn, "select * from users", [])
bavarian_instances = ~w"muenchen.social augsburg.social mastodon.bayern nuernberg.social 
ploen.social wue.social mastodon.dachgau.social  sueden.social"

bavarian_instances_reg =
  bavarian_instances
  |> Enum.join("|")
  |> Regex.compile!()
is_user_on_bavarian_instance =
  person_df["user_name"]
  |> S.downcase()
  |> S.transform(&Enum.at(String.split(&1, "@"), 1, "chaos.social"))
  |> S.transform(&is_list(Regex.run(bavarian_instances_reg, &1)))

person_df = DF.put(person_df, "bavarian_instance", is_user_on_bavarian_instance)

bavarian_person = DF.filter(person_df, bavarian_instance == true)
elem(DF.shape(bavarian_person), 0) / elem(DF.shape(person_df), 0) * 100

About 6 % of all the users are on a Bavarian instance.

Bavarian Locations in Fields

{:ok, fields_df} = Explorer.DataFrame.from_query(conn, "select * from fields", [])

Remove:

  • HTML
  • #-Sign
  • @-Sign
  • _-Sign
  • links
  • excess white space

Left in:

  • Simileys (language model might know them)
  • numbers (language model converts them)
links =
  "https?:\/\/(?:www\.)?([-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b)*(\/[\/\d\w\.-]*)*(?:[\?])*(.+)*|="

html = "<[^>]*>"
excess_spaces_r = Regex.compile!("\s\s+")

clean_r = Regex.compile!("#{html}|#|@|_|#{links}|\"")
filter_field_regions_names = [
  "adresse",
  "born where",
  "bundesland",
  "city",
  "country",
  "heimat",
  "heimathafen",
  "heimatort",
  "herkunft",
  "home",
  "location",
  "ort",
  "standort",
  "wahlkreis",
  "wo",
  "wohnhaft",
  "wohnort",
  "wohnt in",
  "zuhause",
  "📍"
]

location_fields =
  fields_df["field_name"]
  |> S.downcase()
  |> S.to_enum()
  |> Enum.map(&amp;Enum.member?(filter_field_regions_names, &amp;1))

cleaned_field =
  fields_df["field_value"]
  |> S.downcase()
  |> S.transform(&amp;Regex.replace(clean_r, &amp;1, " "))
  |> S.transform(&amp;Regex.replace(excess_spaces_r, &amp;1, " "))
  |> S.transform(&amp;String.trim(&amp;1))

fields_df = DF.put(fields_df, "cleaned_field", cleaned_field)
DF.print(fields_df)
bavarian_fields =
  fields_df["cleaned_field"]
  |> S.to_list()
  |> Enum.map(fn a ->
    a
    |> String.split([",", "-"])
    |> Enum.map(&amp;String.trim(&amp;1))
    |> Enum.map(&amp;Enum.member?(S.to_list(local_entities), &amp;1))
    |> Enum.any?()
  end)

fields_df =
  fields_df
  |> DF.put("locations", location_fields)
  |> DF.put("is_bavarian", bavarian_fields)

fields_with_bavarian_locations = DF.filter(fields_df, is_bavarian == true and locations == true)
DF.print(fields_with_bavarian_locations)
elem(DF.shape(fields_with_bavarian_locations), 0) / elem(DF.shape(person_df), 0) * 100

About 5 % users add locations. About 0.5 % of users give locations in Bavarian.

Find bavarian locations in user texts

todo use local_entities funtion

cleaned_note =
  person_df["note"]
  |> S.transform(&amp;if(is_binary(&amp;1), do: &amp;1, else: ""))
  |> S.transform(&amp;Regex.replace(clean_r, &amp;1, " "))
  |> S.transform(&amp;Regex.replace(excess_spaces_r, &amp;1, " "))
  |> S.transform(&amp;String.trim(&amp;1))

person_df = DF.put(person_df, "cleaned_note", cleaned_note)

bavarian_location_in_note =
  person_df["cleaned_note"]
  |> S.transform(fn x ->
    Text.contains_mentions?(x, S.to_list(local_entities))
  end)

person_df = DF.put(person_df, "bavarian_location_in_note", bavarian_location_in_note)
person_with_bavarian_note_locations = DF.filter(person_df, bavarian_location_in_note == true)

elem(DF.shape(person_with_bavarian_note_locations), 0) / elem(DF.shape(person_df), 0) * 100
DF.print(person_with_bavarian_note_locations)

About 2 % of the user name Bavarian location in there user note texts.

Person in Bavaria

OR the Bavarian locations in note, in fields and instance name.

persons_with_bavarian_fields =
  DF.join(person_df, DF.select(fields_with_bavarian_locations, ["user_name", "is_bavarian"]),
    how: :left,
    on: [{"user_name", "user_name"}]
  )

DF.print(persons_with_bavarian_fields)
DF.shape(persons_with_bavarian_fields)
bavarian_users =
  Enum.reduce(
    [
      persons_with_bavarian_fields["is_bavarian"],
      persons_with_bavarian_fields["bavarian_location_in_note"],
      persons_with_bavarian_fields["bavarian_instance"]
    ],
    &amp;S.or(&amp;1, &amp;2)
  )
  |> S.to_enum()
  |> Enum.map(&amp;if(is_nil(&amp;1), do: false, else: &amp;1))

persons_with_bavarian_fields =
  DF.put(persons_with_bavarian_fields, "bavarian_users", bavarian_users)

DF.print(persons_with_bavarian_fields)
person_from_bavaria = DF.filter(persons_with_bavarian_fields, bavarian_users == true)

bavarian_count = elem(DF.shape(person_from_bavaria), 0)
{bavarian_count, bavarian_count / elem(DF.shape(persons_with_bavarian_fields), 0) * 100}

About 8.14 % of the sample datasets users are estimated to be from Bavarian. But (6.14 % + 0.39 % + 2.00 % =) 8.53 % is the sum of percentage per feature. Hence, there is little overlap.

Download user pics

In test data set 1390 of 1547 user do have avatars. Which is about 90 % of the users.

Enum.zip(
  S.to_list(person_df["avatar"]),
  S.to_list(person_df["user_id"])
)
|> Enum.map(fn {link, name} -> User.download_image(name, link) end)
flowchart TD;
  A(gender in fields?) -->|male| B[male];
  A -->|female| C[female];
  A -->|no| D(gender in user text?);

  D -->|male| B;
  D -->|female| C;

  D -->|no| E(user has image?)

  E -->|yes| F(image segamentatation);
  E -->|no| H[no data]

  F-->|single person| G(vgg_ilsvrc_16_gender_imdb_wiki);
  F-->|others| H;

  G-->|male| B;
  G-->|female| C;
flowchart TD;
  E(user has image?) -->|yes| F(image segamentatation);
  E-->|no| H[no data]

  F-->|single person| G(vgg_ilsvrc_16_gender_imdb_wiki);
  F-->|others| H[no data];

  G--> B[age classes];
filter_field_age_names = ~w[Age Alter Born Geburtstag]

filter_field_gender_names =
  ~w[Gender Geschlecht Pronom Pronomen pronouns Pronouns Pronomina Pronoms Pronons Pronoun Sexulatität Wer pronomen pronouns]
gender_fields =
  fields_df["field_name"]
  |> S.downcase()
  |> S.transform(&amp;Enum.member?(filter_field_gender_names, &amp;1))

fields_df = DF.put(fields_df, "gender_fields", gender_fields)
%{male: ["he", "him", "his", "er", "ihm", "ihn", "sein"], female: ["she", "her", "sie", "ihr"]}
DF.filter(fields_df, gender_fields)["cleaned_field"]
|> S.to_list()
|> Enum.uniq()

Preprocessing posts

Before the sentiments can be read from the posts. It is nessescary to remove html tags. The mastodon tags, need to be converted.

Convert date column from string to naive date time.

cleared_posts =
  toots_df[:content]
  |> S.fill_missing(" ")
  |> S.transform(&amp;Regex.replace(clean_r, &amp;1, " "))
  |> S.transform(&amp;Regex.replace(excess_spaces_r, &amp;1, " "))
  |> S.transform(&amp;String.trim(&amp;1))

toots_df = DF.put(toots_df, "cleared_content", cleared_posts)
has_content =
  cleared_posts
  |> S.transform(&amp;String.length(&amp;1))
  |> S.greater(50)

toots_df = DF.put(toots_df, "has_content?", has_content)

toots_df = DF.filter(toots_df, has_content?)
post_length_s =
  toots_df["cleared_content"]
  |> S.transform(&amp;String.length(&amp;1))

post_length_s
|> S.mean()
S.standard_deviation(post_length_s)
S.median(post_length_s)
S.max(post_length_s)

On the median of the cleared posts is 248 characters in the first set (After removing the length of zero). The mean is 217+/-189 character.

toots_df =
  DF.join(
    toots_df,
    DF.select(persons_with_bavarian_fields, ["user_name", "bavarian_users", "followers"]),
    how: :left,
    on: [{"user_name", "user_name"}]
  )

DF.print(toots_df)
DF.filter(toots_df, bavarian_users == true)

Filter posts

  1. Filter: Topic of posts is really about Bavaria
  2. Attribution of Sentiment to a single party
  3. Filter: No party, multiple parties

Filter: Contains at least a single candidate.

Sample Data: 3503/4563 Mentions words in context of Bavaria or a Candidate.

bavarian_post_filter =
  toots_df["cleared_content"]
  |> S.downcase()
  |> S.transform(&amp;Text.contains_mentions?(&amp;1, bavaria_tags))

bavarian_toots_df = DF.mask(toots_df, bavarian_post_filter)
bavarian_full_post_filter =
  toots_df["content"]
  |> S.downcase()
  |> S.transform(&amp;Text.contains_mentions?(&amp;1, bavaria_tags))

full_bavarian_toots_df = DF.mask(toots_df, bavarian_full_post_filter)
bav_set =
  bavarian_toots_df[:id]
  |> S.to_list()
  |> MapSet.new()

bav_full_set =
  full_bavarian_toots_df[:id]
  |> S.to_list()
  |> MapSet.new()

differences = MapSet.difference(bav_full_set, bav_set) |> Enum.to_list()

tags_df
|> DF.filter(toot_id in ^differences)
|> DF.arrange(tag)
|> DF.print(limit: :infinity)

tags_df
|> DF.filter(toot_id == 165)
|> DF.print(limit: :infinity)
DF.filter(full_bavarian_toots_df, bavarian_users == true)

TODO: Rework Parties Single -> Most commonly used Filter all posts that the toots, contains only a single party or its candidates per post.

party_search_terms =
  DF.to_rows(parties_df)
  # only per line in DataFrame
  |> Enum.map(fn x ->
    x
    # only takes the values
    |> Map.values()
    # remove nils
    |> Enum.filter(&amp;is_bitstring(&amp;1))
    # take family names if candidate anmes
    |> Enum.map(fn y ->
      cond do
        Names.family_name(y) == nil -> y
        true -> Names.family_name(y)
      end
    end)
    |> Names.append_variants()
    |> Enum.map(&amp;String.downcase(&amp;1))
  end)
contains_party =
  full_bavarian_toots_df["cleared_content"]
  |> S.downcase()
  |> S.to_list()
  |> Enum.map(fn text ->
    Enum.map(party_search_terms, fn party -> Text.count_unique_mentions(text, party) > 0 end)
  end)

parties_count =
  contains_party
  |> Enum.map(fn text -> Enum.sum(Enum.map(text, fn party -> Bool.to_integer(party) end)) end)

contains_single_party =
  parties_count
  |> Enum.map(&amp;(&amp;1 == 1))
party_frquency =
  full_bavarian_toots_df["content"]
  |> S.downcase()
  |> S.to_list()
  |> Enum.map(fn text ->
    Enum.map(party_search_terms, fn party -> Text.count_mentions(text, party) end)
  end)
  |> Enum.map(fn party_counts ->
    Enum.map(party_counts, fn party_count ->
      if(Enum.sum(party_counts) === 0, do: 0, else: party_count / Enum.sum(party_counts))
    end)
  end)

has_dominant_party =
  party_frquency
  |> Enum.map(fn party_frequencies -> Enum.any?(party_frequencies, &amp;(&amp;1 > 0.5)) end)

Enum.sum(Enum.map(has_dominant_party, &amp;Bool.to_integer(&amp;1)))
domentent_party =
  party_frquency
  |> Enum.map(&amp;Enum.zip(&amp;1, S.to_list(parties_df["party"])))
  |> Enum.map(fn text -> Enum.filter(text, fn party -> elem(party, 0) > 0.5 end) end)
  |> Enum.map(fn text -> Enum.map(text, fn party -> elem(party, 1) end) end)
  |> Enum.map(&amp;if(length(&amp;1) === 0, do: "", else: Enum.at(&amp;1, 0)))

Most posts name a single party. Very often two parties are mentioned in a single post. The other cases are much less frequent.

mentioned_parties =
  contains_party
  |> Enum.map(&amp;Enum.zip(&amp;1, S.to_list(parties_df["party"])))
  |> Enum.map(fn text -> Enum.filter(text, fn party -> elem(party, 0) end) end)
  |> Enum.map(fn text -> Enum.map(text, fn party -> elem(party, 1) end) end)
  |> Enum.map(&amp;Enum.join(&amp;1, ", "))

full_bavarian_toots_df = DF.put(full_bavarian_toots_df, "mentioned_party", mentioned_parties)
full_bavarian_toots_df = DF.put(full_bavarian_toots_df, "domentent_party", domentent_party)

single_party_toots_df = DF.mask(full_bavarian_toots_df, has_dominant_party)
DF.print(single_party_toots_df |> DF.arrange(desc: date))
DF.filter(single_party_toots_df, bavarian_users == true)

Sentiment Analysis

Before the sentiment analysis. The langauage has to be checked. As the language attribute is very often not correct. Therefore a language detection has to be made first, before the sentiment analysis

  1. XLM-RoBERTa - language detection
  2. german-sentiment_bert - Sentiment Analysis German
  3. RoBERTa (BERTtweet) - Sentiment - English language Sentiment analysis

Language Detection

The modules takes a different number of maximum characters. The language detection takes up to 514 characters, but the results updates of the self set languages is similar to 100 characters. Therefore we restrict to 100 characters.

flowchart TD;
  A(XLM-RoBERTa) -->|German| B[german-sentiment_bert];
  A(XLM-RoBERTa) -->|English| C[RoBERTa BERTtweet - Sentiment];


  B -->  E[Bavarian?]
  C -->  F[Bavarian?]

  E -->|no| H(German)
  E -->|yes| G(Bavarian)

  F -->|yes| G(Bavarian)
  F -->|no| I(English)
{:ok, lang_detect_model_info} =
  Bumblebee.load_model({:hf, "papluca/xlm-roberta-base-language-detection"})

{:ok, lang_detect_tokenizer} =
  Bumblebee.load_tokenizer({:hf, "papluca/xlm-roberta-base-language-detection"})

lang_detect_serving =
  Bumblebee.Text.text_classification(lang_detect_model_info, lang_detect_tokenizer,
    compile: [batch_size: 128, sequence_length: 100],
    defn_options: [compiler: EXLA]
  )

lang_detect_model_info.spec.max_positions
Kino.start_child({
  Nx.Serving,
  serving: lang_detect_serving, name: LangDetectServer
})
p = Nx.Serving.batched_run(LangDetectServer, S.to_list(single_party_toots_df["cleared_content"]))

Each predictions is ordered by probybility. Hence always selecting the label the first language returns the most likely one.

detected_languages =
  Enum.map(p, fn post ->
    post
    |> Enum.at(0)
    |> elem(1)
    |> Enum.at(0)
    |> (&amp; &amp;1[:label]).()
  end)

The majority of 96% of all sample posts are detected as german. 1.6 % are detected as English. Why 0.9 % are labeled as Hindi has to be figured out. That 0.5 % are detected as Dutch is more plausable.

detected_languages
|> Enum.frequencies()
|> Enum.sort_by(&amp;elem(&amp;1, 1), :desc)
|> Enum.map(fn {lang, freq} -> {lang, freq / length(detected_languages) * 100.0} end)
single_party_toots_df = DF.put(single_party_toots_df, "detected_languages", detected_languages)

In contrast the manually set language are 92%, 6 % language (often the default) and 1.3 % nil (not specified).

single_party_toots_df["language"]
|> S.to_list()
|> Enum.frequencies()
|> Enum.sort_by(&amp;elem(&amp;1, 1), :desc)
|> Enum.map(fn {lang, freq} -> {lang, freq / length(detected_languages) * 100.0} end)

From visual analysis the langauge attribute is often set wrong, as it set manually, with a given default. Often the language was set to English, when it was German or set to nil. Therefore the language has been evaluated by language detection model, which changed the language in 5.6 % of all posts.

reasigned_language =
  single_party_toots_df["language"]
  |> S.not_equal(single_party_toots_df["detected_languages"])
  |> S.transform(&amp;Bool.to_integer(&amp;1))
  |> S.sum()

reasigned_language / S.size(single_party_toots_df["language"]) * 100.0

German Sentiments

The German sentiment analysis works with up to 512 tokens. Per standard Mastodon set the limit to 500 characters. As other services than Mastodon and some Mastodon instances use other cut offs, we select 512 tokens. We assume 5.99 characters per word. OpenAI estimates 4 characters per tokens on a English word.

{:ok, ger_sent_model_info} = Bumblebee.load_model({:hf, "oliverguhr/german-sentiment-bert"})
{:ok, ger_sent_tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-german-cased"})
ger_sent_model_info.spec.max_positions
ger_sent_serving =
  Bumblebee.Text.text_classification(ger_sent_model_info, ger_sent_tokenizer,
    compile: [batch_size: 128, sequence_length: 512],
    defn_options: [compiler: EXLA]
  )

Kino.start_child({
  Nx.Serving,
  serving: ger_sent_serving, name: GerSentimentServer
})
german_toots_df = DF.filter(single_party_toots_df, detected_languages == "de")
german_toots = S.to_list(german_toots_df["cleared_content"])
ger_predictions = Nx.Serving.batched_run(GerSentimentServer, german_toots)
ger_sentiments = Enum.map(ger_predictions, fn x -> SentimentScore.score(x.predictions) end)
german_toots_df = DF.put(german_toots_df, "sentiment", ger_sentiments)

Englisch Sentiment

The sentiment analysis on the English text works only with 130 tokens.

english_toots_df = DF.filter(single_party_toots_df, detected_languages == "en")
english_toots = S.to_list(english_toots_df["cleared_content"])
{:ok, model_info} = Bumblebee.load_model({:hf, "finiteautomata/bertweet-base-sentiment-analysis"})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "vinai/bertweet-base"})

english_sentiment_serving =
  Bumblebee.Text.text_classification(model_info, tokenizer,
    compile: [batch_size: 128, sequence_length: 130],
    defn_options: [compiler: EXLA]
  )
Kino.start_child({
  Nx.Serving,
  serving: english_sentiment_serving, name: EngSentimentServer
})
eng_predictions = Nx.Serving.batched_run(EngSentimentServer, english_toots)
eng_sentiments = Enum.map(eng_predictions, fn x -> SentimentScore.score(x.predictions) end)
english_toots_df = DF.put(english_toots_df, "sentiment", eng_sentiments)
ger_eng_toots = DF.concat_rows(german_toots_df, english_toots_df)
bav_ger_eng_lang =
  ger_eng_toots["detected_languages"]
  |> S.to_list()
  |> Enum.zip(S.to_list(ger_eng_toots["bavarian_users"]))
  |> Enum.map(&amp;if(elem(&amp;1, 1), do: "bav", else: elem(&amp;1, 0)))

bav_ger_toots =
  DF.put(ger_eng_toots, "region", bav_ger_eng_lang)
  |> DF.filter(region != "en")

DF.print(ger_eng_toots)
S.frequencies(bav_ger_toots["region"])
h =
  bav_ger_toots
  |> DF.filter(region == "bav")

S.distinct(h["user_id"])
S.distinct(bav_ger_toots["user_id"])
DF.filter(bav_ger_toots, date > ^start_date)[:domentent_party]
|> S.to_list()
|> Enum.frequencies()
DF.filter(bav_ger_toots, date > ^start_date)[:region]
|> S.to_list()
|> Enum.frequencies()

Sentiment Graphs

avg_sentiment = S.mean(bav_ger_toots["sentiment"])

party_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(domentent_party == ^x)
  |> DF.shape()
  |> elem(0)
end

total_count =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.shape()
  |> elem(0)

follower_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(domentent_party == ^x)
  |> DF.to_columns()
  |> Map.fetch!("followers")
  |> Enum.sum()
end

total_followers =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.to_columns()
  |> Map.fetch!("followers")
  |> Enum.sum()

party_bav_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(region == "bav")
  |> DF.filter(domentent_party == ^x)
  |> DF.shape()
  |> elem(0)
end

bavaria_total =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(region == "bav")
  |> DF.shape()
  |> elem(0)

#  Last Positive before election
pos_party_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.filter(domentent_party == ^x)
  |> DF.shape()
  |> elem(0)
end

pos_total_count =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.shape()
  |> elem(0)

pos_party_bav_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(region == "bav")
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.filter(domentent_party == ^x)
  |> DF.shape()
  |> elem(0)
end

pos_follower_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.filter(domentent_party == ^x)
  |> DF.to_columns()
  |> Map.fetch!("followers")
  |> Enum.sum()
end

pos_total_followers =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.to_columns()
  |> Map.fetch!("followers")
  |> Enum.sum()

pos_bavaria_total =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(region == "bav")
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.shape()
  |> elem(0)

DF.new(%{
  party: [
    "afd",
    "csu",
    "fdp",
    "fw",
    "gruene",
    "linke",
    "spd"
  ],
  german_ratio: [
    party_count.("afd") / total_count,
    party_count.("csu") / total_count,
    party_count.("fdp") / total_count,
    party_count.("fw") / total_count,
    party_count.("gruene") / total_count,
    party_count.("linke") / total_count,
    party_count.("spd") / total_count
  ],
  german_followers: [
    follower_count.("afd") / total_followers,
    follower_count.("csu") / total_followers,
    follower_count.("fdp") / total_followers,
    follower_count.("fw") / total_followers,
    follower_count.("gruene") / total_followers,
    follower_count.("linke") / total_followers,
    follower_count.("spd") / total_followers
  ],
  bavarian_support: [
    party_bav_count.("afd"),
    party_bav_count.("csu"),
    party_bav_count.("fdp"),
    party_bav_count.("fw"),
    party_bav_count.("gruene"),
    party_bav_count.("linke"),
    party_bav_count.("spd")
  ],
  bavarian_ratio: [
    party_bav_count.("afd") / bavaria_total,
    party_bav_count.("csu") / bavaria_total,
    party_bav_count.("fdp") / bavaria_total,
    party_bav_count.("fw") / bavaria_total,
    party_bav_count.("gruene") / bavaria_total,
    party_bav_count.("linke") / bavaria_total,
    party_bav_count.("spd") / bavaria_total
  ],
  positiv_german_ratio: [
    pos_party_count.("afd") / pos_total_count,
    pos_party_count.("csu") / pos_total_count,
    pos_party_count.("fdp") / pos_total_count,
    pos_party_count.("fw") / pos_total_count,
    pos_party_count.("gruene") / pos_total_count,
    pos_party_count.("linke") / pos_total_count,
    pos_party_count.("spd") / pos_total_count
  ],
  positiv_bavarian_support: [
    pos_party_bav_count.("afd"),
    pos_party_bav_count.("csu"),
    pos_party_bav_count.("fdp"),
    pos_party_bav_count.("fw"),
    pos_party_bav_count.("gruene"),
    pos_party_bav_count.("linke"),
    pos_party_bav_count.("spd")
  ],
  positiv_bavarian_ratio: [
    pos_party_bav_count.("afd") / pos_bavaria_total,
    pos_party_bav_count.("csu") / pos_bavaria_total,
    pos_party_bav_count.("fdp") / pos_bavaria_total,
    pos_party_bav_count.("fw") / pos_bavaria_total,
    pos_party_bav_count.("gruene") / pos_bavaria_total,
    pos_party_bav_count.("linke") / pos_bavaria_total,
    pos_party_bav_count.("spd") / pos_bavaria_total
  ],
  positive_german_followers: [
    pos_follower_count.("afd") / pos_total_followers,
    pos_follower_count.("csu") / pos_total_followers,
    pos_follower_count.("fdp") / pos_total_followers,
    pos_follower_count.("fw") / pos_total_followers,
    pos_follower_count.("gruene") / pos_total_followers,
    pos_follower_count.("linke") / pos_total_followers,
    pos_follower_count.("spd") / pos_total_followers
  ]
})
|> DF.print(limit: :infinity)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the CSU", "csu", start_date)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the Freie Waehler", "fw", start_date)
Graphs.create_sentiment_graph(
  bav_ger_toots,
  "Sentiments for the Buendnis90/Gruene",
  "gruene",
  start_date
)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the SPD", "spd", start_date)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the FDP", "fdp", start_date)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the AFD", "afd", start_date)
Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the Linke", "linke", start_date)

TODO:

  1. cross correlation
  2. emojies?
  3. add min/max dates for sentiment graphs

4 nl posts -> error, 1 fr post -> correct

Comparision Polls and Sentiments

normalized_timeline =
  bav_ger_toots
  |> DF.filter(date >= ^start_date)
  |> DF.select(["domentent_party", "sentiment", "day", "id"])
  |> DF.mutate(sentiment: (sentiment + 1) / 2)
  |> DF.pivot_wider("domentent_party", "sentiment", id_columns: ["id", "day"])
  |> DF.discard(["id"])

sentiment_timeline =
  normalized_timeline
  |> DF.group_by("day")
  |> DF.mutate(
    csu: mean(csu),
    spd: mean(spd),
    fw: mean(fw),
    fdp: mean(fdp),
    afd: mean(afd),
    gruene: mean(gruene),
    linke: mean(linke)
  )
  |> DF.select(["day", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
  |> DF.distinct()
  |> DF.arrange(asc: day)

DF.print(sentiment_timeline)
filtered_polls = DF.filter(polls, start_date >= ^start_date)

polls_timeline =
  DF.mutate(filtered_polls,
    start_day: S.day_of_year(start_date),
    end_day: S.day_of_year(end_date)
  )
polls_timeline =
  polls_timeline
  |> DF.to_rows()
  |> Enum.with_index()
  |> Enum.flat_map(fn {row, index} ->
    row["start_day"]..row["end_day"]
    |> Enum.map(&amp;Map.merge(row, %{"day" => &amp;1, "index" => index}))
  end)
  |> DF.new()
  |> DF.select(["day", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
  |> DF.group_by("day")
  |> DF.mutate(
    csu: mean(csu) / 100,
    spd: mean(spd) / 100,
    fw: mean(fw) / 100,
    fdp: mean(fdp) / 100,
    afd: mean(afd) / 100,
    gruene: mean(gruene) / 100,
    linke: mean(linke) / 100
  )
  |> DF.arrange(asc: day)
  |> DF.distinct()

DF.print(polls_timeline, limit: :infinity)

Align the End dates of Polls and Sentiments.

end_day = Enum.min([S.max(polls_timeline["day"]), S.max(sentiment_timeline["day"])])

sentiment_timeline = DF.filter(sentiment_timeline, day <= ^end_day)
polls_timeline = DF.filter(polls_timeline, day <= ^end_day)
end_sampling_doy = 323
lang_toots = DF.filter(bav_ger_toots, day <= ^end_sampling_doy) |> DF.filter(day >= ^start_day)

Tucan.concat([
  Tucan.histogram(lang_toots, "day", relative: true, step: 1) |> Tucan.Axes.set_x_title("Day"),
  Tucan.histogram(lang_toots, "weekday", relative: true) |> Tucan.Axes.set_x_title("Weekday"),
  Tucan.histogram(lang_toots, "hour", relative: true, step: 1) |> Tucan.Axes.set_x_title("Hour")
])
|> Tucan.set_title("Post frequencies on different time scales")

Fill in missing data.

  • Dates not yet included
  • Dates included but no data measured.
days_range = start_day..end_day
parties = ["csu", "spd", "fw", "fdp", "afd", "gruene", "linke"]
polls_timeline = DataFrameDate.fill(DF.ungroup(polls_timeline), "day", days_range, parties)

sentiment_timeline =
  DataFrameDate.fill(DF.ungroup(sentiment_timeline), "day", days_range, parties)

DF.print(sentiment_timeline, limit: :infinity)
DF.print(polls_timeline, limit: :infinity)
polls_timeline =
  polls_timeline
  |> DF.ungroup()
  |> DF.arrange(day)
  |> DF.mutate(
    csu: NilSeries.fb_fill(csu),
    spd: NilSeries.fb_fill(spd),
    fw: NilSeries.fb_fill(fw),
    fdp: NilSeries.fb_fill(fdp),
    afd: NilSeries.fb_fill(afd),
    gruene: NilSeries.fb_fill(gruene),
    linke: NilSeries.fb_fill(linke)
  )

sentiment_timeline =
  sentiment_timeline
  |> DF.ungroup()
  |> DF.arrange(day)
  |> DF.mutate(
    csu: NilSeries.fb_fill(csu),
    spd: NilSeries.fb_fill(spd),
    fw: NilSeries.fb_fill(fw),
    fdp: NilSeries.fb_fill(fdp),
    afd: NilSeries.fb_fill(afd),
    gruene: NilSeries.fb_fill(gruene),
    linke: NilSeries.fb_fill(linke)
  )
  |> DF.mutate(
    csu: csu / (csu + spd + fw + fdp + afd + gruene + linke),
    spd: spd / (csu + spd + fw + fdp + afd + gruene + linke),
    fw: fw / (csu + spd + fw + fdp + afd + gruene + linke),
    fdp: fdp / (csu + spd + fw + fdp + afd + gruene + linke),
    afd: afd / (csu + spd + fw + fdp + afd + gruene + linke),
    gruene: gruene / (csu + spd + fw + fdp + afd + gruene + linke),
    linke: linke / (csu + spd + fw + fdp + afd + gruene + linke)
  )

Parties with enough Sentiment samples

Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - CSU", "csu")
{S.standard_deviation(polls_timeline["csu"]), S.standard_deviation(polls_timeline["fw"]),
 S.standard_deviation(polls_timeline["afd"])}
{S.standard_deviation(sentiment_timeline["csu"]), S.standard_deviation(sentiment_timeline["fw"]),
 S.standard_deviation(sentiment_timeline["afd"])}
Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - FW", "fw")
Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - AFD", "afd")

Parties with few Sentiment Samples

Graphs.create_daily_compare_graph(
  sentiment_timeline,
  polls_timeline,
  "Sentiment - Buendis90 Gruene",
  "gruene"
)
Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment -  SPD", "spd")
Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - FDP", "fdp")
Graphs.create_daily_compare_graph(
  sentiment_timeline,
  polls_timeline,
  "Sentiment - Linke",
  "linke"
)

Cross Correlation

party_sentiment_offset = fn sentiment_df, poll_df, party_str ->
  sentiments_t = S.to_tensor(sentiment_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)
  polls_t = S.to_tensor(poll_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)

  CrossCorrelation.find_offset(sentiments_t, polls_t)
end
party_sentiment_offset.(sentiment_timeline, polls_timeline, "csu")
party_sentiment_offset.(sentiment_timeline, polls_timeline, "fw")
party_sentiment_offset.(sentiment_timeline, polls_timeline, "afd")

Correlation Sentiment –> Polls

alias Scholar.Linear.LinearRegression, as: LR
alias Scholar.Metrics.Regression, as: RegMetrics
alias Scholar.Preprocessing
longer_sentiment_timeline =
  sentiment_timeline
  |> DF.mutate(
    csu: csu / (csu + spd + fw + fdp + afd + gruene + linke),
    spd: spd / (csu + spd + fw + fdp + afd + gruene + linke),
    fw: fw / (csu + spd + fw + fdp + afd + gruene + linke),
    fdp: fdp / (csu + spd + fw + fdp + afd + gruene + linke),
    afd: afd / (csu + spd + fw + fdp + afd + gruene + linke),
    gruene: gruene / (csu + spd + fw + fdp + afd + gruene + linke),
    linke: linke / (csu + spd + fw + fdp + afd + gruene + linke)
  )
  |> DF.select(["day", "afd", "csu", "fw"])
  |> DF.pivot_longer(["afd", "csu", "fw"])

# |> DF.print(limit: :infinity)
longer_polls_timeline =
  polls_timeline
  |> DF.select(["day", "afd", "csu", "fw"])
  |> DF.pivot_longer(["afd", "csu", "fw"])
daily_polls_sentiment_df =
  DF.new(
    day: longer_sentiment_timeline["day"],
    sentiment: longer_sentiment_timeline["value"],
    poll: longer_polls_timeline["value"],
    party: longer_polls_timeline["variable"]
  )

daily_polls_sentiment_df = DF.mutate(daily_polls_sentiment_df, party: S.cast(party, :category))
Tucan.scatter(daily_polls_sentiment_df, "sentiment", "poll", color_by: "party")
|> Tucan.set_width(500)
|> Tucan.set_height(500)
|> Tucan.set_title("Dependency of Poll results from Sentiment.")
|> Tucan.Axes.set_x_title("Sentiment per Average of Sentiments")
|> Tucan.Axes.set_y_title("Poll")
|> Tucan.Scale.set_y_domain(0.1, 0.4)
|> Tucan.Scale.set_x_domain(0, 0.3)
split = round(elem(DF.shape(daily_polls_sentiment_df), 0) * 0.7)
daily_polls_sentiment_shuffled_df = DF.shuffle(daily_polls_sentiment_df)
polls_sentiment_df_train = DF.slice(daily_polls_sentiment_shuffled_df, 0..split)
polls_sentiment_df_test = DF.slice(daily_polls_sentiment_shuffled_df, split..-1)
get_one_hot = fn x, y ->
  x
  |> S.cast(:category)
  |> S.to_tensor()
  |> Preprocessing.one_hot_encode(num_classes: y)
end

get_sentiment = fn x ->
  x
  |> S.to_tensor()
  |> Nx.reshape({:auto, 1})
end
party_train = get_one_hot.(polls_sentiment_df_train["party"], 3)
party_test = get_one_hot.(polls_sentiment_df_test["party"], 3)

sentiments_train = get_sentiment.(polls_sentiment_df_train["sentiment"])
sentiments_test = get_sentiment.(polls_sentiment_df_test["sentiment"])

x_train = Nx.concatenate([party_train, sentiments_train], axis: 1)
x_test = Nx.concatenate([party_test, sentiments_test], axis: 1)
y_train = polls_sentiment_df_train["poll"] |> S.to_tensor()
y_test = polls_sentiment_df_test["poll"] |> S.to_tensor()

model = LR.fit(x_train, y_train)
y_hat = LR.predict(model, x_test)
RegMetrics.r2_score(y_test, y_hat)

Summary:

  • intercept: 0.2182
  • afd: -0.0749
  • csu: 0.1321
  • fw: -0.0573
  • sentiment: 0.0013

Meaning:

  • Almost not depending on (daily) Sentiment! Better Sentiment worse Poll results!?
  • afd: 14.9 %
  • csu: 35.3 %
  • fw: 16.2 %

Comparision: Weekly Sentiments and Polls

normalized_weeks =
  bav_ger_toots
  |> DF.filter(date >= ^start_date)
  |> DF.select(["domentent_party", "sentiment", "week", "id"])
  |> DF.mutate(sentiment: (sentiment + 1) / 2)
  |> DF.pivot_wider("domentent_party", "sentiment", id_columns: ["id", "week"])
  |> DF.discard(["id"])

sentiment_weeks =
  normalized_weeks
  |> DF.group_by("week")
  |> DF.mutate(
    csu: mean(csu),
    spd: mean(spd),
    fw: mean(fw),
    fdp: mean(fdp),
    afd: mean(afd),
    gruene: mean(gruene),
    linke: mean(linke)
  )
  |> DF.select(["week", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
  |> DF.distinct()
  |> DF.arrange(asc: week)

DF.print(sentiment_weeks)
poll_weeks =
  polls
  |> DF.filter(mid_date > ^start_date)
  |> DF.select(["week", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
  |> DF.group_by("week")
  |> DF.mutate(
    afd: mean(afd) / 100,
    csu: mean(csu) / 100,
    fdp: mean(fdp) / 100,
    fw: mean(fw) / 100,
    gruene: mean(gruene) / 100,
    linke: mean(linke) / 100,
    spd: mean(spd) / 100
  )
  |> DF.distinct()
  |> DF.ungroup()

DF.print(poll_weeks, limit: :infinity)
end_week = Enum.min([S.max(poll_weeks["week"]), S.max(sentiment_weeks["week"])])
start_week = Enum.max([S.min(poll_weeks["week"]), S.min(sentiment_weeks["week"])])

poll_weeks = DF.filter(poll_weeks, week <= ^end_week)
DF.print(poll_weeks, limit: :infinity)
sentiment_weeks = DF.filter(sentiment_weeks, week <= ^end_week)
DF.print(sentiment_weeks, limit: :infinity)
filled_poll_weeks =
  DataFrameDate.fill(poll_weeks, "week", start_week..end_week, [
    "afd",
    "csu",
    "fdp",
    "fw",
    "gruene",
    "linke",
    "spd"
  ])

filled_poll_weeks =
  DF.mutate(filled_poll_weeks,
    afd: NilSeries.fb_fill(afd),
    csu: NilSeries.fb_fill(csu),
    fdp: NilSeries.fb_fill(fdp),
    fw: NilSeries.fb_fill(fw),
    gruene: NilSeries.fb_fill(gruene),
    linke: NilSeries.fb_fill(linke),
    spd: NilSeries.fb_fill(spd)
  )

filled_sentiment_weeks =
  DataFrameDate.fill(sentiment_weeks, "week", start_week..end_week, [
    "afd",
    "csu",
    "fdp",
    "fw",
    "gruene",
    "linke",
    "spd"
  ])
  |> DF.mutate(gruene: NilSeries.fb_fill(gruene))
  |> DF.mutate(
    csu: csu / (csu + spd + fw + fdp + afd + gruene + linke),
    spd: spd / (csu + spd + fw + fdp + afd + gruene + linke),
    fw: fw / (csu + spd + fw + fdp + afd + gruene + linke),
    fdp: fdp / (csu + spd + fw + fdp + afd + gruene + linke),
    afd: afd / (csu + spd + fw + fdp + afd + gruene + linke),
    gruene: gruene / (csu + spd + fw + fdp + afd + gruene + linke),
    linke: linke / (csu + spd + fw + fdp + afd + gruene + linke)
  )

DF.print(filled_sentiment_weeks, limit: :infinity)
Graphs.create_weekly_compare_graph(
  filled_sentiment_weeks,
  filled_poll_weeks,
  "Sentiment - CSU",
  "csu"
)
Graphs.create_weekly_compare_graph(
  filled_sentiment_weeks,
  filled_poll_weeks,
  "Sentiment - FW",
  "fw"
)
Graphs.create_weekly_compare_graph(
  filled_sentiment_weeks,
  filled_poll_weeks,
  "Sentiment - AFD",
  "afd"
)
Graphs.create_weekly_compare_graph(
  filled_sentiment_weeks,
  filled_poll_weeks,
  "Sentiment - SPD",
  "spd"
)
Graphs.create_weekly_compare_graph(
  filled_sentiment_weeks,
  filled_poll_weeks,
  "Sentiment - Buendnis/ Gruene",
  "gruene"
)

Cross Correlation

party_sentiment_offset = fn sentiment_df, poll_df, party_str ->
  sentiments_t = S.to_tensor(sentiment_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)
  polls_t = S.to_tensor(poll_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)

  CrossCorrelation.find_offset(sentiments_t, polls_t)
end
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "afd") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "csu") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "fdp") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "fw") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "gruene") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "spd") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "linke") |> dbg()

Linear Fit

alias Scholar.Linear.LinearRegression, as: LR
alias Scholar.Linear.RidgeRegression, as: Ridge
alias Scholar.Metrics.Regression, as: RegMetrics
alias Scholar.Preprocessing
DF.print(filled_sentiment_weeks, limit: :infinity)
longer_sentiment_weekly_timeline =
  filled_sentiment_weeks
  |> DF.pivot_longer(["afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])

DF.print(longer_sentiment_weekly_timeline, limit: :infinity)
longer_polls_weekly_timeline =
  filled_poll_weeks
  |> DF.pivot_longer(["afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])

DF.print(longer_polls_weekly_timeline, limit: :infinity)
weekly_polls_sentiment_df =
  DF.new(
    week: longer_sentiment_weekly_timeline["week"],
    sentiment: longer_sentiment_weekly_timeline["value"],
    poll: longer_polls_weekly_timeline["value"],
    party: longer_sentiment_weekly_timeline["variable"]
  )

weekly_polls_sentiment_df = DF.mutate(weekly_polls_sentiment_df, party: S.cast(party, :category))
Tucan.scatter(weekly_polls_sentiment_df, "sentiment", "poll", color_by: "party", shape_by: "party")
|> Tucan.set_width(500)
|> Tucan.set_height(500)
|> Tucan.set_title("Dependency of Polling Results From Sentiment.")
|> Tucan.Axes.set_x_title("Sentiment")
|> Tucan.Axes.set_y_title("Poll")
|> Tucan.Scale.set_y_domain(0, 0.4)
|> Tucan.Scale.set_x_domain(0, 0.25)
split = round(elem(DF.shape(weekly_polls_sentiment_df), 0) * 0.7)

weekly_polls_sentiment_shuffled_df = DF.shuffle(weekly_polls_sentiment_df)
polls_sentiment_df_train = DF.slice(weekly_polls_sentiment_shuffled_df, 0..split)
polls_sentiment_df_test = DF.slice(weekly_polls_sentiment_shuffled_df, split..-1)
get_one_hot = fn x, y ->
  x
  |> S.cast(:category)
  |> S.to_tensor()
  |> Preprocessing.one_hot_encode(num_classes: y)
end

get_sentiment = fn x ->
  x
  |> S.to_tensor()
  |> Nx.reshape({:auto, 1})
end
party_train = get_one_hot.(polls_sentiment_df_train["party"], 7)
party_test = get_one_hot.(polls_sentiment_df_test["party"], 7)

sentiments_train = get_sentiment.(polls_sentiment_df_train["sentiment"])
sentiments_test = get_sentiment.(polls_sentiment_df_test["sentiment"])

x_train = Nx.concatenate([party_train, sentiments_train], axis: 1)
x_test = Nx.concatenate([party_test, sentiments_test], axis: 1)
y_train = polls_sentiment_df_train["poll"] |> S.to_tensor()
y_test = polls_sentiment_df_test["poll"] |> S.to_tensor()

model = LR.fit(x_train, y_train, fit_intercept?: false)
y_hat = LR.predict(model, x_test)
RegMetrics.r2_score(y_test, y_hat)

Summary:

  • intercept: 0.131

  • afd: 0.003

  • csu: 0.229

  • fdp: -0.103

  • fw: 0.026

  • gruene 0.008

  • linke: -0.118

  • spd: -0.046

  • sentiment: 0.003

defmodule GridSearch do
  def ridge_single(point, x_train, x_test, y_train, y_test) do
    model_ridge = Ridge.fit(x_train, y_train, fit_intercept?: false, alpha: point)
    y_hat_ridge = Ridge.predict(model_ridge, x_test)
    Nx.to_number(RegMetrics.r2_score(y_test, y_hat_ridge))
  end

  def ridge_1d(points, x_train, x_test, y_train, y_test)
      when is_list(points) do
    points
    |> Enum.map(&amp;ridge_single(&amp;1, x_train, x_test, y_train, y_test))
  end
end
GridSearch.ridge_1d([0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], x_train, x_test, y_train, y_test)
model_ridge = Ridge.fit(x_train, y_train, fit_intercept?: false, alpha: 0.1) |> dbg
y_hat_ridge = Ridge.predict(model_ridge, x_test)
Nx.to_number(RegMetrics.r2_score(y_test, y_hat_ridge))

Compare with average of that time frame

filled_poll_weeks
|> DF.mutate(
  afd: S.mean(afd),
  csu: S.mean(csu),
  fdp: S.mean(fdp),
  fw: S.mean(fw),
  gruene: S.mean(gruene),
  linke: S.mean(linke),
  spd: S.mean(spd)
)
|> DF.discard(["week"])
|> DF.distinct()
|> DF.print()