Notesclub

created by hec & contributors

terms privacy

Bavarian Election

election_bavaria.livemd

Sebastian Heiden

@sehHeiden

poll_election_bavaria_202...

Share to X

Share to Bluesky

More notebooks

Bavarian Election

Mix.install(
  [
    {:nx, "~>0.6.4"},
    {:bumblebee, "~>0.4.2"},
    {:explorer, "~>0.7.2"},
    {:kino_vega_lite, "~> 0.1.11"},
    {:httpoison, "~> 1.8"},
    {:exla, "~> 0.6.4"},
    {:adbc, "~> 0.2.2"},
    {:kino_bumblebee, "~>0.4.0"},
    {:tucan, "~> 0.2.1"},
    {:scholar, "~> 0.2.1"}
  ],
  config: [
    nx: [default_backend: EXLA.Backend, client: :cuda, device_id: 1]
  ]
)

Modules

require Explorer.DataFrame

alias VegaLite, as: Vl
alias Explorer.DataFrame, as: DF
alias Explorer.Series, as: S

defmodule Graphs do
  def create_poll_graph(data_source, title, party, start_date) do
    Vl.new(width: 500, height: 300, title: title)
    |> Vl.data_from_values(DF.filter(data_source, mid_date >= ^start_date),
      only: ["end_date", "start_date", "mid_date", party, "institute"]
    )
    |> Vl.layers([
      # data as bars
      Vl.new()
      |> Vl.mark(:bar, opacity: 0.5)
      |> Vl.encode_field(:x, "start_date", type: :temporal, title: "poll dates")
      |> Vl.encode_field(:x2, "end_date")
      |> Vl.encode_field(:y, party,
        type: :quantitative,
        title: "percentage",
        scale: [domain: [S.min(data_source[party]), S.max(data_source[party])]]
      )
      |> Vl.encode_field(:color, "institute", type: :nominal),
      Vl.new()
      # data with opacity 0, to be able to fit
      |> Vl.mark(:bar, opacity: 0.0)
      |> Vl.encode_field(:x, "mid_date", type: :temporal, title: "poll dates")
      |> Vl.encode_field(:x2, "end_date")
      |> Vl.encode_field(:y, party,
        type: :quantitative,
        title: "percentage",
        scale: [domain: [S.min(data_source[party]), S.max(data_source[party])]]
      )
      |> Vl.encode_field(:color, "institute", type: :nominal),
      # rule
      Vl.new()
      |> Vl.mark(:rule, color: "blue")
      |> Vl.encode(:y,
        datum: DF.filter(data_source, institute == "Landtagswahl")[party][0],
        type: :quantitative
      ),
      # fit
      Vl.new()
      |> Vl.mark(:line, color: "firebrick")
      |> Vl.transform(loess: party, on: "mid_date", bandwidth: 0.5)
      |> Vl.encode_field(:x, "mid_date", type: :temporal, title: "poll dates")
      |> Vl.encode_field(:y, party, type: :quantitative, title: "percentage")
    ])
  end

  def create_poll_graph(data_source, title, party) do
    create_poll_graph(data_source, title, party, ~N[2023-01-01 00:00:01])
  end

  def create_sentiment_graph(data_source, title, party, start_date) do
    Vl.new(width: 500, height: 300, title: title)
    |> Vl.data_from_values(
      DF.filter(data_source, domentent_party == ^party and date > ^start_date),
      only: ["date", "sentiment", "region"]
    )
    |> Vl.layers([
      Vl.new()
      |> Vl.mark(:point)
      |> Vl.encode_field(:x, "date", type: :temporal)
      |> Vl.encode_field(:y, "sentiment",
        type: :quantitative,
        scale: [domain: [-1, 1]]
      )
      |> Vl.encode_field(:color, "region", type: :nominal, title: "Region")
      |> Vl.encode_field(:shape, "region", type: :nominal, title: "Region"),
      Vl.new()
      |> Vl.mark(:line, color: "firebrick", opacity: 0.5)
      |> Vl.transform(loess: "sentiment", on: "date", bandwidth: 0.5)
      |> Vl.encode_field(:x, "date", type: :temporal, title: "date")
      |> Vl.encode_field(:y, "sentiment", type: :quantitative, title: "sentiment")
    ])
  end

  def create_daily_compare_graph(sentiment_data_source, poll_data_source, title, party) do
    Tucan.layers([
      Tucan.step(poll_data_source, "day", party, line_color: "red"),
      Tucan.step(sentiment_data_source, "day", party, line_color: "green")
    ])
    |> Tucan.set_width(500)
    |> Tucan.set_title(title)
    |> Tucan.Axes.set_x_title("Day of the year")
    |> Tucan.Axes.set_y_title("Sentiments, Polls")
    |> Tucan.Scale.set_y_domain(0, 1)
    |> Tucan.Legend.set_enabled(:color, true)
    |> Tucan.Legend.set_title(:color, "TimeLine")
    |> Tucan.annotate(243, S.max(poll_data_source[party]) + 0.05, "Polls", color: "red", size: 18)
    |> Tucan.annotate(243, S.max(sentiment_data_source[party]) + 0.05, "Sentiments",
      color: "green",
      size: 18
    )
  end

  def create_weekly_compare_graph(sentiment_data_source, poll_data_source, title, party) do
    Tucan.layers([
      Tucan.step(poll_data_source, "week", party, line_color: "red"),
      Tucan.step(sentiment_data_source, "week", party, line_color: "green")
    ])
    |> Tucan.set_width(500)
    |> Tucan.set_title(title)
    |> Tucan.Axes.set_x_title("Calendar Week")
    |> Tucan.Axes.set_y_title("Sentiments, Polls")
    |> Tucan.Scale.set_y_domain(0, 1)
    |> Tucan.Legend.set_enabled(:color, true)
    |> Tucan.Legend.set_title(:color, "TimeLine")
    |> Tucan.annotate(36, poll_data_source[party][1] + 0.05, "Polls", color: "red", size: 18)
    |> Tucan.annotate(36, sentiment_data_source[party][1] + 0.05, "Sentiments",
      color: "green",
      size: 18
    )
  end
end

defmodule Names do
  def append_variants(names) do
    names
    |> Enum.concat(Enum.map(names, fn x -> String.replace(x, " ", "") end))
    |> Enum.concat(Enum.map(names, fn x -> String.replace(x, "ö", "o") end))
    |> Enum.concat(Enum.map(names, fn x -> String.replace(x, "ö", "oe") end))
    |> Enum.concat(Enum.map(names, fn x -> String.replace(x, "-", "") end))
    |> Enum.uniq()
  end

  def family_name(y) do
    String.split(y, " ", parts: 2) |> Enum.at(1)
  end
end

defmodule Bool do
  def to_integer(true), do: 1
  def to_integer(false), do: 0
  def to_integer(nil), do: 0
end

defmodule SentimentScore do
  def score(prediction) do
    prediction
    |> Enum.map(fn p ->
      case p do
        %{label: l} when l in ["POS", "positive"] -> p.score
        %{label: l} when l in ["NEG", "negative"] -> -p.score
        _ -> 0
      end
    end)
    |> Enum.sum()
  end
end

defmodule User do
  def image_exists?(body), do: byte_size(body) > 478

  def download_image(save_stem, link) do
    save_name = "./graphics/userimages/#{save_stem}.png"

    if not File.exists?(save_name) do
      %HTTPoison.Response{body: body} = HTTPoison.get!(link)

      if User.image_exists?(body) do
        File.write!(save_name, body)
      end
    end
  end
end

defmodule Text do
  def count_mentions(text, mentioned_words)
      when is_list(mentioned_words)
      when is_binary(text) do
    is_scanned =
      Enum.map(mentioned_words, fn y ->
        _scanned = Regex.scan(Regex.compile!("\\b#{y}\\b"), text)
      end)

    per_word_sum = Enum.map(is_scanned, &amp;length(&amp;1))
    Enum.sum(per_word_sum)
  end

  def count_unique_mentions(text, mentioned_words)
      when is_list(mentioned_words)
      when is_binary(text) do
    is_scanned =
      Enum.map(mentioned_words, fn y ->
        scanned = Regex.run(Regex.compile!("\\b#{y}\\b"), text)
        if is_nil(scanned), do: 0, else: 1
      end)

    Enum.sum(is_scanned)
  end

  def contains_mentions?(text, mentioned_words)
      when is_list(mentioned_words)
      when is_binary(text) do
    count =
      text
      |> count_unique_mentions(mentioned_words)

    if(count > 0, do: true, else: false)
  end
end

defmodule CrossCorrelation do
  def cross_corr(t1, t2) do
    Nx.conv(t1, t2)
  end

  def full_cross_corr(t1, t2) do
    p1 = elem(Nx.shape(t1), 2) - 1
    p2 = elem(Nx.shape(t2), 2) - 1

    Nx.conv(t1, t2, padding: [{p1, p2}])
  end

  def find_offset(t1, t2) do
    l1 = elem(Nx.shape(t1), 2)

    coeffs = full_cross_corr(t1, t2)
    index = Nx.argmax(coeffs)
    Nx.to_number(index) - l1 + 1
  end
end

defmodule DataFrameDate do
  def fill(df, timeline_int_col, timeline_range, data_cols) do
    timeline =
      df[timeline_int_col]
      |> S.to_list()
      |> MapSet.new()

    measurement_timeline = timeline_range |> MapSet.new()

    missing_time = MapSet.difference(measurement_timeline, timeline)
    missing_data = List.duplicate(nil, length(Enum.to_list(missing_time)))

    missing_df =
      (Enum.map(data_cols, fn x -> %{x => missing_data} end) ++
         [%{timeline_int_col => missing_time}])
      |> Enum.map(&amp;DF.new(&amp;1))
      |> DF.concat_columns()

    DF.concat_rows([missing_df, df])
    |> DF.arrange_with(&amp; &amp;1[timeline_int_col])
  end
end

defmodule NilSeries do
  def fb_fill(series) do
    S.fill_missing(S.fill_missing(series, :forward), :backward)
  end
end

Polls

start_date = ~N[2023-08-29 00:00:01]
start_day = start_date |> NaiveDateTime.to_date() |> Date.day_of_year()

Load the csv with the polls from different instiutions form the website wahlrecht.de. Show the timeline and trend for every party.

polls =
  DF.from_csv!("polls.csv", delimiter: ";", parse_dates: true)
  |> DF.mutate(
    start_date: S.cast(start_date, {:datetime, :millisecond}),
    end_date: S.cast(end_date, {:datetime, :millisecond})
  )

polls_mdt =
  S.to_list(polls["end_date"])
  |> Enum.zip(S.to_list(polls["start_date"]))
  |> Enum.map(
    &amp;NaiveDateTime.add(
      elem(&amp;1, 1),
      round(NaiveDateTime.diff(elem(&amp;1, 0), elem(&amp;1, 1)) / 2)
    )
  )

polls = DF.put(polls, "mid_date", polls_mdt)
polls = DF.mutate(polls, week: S.cast(mid_date, :date) |> S.week_of_year())
DF.print(polls, limit: :infinity)

Graphs.create_poll_graph(polls, "Polls - CSU", "csu")

Graphs.create_poll_graph(polls, "Polls - Freie Waehler", "fw")

Graphs.create_poll_graph(polls, "Polls - Buendnis90-Gruene", "gruene")

Graphs.create_poll_graph(polls, "Polls - SPD", "spd")

Graphs.create_poll_graph(polls, "Polls - FDP", "fdp")

Graphs.create_poll_graph(polls, "Polls - AFD", "afd")

Graphs.create_poll_graph(polls, "Polls - Linke", "linke")

Regions

Names of subdistricts are removed, as the are often to general. Example: Oder, Gern. Still, Village name can be general terms as Wald.

local_entities = DF.from_csv!("geodaten/VerwaltungsEinheit.csv", delimiter: ",")["name"]
# subdist_entities = DF.from_csv!("geodaten/KatasterBezirk.csv", delimiter: ",")["name"]

local_entities =
  local_entities
  |> S.downcase()

Parties

parties_df =
  DF.new(
    party: ["csu", "fw", "spd", "gruene", "fdp", "afd", "linke"],
    candiate1: [
      "Markus Söder",
      "Hubert Aiwanger",
      "Florian von Brunn",
      "Ludwig Hartmann",
      "Martin Hagen",
      "Katrin Ebner-Steiner",
      "Adelheid Rupp"
    ],
    candidate2: [nil, nil, nil, "Katharina Schulze", nil, "Martin Böhm", nil]
  )

DF.print(parties_df)

parties = S.to_list(parties_df["party"])

parties_regex =
  parties
  |> Enum.join("|")
  |> Regex.compile!()

Analysis Tags and Toots

f = "mastodon_bayernwahl2023_20231119.db"
# f = "mastodon_bayernwahl2023_20230910.db"
p = Path.absname(f)

Adbc.download_driver!(:sqlite)
{:ok, db} = Kino.start_child({Adbc.Database, driver: :sqlite, uri: p})
{:ok, conn} = Kino.start_child({Adbc.Connection, database: db})

{:ok, tags_df} = Explorer.DataFrame.from_query(conn, "select * from tags", [])

tags_df["tag"]
|> S.downcase()
|> S.to_list()
|> Enum.filter(&amp;Enum.member?(parties, &amp;1))
|> Enum.frequencies()

From the original tracked posts, only a minimal set contains the names of the parties:

Freie Waehler (82)
Gruene (58)
Linke (57)

This is much better for the parties: AFD, CSU and SPD.

{:ok, toots_df} = Explorer.DataFrame.from_query(conn, "select * from toots", [])
toots_df = DF.mutate(toots_df, date: S.strptime(date, "%Y-%m-%dT%H:%M:%S"))

toots_df =
  toots_df
  |> DF.mutate(
    day: S.cast(date, :date) |> S.day_of_year(),
    week: S.cast(date, :date) |> S.week_of_year(),
    weekday: S.cast(date, :date) |> S.day_of_week(),
    hour: S.hour(date)
  )

post_number = S.size(toots_df["content"])

toots_df["content"]
|> S.fill_missing(" ")
|> S.downcase()
|> S.to_list()
|> Enum.filter(&amp;Regex.match?(parties_regex, &amp;1))
|> Enum.map(&amp;Regex.scan(parties_regex, &amp;1))
|> Enum.map(&amp;Enum.uniq(&amp;1))
|> List.flatten()
|> Enum.frequencies()
|> Enum.sort_by(&amp;elem(&amp;1, 1), :desc)
|> Enum.map(fn {lang, freq} -> {lang, freq / post_number * 100.0} end)

This is slightly getting better, the whole posts are taken into account.

Freie Waehler (82 -> 155)
Grune (58 -> 169)
Linke (57 -> 175)

TODO: use regions.

candidate_family_names =
  parties_df["candiate1"]
  |> S.concat(parties_df["candidate2"])
  |> S.downcase()
  |> S.to_list()
  |> Enum.filter(&amp;is_binary(&amp;1))
  |> Enum.map(&amp;Names.family_name(&amp;1))

candidate_family_names =
  candidate_family_names
  |> Names.append_variants()

bavaria_tags = S.to_list(local_entities) ++ ["csu"] ++ candidate_family_names

name_regex = Regex.compile!(Enum.join(candidate_family_names, "|"))

Attribution (Do Execute with caution)

gender
bavarian
age (perhaps)
sentiment -> Party
date -> calendar week

flowchart TD;
  A(User on Bavarian Instance?) -->|yes| B[Bavarian];
  A-->|no| C(Bavarian Location in Field?);
  C --> |yes| B;
  C --> |no| D(Bavarian Location in User note?)
  D --> |yes| B;
  D --> |no| E(Interfered Language in toot is German?);
  E --> |yes| F[German]
  E --> |no| G[Foreign]

Mark Bavarian Instance

{:ok, person_df} = Explorer.DataFrame.from_query(conn, "select * from users", [])

bavarian_instances = ~w"muenchen.social augsburg.social mastodon.bayern nuernberg.social 
ploen.social wue.social mastodon.dachgau.social  sueden.social"

bavarian_instances_reg =
  bavarian_instances
  |> Enum.join("|")
  |> Regex.compile!()

is_user_on_bavarian_instance =
  person_df["user_name"]
  |> S.downcase()
  |> S.transform(&amp;Enum.at(String.split(&amp;1, "@"), 1, "chaos.social"))
  |> S.transform(&amp;is_list(Regex.run(bavarian_instances_reg, &amp;1)))

person_df = DF.put(person_df, "bavarian_instance", is_user_on_bavarian_instance)

bavarian_person = DF.filter(person_df, bavarian_instance == true)
elem(DF.shape(bavarian_person), 0) / elem(DF.shape(person_df), 0) * 100

About 6 % of all the users are on a Bavarian instance.

Bavarian Locations in Fields

{:ok, fields_df} = Explorer.DataFrame.from_query(conn, "select * from fields", [])

Remove:

HTML
#-Sign
@-Sign
_-Sign
links
excess white space

Left in:

Simileys (language model might know them)
numbers (language model converts them)

links =
  "https?:\/\/(?:www\.)?([-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b)*(\/[\/\d\w\.-]*)*(?:[\?])*(.+)*|="

html = "<[^>]*>"
excess_spaces_r = Regex.compile!("\s\s+")

clean_r = Regex.compile!("#{html}|#|@|_|#{links}|\"")

filter_field_regions_names = [
  "adresse",
  "born where",
  "bundesland",
  "city",
  "country",
  "heimat",
  "heimathafen",
  "heimatort",
  "herkunft",
  "home",
  "location",
  "ort",
  "standort",
  "wahlkreis",
  "wo",
  "wohnhaft",
  "wohnort",
  "wohnt in",
  "zuhause",
  "📍"
]

location_fields =
  fields_df["field_name"]
  |> S.downcase()
  |> S.to_enum()
  |> Enum.map(&amp;Enum.member?(filter_field_regions_names, &amp;1))

cleaned_field =
  fields_df["field_value"]
  |> S.downcase()
  |> S.transform(&amp;Regex.replace(clean_r, &amp;1, " "))
  |> S.transform(&amp;Regex.replace(excess_spaces_r, &amp;1, " "))
  |> S.transform(&amp;String.trim(&amp;1))

fields_df = DF.put(fields_df, "cleaned_field", cleaned_field)
DF.print(fields_df)

bavarian_fields =
  fields_df["cleaned_field"]
  |> S.to_list()
  |> Enum.map(fn a ->
    a
    |> String.split([",", "-"])
    |> Enum.map(&amp;String.trim(&amp;1))
    |> Enum.map(&amp;Enum.member?(S.to_list(local_entities), &amp;1))
    |> Enum.any?()
  end)

fields_df =
  fields_df
  |> DF.put("locations", location_fields)
  |> DF.put("is_bavarian", bavarian_fields)

fields_with_bavarian_locations = DF.filter(fields_df, is_bavarian == true and locations == true)
DF.print(fields_with_bavarian_locations)

elem(DF.shape(fields_with_bavarian_locations), 0) / elem(DF.shape(person_df), 0) * 100

About 5 % users add locations. About 0.5 % of users give locations in Bavarian.

Find bavarian locations in user texts

todo use local_entities funtion

cleaned_note =
  person_df["note"]
  |> S.transform(&amp;if(is_binary(&amp;1), do: &amp;1, else: ""))
  |> S.transform(&amp;Regex.replace(clean_r, &amp;1, " "))
  |> S.transform(&amp;Regex.replace(excess_spaces_r, &amp;1, " "))
  |> S.transform(&amp;String.trim(&amp;1))

person_df = DF.put(person_df, "cleaned_note", cleaned_note)

bavarian_location_in_note =
  person_df["cleaned_note"]
  |> S.transform(fn x ->
    Text.contains_mentions?(x, S.to_list(local_entities))
  end)

person_df = DF.put(person_df, "bavarian_location_in_note", bavarian_location_in_note)

person_with_bavarian_note_locations = DF.filter(person_df, bavarian_location_in_note == true)

elem(DF.shape(person_with_bavarian_note_locations), 0) / elem(DF.shape(person_df), 0) * 100

DF.print(person_with_bavarian_note_locations)

About 2 % of the user name Bavarian location in there user note texts.

Person in Bavaria

OR the Bavarian locations in note, in fields and instance name.

persons_with_bavarian_fields =
  DF.join(person_df, DF.select(fields_with_bavarian_locations, ["user_name", "is_bavarian"]),
    how: :left,
    on: [{"user_name", "user_name"}]
  )

DF.print(persons_with_bavarian_fields)

DF.shape(persons_with_bavarian_fields)

bavarian_users =
  Enum.reduce(
    [
      persons_with_bavarian_fields["is_bavarian"],
      persons_with_bavarian_fields["bavarian_location_in_note"],
      persons_with_bavarian_fields["bavarian_instance"]
    ],
    &amp;S.or(&amp;1, &amp;2)
  )
  |> S.to_enum()
  |> Enum.map(&amp;if(is_nil(&amp;1), do: false, else: &amp;1))

persons_with_bavarian_fields =
  DF.put(persons_with_bavarian_fields, "bavarian_users", bavarian_users)

DF.print(persons_with_bavarian_fields)

person_from_bavaria = DF.filter(persons_with_bavarian_fields, bavarian_users == true)

bavarian_count = elem(DF.shape(person_from_bavaria), 0)
{bavarian_count, bavarian_count / elem(DF.shape(persons_with_bavarian_fields), 0) * 100}

About 8.14 % of the sample datasets users are estimated to be from Bavarian. But (6.14 % + 0.39 % + 2.00 % =) 8.53 % is the sum of percentage per feature. Hence, there is little overlap.

Download user pics

In test data set 1390 of 1547 user do have avatars. Which is about 90 % of the users.

Enum.zip(
  S.to_list(person_df["avatar"]),
  S.to_list(person_df["user_id"])
)
|> Enum.map(fn {link, name} -> User.download_image(name, link) end)

flowchart TD;
  A(gender in fields?) -->|male| B[male];
  A -->|female| C[female];
  A -->|no| D(gender in user text?);

  D -->|male| B;
  D -->|female| C;

  D -->|no| E(user has image?)

  E -->|yes| F(image segamentatation);
  E -->|no| H[no data]

  F-->|single person| G(vgg_ilsvrc_16_gender_imdb_wiki);
  F-->|others| H;

  G-->|male| B;
  G-->|female| C;

flowchart TD;
  E(user has image?) -->|yes| F(image segamentatation);
  E-->|no| H[no data]

  F-->|single person| G(vgg_ilsvrc_16_gender_imdb_wiki);
  F-->|others| H[no data];

  G--> B[age classes];

filter_field_age_names = ~w[Age Alter Born Geburtstag]

filter_field_gender_names =
  ~w[Gender Geschlecht Pronom Pronomen pronouns Pronouns Pronomina Pronoms Pronons Pronoun Sexulatität Wer pronomen pronouns]

gender_fields =
  fields_df["field_name"]
  |> S.downcase()
  |> S.transform(&amp;Enum.member?(filter_field_gender_names, &amp;1))

fields_df = DF.put(fields_df, "gender_fields", gender_fields)

%{male: ["he", "him", "his", "er", "ihm", "ihn", "sein"], female: ["she", "her", "sie", "ihr"]}

DF.filter(fields_df, gender_fields)["cleaned_field"]
|> S.to_list()
|> Enum.uniq()

Preprocessing posts

Before the sentiments can be read from the posts. It is nessescary to remove html tags. The mastodon tags, need to be converted.

Convert date column from string to naive date time.

cleared_posts =
  toots_df[:content]
  |> S.fill_missing(" ")
  |> S.transform(&amp;Regex.replace(clean_r, &amp;1, " "))
  |> S.transform(&amp;Regex.replace(excess_spaces_r, &amp;1, " "))
  |> S.transform(&amp;String.trim(&amp;1))

toots_df = DF.put(toots_df, "cleared_content", cleared_posts)

has_content =
  cleared_posts
  |> S.transform(&amp;String.length(&amp;1))
  |> S.greater(50)

toots_df = DF.put(toots_df, "has_content?", has_content)

toots_df = DF.filter(toots_df, has_content?)

post_length_s =
  toots_df["cleared_content"]
  |> S.transform(&amp;String.length(&amp;1))

post_length_s
|> S.mean()

S.standard_deviation(post_length_s)

S.median(post_length_s)

S.max(post_length_s)

On the median of the cleared posts is 248 characters in the first set (After removing the length of zero). The mean is 217+/-189 character.

toots_df =
  DF.join(
    toots_df,
    DF.select(persons_with_bavarian_fields, ["user_name", "bavarian_users", "followers"]),
    how: :left,
    on: [{"user_name", "user_name"}]
  )

DF.print(toots_df)

DF.filter(toots_df, bavarian_users == true)

Filter posts

Filter: Topic of posts is really about Bavaria
Attribution of Sentiment to a single party
Filter: No party, multiple parties

Filter: Contains at least a single candidate.

Sample Data: 3503/4563 Mentions words in context of Bavaria or a Candidate.

bavarian_post_filter =
  toots_df["cleared_content"]
  |> S.downcase()
  |> S.transform(&amp;Text.contains_mentions?(&amp;1, bavaria_tags))

bavarian_toots_df = DF.mask(toots_df, bavarian_post_filter)

bavarian_full_post_filter =
  toots_df["content"]
  |> S.downcase()
  |> S.transform(&amp;Text.contains_mentions?(&amp;1, bavaria_tags))

full_bavarian_toots_df = DF.mask(toots_df, bavarian_full_post_filter)

bav_set =
  bavarian_toots_df[:id]
  |> S.to_list()
  |> MapSet.new()

bav_full_set =
  full_bavarian_toots_df[:id]
  |> S.to_list()
  |> MapSet.new()

differences = MapSet.difference(bav_full_set, bav_set) |> Enum.to_list()

tags_df
|> DF.filter(toot_id in ^differences)
|> DF.arrange(tag)
|> DF.print(limit: :infinity)

tags_df
|> DF.filter(toot_id == 165)
|> DF.print(limit: :infinity)

DF.filter(full_bavarian_toots_df, bavarian_users == true)

TODO: Rework Parties Single -> Most commonly used Filter all posts that the toots, contains only a single party or its candidates per post.

party_search_terms =
  DF.to_rows(parties_df)
  # only per line in DataFrame
  |> Enum.map(fn x ->
    x
    # only takes the values
    |> Map.values()
    # remove nils
    |> Enum.filter(&amp;is_bitstring(&amp;1))
    # take family names if candidate anmes
    |> Enum.map(fn y ->
      cond do
        Names.family_name(y) == nil -> y
        true -> Names.family_name(y)
      end
    end)
    |> Names.append_variants()
    |> Enum.map(&amp;String.downcase(&amp;1))
  end)

contains_party =
  full_bavarian_toots_df["cleared_content"]
  |> S.downcase()
  |> S.to_list()
  |> Enum.map(fn text ->
    Enum.map(party_search_terms, fn party -> Text.count_unique_mentions(text, party) > 0 end)
  end)

parties_count =
  contains_party
  |> Enum.map(fn text -> Enum.sum(Enum.map(text, fn party -> Bool.to_integer(party) end)) end)

contains_single_party =
  parties_count
  |> Enum.map(&amp;(&amp;1 == 1))

party_frquency =
  full_bavarian_toots_df["content"]
  |> S.downcase()
  |> S.to_list()
  |> Enum.map(fn text ->
    Enum.map(party_search_terms, fn party -> Text.count_mentions(text, party) end)
  end)
  |> Enum.map(fn party_counts ->
    Enum.map(party_counts, fn party_count ->
      if(Enum.sum(party_counts) === 0, do: 0, else: party_count / Enum.sum(party_counts))
    end)
  end)

has_dominant_party =
  party_frquency
  |> Enum.map(fn party_frequencies -> Enum.any?(party_frequencies, &amp;(&amp;1 > 0.5)) end)

Enum.sum(Enum.map(has_dominant_party, &amp;Bool.to_integer(&amp;1)))

domentent_party =
  party_frquency
  |> Enum.map(&amp;Enum.zip(&amp;1, S.to_list(parties_df["party"])))
  |> Enum.map(fn text -> Enum.filter(text, fn party -> elem(party, 0) > 0.5 end) end)
  |> Enum.map(fn text -> Enum.map(text, fn party -> elem(party, 1) end) end)
  |> Enum.map(&amp;if(length(&amp;1) === 0, do: "", else: Enum.at(&amp;1, 0)))

Most posts name a single party. Very often two parties are mentioned in a single post. The other cases are much less frequent.

mentioned_parties =
  contains_party
  |> Enum.map(&amp;Enum.zip(&amp;1, S.to_list(parties_df["party"])))
  |> Enum.map(fn text -> Enum.filter(text, fn party -> elem(party, 0) end) end)
  |> Enum.map(fn text -> Enum.map(text, fn party -> elem(party, 1) end) end)
  |> Enum.map(&amp;Enum.join(&amp;1, ", "))

full_bavarian_toots_df = DF.put(full_bavarian_toots_df, "mentioned_party", mentioned_parties)
full_bavarian_toots_df = DF.put(full_bavarian_toots_df, "domentent_party", domentent_party)

single_party_toots_df = DF.mask(full_bavarian_toots_df, has_dominant_party)
DF.print(single_party_toots_df |> DF.arrange(desc: date))

DF.filter(single_party_toots_df, bavarian_users == true)

Sentiment Analysis

Before the sentiment analysis. The langauage has to be checked. As the language attribute is very often not correct. Therefore a language detection has to be made first, before the sentiment analysis

XLM-RoBERTa - language detection
german-sentiment_bert - Sentiment Analysis German
RoBERTa (BERTtweet) - Sentiment - English language Sentiment analysis

Language Detection

The modules takes a different number of maximum characters. The language detection takes up to 514 characters, but the results updates of the self set languages is similar to 100 characters. Therefore we restrict to 100 characters.

flowchart TD;
  A(XLM-RoBERTa) -->|German| B[german-sentiment_bert];
  A(XLM-RoBERTa) -->|English| C[RoBERTa BERTtweet - Sentiment];


  B -->  E[Bavarian?]
  C -->  F[Bavarian?]

  E -->|no| H(German)
  E -->|yes| G(Bavarian)

  F -->|yes| G(Bavarian)
  F -->|no| I(English)

{:ok, lang_detect_model_info} =
  Bumblebee.load_model({:hf, "papluca/xlm-roberta-base-language-detection"})

{:ok, lang_detect_tokenizer} =
  Bumblebee.load_tokenizer({:hf, "papluca/xlm-roberta-base-language-detection"})

lang_detect_serving =
  Bumblebee.Text.text_classification(lang_detect_model_info, lang_detect_tokenizer,
    compile: [batch_size: 128, sequence_length: 100],
    defn_options: [compiler: EXLA]
  )

lang_detect_model_info.spec.max_positions

Kino.start_child({
  Nx.Serving,
  serving: lang_detect_serving, name: LangDetectServer
})

p = Nx.Serving.batched_run(LangDetectServer, S.to_list(single_party_toots_df["cleared_content"]))

Each predictions is ordered by probybility. Hence always selecting the label the first language returns the most likely one.

detected_languages =
  Enum.map(p, fn post ->
    post
    |> Enum.at(0)
    |> elem(1)
    |> Enum.at(0)
    |> (&amp; &amp;1[:label]).()
  end)

The majority of 96% of all sample posts are detected as german. 1.6 % are detected as English. Why 0.9 % are labeled as Hindi has to be figured out. That 0.5 % are detected as Dutch is more plausable.

detected_languages
|> Enum.frequencies()
|> Enum.sort_by(&amp;elem(&amp;1, 1), :desc)
|> Enum.map(fn {lang, freq} -> {lang, freq / length(detected_languages) * 100.0} end)

single_party_toots_df = DF.put(single_party_toots_df, "detected_languages", detected_languages)

In contrast the manually set language are 92%, 6 % language (often the default) and 1.3 % nil (not specified).

single_party_toots_df["language"]
|> S.to_list()
|> Enum.frequencies()
|> Enum.sort_by(&amp;elem(&amp;1, 1), :desc)
|> Enum.map(fn {lang, freq} -> {lang, freq / length(detected_languages) * 100.0} end)

From visual analysis the langauge attribute is often set wrong, as it set manually, with a given default. Often the language was set to English, when it was German or set to nil. Therefore the language has been evaluated by language detection model, which changed the language in 5.6 % of all posts.

reasigned_language =
  single_party_toots_df["language"]
  |> S.not_equal(single_party_toots_df["detected_languages"])
  |> S.transform(&amp;Bool.to_integer(&amp;1))
  |> S.sum()

reasigned_language / S.size(single_party_toots_df["language"]) * 100.0

German Sentiments

The German sentiment analysis works with up to 512 tokens. Per standard Mastodon set the limit to 500 characters. As other services than Mastodon and some Mastodon instances use other cut offs, we select 512 tokens. We assume 5.99 characters per word. OpenAI estimates 4 characters per tokens on a English word.

{:ok, ger_sent_model_info} = Bumblebee.load_model({:hf, "oliverguhr/german-sentiment-bert"})
{:ok, ger_sent_tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-german-cased"})
ger_sent_model_info.spec.max_positions

ger_sent_serving =
  Bumblebee.Text.text_classification(ger_sent_model_info, ger_sent_tokenizer,
    compile: [batch_size: 128, sequence_length: 512],
    defn_options: [compiler: EXLA]
  )

Kino.start_child({
  Nx.Serving,
  serving: ger_sent_serving, name: GerSentimentServer
})

german_toots_df = DF.filter(single_party_toots_df, detected_languages == "de")
german_toots = S.to_list(german_toots_df["cleared_content"])

ger_predictions = Nx.Serving.batched_run(GerSentimentServer, german_toots)

ger_sentiments = Enum.map(ger_predictions, fn x -> SentimentScore.score(x.predictions) end)
german_toots_df = DF.put(german_toots_df, "sentiment", ger_sentiments)

Englisch Sentiment

The sentiment analysis on the English text works only with 130 tokens.

english_toots_df = DF.filter(single_party_toots_df, detected_languages == "en")
english_toots = S.to_list(english_toots_df["cleared_content"])

{:ok, model_info} = Bumblebee.load_model({:hf, "finiteautomata/bertweet-base-sentiment-analysis"})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "vinai/bertweet-base"})

english_sentiment_serving =
  Bumblebee.Text.text_classification(model_info, tokenizer,
    compile: [batch_size: 128, sequence_length: 130],
    defn_options: [compiler: EXLA]
  )

Kino.start_child({
  Nx.Serving,
  serving: english_sentiment_serving, name: EngSentimentServer
})

eng_predictions = Nx.Serving.batched_run(EngSentimentServer, english_toots)

eng_sentiments = Enum.map(eng_predictions, fn x -> SentimentScore.score(x.predictions) end)
english_toots_df = DF.put(english_toots_df, "sentiment", eng_sentiments)

ger_eng_toots = DF.concat_rows(german_toots_df, english_toots_df)

bav_ger_eng_lang =
  ger_eng_toots["detected_languages"]
  |> S.to_list()
  |> Enum.zip(S.to_list(ger_eng_toots["bavarian_users"]))
  |> Enum.map(&amp;if(elem(&amp;1, 1), do: "bav", else: elem(&amp;1, 0)))

bav_ger_toots =
  DF.put(ger_eng_toots, "region", bav_ger_eng_lang)
  |> DF.filter(region != "en")

DF.print(ger_eng_toots)

S.frequencies(bav_ger_toots["region"])

h =
  bav_ger_toots
  |> DF.filter(region == "bav")

S.distinct(h["user_id"])

S.distinct(bav_ger_toots["user_id"])

DF.filter(bav_ger_toots, date > ^start_date)[:domentent_party]
|> S.to_list()
|> Enum.frequencies()

DF.filter(bav_ger_toots, date > ^start_date)[:region]
|> S.to_list()
|> Enum.frequencies()

Sentiment Graphs

avg_sentiment = S.mean(bav_ger_toots["sentiment"])

party_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(domentent_party == ^x)
  |> DF.shape()
  |> elem(0)
end

total_count =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.shape()
  |> elem(0)

follower_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(domentent_party == ^x)
  |> DF.to_columns()
  |> Map.fetch!("followers")
  |> Enum.sum()
end

total_followers =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.to_columns()
  |> Map.fetch!("followers")
  |> Enum.sum()

party_bav_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(region == "bav")
  |> DF.filter(domentent_party == ^x)
  |> DF.shape()
  |> elem(0)
end

bavaria_total =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(region == "bav")
  |> DF.shape()
  |> elem(0)

#  Last Positive before election
pos_party_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.filter(domentent_party == ^x)
  |> DF.shape()
  |> elem(0)
end

pos_total_count =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.shape()
  |> elem(0)

pos_party_bav_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(region == "bav")
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.filter(domentent_party == ^x)
  |> DF.shape()
  |> elem(0)
end

pos_follower_count = fn x ->
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.filter(domentent_party == ^x)
  |> DF.to_columns()
  |> Map.fetch!("followers")
  |> Enum.sum()
end

pos_total_followers =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.to_columns()
  |> Map.fetch!("followers")
  |> Enum.sum()

pos_bavaria_total =
  bav_ger_toots
  |> DF.filter(day >= ^260)
  |> DF.filter(day < ^281)
  |> DF.filter(region == "bav")
  |> DF.arrange(desc: sentiment)
  |> DF.distinct(["user_id"], keep_all: true)
  |> DF.shape()
  |> elem(0)

DF.new(%{
  party: [
    "afd",
    "csu",
    "fdp",
    "fw",
    "gruene",
    "linke",
    "spd"
  ],
  german_ratio: [
    party_count.("afd") / total_count,
    party_count.("csu") / total_count,
    party_count.("fdp") / total_count,
    party_count.("fw") / total_count,
    party_count.("gruene") / total_count,
    party_count.("linke") / total_count,
    party_count.("spd") / total_count
  ],
  german_followers: [
    follower_count.("afd") / total_followers,
    follower_count.("csu") / total_followers,
    follower_count.("fdp") / total_followers,
    follower_count.("fw") / total_followers,
    follower_count.("gruene") / total_followers,
    follower_count.("linke") / total_followers,
    follower_count.("spd") / total_followers
  ],
  bavarian_support: [
    party_bav_count.("afd"),
    party_bav_count.("csu"),
    party_bav_count.("fdp"),
    party_bav_count.("fw"),
    party_bav_count.("gruene"),
    party_bav_count.("linke"),
    party_bav_count.("spd")
  ],
  bavarian_ratio: [
    party_bav_count.("afd") / bavaria_total,
    party_bav_count.("csu") / bavaria_total,
    party_bav_count.("fdp") / bavaria_total,
    party_bav_count.("fw") / bavaria_total,
    party_bav_count.("gruene") / bavaria_total,
    party_bav_count.("linke") / bavaria_total,
    party_bav_count.("spd") / bavaria_total
  ],
  positiv_german_ratio: [
    pos_party_count.("afd") / pos_total_count,
    pos_party_count.("csu") / pos_total_count,
    pos_party_count.("fdp") / pos_total_count,
    pos_party_count.("fw") / pos_total_count,
    pos_party_count.("gruene") / pos_total_count,
    pos_party_count.("linke") / pos_total_count,
    pos_party_count.("spd") / pos_total_count
  ],
  positiv_bavarian_support: [
    pos_party_bav_count.("afd"),
    pos_party_bav_count.("csu"),
    pos_party_bav_count.("fdp"),
    pos_party_bav_count.("fw"),
    pos_party_bav_count.("gruene"),
    pos_party_bav_count.("linke"),
    pos_party_bav_count.("spd")
  ],
  positiv_bavarian_ratio: [
    pos_party_bav_count.("afd") / pos_bavaria_total,
    pos_party_bav_count.("csu") / pos_bavaria_total,
    pos_party_bav_count.("fdp") / pos_bavaria_total,
    pos_party_bav_count.("fw") / pos_bavaria_total,
    pos_party_bav_count.("gruene") / pos_bavaria_total,
    pos_party_bav_count.("linke") / pos_bavaria_total,
    pos_party_bav_count.("spd") / pos_bavaria_total
  ],
  positive_german_followers: [
    pos_follower_count.("afd") / pos_total_followers,
    pos_follower_count.("csu") / pos_total_followers,
    pos_follower_count.("fdp") / pos_total_followers,
    pos_follower_count.("fw") / pos_total_followers,
    pos_follower_count.("gruene") / pos_total_followers,
    pos_follower_count.("linke") / pos_total_followers,
    pos_follower_count.("spd") / pos_total_followers
  ]
})
|> DF.print(limit: :infinity)

Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the CSU", "csu", start_date)

Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the Freie Waehler", "fw", start_date)

Graphs.create_sentiment_graph(
  bav_ger_toots,
  "Sentiments for the Buendnis90/Gruene",
  "gruene",
  start_date
)

Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the SPD", "spd", start_date)

Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the FDP", "fdp", start_date)

Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the AFD", "afd", start_date)

Graphs.create_sentiment_graph(bav_ger_toots, "Sentiments for the Linke", "linke", start_date)

TODO:

cross correlation
emojies?
add min/max dates for sentiment graphs

4 nl posts -> error, 1 fr post -> correct

Comparision Polls and Sentiments

normalized_timeline =
  bav_ger_toots
  |> DF.filter(date >= ^start_date)
  |> DF.select(["domentent_party", "sentiment", "day", "id"])
  |> DF.mutate(sentiment: (sentiment + 1) / 2)
  |> DF.pivot_wider("domentent_party", "sentiment", id_columns: ["id", "day"])
  |> DF.discard(["id"])

sentiment_timeline =
  normalized_timeline
  |> DF.group_by("day")
  |> DF.mutate(
    csu: mean(csu),
    spd: mean(spd),
    fw: mean(fw),
    fdp: mean(fdp),
    afd: mean(afd),
    gruene: mean(gruene),
    linke: mean(linke)
  )
  |> DF.select(["day", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
  |> DF.distinct()
  |> DF.arrange(asc: day)

DF.print(sentiment_timeline)

filtered_polls = DF.filter(polls, start_date >= ^start_date)

polls_timeline =
  DF.mutate(filtered_polls,
    start_day: S.day_of_year(start_date),
    end_day: S.day_of_year(end_date)
  )

polls_timeline =
  polls_timeline
  |> DF.to_rows()
  |> Enum.with_index()
  |> Enum.flat_map(fn {row, index} ->
    row["start_day"]..row["end_day"]
    |> Enum.map(&amp;Map.merge(row, %{"day" => &amp;1, "index" => index}))
  end)
  |> DF.new()
  |> DF.select(["day", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
  |> DF.group_by("day")
  |> DF.mutate(
    csu: mean(csu) / 100,
    spd: mean(spd) / 100,
    fw: mean(fw) / 100,
    fdp: mean(fdp) / 100,
    afd: mean(afd) / 100,
    gruene: mean(gruene) / 100,
    linke: mean(linke) / 100
  )
  |> DF.arrange(asc: day)
  |> DF.distinct()

DF.print(polls_timeline, limit: :infinity)

Align the End dates of Polls and Sentiments.

end_day = Enum.min([S.max(polls_timeline["day"]), S.max(sentiment_timeline["day"])])

sentiment_timeline = DF.filter(sentiment_timeline, day <= ^end_day)
polls_timeline = DF.filter(polls_timeline, day <= ^end_day)

end_sampling_doy = 323
lang_toots = DF.filter(bav_ger_toots, day <= ^end_sampling_doy) |> DF.filter(day >= ^start_day)

Tucan.concat([
  Tucan.histogram(lang_toots, "day", relative: true, step: 1) |> Tucan.Axes.set_x_title("Day"),
  Tucan.histogram(lang_toots, "weekday", relative: true) |> Tucan.Axes.set_x_title("Weekday"),
  Tucan.histogram(lang_toots, "hour", relative: true, step: 1) |> Tucan.Axes.set_x_title("Hour")
])
|> Tucan.set_title("Post frequencies on different time scales")

Fill in missing data.

Dates not yet included
Dates included but no data measured.

days_range = start_day..end_day
parties = ["csu", "spd", "fw", "fdp", "afd", "gruene", "linke"]
polls_timeline = DataFrameDate.fill(DF.ungroup(polls_timeline), "day", days_range, parties)

sentiment_timeline =
  DataFrameDate.fill(DF.ungroup(sentiment_timeline), "day", days_range, parties)

DF.print(sentiment_timeline, limit: :infinity)

DF.print(polls_timeline, limit: :infinity)

polls_timeline =
  polls_timeline
  |> DF.ungroup()
  |> DF.arrange(day)
  |> DF.mutate(
    csu: NilSeries.fb_fill(csu),
    spd: NilSeries.fb_fill(spd),
    fw: NilSeries.fb_fill(fw),
    fdp: NilSeries.fb_fill(fdp),
    afd: NilSeries.fb_fill(afd),
    gruene: NilSeries.fb_fill(gruene),
    linke: NilSeries.fb_fill(linke)
  )

sentiment_timeline =
  sentiment_timeline
  |> DF.ungroup()
  |> DF.arrange(day)
  |> DF.mutate(
    csu: NilSeries.fb_fill(csu),
    spd: NilSeries.fb_fill(spd),
    fw: NilSeries.fb_fill(fw),
    fdp: NilSeries.fb_fill(fdp),
    afd: NilSeries.fb_fill(afd),
    gruene: NilSeries.fb_fill(gruene),
    linke: NilSeries.fb_fill(linke)
  )
  |> DF.mutate(
    csu: csu / (csu + spd + fw + fdp + afd + gruene + linke),
    spd: spd / (csu + spd + fw + fdp + afd + gruene + linke),
    fw: fw / (csu + spd + fw + fdp + afd + gruene + linke),
    fdp: fdp / (csu + spd + fw + fdp + afd + gruene + linke),
    afd: afd / (csu + spd + fw + fdp + afd + gruene + linke),
    gruene: gruene / (csu + spd + fw + fdp + afd + gruene + linke),
    linke: linke / (csu + spd + fw + fdp + afd + gruene + linke)
  )

Parties with enough Sentiment samples

Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - CSU", "csu")

{S.standard_deviation(polls_timeline["csu"]), S.standard_deviation(polls_timeline["fw"]),
 S.standard_deviation(polls_timeline["afd"])}

{S.standard_deviation(sentiment_timeline["csu"]), S.standard_deviation(sentiment_timeline["fw"]),
 S.standard_deviation(sentiment_timeline["afd"])}

Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - FW", "fw")

Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - AFD", "afd")

Parties with few Sentiment Samples

Graphs.create_daily_compare_graph(
  sentiment_timeline,
  polls_timeline,
  "Sentiment - Buendis90 Gruene",
  "gruene"
)

Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment -  SPD", "spd")

Graphs.create_daily_compare_graph(sentiment_timeline, polls_timeline, "Sentiment - FDP", "fdp")

Graphs.create_daily_compare_graph(
  sentiment_timeline,
  polls_timeline,
  "Sentiment - Linke",
  "linke"
)

Cross Correlation

party_sentiment_offset = fn sentiment_df, poll_df, party_str ->
  sentiments_t = S.to_tensor(sentiment_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)
  polls_t = S.to_tensor(poll_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)

  CrossCorrelation.find_offset(sentiments_t, polls_t)
end

party_sentiment_offset.(sentiment_timeline, polls_timeline, "csu")

party_sentiment_offset.(sentiment_timeline, polls_timeline, "fw")

party_sentiment_offset.(sentiment_timeline, polls_timeline, "afd")

Correlation Sentiment –> Polls

alias Scholar.Linear.LinearRegression, as: LR
alias Scholar.Metrics.Regression, as: RegMetrics
alias Scholar.Preprocessing

longer_sentiment_timeline =
  sentiment_timeline
  |> DF.mutate(
    csu: csu / (csu + spd + fw + fdp + afd + gruene + linke),
    spd: spd / (csu + spd + fw + fdp + afd + gruene + linke),
    fw: fw / (csu + spd + fw + fdp + afd + gruene + linke),
    fdp: fdp / (csu + spd + fw + fdp + afd + gruene + linke),
    afd: afd / (csu + spd + fw + fdp + afd + gruene + linke),
    gruene: gruene / (csu + spd + fw + fdp + afd + gruene + linke),
    linke: linke / (csu + spd + fw + fdp + afd + gruene + linke)
  )
  |> DF.select(["day", "afd", "csu", "fw"])
  |> DF.pivot_longer(["afd", "csu", "fw"])

# |> DF.print(limit: :infinity)

longer_polls_timeline =
  polls_timeline
  |> DF.select(["day", "afd", "csu", "fw"])
  |> DF.pivot_longer(["afd", "csu", "fw"])

daily_polls_sentiment_df =
  DF.new(
    day: longer_sentiment_timeline["day"],
    sentiment: longer_sentiment_timeline["value"],
    poll: longer_polls_timeline["value"],
    party: longer_polls_timeline["variable"]
  )

daily_polls_sentiment_df = DF.mutate(daily_polls_sentiment_df, party: S.cast(party, :category))

Tucan.scatter(daily_polls_sentiment_df, "sentiment", "poll", color_by: "party")
|> Tucan.set_width(500)
|> Tucan.set_height(500)
|> Tucan.set_title("Dependency of Poll results from Sentiment.")
|> Tucan.Axes.set_x_title("Sentiment per Average of Sentiments")
|> Tucan.Axes.set_y_title("Poll")
|> Tucan.Scale.set_y_domain(0.1, 0.4)
|> Tucan.Scale.set_x_domain(0, 0.3)

split = round(elem(DF.shape(daily_polls_sentiment_df), 0) * 0.7)
daily_polls_sentiment_shuffled_df = DF.shuffle(daily_polls_sentiment_df)
polls_sentiment_df_train = DF.slice(daily_polls_sentiment_shuffled_df, 0..split)

polls_sentiment_df_test = DF.slice(daily_polls_sentiment_shuffled_df, split..-1)

get_one_hot = fn x, y ->
  x
  |> S.cast(:category)
  |> S.to_tensor()
  |> Preprocessing.one_hot_encode(num_classes: y)
end

get_sentiment = fn x ->
  x
  |> S.to_tensor()
  |> Nx.reshape({:auto, 1})
end

party_train = get_one_hot.(polls_sentiment_df_train["party"], 3)
party_test = get_one_hot.(polls_sentiment_df_test["party"], 3)

sentiments_train = get_sentiment.(polls_sentiment_df_train["sentiment"])
sentiments_test = get_sentiment.(polls_sentiment_df_test["sentiment"])

x_train = Nx.concatenate([party_train, sentiments_train], axis: 1)
x_test = Nx.concatenate([party_test, sentiments_test], axis: 1)

y_train = polls_sentiment_df_train["poll"] |> S.to_tensor()
y_test = polls_sentiment_df_test["poll"] |> S.to_tensor()

model = LR.fit(x_train, y_train)

y_hat = LR.predict(model, x_test)
RegMetrics.r2_score(y_test, y_hat)

Summary:

intercept: 0.2182
afd: -0.0749
csu: 0.1321
fw: -0.0573
sentiment: 0.0013

Meaning:

Almost not depending on (daily) Sentiment! Better Sentiment worse Poll results!?
afd: 14.9 %
csu: 35.3 %
fw: 16.2 %

Comparision: Weekly Sentiments and Polls

normalized_weeks =
  bav_ger_toots
  |> DF.filter(date >= ^start_date)
  |> DF.select(["domentent_party", "sentiment", "week", "id"])
  |> DF.mutate(sentiment: (sentiment + 1) / 2)
  |> DF.pivot_wider("domentent_party", "sentiment", id_columns: ["id", "week"])
  |> DF.discard(["id"])

sentiment_weeks =
  normalized_weeks
  |> DF.group_by("week")
  |> DF.mutate(
    csu: mean(csu),
    spd: mean(spd),
    fw: mean(fw),
    fdp: mean(fdp),
    afd: mean(afd),
    gruene: mean(gruene),
    linke: mean(linke)
  )
  |> DF.select(["week", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
  |> DF.distinct()
  |> DF.arrange(asc: week)

DF.print(sentiment_weeks)

poll_weeks =
  polls
  |> DF.filter(mid_date > ^start_date)
  |> DF.select(["week", "afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])
  |> DF.group_by("week")
  |> DF.mutate(
    afd: mean(afd) / 100,
    csu: mean(csu) / 100,
    fdp: mean(fdp) / 100,
    fw: mean(fw) / 100,
    gruene: mean(gruene) / 100,
    linke: mean(linke) / 100,
    spd: mean(spd) / 100
  )
  |> DF.distinct()
  |> DF.ungroup()

DF.print(poll_weeks, limit: :infinity)

end_week = Enum.min([S.max(poll_weeks["week"]), S.max(sentiment_weeks["week"])])
start_week = Enum.max([S.min(poll_weeks["week"]), S.min(sentiment_weeks["week"])])

poll_weeks = DF.filter(poll_weeks, week <= ^end_week)
DF.print(poll_weeks, limit: :infinity)

sentiment_weeks = DF.filter(sentiment_weeks, week <= ^end_week)
DF.print(sentiment_weeks, limit: :infinity)

filled_poll_weeks =
  DataFrameDate.fill(poll_weeks, "week", start_week..end_week, [
    "afd",
    "csu",
    "fdp",
    "fw",
    "gruene",
    "linke",
    "spd"
  ])

filled_poll_weeks =
  DF.mutate(filled_poll_weeks,
    afd: NilSeries.fb_fill(afd),
    csu: NilSeries.fb_fill(csu),
    fdp: NilSeries.fb_fill(fdp),
    fw: NilSeries.fb_fill(fw),
    gruene: NilSeries.fb_fill(gruene),
    linke: NilSeries.fb_fill(linke),
    spd: NilSeries.fb_fill(spd)
  )

filled_sentiment_weeks =
  DataFrameDate.fill(sentiment_weeks, "week", start_week..end_week, [
    "afd",
    "csu",
    "fdp",
    "fw",
    "gruene",
    "linke",
    "spd"
  ])
  |> DF.mutate(gruene: NilSeries.fb_fill(gruene))
  |> DF.mutate(
    csu: csu / (csu + spd + fw + fdp + afd + gruene + linke),
    spd: spd / (csu + spd + fw + fdp + afd + gruene + linke),
    fw: fw / (csu + spd + fw + fdp + afd + gruene + linke),
    fdp: fdp / (csu + spd + fw + fdp + afd + gruene + linke),
    afd: afd / (csu + spd + fw + fdp + afd + gruene + linke),
    gruene: gruene / (csu + spd + fw + fdp + afd + gruene + linke),
    linke: linke / (csu + spd + fw + fdp + afd + gruene + linke)
  )

DF.print(filled_sentiment_weeks, limit: :infinity)

Graphs.create_weekly_compare_graph(
  filled_sentiment_weeks,
  filled_poll_weeks,
  "Sentiment - CSU",
  "csu"
)

Graphs.create_weekly_compare_graph(
  filled_sentiment_weeks,
  filled_poll_weeks,
  "Sentiment - FW",
  "fw"
)

Graphs.create_weekly_compare_graph(
  filled_sentiment_weeks,
  filled_poll_weeks,
  "Sentiment - AFD",
  "afd"
)

Graphs.create_weekly_compare_graph(
  filled_sentiment_weeks,
  filled_poll_weeks,
  "Sentiment - SPD",
  "spd"
)

Graphs.create_weekly_compare_graph(
  filled_sentiment_weeks,
  filled_poll_weeks,
  "Sentiment - Buendnis/ Gruene",
  "gruene"
)

Cross Correlation

party_sentiment_offset = fn sentiment_df, poll_df, party_str ->
  sentiments_t = S.to_tensor(sentiment_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)
  polls_t = S.to_tensor(poll_df[party_str]) |> Nx.new_axis(0) |> Nx.new_axis(0)

  CrossCorrelation.find_offset(sentiments_t, polls_t)
end

party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "afd") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "csu") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "fdp") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "fw") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "gruene") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "spd") |> dbg()
party_sentiment_offset.(filled_sentiment_weeks, filled_poll_weeks, "linke") |> dbg()

Linear Fit

alias Scholar.Linear.LinearRegression, as: LR
alias Scholar.Linear.RidgeRegression, as: Ridge
alias Scholar.Metrics.Regression, as: RegMetrics
alias Scholar.Preprocessing

DF.print(filled_sentiment_weeks, limit: :infinity)

longer_sentiment_weekly_timeline =
  filled_sentiment_weeks
  |> DF.pivot_longer(["afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])

DF.print(longer_sentiment_weekly_timeline, limit: :infinity)

longer_polls_weekly_timeline =
  filled_poll_weeks
  |> DF.pivot_longer(["afd", "csu", "fdp", "fw", "gruene", "linke", "spd"])

DF.print(longer_polls_weekly_timeline, limit: :infinity)

weekly_polls_sentiment_df =
  DF.new(
    week: longer_sentiment_weekly_timeline["week"],
    sentiment: longer_sentiment_weekly_timeline["value"],
    poll: longer_polls_weekly_timeline["value"],
    party: longer_sentiment_weekly_timeline["variable"]
  )

weekly_polls_sentiment_df = DF.mutate(weekly_polls_sentiment_df, party: S.cast(party, :category))

Tucan.scatter(weekly_polls_sentiment_df, "sentiment", "poll", color_by: "party", shape_by: "party")
|> Tucan.set_width(500)
|> Tucan.set_height(500)
|> Tucan.set_title("Dependency of Polling Results From Sentiment.")
|> Tucan.Axes.set_x_title("Sentiment")
|> Tucan.Axes.set_y_title("Poll")
|> Tucan.Scale.set_y_domain(0, 0.4)
|> Tucan.Scale.set_x_domain(0, 0.25)

split = round(elem(DF.shape(weekly_polls_sentiment_df), 0) * 0.7)

weekly_polls_sentiment_shuffled_df = DF.shuffle(weekly_polls_sentiment_df)
polls_sentiment_df_train = DF.slice(weekly_polls_sentiment_shuffled_df, 0..split)
polls_sentiment_df_test = DF.slice(weekly_polls_sentiment_shuffled_df, split..-1)

get_one_hot = fn x, y ->
  x
  |> S.cast(:category)
  |> S.to_tensor()
  |> Preprocessing.one_hot_encode(num_classes: y)
end

get_sentiment = fn x ->
  x
  |> S.to_tensor()
  |> Nx.reshape({:auto, 1})
end

party_train = get_one_hot.(polls_sentiment_df_train["party"], 7)
party_test = get_one_hot.(polls_sentiment_df_test["party"], 7)

sentiments_train = get_sentiment.(polls_sentiment_df_train["sentiment"])
sentiments_test = get_sentiment.(polls_sentiment_df_test["sentiment"])

x_train = Nx.concatenate([party_train, sentiments_train], axis: 1)
x_test = Nx.concatenate([party_test, sentiments_test], axis: 1)

y_train = polls_sentiment_df_train["poll"] |> S.to_tensor()
y_test = polls_sentiment_df_test["poll"] |> S.to_tensor()

model = LR.fit(x_train, y_train, fit_intercept?: false)

y_hat = LR.predict(model, x_test)
RegMetrics.r2_score(y_test, y_hat)

Summary:

intercept: 0.131
afd: 0.003
csu: 0.229
fdp: -0.103
fw: 0.026
gruene 0.008
linke: -0.118
spd: -0.046
sentiment: 0.003

defmodule GridSearch do
  def ridge_single(point, x_train, x_test, y_train, y_test) do
    model_ridge = Ridge.fit(x_train, y_train, fit_intercept?: false, alpha: point)
    y_hat_ridge = Ridge.predict(model_ridge, x_test)
    Nx.to_number(RegMetrics.r2_score(y_test, y_hat_ridge))
  end

  def ridge_1d(points, x_train, x_test, y_train, y_test)
      when is_list(points) do
    points
    |> Enum.map(&amp;ridge_single(&amp;1, x_train, x_test, y_train, y_test))
  end
end

GridSearch.ridge_1d([0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], x_train, x_test, y_train, y_test)

model_ridge = Ridge.fit(x_train, y_train, fit_intercept?: false, alpha: 0.1) |> dbg
y_hat_ridge = Ridge.predict(model_ridge, x_test)
Nx.to_number(RegMetrics.r2_score(y_test, y_hat_ridge))

Compare with average of that time frame

filled_poll_weeks
|> DF.mutate(
  afd: S.mean(afd),
  csu: S.mean(csu),
  fdp: S.mean(fdp),
  fw: S.mean(fw),
  gruene: S.mean(gruene),
  linke: S.mean(linke),
  spd: S.mean(spd)
)
|> DF.discard(["week"])
|> DF.distinct()
|> DF.print()

Other notebooks:

Michal Slaski
@michalslaski

livebook_examples

Salary predictions

salary_prediction.livemd

advanced data-science exla axon nx

2022-8-18
Dr. Christian Geuer-Pollmann
@chgeuer

livebook_on_azure

Christian's first LiveBook test

notebook1.livemd

tutorial advanced data-science axon exla nx

2022-8-18
@andyl

elix_util

MNIST

mnist.livemd

tutorial advanced data-science req axon exla nx

2022-8-18
@TomBers

livebookNotes

Input test

inputs.livemd

tutorial intermediate httpoison

2022-8-18
@DockYard-Academy

curriculum

Ranges

ranges.livemd

tutorial beginner jason kino youtube hidden_cell

2023-3-21
Ryan Young
@ryoung786

AdventOfCode

2023 Day 16

16.livemd

tutorial algorithms intermediate data-structures req vega_lite kino_vega_lite

2023-12-17
Nick C
@flowerett

aoc

Day8

day8.livemd

advanced testing libgraph math kino_aoc benchee nimble_parsec

2023-12-9

Back