Powered by AppSignal & Oban Pro

Data Preprocessing

dataPreprocessing .livemd

Data Preprocessing

Mix.install([
  {:req, "~> 0.4.8"},
  {:explorer, "~> 0.8.0"},
  {:kino_explorer, "~> 0.1.18"},
  {:vega_lite, "~> 0.1.8"},
  {:kino_vega_lite, "~> 0.1.3"},
  {:tzdata, "~> 1.1"},
  {:nx, "~> 0.7.0"},
  {:scholar, "~> 0.3.0"}
])

Section

require Explorer.DataFrame, as: DF
require Explorer.Series, as: Series
require VegaLite, as: Vl

For simplicity and flexibility purposes, let’s define the file paths to all the csv files that is needed.

defmodule Constants do
  @answers_file_path "/Users/qianqian/TPCDataAnalysis/RealData/answers-aaf7180-2025-04-22 14_17_28.csv"
  @events_file_path "/Users/qianqian/TPCDataAnalysis/RealData/events-aaf7180-2025-04-22 14_26_18.csv"
  @participation_file_path "/Users/qianqian/TPCDataAnalysis/RealData/participation-aaf7180-2025-04-22 14_17_51.csv"
  @clustering_label_file_path "/Users/qianqian/profile-clustering-filterCol-combCol-2.csv"
  @clustering_label2_file_path "/Users/qianqian/profile-sub-clustering-2.csv"
  
  def get_answers_file_path, do: @answers_file_path
  def get_events_file_path, do: @events_file_path
  def get_participation_file_path, do: @participation_file_path
  def get_clustering_label_file_path, do: @clustering_label_file_path
  def get_clustering_label2_file_path, do: @clustering_label2_file_path
end

1. Answers.csv

Initial inspection into answers.csv.

answers_df = DF.from_csv!(Constants.get_answers_file_path())
IO.inspect(answers_df)

Throughout the online course, students were asked to answer some questionnaires. Initial questionnaire to understand the basic information of the participants (such as the country of origin, gender, knowledge level of technology, etc). Other questionnaire are also included halfway through the course, to understand how they have being doing it. Lastly, a final questionnaire is included to ask regarding their final experience. This csv file contains all the answers responded by the participants.

Different questions are divided into different categories, such as:

  1. Profile - Contains some initial questions that are asked at the beginning of the course to gather some basic information of the students
  2. Access - The level of access to the technology. These are asked at the beginning of the course
  3. selfpre - The level of knowledge on the technology at the beginning of the course.
  4. Feedback - Contains information for feedback after finishing up a unit. This can be after watching a video, etc.
  5. Selfpost - The level of knowledge after finishing the course
  6. Satisfaction - The level of satisfaccion after finishing the course
  7. Usability - Usability level of the entire course, hence it is asked after finishing the course
questions2category = DF.distinct(answers_df, ["Categoría de la pregunta", "Texto de la pregunta"])
questions2type = DF.distinct(answers_df, ["Tipo de pregunta", "Texto de la pregunta"])
values = DF.pull(questions2category, "Categoría de la pregunta") |> Explorer.Series.to_list()
keys = DF.pull(questions2category, "Texto de la pregunta") |> Explorer.Series.to_list()
# Combining into a map
question_category_map = Enum.zip(keys, values) |> Map.new()

values = DF.pull(questions2type, "Tipo de pregunta") |> Explorer.Series.to_list()
keys = DF.pull(questions2type, "Texto de la pregunta") |> Explorer.Series.to_list()
# Combining into a map
question_type_map = Enum.zip(keys, values) |> Map.new()

1.1 Gathering student profiles

Just like we mentioned, we have different questionnaires throughout the course. What we are interested are mainly the student profile, where it includes all the academic backgrounds, etc. Especially when we want to perform a clustering technique. Therefore, I will retrieve those information independently, forming a unique dataset. By analyzing this dataset individually can also provide us an insight on the type of students that we have.

#This is a module used to generate a row where it represents the participant's answers
defmodule ParticipantRowBuilder do
  alias Explorer.DataFrame, as: DF
  alias Explorer.Series, as: Series

  def build_participant_row_data(participant, answers_df, new_df_col_names) do
    participant_code = participant["ID de participante"]

    # Filter the answers for this participant
    participant_answers =
      DF.filter_with(answers_df, fn answer ->
        Series.equal(answer["ID de participante"], participant_code)
      end)
      |> DF.to_rows()

    Enum.reduce(new_df_col_names, %{}, fn col, acc ->
      cond do
        col == "Participant ID" ->
          Map.put(acc, col, participant_code)

        true ->
          answer_row =
            Enum.find(participant_answers, fn q ->
              q["Texto de la pregunta"] == col
            end)

          answer =
            if answer_row do
              answer_row["Selección en la respuesta"]
            else
              -1
            end
          Map.put(acc, col, answer)
      end
    end)
  end
end
profile_questions_list = 
  answers_df
  |> DF.filter(col("Categoría de la pregunta") == "profile")
  |> DF.select(["Texto de la pregunta"])
  |> DF.distinct(["Texto de la pregunta"])
  |> DF.to_rows()

profile_questions_list = Enum.map(profile_questions_list, & &1["Texto de la pregunta"])


#IO.inspect(profile_questions_list)# We have 13 questions that belong to the profile questions 
column_values = ["Participant ID" | profile_questions_list]
#Generate a profile dataset, where each row represents the profile of the participants


# Get all unique participants - 878 in total
participants_list =
  answers_df
  |> DF.select(["ID de participante"])
  |> DF.distinct(["ID de participante"])
  |> DF.to_rows()

participants_profile_rows =
  Enum.map(participants_list, fn row ->
    ParticipantRowBuilder.build_participant_row_data(row, answers_df, column_values)
  end)

student_profile_df = DF.new(participants_profile_rows)

1.2 Gathering answers for initial questionnaire

init_questions_list = 
  answers_df
  |> DF.filter(col("Categoría de la pregunta") == "profile" 
    or col("Categoría de la pregunta") == "access" or col("Categoría de la pregunta") == "selfpre" )
  |> DF.select(["Texto de la pregunta"])
  |> DF.distinct(["Texto de la pregunta"])
  |> DF.to_rows()

init_questions_list = Enum.map(init_questions_list, & &1["Texto de la pregunta"])

IO.inspect(length(init_questions_list)) # We have 29 questions that were asked in the initial questionnaire questions 
column_values = ["Participant ID" | init_questions_list]
#Generate a profile dataset, where each row represents the profile of the participants

participants_profile_rows =
  Enum.map(participants_list, fn row ->
    ParticipantRowBuilder.build_participant_row_data(row, answers_df, column_values)
  end)

init_answers_df = DF.new(participants_profile_rows)

1.3 Gather answers for all questions in the csv files

# Get all unique questions
questions_df = DF.distinct(answers_df, ["Texto de la pregunta"])
questions_list = DF.to_series(questions_df) |> then(&Series.to_list(&1["Texto de la pregunta"]))
IO.inspect(length(questions_list)) #55 questions in total
column_values = ["Participant ID" | questions_list]

# Build a list of maps for each participant
participant_rows =
  Enum.map(participants_list, fn row ->
    ParticipantRowBuilder.build_participant_row_data(row, answers_df, column_values)
  end)

# Build the final DataFrame all at once
questions_df = DF.new(participant_rows)

#export dataframe to CSV file
#DF.to_csv!(questions_df, "all-questions.csv")

1.4 Checking for questions that are not answered by the students

Questions not answered by the student could potentially means either the student didn’t wants to answer, or the student having difficulties of answering the questions. By having this in mind, we will not discard them. In fact we will make use of them and consider them as a variable resource when predicting which questions will more likely to be not responded by the student.

profile_questions_list
question_categories = DF.distinct(answers_df, ["Categoría de la pregunta"])
question_types = DF.distinct(answers_df, ["Tipo de pregunta"])
keys = DF.pull(question_categories, "Categoría de la pregunta") |> Explorer.Series.to_list()
values = Enum.to_list(0..length(keys)-1)
# Zip into a map
category_index_map = Enum.zip(keys, values) |> Map.new()

keys = DF.pull(question_types, "Tipo de pregunta") |> Explorer.Series.to_list()
values = Enum.to_list(0..length(keys)-1)
# Zip into a map
type_index_map = Enum.zip(keys, values) |> Map.new()
#We generate a data frame called is_answered_df, for our model addressing our first research question
is_answered_df =
  DF.to_rows(questions_df)
  |> Enum.flat_map(fn row ->
    student_id = row["Participant ID"] 

    row
    |> Enum.reject(fn {key, _} -> key == "Participant ID" end)  # Remove "Participant ID"
    |> Enum.map(fn {question_id, answer} ->
      %{
        "StudentID" => student_id,
        "Question" => question_id,
        "Answer" => answer,
        "Answered" => if answer == -1 do 0 else 1 end #1 for did answered, 0 for not answered
      }
      end)
  end)
  |> DF.new()


is_answered_df =
  DF.join(is_answered_df, answers_df,
    how: :left,
    on: [
      {"StudentID", "ID de participante"},
      {"Question", "Texto de la pregunta"}
    ]
  ) |> DF.select(["Answer", "Answered", "Question", "StudentID", "ID Actividad", "Categoría de la pregunta", "Tipo de pregunta"])

is_answered_df =
  DF.to_rows(is_answered_df)
  |> Enum.map(fn row ->
  category = row["Categoría de la pregunta"]
  type = row["Tipo de pregunta"]

  # Step 3: Convert the category to numerical index
  category_index = case category do
    "profile" -> 1
    "selfpre" -> 4
    "access" -> 0
    "feedback" -> 3
    "satisfaction" -> 5 
    "selfpost" -> 4
    "usability" -> 6
    nil -> Map.get(category_index_map, Map.get(question_category_map, row["Question"]))  # -1 incase the student did not reply
  end

  type_index = case type do
    "mcq" -> 0
    "likert" -> 1
    nil -> Map.get(type_index_map , Map.get(question_type_map, row["Question"])) # -1 incase the student did not reply
  end


  # Step 4: Add the new 'Category_Index' to the row
  row
    |> Map.put("Category_Index", category_index)
    |> Map.put("Type_Index", type_index)
    end) |> DF.new() |> DF.discard(["Categoría de la pregunta", "Tipo de pregunta"])

We observe that the “Texto de la respuesta” + “Seleccion en la respuesta” represents the same piece of information, that is, the option that the students selected. Moreover, we are using machine learning models later on where “Texto de la respuesta” seems to be more useful. We remove “Seleccion en la respuesta”

answers_df = DF.discard(answers_df, ["Texto de la respuesta"])

2. Events.csv

events_df = DF.from_csv!(Constants.get_events_file_path(), dtypes: %{"Participante" => :string})

In events.csv, there are some interesting time component to be consider when addressing our research questions.

DF.dtypes(events_df)
different_event_df = DF.select(events_df, ["Tipo de evento"]) |> DF.distinct()

We can see that the “Ocurrío en” column contains a string datatype.

In Elixir, there is an interesting module DateTime that represents a special data structure for date and time. More interestingly, there is a built-in function that directly computes the time differences in seconds. To generate this data structure, we need different information such as year, month, date, time, timezone, etc. So what we need to do first is to gather all this information from the string. Hence, a special module is created, called “TimeParser”

defmodule TimeParser do
  @monthRegex ~r/^\d{4}-(\d{2})-\d{2}/
  @dateRegex ~r/^\d{4}-\d{2}-(\d{2})/
  @yearRegex ~r/^(\d{4})-\d{2}-\d{2}/
  @hourRegex ~r/(\d{2}):\d{2}:\d{2}/
  @minuteRegex ~r/\d{2}:(\d{2}):\d{2}/
  @secondRegex ~r/\d{2}:\d{2}:(\d{2})/
  @microsecondRegex ~r/\d{2}:\d{2}:\d{2}\.(\d{6})/
  @timezoneRegex ~r/\s[A-Za-z]+\s([\w\/]+)$/

  defp safe_extract_integer(regex, text) do
    with true <- is_binary(text),
         [_, value] <- Regex.run(regex, text),
         {int, _} <- Integer.parse(value) do
      int
    else
      _ -> 0
    end
  end

  def get_hour(text), do: safe_extract_integer(@hourRegex, text)
  def get_minute(text), do: safe_extract_integer(@minuteRegex, text)
  def get_second(text), do: safe_extract_integer(@secondRegex, text)
  def get_microsecond(text), do: safe_extract_integer(@microsecondRegex, text)
  def get_month(text), do: safe_extract_integer(@monthRegex, text)
  def get_date(text), do: safe_extract_integer(@dateRegex, text)
  def get_year(text), do: safe_extract_integer(@yearRegex, text)

  def get_timezone(text) when is_binary(text) do
    case Regex.run(@timezoneRegex, text) do
      [_, tz] -> tz
      _ -> nil
    end
  end

  def to_seconds(nil), do: 0

  def to_seconds(time_str) do
    h = get_hour(time_str)
    m = get_minute(time_str)
    s = get_second(time_str)
    h * 3600 + m * 60 + s
  end

  def get_timezone(_), do: nil
end
#define a function to determine the time differences in seconds
generate_dateTime_diff_from_str = fn str1, str2 ->
  entered_time = Time.new!(TimeParser.get_hour(str1), TimeParser.get_minute(str1), TimeParser.get_second(str1), TimeParser.get_microsecond(str1))  
  entered_date = Date.new!(TimeParser.get_year(str1), TimeParser.get_month(str1), TimeParser.get_date(str1))
  entered_dateTime = DateTime.new!(entered_date, entered_time, TimeParser.get_timezone(str1), Tzdata.TimeZoneDatabase)
  #get the exited date
  exited_date = Date.new!(TimeParser.get_year(str2), TimeParser.get_month(str2), TimeParser.get_date(str2))
  exited_time = Time.new!(TimeParser.get_hour(str2), TimeParser.get_minute(str2), TimeParser.get_second(str2), TimeParser.get_microsecond(str2))
  exited_dateTime = DateTime.new!(exited_date, exited_time, TimeParser.get_timezone(str2), Tzdata.TimeZoneDatabase)

  DateTime.diff(entered_dateTime, exited_dateTime)
end

enter_exit_df =
  events_df
|> DF.filter_with( fn df ->
    col = df["Tipo de actividad"]
    cond_a = Explorer.Series.equal(col, "likert")
    cond_b = Explorer.Series.equal(col, "mcq")
    Explorer.Series.or(cond_a, cond_b)
  end)
|> DF.filter_with(fn df ->
    col = df["Tipo de evento"]
    cond_a = Explorer.Series.equal(col, "entered")
    cond_b = Explorer.Series.equal(col, "exited")
    Explorer.Series.or(cond_a, cond_b)
  end)

distinct_student_activities = 
  enter_exit_df
  |> DF.distinct(["ID Actividad","ID de participante"])
  |> DF.to_rows()
  |> Enum.map(fn %{"ID Actividad" => current_id, "ID de participante" => current_participant} ->
    df = 
      enter_exit_df
      |> DF.filter_with(fn enter_exit_df -> Explorer.Series.equal(enter_exit_df["ID Actividad"], current_id) end)
      |> DF.filter_with(fn enter_exit_df -> Explorer.Series.equal(enter_exit_df["ID de participante"], current_participant) end)

    entered_time = 
      df
      |> DF.filter_with(fn df -> Explorer.Series.equal(df["Tipo de evento"], "entered") end)
      |> DF.sort_with(&[&1["Ocurrió en"]])
      |> DF.head(1)
      |> DF.to_rows()
      |> List.first()
      |> Map.get("Ocurrió en")

    exited_time =
      df
      |> DF.filter_with(fn df -> Explorer.Series.equal(df["Tipo de evento"], "exited") end)
      |> DF.sort_with(&[desc: &1["Ocurrió en"]])
      |> DF.head(1)
      |> DF.to_rows()
      |> List.first()
      |> case do
        nil -> 
          current_time = DateTime.now!("Etc/UTC") 
          #convert current time to CET
          current_time = DateTime.shift_zone!(current_time, "Europe/Madrid", Tzdata.TimeZoneDatabase)
          DateTime.to_string(current_time)
        row -> Map.get(row, "Ocurrió en")  
      end

    time_diff_sec = generate_dateTime_diff_from_str.(exited_time, entered_time)

    %{
      id: current_id,
      participant: current_participant,
      total_time_video_sec: time_diff_sec,
      total_time_video_days: round(time_diff_sec / 86400)
    }
end)

total_video_time_df =DF.new(distinct_student_activities)
#store the "total_video_time_df" in a CSV file
#DF.to_csv!(total_video_time_df, "questions-event.csv")


total_video_time_df = DF.from_csv!("/Users/qianqian/questions-event.csv")
#left join
is_answered_df =
  DF.join(total_video_time_df, is_answered_df,
    how: :right,
    on: [
      {"id", "ID Actividad"},
      {"participant", "StudentID"}
    ]
  )

As we can see here, some row contains null days and sec. This is because there is no time recorded for the specific activity. In this case, we are handling questions activity ID, hence those rows represent those unreponded questions

is_answered_df =
  DF.to_rows(is_answered_df)
  |> Enum.map(fn row ->
  day = row["total_time_video_days"]
  sec = row["total_time_video_sec"]

  # Generating zero value when day and sec row data equal null
  final_total_time_video_days = if day == nil do
    0
  else day
  end

  final_total_time_video_sec = if sec == nil do
    0
  else sec
  end


  # Step 4: Add the new 'Category_Index' to the row
  row
    |> Map.put("final_total_time_video_days", final_total_time_video_days)
    |> Map.put("final_total_time_video_sec", final_total_time_video_sec)
    end) |> DF.new() |> DF.discard(["total_time_video_days", "total_time_video_sec"])

#DF.to_csv!(is_answered_df, "answers-questions-event.csv")

We can also computer the “entered”, “replayed”, “move_backward” and “finished” frequency. We will be generating a new dataset to store these pieces of information.

participants_list = DF.select(events_df, ["Participante"])
participants_list = DF.distinct(participants_list, ["Participante"])
participants_list = DF.to_rows(participants_list)
#Process each row
new_rows = 
  participants_list
  |> Enum.map(fn
    participant-> 
      participant_name = participant["Participante"]
      filtered_df = DF.filter_with(events_df, fn events_df -> Explorer.Series.equal(events_df["Participante"], participant_name) end)
      
      # filter the database based on "entered" event
      entered_df = DF.filter_with(filtered_df, fn events_df -> Explorer.Series.equal(events_df["Tipo de evento"], "entered") end)
      # filter the database based on "moved_backward" event
      move_backward_df = DF.filter_with(filtered_df, fn events_df -> Explorer.Series.equal(events_df["Tipo de evento"], "moved_backward") end)
      replayed_df = DF.filter_with(filtered_df, fn events_df -> Explorer.Series.equal(events_df["Tipo de evento"], "replayed") end)
      finished_df = DF.filter_with(filtered_df, fn events_df -> Explorer.Series.equal(events_df["Tipo de evento"], "finished") end)
      
      entered_freq = DF.n_rows(entered_df)
      move_backward_freq = DF.n_rows(move_backward_df)
      replayed_freq = DF.n_rows(replayed_df)
      finished_freq = DF.n_rows(finished_df)
      
      %{
        participant: participant_name,
        total_entered_frequency: entered_freq,
        total_playback_frequency: move_backward_freq,
        total_replayed_frequency: replayed_freq,
        total_finished_frequency: finished_freq
      }
  end)

frequency_df = DF.new(new_rows)

3. participation.csv

Initial inspection into participation.csv

participation_df = DF.from_csv!(Constants.get_participation_file_path())
IO.inspect(participation_df)

This dataset contains very important information that we could use directly for our second research question. Such as, time following the course, number of video reproduced, boolean variable to represent whether a course has been studied at least 75%, etc. However, there is one more information that we could calculate. That is, how long a student has been enrolled up until their latest activity. This information can be usefull for the following insight:

  1. If a student has been enrolled for a long time but has very low participation (e.g. low number of video has been watched), they might be at risk of dropping out.
  2. Students who have been enrolled longer might show better engagement.
  3. If most students who stay enrolled for a long time don’t complete the course, it may indicate the course is not engaging or is too difficult.
# Convert DataFrame to a list of maps
participation_map_list = DF.to_rows(participation_df)
# Process every rows 
new_rows =
  participation_map_list
  |> Enum.map(fn
    row -> 
      # compute duration in second from the first enrollment date until now 
      enrolment_duration = generate_dateTime_diff_from_str.(row["Hora ultima actividad"], row["Hora comienzo curso"])
      # recalculate the duration into n days for better interpretation
      n_days = round(enrolment_duration / 86400)
      
      row
      |>Map.put("Enrolment duration (sec)", enrolment_duration)
      |>Map.put("Enrolment duration (days)", n_days)
  end)

# Convert back to DataFrame
participation_df = DF.new(new_rows)
participation_df = DF.discard(participation_df, ["Al menos ha visto un 75%", "Centro", "Curso", "Hora ultima actividad", "Hora ultimo actividad", "Participante", "Temario", "Hora comienzo curso"])
DF.dtypes(participation_df)
participation_df =
  DF.to_rows(participation_df)
  |> Enum.map(fn row ->
  at_least_flag = row["Al menos 7h 30'"]
  last_topic_finished = row["Último tema terminado"]

  at_least_flag = case at_least_flag do
    true -> 1
    false -> 0
    _ -> -1
  end

  last_topic_finished = case last_topic_finished do
    true -> 1
    false -> 0
    _ -> -1
  end

  # Step 4: Add the new 'Category_Index' to the row
  row
    |> Map.put("at least 7.5h", at_least_flag)
    |> Map.put("last topic finished", last_topic_finished)
    end) 
    |>DF.new()
    |> DF.discard(["Al menos 7h 30'", "Último tema terminado"])
t_v =
  participation_df["Tiempo mirando videos"]
  |> Explorer.Series.to_list()
  |> Enum.map(&TimeParser.to_seconds/1)

t_c = 
  participation_df["Tiempo siguiendo curso"]
  |> Explorer.Series.to_list()
  |> Enum.map(&TimeParser.to_seconds/1)

participation_df =
  participation_df
  |> DF.put("time_watching_video_second", t_v)
  |> DF.put("time_following_course_second", t_c)
is_finished =
  participation_df["time_watching_video_second"]
  |> Explorer.Series.to_list()
  |> Enum.map(fn time ->
    if time >= 4320 do
      1
    else
      0
    end
  end)

participation_df =
  participation_df
  |> DF.put("course_is_finished", is_finished)
participation_df = DF.discard(participation_df, ["time_watching_video_second", "Tiempo mirando videos", "Tiempo siguiendo curso", "Enrolment duration (days)"])
                                                                                                                                                                                                                                                                                                                                                                                                              #ººººº1ºº11qDF.to_csv!(participation_df, "student-performance.csv")

4. Clustering - Profile questions

Clustering algorithms (like K-Means, DBSCAN, Hierarchical Clustering) and most ML models (like neural networks, linear regression, etc.) operate on mathematical distances, gradients, and matrix operations. Furthermore, all ML and clustering libraries in Elixir work with Nx.Tensor, hence we will be using the NX library.

We will apply cluster algorithm based on the answers provided to the profile questions of the participants. Important, this includes both the answers that are provided or not provided by the students. With the aim of gathering a better overview of all the students enrolled in the course, even when some data is incomplete.

Later on, the labels will be used as a feature parameter for the future predictive models in the supervised learning strategy. Among all the questions that we are planning to address, the profile data of the customer seems to be an important ingredient. And this approach helps address the issue of missing data: If we rely solely on the provided answers to the profile questions, we risk losing valuable information and potentially degrading model performance. By assigning students to clusters based on the available patterns, we can capture underlying similarities and trends, allowing us to enrich the dataset and build more robust models.

When building a clustering model, we have two main questions to ask ourself:

  • How many cluster do we need to create to meaningfully differentiate betwwen distinct groups of students?
  • Which cluster algorithm should we use? As there are many, including K-means, DBSCAN, Hierarchical clustering.
defmodule KModes do
  @moduledoc """
  A simplified KModes clustering algorithm for categorical data.
  """

  # Public entry point
  def cluster(tensor, k, max_iter \\ 50) do
    # Convert Nx.Tensor to a list of rows
    data = Nx.to_list(tensor)
    
    # Randomly initialize k centroids
    centroids = Enum.take_random(data, k)
    
    iterate(data, centroids, k, max_iter)
  end

  # Main loop
  defp iterate(data, centroids, k, 0), do: assign_clusters(data, centroids)

  defp iterate(data, centroids, k, iter) do
    assignments = assign_clusters(data, centroids)
    new_centroids = recompute_centroids(assignments, k)

    if new_centroids == centroids do
      assignments
    else
      iterate(data, new_centroids, k, iter - 1)
    end
  end

  # Assign each item to the closest centroid using matching dissimilarity
  defp assign_clusters(data, centroids) do
    Enum.map(data, fn row ->
      {_, label} =
        centroids
        |> Enum.with_index()
        |> Enum.min_by(fn {centroid, _idx} -> matching_dissimilarity(row, centroid) end)
      
      {row, label}
    end)
  end

  # Recompute centroids as mode (most frequent value) per column
  defp recompute_centroids(assignments, k) do
    assignments
    # group by label and extract only the row from each pair to use in computing the centroids
    |> Enum.group_by(fn {_row, label} -> label end, fn {row, _label} -> row end)
    |> Enum.map(fn {_label, rows} -> column_modes(rows) end)
    |> pad_missing_centroids(k)
  end

  # Just in case some clusters were empty, it ensures k centroids are returned
  # Pad centroids if fewer than k clusters were populated
  defp pad_missing_centroids(centroids, k) do
    if length(centroids) < k do
      centroids ++ List.duplicate(List.first(centroids), k - length(centroids))
    else
      centroids
    end
  end

  # Mode for each column
  defp column_modes(rows) do
    rows
    |> Enum.zip()
    |> Enum.map(fn column ->
      column
      |> Tuple.to_list()
      |> Enum.frequencies()
      |> Enum.max_by(fn {_val, freq} -> freq end)
      |> elem(0)
    end)
  end

  # Count how many values differ (matching dissimilarity)
  defp matching_dissimilarity(row1, row2) do
    Enum.zip(row1, row2)
    |> Enum.count(fn {a, b} -> a != b end)
  end

  def silhouette_score(assignments) do
  clusters =
    assignments
    |> Enum.group_by(fn {_row, label} -> label end, fn {row, _label} -> row end)

  # Calculate silhouette score for each point
  scores =
    Enum.map(assignments, fn {point, label} ->
      a = average_dissimilarity(point, clusters[label])
      b =
        clusters
        |> Map.delete(label)
        |> Enum.map(fn {_other_label, points} -> average_dissimilarity(point, points) end)
        |> Enum.min()

      if a == b do
        0.0
      else
        (b - a) / max(a, b)
      end
    end)

  Enum.sum(scores) / length(scores)
end

# Helper function to calculate average dissimilarity to a list of points
defp average_dissimilarity(point, others) do
  others
  |> Enum.reject(&(&1 == point)) # Exclude self
  |> Enum.map(&matching_dissimilarity(point, &1))
  |> case do
    [] -> 0.0
    dissimilarities -> Enum.sum(dissimilarities) / length(dissimilarities)
  end
end

end
student_profile_df
df = DF.select(student_profile_df, ["¿Cuál es su país de nacimiento?"])

For machine learning purposes, we will be using tensor as a data structure for the model inputs.

tensor = Nx.stack(student_profile_df, axis: -1)
result = KModes.cluster(tensor,2, 500) #this returns a list of tuple {data, label}

data_list = Enum.map(result, fn {list, _label} -> list end) 
tensor_data = Nx.tensor(data_list)
IO.inspect(tensor_data)

label_list = Enum.map(result, fn {_list, label} -> label end)
tensor_label = Nx.tensor(label_list)
score = KModes.silhouette_score(result)
IO.puts("Silhouette Score: #{score}")
labels_list = Nx.to_flat_list(tensor_label)
cluster_df = DF.new(cluster: labels_list)
df_with_labels = DF.concat_columns(student_profile_df, cluster_df)

#DF.to_csv!(df_with_labels, "profile-clustering-2.csv")
#File.cwd!()
  • 2 clusters -> Silhouette Score: 0.4453125125238959
  • 3 clusters-> Silhouette Score: 0.07989377306790536
  • 4 clusters-> Silhouette Score: 0.1313027585955213
  • 5 clusters-> Silhouette Score: 0.14019041511784322
  • 6 clusters-> Silhouette Score: 0.11184457767867173
  • 10 clusters-> Silhouette Score: 0.048957542856680476

From this we can see that All other options (3–10) show much lower cohesion and separation, suggesting the data does not naturally support more clusters. Even though 0.445 isn’t extremely high, for categorical data using K-Modes, it’s a reasonable score, especially if the dataset is sparse or imbalanced. If the clusters are too broad or unhelpful, you might:

  • Try sub-clustering within each.
  • Use feature selection to refine input variables (remove irrelevant ones).

4.1 Feature selection

  • 2-> Silhouette Score: 0.4453125125238959
  • 3-> Silhouette Score: 0.07989377306790536
  • 4 -> Silhouette Score: 0.1313027585955213
  • 5 -> Silhouette Score: 0.14019041511784322
  • 6 -> Silhouette Score: 0.11184457767867173
  • 10 -> Silhouette Score: 0.048957542856680476

From this we can see that All other options (3–10) show much lower cohesion and separation, suggesting the data does not naturally support more clusters. Even though 0.445 isn’t extremely high, for categorical data using K-Modes, it’s a reasonable score, especially if the dataset is sparse or imbalanced. Let’s try out feature selection.

  • Use feature selection to refine input variables (remove irrelevant ones).

  • ¿Cual es el país de nacimiento” -> removed, since only a couple are answered hence does not provide meaningful information

  • ¿Cuál fue el nivel más alto de estudios que cursó su madre? & ¿Cuál fue el nivel más alto de estudios que cursó su padre? -> These two feature can be combinad using max. These features are semantically related — both reflect household education background. If treated separately, they may add redundant information. Combining them reduces dimensionality and simplifies clustering.

  • Chi-squared test -> for correlation. In case high correlation -> keep one that contains fewer missing values.

#Discard "¿Cual es el país de nacimiento” column
filtered_student_col_df = DF.discard(student_profile_df, ["¿Cuál es su país de nacimiento?"])
tensor = Nx.stack(student_profile_df, axis: -1)
filtered_student_result = KModes.cluster(tensor,2, 500) #this returns a list of tuple {data, label}

data_list = Enum.map(filtered_student_result, fn {list, _label} -> list end) 
tensor_data = Nx.tensor(data_list)
IO.inspect(tensor_data)

label_list = Enum.map(filtered_student_result, fn {_list, label} -> label end)
tensor_label = Nx.tensor(label_list)
IO.inspect(tensor_label)

score = KModes.silhouette_score(filtered_student_result)
IO.puts("Silhouette Score: #{score}")
  • 2-> Silhouette Score: 0.4453125125238959
  • 3-> Silhouette Score: 0.25224026519341247
  • 4 -> Silhouette Score: 0.1318984987189088
  • 5 -> Silhouette Score: 0.13588691520357365
  • 6 -> Silhouette Score: 0.12828607767352374
  • 10 -> Silhouette Score: 0.11067767090068065

After dropping a non-informative column, the clustering algorithm showed better convergence—especially for k=3—as evidenced by the improved Silhouette Score. This suggests that the removed column was likely adding noise or diluting meaningful distance relationships, a classic issue when dealing with high-dimensional or irrelevant features in clustering. -> A bit of improvement but somehow moderate

# We now compute the highest parental education level.

# Pull each column as lists
mum_col = DF.pull(filtered_student_col_df, "¿Cuál fue el nivel más alto de estudios que cursó su madre?") |> Series.to_list() 
dad_col = DF.pull(filtered_student_col_df, "¿Cuál fue el nivel más alto de estudios que cursó su padre?") |> Series.to_list()

# Compute element-wise max
max_col = Enum.zip(mum_col, dad_col) |> Enum.map(fn {m, p} -> max(m, p) end)

# Add new column to the DataFrame
combCol_filtered_student_df = 
  filtered_student_col_df
  |> DF.put("Educación más alta de padres", max_col)
  |> DF.discard(["¿Cuál fue el nivel más alto de estudios que cursó su madre?", "¿Cuál fue el nivel más alto de estudios que cursó su padre?"])

tensor = Nx.stack(combCol_filtered_student_df, axis: -1)
combCol_filtered_student_result = KModes.cluster(tensor,2, 500) #this returns a list of tuple {data, label}

data_list = Enum.map(combCol_filtered_student_result, fn {list, _label} -> list end) 
tensor_data = Nx.tensor(data_list)
IO.inspect(tensor_data)

label_list = Enum.map(combCol_filtered_student_result, fn {_list, label} -> label end)
tensor_label = Nx.tensor(label_list)
IO.inspect(tensor_label)

score = KModes.silhouette_score(combCol_filtered_student_result)
IO.puts("Silhouette Score: #{score}")
  • 2-> Silhouette Score: 0.4535754484319116
  • 3-> Silhouette Score: 0.2462278684874294
  • 4 -> Silhouette Score: 0.15134027451958038
  • 5 -> Silhouette Score: 0.13334222537704885
  • 6 -> Silhouette Score: 0.12239520690271541
  • 10 -> Silhouette Score: 0.1488067659185638

After dropping a non-informative column, the clustering algorithm showed better convergence—especially for k=3—as evidenced by the improved Silhouette Score. This suggests that the removed column was likely adding noise or diluting meaningful distance relationships, a classic issue when dealing with high-dimensional or irrelevant features in clustering. -> A bit of improvement but somehow moderate

labels_list = Nx.to_flat_list(tensor_label)
cluster_df = DF.new(cluster: labels_list)
df_with_labels = DF.concat_columns(combCol_filtered_student_df, cluster_df)

#DF.to_csv!(df_with_labels, "profile-clustering-filterCol-combCol-2.csv")
#File.cwd!()
cluster_label_df = DF.from_csv!("/Users/qianqian/profile-clustering-filterCol-combCol-2.csv")
cluster_1_df = DF.filter_with(cluster_label_df, fn events_df -> Explorer.Series.equal(events_df["cluster"], 0) end) |> DF.discard(["cluster"])

Removing the group of people who are inactive (in cluster 0). And perform sub-clustering

#Remove cluster 1 data
cluster_label_df = DF.from_csv!(Constants.get_clustering_label_file_path())

#inactive_student_df = DF.filter_with(cluster_label_df, fn events_df -> Explorer.Series.equal(events_df["cluster"], 0) end)
cluster_1_df = DF.filter_with(cluster_label_df, fn events_df -> Explorer.Series.equal(events_df["cluster"], 1) end) |> DF.discard(["cluster"])

tensor = Nx.stack(cluster_1_df, axis: -1)
combCol_filtered_student_result = KModes.cluster(tensor,2, 500) #this returns a list of tuple {data, label}

data_list = Enum.map(combCol_filtered_student_result, fn {list, _label} -> list end) 
tensor_data = Nx.tensor(data_list)
IO.inspect(tensor_data)

label_list = Enum.map(combCol_filtered_student_result, fn {_list, label} -> label end)
tensor_label = Nx.tensor(label_list)
IO.inspect(tensor_label)

score = KModes.silhouette_score(combCol_filtered_student_result)
IO.puts("Silhouette Score: #{score}")
inactive_student_df = DF.filter_with(cluster_label_df, fn events_df -> Explorer.Series.equal(events_df["cluster"], 0) end)
labels_list = Nx.to_flat_list(tensor_label)
cluster_df = DF.new(cluster: labels_list)
df_with_labels = DF.concat_columns(cluster_1_df, cluster_df)

#DF.to_csv!(df_with_labels, "profile-sub-clustering-2.csv")
#File.cwd!()
defmodule DataSummariser do
  alias Explorer.DataFrame, as: DF
  alias Explorer.Series
  require VegaLite, as: Vl

  def generate_frequency_per_group(df, col_name) do
    df
    |> DF.group_by(col_name)
    |> DF.summarise_with(&[
      count: Series.count(&1[col_name])
    ])

  end
end
#generate new df with new cluster label after second clustering
cluster_label_df2 = DF.from_csv!("/Users/qianqian/profile-sub-clustering-2.csv")
cluster_label_df2 = DF.mutate(cluster_label_df2, cluster: cluster + 1)
cluster_label_df2 = DF.filter_with(cluster_label_df2, fn events_df -> Explorer.Series.equal(events_df["cluster"], 2) end)
summary = DataSummariser.generate_frequency_per_group(cluster_label_df2, "¿Entiende la lengua española?")
#get the inactive participants after the first clustering
cluster_label_df1 = DF.from_csv!(Constants.get_clustering_label_file_path())
cluster_label_df1 = DF.filter_with(cluster_label_df1, fn events_df -> Explorer.Series.equal(events_df["cluster"], 0) end)

#generate new df with new cluster label after second clustering
cluster_label_df2 = DF.from_csv!(Constants.get_clustering_label2_file_path())
cluster_label_df2 = DF.mutate(cluster_label_df2, cluster: cluster + 1)

#combine two dataframes - row-wise
final_cluster_df = DF.concat_rows([cluster_label_df1, cluster_label_df2]) |> DF.select(["Participant ID", "cluster"])
#DF.to_csv!(final_cluster_df, "final-profile-label.csv")