Data understanding

dataUnderstanding.livemd

Wenqian (sofia) Gao

@QIANsofia1128

TPCDataAnalysis

Share to X

Share to Bluesky

More notebooks

Data understanding

Mix.install([
  {:req, "~> 0.4.8"},
  {:explorer, "~> 0.8.0"},
  {:kino_explorer, "~> 0.1.18"},
  {:vega_lite, "~> 0.1.8"},
  {:kino_vega_lite, "~> 0.1.3"},
  {:tzdata, "~> 1.1"},
  {:scholar, "~> 0.3.0"},
  {:tucan, "~> 0.5.0"}
])

1. Exploring “respuesta.csv”

require Explorer.DataFrame, as: DF
require Explorer.Series, as: Series
require VegaLite, as: Vl

answers_df = DF.from_csv!("/Users/qianqian/TPCDataAnalysis/RealData/answers-aaf7180-2025-04-22 14_17_28.csv")

DF.dtypes(answers_df)

1.1 Student distribution

student_age_df = DF.filter_with(answers_df, fn answers_df -> Explorer.Series.equal(answers_df["ID de pregunta"], "a30f78cc-f230-4851-a98d-5aaf2bf8bc19") end)

grouped_age_df =
  student_age_df
      |> DF.group_by("Texto de la respuesta")
    |> DF.summarise_with(&amp;[count: Series.count(&amp;1["Selección en la respuesta"])])

#Draw bar chart 
bar_chart = 
Vl.new(width: 600, height: 400)
|> Vl.data_from_values(grouped_age_df)
|> Vl.mark(:bar, clip: true)
|> Vl.encode_field(:x, "Texto de la respuesta", title: "Age Range", axis: [label_angle: 0])
|> Vl.encode_field(:y, "count",
  type: :quantitative,
  title: "Count",
  scale: [domain: [0, 280]]
)

Kino.VegaLite.new(bar_chart)

1.2 Student satisfaction level

student_age_df = DF.filter_with(answers_df, fn answers_df -> Explorer.Series.equal(answers_df["Categoría de la pregunta"], "satisfaction") end)

grouped_satisfaction_df = DF.filter_with(answers_df, fn answers_df -> Explorer.Series.equal(answers_df["ID de pregunta"], "eeec01ae-5b90-4d63-b400-5fd5ecd862b1") end)

grouped_satisfaction_df =
  grouped_satisfaction_df
      |> DF.group_by("Texto de la respuesta")
    |> DF.summarise_with(&amp;[count: Series.count(&amp;1["Selección en la respuesta"])])

#Draw bar chart 
bar_chart = 
Vl.new(width: 600, height: 400)
|> Vl.data_from_values(grouped_satisfaction_df)
|> Vl.mark(:bar, clip: true)
|> Vl.encode_field(:x, "Texto de la respuesta", title: "Satisfaction", axis: [label_angle: 0])
|> Vl.encode_field(:y, "count",
  type: :quantitative,
  title: "Count",
  scale: [domain: [0, 280]]
)

Kino.VegaLite.new(bar_chart)

2. Exploring “participacion.csv”

participation_df = DF.from_csv!("/Users/qianqian/TPCDataAnalysis/RealData/participation-aaf7180-2025-04-22 14_17_51.csv")

DF.dtypes(participation_df)

2.1 Average Time Spent on Videos

defmodule TimeParser do
  @monthRegex ~r/^\d{4}-(\d{2})-\d{2}/
  @dateRegex ~r/^\d{4}-\d{2}-(\d{2})/
  @yearRegex ~r/^(\d{4})-\d{2}-\d{2}/
  @hourRegex ~r/(\d{2}):\d{2}:\d{2}/
  @minuteRegex ~r/\d{2}:(\d{2}):\d{2}/
  @secondRegex ~r/\d{2}:\d{2}:(\d{2})/
  @microsecondRegex ~r/\d{2}:\d{2}:\d{2}\.(\d{6})/
  @timezoneRegex ~r/\s[A-Za-z]+\s([\w\/]+)$/

  defp safe_extract_integer(regex, text) do
    with true <- is_binary(text),
         [_, value] <- Regex.run(regex, text),
         {int, _} <- Integer.parse(value) do
      int
    else
      _ -> 0
    end
  end

  def get_hour(text), do: safe_extract_integer(@hourRegex, text)
  def get_minute(text), do: safe_extract_integer(@minuteRegex, text)
  def get_second(text), do: safe_extract_integer(@secondRegex, text)
  def get_microsecond(text), do: safe_extract_integer(@microsecondRegex, text)
  def get_month(text), do: safe_extract_integer(@monthRegex, text)
  def get_date(text), do: safe_extract_integer(@dateRegex, text)
  def get_year(text), do: safe_extract_integer(@yearRegex, text)

  def get_timezone(text) when is_binary(text) do
    case Regex.run(@timezoneRegex, text) do
      [_, tz] -> tz
      _ -> nil
    end
  end

  def to_seconds(nil), do: 0

  def to_seconds(time_str) do
    h = get_hour(time_str)
    m = get_minute(time_str)
    s = get_second(time_str)
    h * 3600 + m * 60 + s
  end

  def get_timezone(_), do: nil
end

t_v =
  participation_df["Tiempo mirando videos"]
  |> Explorer.Series.to_list()
  |> Enum.map(&amp;TimeParser.to_seconds/1)

t_c = 
  participation_df["Tiempo siguiendo curso"]
  |> Explorer.Series.to_list()
  |> Enum.map(&amp;TimeParser.to_seconds/1)

participation_df =
  participation_df
  |> DF.put("time_watching_video_second", t_v)
  |> DF.put("time_following_course_second", t_c)

is_finished=
  participation_df["time_watching_video_second"]
  |> Explorer.Series.to_list()
  |> Enum.map(fn time ->
    if time >= 4320 do
      "finished"
    else
      "not finished"
    end
  end)

participation_df =
  participation_df
  |> DF.put("course_is_finished", is_finished)


participation_df = DF.rename(participation_df, %{"Al menos 7h 30'" => "At least 7h 30"})

plot = Tucan.boxplot(participation_df, "time_watching_video_second", color_by: "At least 7h 30", height: 100, width: 1000)

3. Exploring “event.csv”

events_df = DF.from_csv!("/Users/qianqian/TPCDataAnalysis/RealData/events-aaf7180-2025-04-22 14_26_18.csv", dtypes: %{"Participante" => :string})

DF.dtypes(events_df)

df = DF.distinct(events_df, ["Tipo de actividad"])

df = DF.distinct(events_df, ["Tipo de evento"])

df = DF.distinct(events_df, ["Tipo de evento", "Tipo de actividad"])

resumed_video_df =
  events_df
  |> DF.filter_with(fn events_df -> Series.equal(events_df["Tipo de actividad"], "video") end)
  |> DF.filter_with(fn events_df -> Series.equal(events_df["Tipo de evento"], "moved_backward") end)

participants = DF.distinct(resumed_video_df, ["ID de participante"]) |> DF.pull("ID de participante")
  |> Series.to_list() 

# Compute counts for each participant
counts =
  for id <- participants do
    count =
      resumed_video_df
      |> DF.filter_with(fn events_df -> Series.equal(events_df["ID de participante"], id) end)
      |> DF.n_rows()

    %{participant_id: id, move_backward_video_count: count}
  end

# Create a new DataFrame from the list of maps
result_df = DF.new(counts)

plot = Tucan.boxplot(result_df, "move_backward_video_count", height: 100, width: 1000)

3.2 Student engagement

participants = DF.distinct(events_df, ["ID de participante"]) |> DF.pull("ID de participante")
  |> Series.to_list() 

# Compute counts for each participant
counts =
  for id <- participants do
    count =
      events_df
      |> DF.filter_with(fn events_df -> Series.equal(events_df["ID de participante"], id) end)
      |> DF.n_rows()

    %{participant_id: id, total_event_count: count}
  end

# Create a new DataFrame from the list of maps
result_df = DF.new(counts)

plot = Tucan.boxplot(result_df, "total_event_count", height: 100, width: 1000)

Other notebooks:

@andyl

elix_util

Examples

vegalite.livemd

tutorial data-science intermediate vega_lite jason

2022-8-18
@andyl

elix_util

MNIST

mnist.livemd

tutorial advanced data-science req axon exla nx

2022-8-18
@TomBers

livebookNotes

Attractors

attractors.livemd

advanced data-science decimal vega_lite kino

2022-8-18
Wojtek Mach
@wojtekmach

notebooks

Playground

rss.livemd

tutorial intermediate req easyxml

2022-8-18
@DockYard-Academy

curriculum

One-to-One Relationships

phoenix_one_to_one_relationships.livemd

tutorial advanced jason kino youtube hidden_cell

2023-6-5
Aurélien Noce
@ushu

elixir-course

C2N3 – Le module Task

3-le-module-task.livemd

tutorial advanced gen-server kino req merquery image

2025-8-27
Dr. Christian Geuer-Pollmann
@chgeuer

rendevous_hash_topology

Rendevouz hashing (with package)

demo.livemd

advanced algorithms testing rendevous_hash_topology combination kino benchee

2025-9-17

Back