LcSeg
Define module to parse manually annotated topic changes
defmodule Test.LcSeg.TopicChange do
defstruct [:start, :end, :id]
end
defmodule Test.LcSeg.TopicChangeParser do
alias Test.LcSeg.TopicChange
def parse(topic_metadata_file_path) do
raw_topic_metadata = File.read!(topic_metadata_file_path)
{:ok, parsed} = Floki.parse_document(raw_topic_metadata)
transcript_lines =
parsed
|> Floki.find("top")
|> IO.inspect()
|> Enum.map(fn segment ->
time_range =
["starttime", "endtime"]
|> Enum.map(&(hd(Floki.attribute(segment, &1)) |> String.to_float()))
topic_change_id = Floki.attribute(segment, "nite:id") |> hd()
%TopicChange{
id: topic_change_id,
start: Enum.at(time_range, 0),
end: Enum.at(time_range, 1)
}
end)
{:ok, transcript_lines}
end
end
transcript_files_with_annotations = [
"Bed003",
"Bed004",
"Bed011",
"Bmr001",
"Bmr002",
"Bmr005",
"Bmr007",
"Bmr008",
"Bmr009",
"Bmr010",
"Bmr011",
"Bmr012",
"Bmr013",
"Bmr014",
"Bmr018",
"Bmr021",
"Bmr022",
"Bmr024",
"Bmr025",
"Bmr026",
"Bmr027",
"Bmr029",
"Bro004",
"Bro007",
"Bro015"
]
Parse all transcripts with annotations
transcripts_with_changes =
transcript_files_with_annotations
|> Enum.map(fn file_name ->
{:ok, transcript} =
Test.LcSeg.MrtTranscriptParser.parse("test/fixtures/transcripts/#{file_name}.mrt")
{:ok, topic_changes} =
Test.LcSeg.TopicChangeParser.parse("test/fixtures/topic_changes/#{file_name}.tops.xml")
{file_name, %{transcript: transcript, annotated_topic_changes: topic_changes}}
end)
|> Enum.into(%{})
Define Graphing utilities
defmodule Graphing do
def smooth(data_points, factor, x_field, y_field) do
data_points
|> Enum.chunk_every(factor, 1)
|> Enum.map(fn chunked_data_points ->
sum =
chunked_data_points
|> Enum.map(&Map.fetch!(&1, y_field))
|> Enum.sum()
%{}
|> Map.put(x_field, hd(chunked_data_points)[x_field])
|> Map.put(y_field, sum / length(chunked_data_points))
end)
end
def graph(data, type, x_field, y_field) do
VegaLite.new(width: 750, height: 400)
|> VegaLite.mark(type)
|> VegaLite.data_from_values(data)
|> VegaLite.encode_field(:x, x_field, scale: [type: :linear])
|> VegaLite.encode_field(:y, y_field, scale: [type: :linear])
end
end
graphables =
transcripts_with_changes
|> Enum.map(fn {file, %{transcript: transcript} = annotated_transcript} ->
cohesion_over_time =
transcript
|> LcSeg.calculate_lexical_chains(h: 1)
|> LcSeg.cohesion_over_time(k: 3)
|> Graphing.smooth(5, :playback_time, :cohesion)
probabilities =
cohesion_over_time
|> LcSeg.topic_change_probabilities()
annotated_transcript
|> Map.put(:cohesion_over_time, cohesion_over_time)
|> Map.put(:probabilities, probabilities)
end)
VegaLite.new(width: 750, height: 800)
|> VegaLite.concat(
graphables
|> Enum.flat_map(fn %{
cohesion_over_time: cohesion_over_time,
probabilities: probabilities,
annotated_topic_changes: annotated_topic_changes
} ->
annotated_topic_changes =
annotated_topic_changes
|> Enum.map(&%{playback_time: &1.start, change: 1})
# stats = Statistex.statistics(probabilities |> Enum.map(& &1.probability))
probabilities =
probabilities
# |> Enum.filter(fn %{probability: probability} ->
# probability > stats.average - 0.05 * stats.standard_deviation
# end)
|> Enum.sort_by(&(-&1.probability))
|> Enum.take(5)
probabilities =
[%{playback_time: hd(cohesion_over_time).playback_time, probability: 1}] ++
probabilities ++
[%{playback_time: List.last(cohesion_over_time).playback_time, probability: 1}]
annotated_topic_changes =
[%{playback_time: hd(cohesion_over_time).playback_time, change: 0}] ++
annotated_topic_changes ++
[%{playback_time: List.last(cohesion_over_time).playback_time, change: 0}]
[
VegaLite.new(width: 750, height: 400)
|> VegaLite.data_from_values(cohesion_over_time)
|> VegaLite.mark(:line, tooltip: true)
|> VegaLite.encode_field(:x, "playback_time", scale: [type: :linear])
|> VegaLite.encode_field(:y, "cohesion", scale: [type: :linear]),
Graphing.graph(probabilities, :point, "playback_time", "probability"),
Graphing.graph(annotated_topic_changes, :point, "playback_time", "change")
]
end),
:vertical
)
|> VegaLite.to_spec()
|> Jason.encode!()
|> IO.puts()