Spectrogram Plotting
Mix.install([
{:nx_signal, "~> 0.2"},
{:vega_lite, "~> 0.1.4"},
{:kino_vega_lite, "~> 0.1.1"}
])
Generating the audio data
# You can load an audio file here. For this example,
# we're producing 3 seconds of 220Hz, 440Hz, 1kHz and 3kHz sine waves
fs = 44.1e3
t_max = 3
full_n = ceil(fs * t_max)
half_n = div(full_n, 2)
# samples/sec * sec = samples
n = Nx.iota({half_n})
sin = fn freq, n ->
Nx.sin(Nx.multiply(2 * :math.pi() * freq / fs, n))
end
sin220 = sin.(220, n)
sin440 = sin.(440, n)
sin1000 = sin.(1000, n)
sin3000 = sin.(3000, n)
data = Nx.concatenate([Nx.add(sin440, sin1000), Nx.add(sin220, sin3000)])
n = Nx.iota({full_n})
d = %{data: Nx.to_flat_list(data[[1000..1250]]), n: Nx.to_flat_list(n[[1000..1250]])}
VegaLite.new(width: 600, height: 600, title: "Audio Sample")
|> VegaLite.data_from_values(d, only: ["n", "data"])
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "n", type: :ordinal)
|> VegaLite.encode_field(:y, "data", type: :quantitative)
defmodule Spectrogram do
alias VegaLite, as: Vl
import Nx.Defn
def calculate_stft_and_plot_spectrogram(
input,
fs,
window_duration_ms,
plot_cutoff_frequency \\ 4000
) do
n_window = ceil(fs * window_duration_ms)
{spectrogram, f, t, max_f} = stft(input, fs: fs, n_window: n_window)
max_f = Nx.to_number(max_f)
spectrogram = Nx.slice(spectrogram, [0, 0], [Nx.size(t), max_f])
f = Nx.slice(f, [0], [max_f])
spectrogram
|> to_plot_data(f, t, plot_cutoff_frequency)
|> plot()
end
defn stft(input, opts) do
fs = opts[:fs]
n_window = opts[:n_window]
# ms to samples
window = NxSignal.Windows.hann(n: n_window, is_periodic: true)
# use the default overlap of 50%
{s, t, f} = NxSignal.stft(input, window, sampling_rate: fs, fft_length: 1024)
max_f =
Nx.select(f >= fs / 2, Nx.iota(f.shape), Nx.size(f) + 1)
|> Nx.argmin()
spectrogram = Nx.abs(s)
# to dBFS
spectrogram = 20 * Nx.log(spectrogram / Nx.reduce_max(spectrogram)) / Nx.log(10)
{spectrogram, f, t, max_f}
end
defp to_plot_data(s, f, t, plot_cutoff_frequency) do
for t_idx <- 0..(Nx.size(t) - 1),
f_idx <- 0..(Nx.size(f) - 1),
Nx.to_number(f[[f_idx]]) <= plot_cutoff_frequency,
reduce: %{"t" => [], "f" => [], "s" => []} do
%{"t" => t_acc, "f" => f_acc, "s" => s_acc} ->
%{
"t" => [Nx.to_number(t[[t_idx]]) | t_acc],
"f" => [Float.round(Nx.to_number(f[[f_idx]]), 3) | f_acc],
"s" => [Nx.to_number(s[[t_idx, f_idx]]) | s_acc]
}
end
end
defp plot(dataset) do
Vl.new(title: "Spectrogram", width: 500, height: 500)
|> Vl.mark(:rect)
|> Vl.data_from_values(dataset)
|> Vl.encode_field(:x, "t",
type: :quantitative,
title: "Time (seconds)",
axis: [tick_min_step: 0.1],
grid: false
)
|> Vl.encode_field(:y, "f",
type: :quantitative,
sort: "-x",
title: "Frequency (Hz)",
axis: [tick_count: 25],
grid: false
)
|> Vl.encode_field(:color, "s",
aggregate: :max,
type: :quantitative,
scale: [scheme: "viridis"],
legend: [title: "dBFS"]
)
|> Vl.config(view: [stroke: nil])
end
end
Spectrogram.calculate_stft_and_plot_spectrogram(data, fs, 50.0e-3)
Notice how the first half of the spectrogram looks cleaner than the second one. This is due to the window length, that also interferes in how we can observe both our time and frequency resolutions.
Below we can see what happens if we use different window durations (150ms, 100ms and 25ms respectively).
Spectrogram.calculate_stft_and_plot_spectrogram(data, fs, 150.0e-3)
Spectrogram.calculate_stft_and_plot_spectrogram(data, fs, 100.0e-3)
Spectrogram.calculate_stft_and_plot_spectrogram(data, fs, 25.0e-3)