Audio classification with TensorFlow Lite
Mix.install([
{:nx_signal, "~> 0.1"},
{:tflite_elixir, "~> 0.3.0"},
{:nx, "~> 0.5"},
{:kino, "~> 0.9.0"},
{:req, "~> 0.3.6"}
])
Introduction
The task of identifying what an audio represents is called audio classification. An audio classification model is trained to recognize various audio events.
For example, you may train a model to recognize events representing three different events: clapping, finger snapping, and typing. TensorFlow Lite provides optimized pre-trained models that you can deploy in your mobile applications.
https://www.tensorflow.org/lite/examples/audio_classification/overview
Download model file
downloads_dir = System.tmp_dir!()
# for nerves demo user
# change to a directory with write-permission
# downloads_dir = "/data/livebook"
download = fn url ->
save_as = Path.join(downloads_dir, URI.encode_www_form(url))
unless File.exists?(save_as), do: Req.get!(url, output: save_as)
save_as
end
data_files =
[
cpu_model:
"https://tfhub.dev/google/lite-model/yamnet/classification/tflite/1?lite-format=tflite"
]
|> Enum.map(fn {key, url} -> {key, download.(url)} end)
|> Map.new()
Load model and embedded lables
model_buffer = File.read!(data_files.cpu_model)
labels =
String.split(
TFLiteElixir.FlatBufferModel.get_associated_file(model_buffer, "yamnet_label_list.txt"),
"\n"
)
{:ok, interpreter} = TFLiteElixir.Interpreter.new_from_buffer(model_buffer)
Record audio
input = Kino.Input.audio("Audio")
value = Kino.Input.read(input)
Downsampling to 16kHZ
recording = Nx.from_binary(value.data, :f32)
recording_length = elem(recording.shape, 0)
downsampled = Nx.slice(recording, [0], [recording_length], strides: [3])
downsampled_length = elem(downsampled.shape, 0)
Audio classification with TensorFlow Lite
top_k = 1
downsample_rate = 15600
sample_duration = 0.975
num_samples = trunc(downsampled_length / 15600)
for sample_index <- 0..(num_samples - 1) do
sample =
Nx.slice(
downsampled,
[sample_index * downsample_rate],
[downsample_rate]
)
[out_tensor] = TFLiteElixir.Interpreter.predict(interpreter, sample)
out_tensor = Nx.reshape(out_tensor, {:auto})
sorted = Nx.argsort(out_tensor, direction: :desc)
top_k_pred = Nx.to_flat_list(Nx.take(sorted, Nx.iota({top_k})))
Enum.map(top_k_pred, fn pred_index ->
start_time = Float.round(sample_index * sample_duration, 3)
end_time = Float.round((sample_index + 1) * sample_duration, 3)
IO.puts("[#{start_time}-#{end_time}]: #{Enum.at(labels, pred_index)}")
end)
end
:ok