Whisper Demos
Mix.install(
[
{:kino, "~> 0.13"},
{:membrane_whisper_plugin, "~> 0.1.0"},
{:membrane_portaudio_plugin, "~> 0.19.4"},
{:membrane_transcoder_plugin, "~> 0.3.2"},
{:membrane_raw_audio_format, "~> 0.12.0"},
{:membrane_ffmpeg_swresample_plugin, "~> 0.20.5"},
{:boombox, "~> 0.2.8"},
{:exla, "~> 0.10"}
],
config: [
nx: [default_backend: EXLA.Backend]
]
)
Logger.configure(level: :info)
hf_repo = "openai/whisper-tiny"
{:ok, whisper} = Bumblebee.load_model({:hf, hf_repo})
{:ok, featurizer} = Bumblebee.load_featurizer({:hf, hf_repo})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, hf_repo})
{:ok, generation_config} = Bumblebee.load_generation_config({:hf, hf_repo})
Transcript widget
Let’s write a simple Membrane Sink that will receive an audio stream with transcription generated by Membrane.Whisper.TranscriberFilter and display it in the Kino.Frame.
defmodule TranscriptSink do
use Membrane.Sink
def_input_pad :input,
accepted_format: _any,
flow_control: :auto
def_options frame: [spec: Kino.Frame.t()]
@impl true
def handle_init(_ctx, opts), do: {[], %{frame: opts.frame, text: ""}}
@impl true
def handle_buffer(:input, _buffer, _ctx, state), do: {[], state}
@impl true
def handle_event(:input, %Membrane.Whisper.TranscriptEvent{text: text}, _ctx, state) do
new_text = state.text <> text
Kino.Frame.render(state.frame, Kino.Markdown.new(new_text))
{[], %{state | text: new_text}}
end
def handle_event(pad, event, ctx, state), do: super(pad, event, ctx, state)
end
Live microphone demo
Then, we will write a Membrane Pipeline that takes the audio from the computer microphone, passes it to the Membrane.Whisper.TranscriberFilter and displays it using the Sink we have just written in the previous cell.
Membrane.Whisper.TranscriberFilter expects a Bumblebee.Audio serving as a parameter, so we will pass one in from the pipeline options.
defmodule MicPipeline do
use Membrane.Pipeline
@impl true
def handle_init(_ctx, opts) do
spec =
child(:mic, %Membrane.PortAudio.Source{
sample_format: :f32le,
channels: 1,
sample_rate: 16_000,
latency: :low
})
|> via_in(:input, toilet_capacity: 1_000)
|> child(:whisper, %Membrane.Whisper.TranscriberFilter{
serving: opts[:serving]
})
|> child(:sink, %TranscriptSink{frame: opts[:frame]})
{[spec: spec], %{}}
end
end
Now, we have to set up a serving that will perform the speech-to-text operation and start the Membrane Pipeline. The serving is passed to the pipeline via opts and then forwarded to the Membrane.Whisper.TranscriberFilter.
frame = Kino.Frame.new() |> Kino.render()
serving =
Bumblebee.Audio.speech_to_text_whisper(
whisper,
featurizer,
tokenizer,
generation_config,
stream: true,
chunk_num_seconds: 10
)
{:ok, _supervisor, _pipeline} =
Membrane.Pipeline.start_link(MicPipeline, frame: frame, serving: serving)
Process.sleep(:infinity)
MP4 file transcript demo
This demo pulls an MP4 from a URL, extracts the audio track, and transcribes it. Membrane.Realtimer ensures that the stream will flow through Membrane.Whisper.TranscriberFilter at a realtime pace.
Let’s define a pipeline module that performs these operations…
defmodule MP4Pipeline do
use Membrane.Pipeline
@samples_url "https://raw.githubusercontent.com/membraneframework/static/gh-pages/samples"
@impl true
def handle_init(_ctx, opts) do
spec =
child(:source, %Boombox.Bin{input: "#{@samples_url}/sherlock_librivox.mp4"})
|> via_out(:output, options: [kind: :audio])
|> child(:transcoder, %Membrane.Transcoder{
output_stream_format: Membrane.RawAudio
})
|> child(:resampler, %Membrane.FFmpeg.SWResample.Converter{
output_stream_format: %Membrane.RawAudio{
sample_format: :f32le,
channels: 1,
sample_rate: 16_000
}
})
|> child(:realtimer, Membrane.Realtimer)
|> child(:whisper, %Membrane.Whisper.TranscriberFilter{
serving: opts[:serving]
})
|> child(:sink, %TranscriptSink{frame: opts[:frame]})
{[spec: spec], %{}}
end
end
…load the serving and start the pipeline.
frame = Kino.Frame.new() |> Kino.render()
serving =
Bumblebee.Audio.speech_to_text_whisper(
whisper,
featurizer,
tokenizer,
generation_config,
stream: true,
chunk_num_seconds: 8
)
{:ok, _supervisor, _pipeline} =
Membrane.Pipeline.start_link(MP4Pipeline, frame: frame, serving: serving)
Process.sleep(:infinity)