Powered by AppSignal & Oban Pro

Whisper Demos

whisper/whisper.livemd

Whisper Demos

Mix.install(
  [
    {:kino, "~> 0.13"},
    {:membrane_whisper_plugin, "~> 0.1.0"},
    {:membrane_portaudio_plugin, "~> 0.19.4"},
    {:membrane_transcoder_plugin, "~> 0.3.2"},
    {:membrane_raw_audio_format, "~> 0.12.0"},
    {:membrane_ffmpeg_swresample_plugin, "~> 0.20.5"},
    {:boombox, "~> 0.2.8"},
    {:exla, "~> 0.10"}
  ],
  config: [
    nx: [default_backend: EXLA.Backend]
  ]
)

Logger.configure(level: :info)

hf_repo = "openai/whisper-tiny"

{:ok, whisper} = Bumblebee.load_model({:hf, hf_repo})
{:ok, featurizer} = Bumblebee.load_featurizer({:hf, hf_repo})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, hf_repo})
{:ok, generation_config} = Bumblebee.load_generation_config({:hf, hf_repo})

Transcript widget

Let’s write a simple Membrane Sink that will receive an audio stream with transcription generated by Membrane.Whisper.TranscriberFilter and display it in the Kino.Frame.

defmodule TranscriptSink do
  use Membrane.Sink

  def_input_pad :input,
    accepted_format: _any,
    flow_control: :auto

  def_options frame: [spec: Kino.Frame.t()]

  @impl true
  def handle_init(_ctx, opts), do: {[], %{frame: opts.frame, text: ""}}

  @impl true
  def handle_buffer(:input, _buffer, _ctx, state), do: {[], state}

  @impl true
  def handle_event(:input, %Membrane.Whisper.TranscriptEvent{text: text}, _ctx, state) do
    new_text = state.text <> text
    Kino.Frame.render(state.frame, Kino.Markdown.new(new_text))
    {[], %{state | text: new_text}}
  end

  def handle_event(pad, event, ctx, state), do: super(pad, event, ctx, state)
end

Live microphone demo

Then, we will write a Membrane Pipeline that takes the audio from the computer microphone, passes it to the Membrane.Whisper.TranscriberFilter and displays it using the Sink we have just written in the previous cell.

Membrane.Whisper.TranscriberFilter expects a Bumblebee.Audio serving as a parameter, so we will pass one in from the pipeline options.

defmodule MicPipeline do
  use Membrane.Pipeline

  @impl true
  def handle_init(_ctx, opts) do
    spec =
      child(:mic, %Membrane.PortAudio.Source{
        sample_format: :f32le,
        channels: 1,
        sample_rate: 16_000,
        latency: :low
      })
      |> via_in(:input, toilet_capacity: 1_000)
      |> child(:whisper, %Membrane.Whisper.TranscriberFilter{
        serving: opts[:serving]
      })
      |> child(:sink, %TranscriptSink{frame: opts[:frame]})

    {[spec: spec], %{}}
  end
end

Now, we have to set up a serving that will perform the speech-to-text operation and start the Membrane Pipeline. The serving is passed to the pipeline via opts and then forwarded to the Membrane.Whisper.TranscriberFilter.

frame = Kino.Frame.new() |> Kino.render()

serving =
  Bumblebee.Audio.speech_to_text_whisper(
    whisper,
    featurizer,
    tokenizer,
    generation_config,
    stream: true,
    chunk_num_seconds: 10
  )

{:ok, _supervisor, _pipeline} =
  Membrane.Pipeline.start_link(MicPipeline, frame: frame, serving: serving)

Process.sleep(:infinity)

MP4 file transcript demo

This demo pulls an MP4 from a URL, extracts the audio track, and transcribes it. Membrane.Realtimer ensures that the stream will flow through Membrane.Whisper.TranscriberFilter at a realtime pace.

Let’s define a pipeline module that performs these operations…

defmodule MP4Pipeline do
  use Membrane.Pipeline

  @samples_url "https://raw.githubusercontent.com/membraneframework/static/gh-pages/samples"

  @impl true
  def handle_init(_ctx, opts) do
    spec =
      child(:source, %Boombox.Bin{input: "#{@samples_url}/sherlock_librivox.mp4"})
      |> via_out(:output, options: [kind: :audio])
      |> child(:transcoder, %Membrane.Transcoder{
        output_stream_format: Membrane.RawAudio
      })
      |> child(:resampler, %Membrane.FFmpeg.SWResample.Converter{
        output_stream_format: %Membrane.RawAudio{
          sample_format: :f32le,
          channels: 1,
          sample_rate: 16_000
        }
      })
      |> child(:realtimer, Membrane.Realtimer)
      |> child(:whisper, %Membrane.Whisper.TranscriberFilter{
        serving: opts[:serving]
      })
      |> child(:sink, %TranscriptSink{frame: opts[:frame]})

    {[spec: spec], %{}}
  end
end

…load the serving and start the pipeline.

frame = Kino.Frame.new() |> Kino.render()

serving =
  Bumblebee.Audio.speech_to_text_whisper(
    whisper,
    featurizer,
    tokenizer,
    generation_config,
    stream: true,
    chunk_num_seconds: 8
  )

{:ok, _supervisor, _pipeline} =
  Membrane.Pipeline.start_link(MP4Pipeline, frame: frame, serving: serving)

Process.sleep(:infinity)