Powered by AppSignal & Oban Pro

DiffSingerElixirPoC

simple_run.livemd

DiffSingerElixirPoC

Mix.install([
  {:diff_singer, git: "https://github.com/GES233/DiffSinger.git"},
  {:kino, "~> 0.19.0"},
  {:vega_lite, "~> 0.1.8"},
  {:kino_vega_lite, "~> 0.1.11"}
])

:telemetry.attach(
  "orchid-step-exception-logger",
  [:orchid, :step, :exception],
  &Orchid.Runner.Hooks.Telemetry.error_handler/4,
  %{}
)

Fetch Model’s Metadata

Obtain the metadata and basic information of the sound library’s models as the skeleton for subsequent dependency graph construction.

Here, we take Qixuan v2.5.0 (for OpenUTAU), maintained by OpenVPI, as an example.

# If you have your local version
# Chenge the path to yours
model_root_path = "E:/ProgramAssets/OpenUTAUSingers/Qixuan_v2.5.0_DiffSinger_OpenUtau"
model_config = DiffSinger.VoiceBank.Config.fetch_overview(model_root_path)

:ok
:ok

Prelude Steps

This is a node that converts lyrics/MIDI into corresponding phoneme IDs and pitches (MIDI) based on a phoneme dictionary.

A primary reason for needing MIDI is to use its duration information as a reference for subsequent phoneme duration prediction.

A rasterized node will be implemented later to accommodate the processing of notes and phonemes.

defmodule CommonEncoder do
  @moduledoc """
  The lyrics and pitch are initially encoded for use in subsequent models.
  """

  def run_partial(%Orchid.Param{payload: words}, opts) do
   lang_dict = Keyword.fetch!(opts, :lang_dict)
    phoneme_dict = Keyword.fetch!(opts, :phoneme_dict)

    Enum.reduce(
      words,
      {[], [], [], [], []},
      fn {phonemes, duration, midi_note}, {acc_l, acc_t, acc_wdiv, acc_wdur, acc_midi} ->
        ph_count = length(phonemes)

        {curr_langs, curr_toks} =
          phonemes
          |> Enum.map(fn {lang, phone} -> {lang_dict[lang], phoneme_dict[phone]} end)
          |> Enum.unzip()

        curr_midis = List.duplicate(midi_note, ph_count)

        {
          acc_l ++ curr_langs,
          acc_t ++ curr_toks,
          acc_wdiv ++ [ph_count],
          acc_wdur ++ [duration],
          acc_midi ++ curr_midis
        }
      end)
  end
end
{:module, CommonEncoder, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}
defmodule DurationPredictEncoder do
  use Orchid.Step

  def run(param, opts) do
    {langs, toks, w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)

    lang_map_param = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
    phoneme_map_param = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
    word_div_param = Orchid.Param.new(:word_division, :payload, Nx.tensor([w_div], type: :s64))
    word_dur_param = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
    ph_midi_param = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))

    {:ok, [lang_map_param, phoneme_map_param, word_div_param, word_dur_param, ph_midi_param]}
  end
end
{:module, DurationPredictEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}
defmodule PitchPredictEncoder do
  use Orchid.Step

  def run(%Orchid.Param{} = param, opts) do
    {langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)

    languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
    tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
    word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
    ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))

    {:ok, [languages, tokens, word_dur, ph_midi]}
  end
end
{:module, PitchPredictEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}
# Used for Variance model
defmodule VarianceEncoder do
  use Orchid.Step

  def run(param, opts) do
    {langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)

    languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
    tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
    word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
    ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))

    {:ok, [languages, tokens, word_dur, ph_midi]}
  end
end
{:module, VarianceEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}

Variance Model

Predict Phoneme Duration:

defmodule PredictDuration do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:duration_linguistic, :duration_predict]

  def run_with_model(
    [
      %Orchid.Param{payload: lang_map},
      %Orchid.Param{payload: phoneme_map},
      %Orchid.Param{payload: word_division},
      %Orchid.Param{payload: word_duration},
      %Orchid.Param{payload: phoneme_midi}
    ],
    handlers,
    _opts
  ) do
    duration_linguistic = handlers.duration_linguistic
    duration_predict = handlers.duration_predict

    inputs = {phoneme_map, lang_map, word_division, word_duration}

    {:ok, {encoder_out_tensor, mask_tensor}} = Orchid.Symbiont.call(duration_linguistic, {:infer, inputs})

    {:ok, {result}} = Orchid.Symbiont.call(duration_predict, {:infer, {encoder_out_tensor, mask_tensor, phoneme_midi}})

    {:ok, [
      Orchid.Param.new(:ph_dur_pred, :encoder_out, result)
    ]}
  end
end
{:module, PredictDuration, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}

Predict Pitch:

defmodule PredictPitch do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:pitch_linguistic, :pitch_predict]

  def run_with_model(
    [
      %Orchid.Param{payload: languages},
      %Orchid.Param{payload: phonemes},
      %Orchid.Param{payload: phoneme_duration},
      %Orchid.Param{payload: note_midi}
    ], handlers, opts) do

    ph_dur =
      phoneme_duration
      |> Nx.backend_transfer(Nx.BinaryBackend)
      |> Nx.round()
      |> Nx.as_type(:s64)

    {:ok, {encoder_out, _mask}} =
      Orchid.Symbiont.call(
        handlers.pitch_linguistic,
        {:infer, {phonemes, languages, ph_dur}}
      )

    total_frames =
      ph_dur
      |> Nx.sum()
      |> Nx.to_number()

    key = Nx.Random.key(System.system_time())

    {pitch_noise, _key} =
      Nx.Random.normal(
        key,
        0.0,
        1.0,
        shape: {1, total_frames},
        type: :f32
      )

    expr =
      Nx.broadcast(Nx.tensor(1.0, type: :f32), {1, total_frames})

    retake =
      Nx.broadcast(Nx.tensor(1, type: :u8), {1, total_frames})

    note_rest =
      note_midi
      |> Nx.equal(0)
      |> Nx.as_type(:u8)

    note_midi = note_midi |> Nx.as_type(:f32)

    note_dur = ph_dur

    steps =
      opts
      |> Keyword.get(:steps, 20)
      |> Nx.tensor(type: :s64)

    {:ok, {pitch_pred}} =
      Orchid.Symbiont.call(
        handlers.pitch_predict,
        {:infer,
          {
            encoder_out,
            ph_dur,
            note_midi,
            note_rest,
            note_dur,
            pitch_noise,
            expr,
            retake,
            steps
          }}
      )

    {:ok, Orchid.Param.new(:pitch_pred, :payload, pitch_pred)}
  end
end
{:module, PredictPitch, <<70, 79, 82, 49, 0, 0, 18, ...>>, ...}
defmodule MIDIToPitch do
  use Orchid.Step

  def run(%Orchid.Param{payload: midi_pred}, _opts) do
    {:ok, midi_pred
      |> Nx.backend_transfer(Nx.BinaryBackend)
      |> Nx.add(-69.0)
      |> Nx.divide(12.0)
      |> then(&amp;Nx.pow(2.0, &amp;1))
      |> Nx.multiply(440)
      |> then(fn converted_f0 ->
           Nx.select(Nx.less(Nx.backend_transfer(midi_pred, Nx.BinaryBackend), 0.0), Nx.tensor(0.0), converted_f0)
         end)
      |> then(&amp;Orchid.Param.new(:f0, :tensor, &amp;1))}
  end
end
{:module, MIDIToPitch, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}

Other Variance Params:

defmodule VarianceModel do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:variance_linguistic, :variance]

  def run_with_model(
    [
      %Orchid.Param{payload: languages},
      %Orchid.Param{payload: phonemes},
      %Orchid.Param{payload: ph_dur},
      %Orchid.Param{payload: pitch}
    ],
    handlers,
    opts
  ) do

    ph_dur =
      ph_dur
      |> Nx.backend_transfer(Nx.BinaryBackend)
      |> Nx.round()
      |> Nx.as_type(:s64)

    {:ok, {encoder_out, _mask}} =
      Orchid.Symbiont.call(
        handlers.variance_linguistic,
        {:infer, {phonemes, languages, ph_dur}}
      )

    total_frames =
      ph_dur
      |> Nx.sum()
      |> Nx.to_number()

    key = Nx.Random.key(System.system_time())

    {breath_noise, key} =
      Nx.Random.normal(key, 0.0, 1.0,
        shape: {1, total_frames},
        type: :f32
      )

    {voice_noise, _key} =
      Nx.Random.normal(key, 0.0, 1.0,
        shape: {1, total_frames},
        type: :f32
      )

    retake =
      Nx.broadcast(
        Nx.tensor([1,1], type: :u8),
        {1, total_frames, 2}
      )

    steps =
      opts
      |> Keyword.get(:steps, 20)
      |> Nx.tensor(type: :s64)

    {:ok, {breath_pred, voice_pred}} =
      Orchid.Symbiont.call(
        handlers.variance,
        {:infer,
          {
            encoder_out,
            ph_dur,
            pitch,
            breath_noise,
            voice_noise,
            retake,
            steps
          }},
        :infinity
      )

    {:ok, [
      Orchid.Param.new(:breathiness, :payload, breath_pred),
      Orchid.Param.new(:voicing, :payload, voice_pred)
    ]}
  end
end
{:module, VarianceModel, <<70, 79, 82, 49, 0, 0, 18, ...>>, ...}

Acoustic Model and Vocoder

Acoustic Model:

defmodule Acoustic do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:acoustic]

  def run_with_model(
    [
      %Orchid.Param{payload: languages},
      %Orchid.Param{payload: phonemes},
      %Orchid.Param{payload: phoneme_duration},
      %Orchid.Param{payload: pitch},
      %Orchid.Param{payload: breathiness},
      %Orchid.Param{payload: voicing}
    ],
    handlers,
    opts
  ) do

    durations =
      phoneme_duration
      |> Nx.backend_transfer(Nx.BinaryBackend)
      |> Nx.round()
      |> Nx.as_type(:s64)

    frames = Nx.axis_size(pitch, 1)

    gender =
      Nx.broadcast(Nx.tensor(0.0, type: :f32), {1, frames})

    velocity =
      Nx.broadcast(Nx.tensor(1.0, type: :f32), {1, frames})

    steps =
      opts
      |> Keyword.get(:steps, 20)
      |> Nx.tensor(type: :s64)

    depth =
      opts
      |> Keyword.get(:depth, 1.0)
      |> Nx.tensor(type: :f32)

    {:ok, {mel}} =
      Orchid.Symbiont.call(
        handlers.acoustic,
        {:infer,
          {
            phonemes,
            languages,
            durations,
            pitch,
            breathiness,
            voicing,
            gender,
            velocity,
            depth,
            steps
          }},
        :infinity
      )

    {:ok, Orchid.Param.new(:mel, :payload, mel)}
  end
end
{:module, Acoustic, <<70, 79, 82, 49, 0, 0, 15, ...>>, ...}

Vocoder:

defmodule NSFHifiGAN_Vocoder do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:vocoder]

  def run_with_model(
    [%Orchid.Param{payload: mel}, %Orchid.Param{payload: f0}],
    handlers,
    _opts
  ) do

    {:ok, {audio}} = Orchid.Symbiont.call(handlers.vocoder, {:infer, {mel, f0}})

    {:ok, Orchid.Param.new(:audio, :payload, audio)}
  end
end
{:module, NSFHifiGAN_Vocoder, <<70, 79, 82, 49, 0, 0, 11, ...>>, ...}

Post-process (tensor to audio)

defmodule TensorToWave do
  use Orchid.Step

   def run(%Orchid.Param{payload: wave_tensor}, opts) do
    pcm_data =
      wave_tensor
      |> Nx.flatten()
      |> Nx.backend_transfer(Nx.BinaryBackend)
      |> Nx.multiply(32767.0)
      |> Nx.clip(-32768.0, 32767.0)
      |> Nx.as_type(:s16)
      |> Nx.to_binary()

    sample_rate = Keyword.get(opts, :sample_rate, 44100)
    byte_rate = sample_rate * 1 * 2  # sample_rate * channels * bytes_per_sample (Int16=2)
    data_size = byte_size(pcm_data)
    file_size = 36 + data_size

    header = <<
      "RIFF", file_size::little-integer-size(32), "WAVE",
      "fmt ", 16::little-integer-size(32),       # Subchunk1Size
      1::little-integer-size(16),                # AudioFormat (1 = PCM)
      1::little-integer-size(16),                # NumChannels (1 = Mono)
      sample_rate::little-integer-size(32),      # SampleRate
      byte_rate::little-integer-size(32),        # ByteRate
      2::little-integer-size(16),                # BlockAlign (channels * 2)
      16::little-integer-size(16),               # BitsPerSample (16 bits)
      "data", data_size::little-integer-size(32) # Subchunk2Size
    >>

    File.write!("E:/final.wav", header <> pcm_data)

    {:ok, Orchid.Param.new(:final, :audio, header <> pcm_data)}
  end
end
{:module, TensorToWave, <<70, 79, 82, 49, 0, 0, 15, ...>>, ...}

Build Pipeline and Prepare to Demo

defmodule Orchid.Livebook.GanttTracker do
  use Agent

  def start_link(_) do
    Agent.start_link(fn ->[] end, name: __MODULE__)
  end

  def handle_event([:orchid, :step, status], measurements, meta, _config) do
    run_id = List.first(Process.get(:"$callers")) || self()
    run_id_str = inspect(run_id)

    time_val = if status == :start, do: measurements.system_time, else: measurements.duration

    Agent.update(__MODULE__, fn state ->[{run_id_str, status, time_val, meta} | state]
    end)
  end

  def get_spans() do
    events = Agent.get(__MODULE__, &amp; &amp;1) |> Enum.reverse()

        {_, spans} = Enum.reduce(events, {%{},[]}, fn
      {run_id, :start, sys_time, meta}, {pending, spans} ->
        start_ms = System.convert_time_unit(sys_time, :native, :microsecond) / 1000.0
        sig = {run_id, meta.impl, meta.in_keys, meta.out_keys}
        {Map.put(pending, sig, start_ms), spans}

      {run_id, status, duration, meta}, {pending, spans} when status in[:done, :exception, :special] ->
        duration_ms = System.convert_time_unit(duration, :native, :microsecond) / 1000.0
        sig = {run_id, meta.impl, meta.in_keys, meta.out_keys}
        
        case Map.pop(pending, sig) do
          {start_ms, new_pending} when not is_nil(start_ms) ->
            impl_name = if is_function(meta.impl), do: "Anonymous", else: inspect(meta.impl) |> String.split(".") |> List.last()
            out_keys = inspect(meta.out_keys) |> String.slice(0..14) |> Kernel.<>("...")
            
            span = %{
              run_id: run_id,
              step: "#{impl_name}(#{out_keys})",
              start: start_ms,
              end: start_ms + duration_ms,
              visual_end: start_ms + max(duration_ms, 10.0),
              duration: duration_ms,
              status: status
            }
            {new_pending, [span | spans]}
          {nil, ^pending} -> {pending, spans}
        end
    end)
    
    spans
  end

  def clear(), do: Agent.update(__MODULE__, fn _ ->[] end)
end
{:module, Orchid.Livebook.GanttTracker, <<70, 79, 82, 49, 0, 0, 29, ...>>, ...}
Kino.start_child({Orchid.Livebook.GanttTracker, []})

events =[
  {[:orchid, :step, :start], "start"},
  {[:orchid, :step, :done], "done"},
  {[:orchid, :step, :exception], "exception"},
  {[:orchid, :step, :special], "special"}
]

for {event, suffix} <- events do
  :telemetry.attach(
    "orchid-gantt-#{suffix}",
    event,
    &amp;Orchid.Livebook.GanttTracker.handle_event/4,
    nil
  )
end
[:ok, :ok, :ok, :ok]

Run Workflow

defmodule QixuanPipeline do
  require Logger

  def load_models(model_root_path, model_config) do
    Logger.info("Loading DiffSinger models...")

    models =[
      {:duration_linguistic, model_config.predict_map.maybe_duration.linguistic.path},
      {:duration_predict,    model_config.predict_map.maybe_duration.duration.path},
      {:pitch_linguistic,    model_config.predict_map.maybe_pitch.linguistic.path},
      {:pitch_predict,       model_config.predict_map.maybe_pitch.predict.path},
      {:variance_linguistic, model_config.variance.linguistic.path},
      {:variance,            model_config.variance.variance.path},
      {:acoustic,            model_config.acoustic.infer.path},
      {:vocoder,             model_config.vocoder.path}
    ]

    for {name, rel_path} <- models do
      path = Path.join([model_root_path] ++ rel_path)
      :ok = Orchid.Symbiont.register(name, {Orchid.Symbiont.OrtexRunner, [name: name, path: path]})
    end

    Logger.info("All models loaded successfully.")
    :ok
  end

  def build_recipe(model_config) do
    injector = [extra_hooks_stack: [Orchid.Symbiont.Hooks.Injector]]

    dur_dict   = model_config.predict_map.maybe_duration.phonemes
    pitch_dict = model_config.predict_map.maybe_pitch.phonemes
    var_dict   = model_config.variance.phonemes
    sample_rate = model_config.vocoder.maybe_config["sample_rate"]

    duration_steps =[
      {DurationPredictEncoder, :words,[:duration_lang, :duration_phoneme, :word_division, :word_duration, :duration_ph_midi],[lang_dict: dur_dict.maybe_lang_dict, phoneme_dict: dur_dict.phoneme_dict]},
      {PredictDuration,[:duration_lang, :duration_phoneme, :word_division, :word_duration, :duration_ph_midi], 
        :phoneme_duration_predict, injector}
    ]

    pitch_steps = [
      {PitchPredictEncoder, :words,[:pitch_lang, :pitch_phoneme, :word_duration_from_pitch, :pitch_ph_midi],[lang_dict: pitch_dict.maybe_lang_dict, phoneme_dict: pitch_dict.phoneme_dict]},
      {PredictPitch,[:pitch_lang, :pitch_phoneme, :phoneme_duration_predict, :pitch_ph_midi], 
        :pitch_pred_midi, injector},
      {MIDIToPitch, :pitch_pred_midi, :pitch_pred}
    ]

    variance_steps = [
      {VarianceEncoder, :words,[:variance_lang, :variance_phoneme, :word_duration_from_variance, :variance_ph_midi],[lang_dict: var_dict.maybe_lang_dict, phoneme_dict: var_dict.phoneme_dict]},
      {VarianceModel,[:variance_lang, :variance_phoneme, :phoneme_duration_predict, :pitch_pred_midi],[:breathiness_pred, :voice_pred], injector}
    ]

    acoustic_step =[
      {Acoustic,[:variance_lang, :variance_phoneme, :phoneme_duration_predict, :pitch_pred, :breathiness_pred, :voice_pred], 
        :mel, injector}
    ]

    vocoder_step = [
      {NSFHifiGAN_Vocoder,[:mel, :pitch_pred], :wave_tensor, injector},
      {TensorToWave, :wave_tensor, :audio,[sample_rate: sample_rate]}
    ]

    all_steps = duration_steps ++ pitch_steps ++ variance_steps ++ acoustic_step ++ vocoder_step

    Orchid.Recipe.new(all_steps)
  end
end


QixuanPipeline.load_models(model_root_path, model_config)

recipe = QixuanPipeline.build_recipe(model_config)

inputs = [
  %Orchid.Param{name: :words, payload: [
    {[{"zh", "AP"}], 10, 0},
    {[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
    {[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
    {[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
    {[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60},

    {[{"zh", "AP"}], 1, 0},

    {[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
    {[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
    {[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
    {[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60}
  ]},
]

Orchid.Livebook.GanttTracker.clear()

{elapse, {:ok, results}} = :timer.tc(&amp;Orchid.run/2, [recipe, inputs], :microsecond)

require Logger
Logger.info "Used #{elapse / 1000}ms."


14:42:57.019 [info] Loading DiffSinger models...

14:42:57.023 [info] All models loaded successfully.

14:42:57.024 [info] init got unexpected: {:io_request, #PID<0.92.0>, #Reference<0.2798130424.2264662018.125731>,
 {:put_chars, :unicode,
  "Failed to write log message to stdout, trying stderr\n"}}

14:42:57.024 [debug] ** (RuntimeError) bad return value from Logger formatter Logger.Formatter, got [<<185, 220, 181, 192, 213, 253, 212, 218, 177, 187, 185, 216, 177, 213, 161, 163, 92, 114, 92, 110, 34, 41>>, "\e[0m", 10] after "\e[31m\n14:42:57.022 [error] Writer crashed (:\""
    (kernel 10.4.1) logger_h_common.erl:433: :logger_h_common.string_to_binary/1
    (kernel 10.4.1) logger_h_common.erl:399: :logger_h_common.do_log_to_binary/2
    (kernel 10.4.1) logger_h_common.erl:180: :logger_h_common.log/2
    (kernel 10.4.1) logger_backend.erl:54: :logger_backend.call_handlers/3
    (kernel 10.4.1) user_drv.erl:581: :user_drv.server/3
    (stdlib 7.1) gen_statem.erl:3748: :gen_statem.loop_state_callback/11
    (stdlib 7.1) proc_lib.erl:333: :proc_lib.init_p_do_apply/3


14:42:57.236 [debug] [Symbiont] Inference took 9.728ms via provider [:cpu]

14:42:57.238 [debug] [Symbiont] Inference took 1.536ms via provider [:cpu]

14:42:57.592 [debug] [Symbiont] Inference took 8.806ms via provider [:cpu]

14:42:58.660 [debug] [Symbiont] Inference took 986.214ms via provider [:cpu]

14:42:58.818 [debug] [Symbiont] Inference took 4.71ms via provider [:cpu]

14:42:59.445 [debug] [Symbiont] Inference took 612.352ms via provider [:cpu]

14:43:02.839 [debug] [Symbiont] Inference took 2934.272ms via provider [:cpu]

14:43:04.373 [debug] [Symbiont] Inference took 1445.068ms via provider [:cpu]

14:43:04.441 [info] Used 7415.5ms.
:ok

Execution Timeline:

alias VegaLite, as: Vl

spans = Orchid.Livebook.GanttTracker.get_spans()
min_start = spans |> Enum.map(&amp; &amp;1.start) |> Enum.min(fn -> 0 end)

chart_data =
  Enum.map(spans, fn span ->
    %{
      "step" => span.step,
      "start" => span.start - min_start,
      "end" => span.visual_end - min_start,
      "duration" => Float.round(span.duration, 2),
      "status" => Atom.to_string(span.status)
    }
  end)

Vl.new(width: 650, height: 250, title: "Orchid Workflow Execution Timeline")
|> Vl.data_from_values(chart_data)
|> Vl.mark(:bar, corner_radius: 4, height: 20)
|> Vl.encode_field(:y, "step", type: :nominal, title: "Steps", sort: [field: "start", op: "min"], axis: [labelLimit: 400])
|> Vl.encode_field(:x, "start", type: :quantitative, title: "Time Offset (ms)")
|> Vl.encode_field(:x2, "end")
|> Vl.encode_field(:color, "status", type: :nominal, title: "Status", scale: [range: ["#2eb82e", "#d9534f"]])
|> Vl.encode_field(:tooltip, "duration", type: :quantitative, title: "Duration (ms)")
|> Kino.VegaLite.new()

Prepare for the Spectrum demo:

alias VegaLite, as: Vl


mel_tensor =
  results.mel.payload[0]
  |> Nx.backend_transfer(Nx.BinaryBackend)

mel_data =
  mel_tensor
  |> Nx.to_batched(1)
  |> Enum.with_index()
  |> Enum.flat_map(fn {row_tensor, frame_idx} ->
    row_tensor
    |> Nx.to_flat_list()
    |> Enum.with_index()
    |> Enum.map(fn {val, bin_idx} ->
      %{
        "x1" => frame_idx, "x2" => frame_idx + 1,
        "y1" => bin_idx, "y2" => bin_idx + 1,
        "value" => val
      }
    end)
  end)

f0_data =
  results.pitch_pred.payload[0]
  |> Nx.backend_transfer(Nx.BinaryBackend)
  |> Nx.to_flat_list()
  |> Enum.with_index()
  |> Enum.map(fn {f0, frame} -> %{"frame" => frame, "f0" => f0} end)

durations = results.phoneme_duration_predict.payload[0]
  |> Nx.backend_transfer(Nx.BinaryBackend)
  |> Nx.to_flat_list()

boundaries = Enum.scan(durations, 0, fn dur, acc -> dur + acc end)
boundary_data = Enum.map(boundaries, fn b -> %{"frame" => b} end)


mel_layer =
  Vl.new()
  |> Vl.data_from_values(mel_data)
  |> Vl.mark(:rect, tooltip: false, stroke: nil)
  |> Vl.encode_field(:x, "x1", type: :quantitative, title: "Time (Frames)")
  |> Vl.encode_field(:x2, "x2")
  |> Vl.encode_field(:y, "y1", type: :quantitative, title: "Mel Frequency Bin")
  |> Vl.encode_field(:y2, "y2")
  |> Vl.encode_field(:color, "value",
    type: :quantitative,
    scale: [scheme: "greys", reverse: true, domain: [-11.0, 2.0]],
    legend: false
  )

f0_layer =
  Vl.new()
  |> Vl.data_from_values(f0_data)
  |> Vl.mark(:line, color: "#007bff", strokeWidth: 2)
  |> Vl.encode_field(:x, "frame", type: :quantitative)
  |> Vl.encode_field(:y, "f0",
    type: :quantitative,
    title: "Pitch (Hz)",
    axis: [orient: "right", titleColor: "#007bff"],
    scale: [domain: [50, 800]]
  )

boundary_layer =
  Vl.new()
  |> Vl.data_from_values(boundary_data)
  |> Vl.mark(:rule, color: "red", strokeDash: [4, 4], strokeWidth: 1.5)
  |> Vl.encode_field(:x, "frame", type: :quantitative)

:ok
:ok

Demostration

File.write!("E:/final.wav", results.audio.payload)

Kino.Audio.new(results.audio.payload, :wav)

# audio_html = """
# 
#   Qixuan DiffSinger Output
#   Two Tigers (Liang Zhi Lao Hu)
#   

# # # Your browser does not support the audio element. # # # """ # Kino.HTML.new(audio_html)

Vl.new(width: 800, height: 400, title: "Output Analysis (Mel Spectrum + F0 + Phoneme Duration)")
|> Vl.resolve(:scale, y: :independent)
|> Vl.layers([mel_layer, f0_layer, boundary_layer])
|> Kino.VegaLite.new()

Multiple Pipeline

Kino.start_child({Orchid.Livebook.GanttTracker,[]})
Orchid.Livebook.GanttTracker.clear()

pipeline_tasks = 
  for _ <- 1..3 do
    Task.async(fn ->
      Orchid.run(recipe, inputs, executor_and_opts: {Orchid.Executor.Async,[]})
    end)
  end


Task.await_many(pipeline_tasks, :infinity)

14:43:05.899 [debug] [Symbiont] Inference took 6.348ms via provider [:cpu]

14:43:05.906 [debug] [Symbiont] Inference took 6.348ms via provider [:cpu]

14:43:05.906 [debug] [Symbiont] Inference took 6.86ms via provider [:cpu]

14:43:05.910 [debug] [Symbiont] Inference took 3.481ms via provider [:cpu]

14:43:05.914 [debug] [Symbiont] Inference took 7.987ms via provider [:cpu]

14:43:05.924 [debug] [Symbiont] Inference took 17.92ms via provider [:cpu]

14:43:05.926 [debug] [Symbiont] Inference took 11.98ms via provider [:cpu]

14:43:05.932 [debug] [Symbiont] Inference took 7.884ms via provider [:cpu]

14:43:05.955 [debug] [Symbiont] Inference took 15.667ms via provider [:cpu]

14:43:06.920 [debug] [Symbiont] Inference took 996.454ms via provider [:cpu]

14:43:06.930 [debug] [Symbiont] Inference took 10.547ms via provider [:cpu]

14:43:07.877 [debug] [Symbiont] Inference took 935.219ms via provider [:cpu]

14:43:08.243 [debug] [Symbiont] Inference took 1324.032ms via provider [:cpu]

14:43:08.252 [debug] [Symbiont] Inference took 7.27ms via provider [:cpu]

14:43:09.842 [debug] [Symbiont] Inference took 1576.345ms via provider [:cpu]

14:43:10.317 [debug] [Symbiont] Inference took 2072.678ms via provider [:cpu]

14:43:10.326 [debug] [Symbiont] Inference took 9.216ms via provider [:cpu]

14:43:11.038 [debug] [Symbiont] Inference took 697.753ms via provider [:cpu]

14:43:11.625 [debug] [Symbiont] Inference took 3751.628ms via provider [:cpu]

14:43:12.882 [debug] [Symbiont] Inference took 1251.635ms via provider [:cpu]

14:43:14.437 [debug] [Symbiont] Inference took 2812.518ms via provider [:cpu]

14:43:15.677 [debug] [Symbiont] Inference took 1235.968ms via provider [:cpu]

14:43:17.236 [debug] [Symbiont] Inference took 2796.748ms via provider [:cpu]

14:43:18.365 [debug] [Symbiont] Inference took 1125.785ms via provider [:cpu]
[
  ok: %{
    words: %Orchid.Param{
      name: :words,
      type: nil,
      payload: [
        {[{"zh", "AP"}], 10, 0},
        {[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
        {[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
        {[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
        {[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60},
        {[{"zh", "AP"}], 1, 0},
        {[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
        {[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
        {[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
        {[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60}
      ],
      metadata: %{}
    },
    ...
  },
  ...
]
alias VegaLite, as: Vl

spans = Orchid.Livebook.GanttTracker.get_spans()
min_start = spans |> Enum.map(&amp; &amp;1.start) |> Enum.min(fn -> 0 end)

chart_data = 
  Enum.map(spans, fn span ->
    %{
      "run_id" => span.run_id,
      "step" => span.step,
      "start" => span.start - min_start,
      "end" => span.end - min_start,
      "duration" => Float.round(span.duration, 2),
      "status" => Atom.to_string(span.status)
    }
  end)

Vl.new(width: 650, height: 250, title: "Concurrent Orchid Workflows")
|> Vl.data_from_values(chart_data)
|> Vl.encode_field(:row, "run_id", type: :nominal, title: "Pipeline Instance (PID)")
|> Vl.mark(:bar, corner_radius: 4, height: 20)
|> Vl.encode_field(:y, "step",
     type: :nominal,
     title: nil,
     sort: [field: "start", op: "min"],
     axis: [labelLimit: 300]
   )
|> Vl.encode_field(:x, "start", type: :quantitative, title: "Global Time (ms)")
|> Vl.encode_field(:x2, "end")
|> Vl.encode_field(:color, "status", type: :nominal, scale: [range: ["#2eb82e", "#d9534f"]])
|> Vl.encode_field(:tooltip, "duration", type: :quantitative, title: "Duration (ms)")
|> Vl.resolve(:scale, x: :shared)
|> Kino.VegaLite.new()