Powered by AppSignal & Oban Pro

DiffSingerElixirPoC

simple_run.livemd

DiffSingerElixirPoC

System.shell("mix hex.clean --all")
System.shell("mix hex.config http_timeout 120")

# If the download fails, you can comment out all the git repositories first,
#and then install them incrementally one by one
#after the hex packages are installed.
Mix.install([
  {:orchid, "~> 0.5", override: true},
  {:diff_singer, git: "https://github.com/GES233/DiffSinger.git"},
  {:orchid_stratum, git: "https://github.com/SynapticStrings/OrchidStratum.git"},
  {:kino, "~> 0.19.0"},
  {:vega_lite, "~> 0.1.8"},
  {:kino_vega_lite, "~> 0.1.11"},
  {:kino_benchee, "~> 0.1.0"}
])

:telemetry.attach(
  "orchid-step-exception-logger",
  [:orchid, :step, :exception],
  &Orchid.Runner.Hooks.Telemetry.error_handler/4,
  %{}
)

Fetch Model’s Metadata

Obtain the metadata and basic information of the sound library’s models as the skeleton for subsequent dependency graph construction.

Here, we take Qixuan v2.5.0 (for OpenUTAU), maintained by OpenVPI, as an example.

# If you have your local version
# Chenge the path to yours
model_root_path = "E:/ProgramAssets/OpenUTAUSingers/Qixuan_v2.5.0_DiffSinger_OpenUtau"
model_config = DiffSinger.VoiceBank.Config.fetch_overview(model_root_path)

:ok
:ok

Prelude Steps

This is a node that converts lyrics/MIDI into corresponding phoneme IDs and pitches (MIDI) based on a phoneme dictionary.

A primary reason for needing MIDI is to use its duration information as a reference for subsequent phoneme duration prediction.

A rasterized node will be implemented later to accommodate the processing of notes and phonemes.

defmodule CommonEncoder do
  @moduledoc """
  The lyrics and pitch are initially encoded for use in subsequent models.
  """

  def run_partial(%Orchid.Param{payload: words}, opts) do
   lang_dict = Keyword.fetch!(opts, :lang_dict)
    phoneme_dict = Keyword.fetch!(opts, :phoneme_dict)

    Enum.reduce(
      words,
      {[], [], [], [], []},
      fn {phonemes, duration, midi_note}, {acc_l, acc_t, acc_wdiv, acc_wdur, acc_midi} ->
        ph_count = length(phonemes)

        {curr_langs, curr_toks} =
          phonemes
          |> Enum.map(fn {lang, phone} -> {lang_dict[lang], phoneme_dict[phone]} end)
          |> Enum.unzip()

        curr_midis = List.duplicate(midi_note, ph_count)

        {
          acc_l ++ curr_langs,
          acc_t ++ curr_toks,
          acc_wdiv ++ [ph_count],
          acc_wdur ++ [duration],
          acc_midi ++ curr_midis
        }
      end)
  end
end
{:module, CommonEncoder, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}
defmodule DurationPredictEncoder do
  use Orchid.Step

  def run(param, opts) do
    {langs, toks, w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)

    lang_map_param = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
    phoneme_map_param = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
    word_div_param = Orchid.Param.new(:word_division, :payload, Nx.tensor([w_div], type: :s64))
    word_dur_param = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
    ph_midi_param = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))

    {:ok, [lang_map_param, phoneme_map_param, word_div_param, word_dur_param, ph_midi_param]}
  end
end
{:module, DurationPredictEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}
defmodule PitchPredictEncoder do
  use Orchid.Step

  def run(%Orchid.Param{} = param, opts) do
    {langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)

    languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
    tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
    word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
    ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))

    {:ok, [languages, tokens, word_dur, ph_midi]}
  end
end
{:module, PitchPredictEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}
# Used for Variance model
defmodule VarianceEncoder do
  use Orchid.Step

  def run(param, opts) do
    {langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)

    languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
    tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
    word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
    ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))

    {:ok, [languages, tokens, word_dur, ph_midi]}
  end
end
{:module, VarianceEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}

Variance Model

Predict Phoneme Duration:

defmodule PredictDuration do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:duration_linguistic, :duration_predict]

  def run_with_model(
    [
      %Orchid.Param{payload: lang_map},
      %Orchid.Param{payload: phoneme_map},
      %Orchid.Param{payload: word_division},
      %Orchid.Param{payload: word_duration},
      %Orchid.Param{payload: phoneme_midi}
    ],
    handlers,
    _opts
  ) do
    duration_linguistic = handlers.duration_linguistic
    duration_predict = handlers.duration_predict

    inputs = {phoneme_map, lang_map, word_division, word_duration}

    {:ok, {encoder_out_tensor, mask_tensor}} = Orchid.Symbiont.call(duration_linguistic, {:infer, inputs}, :infinity)

    {:ok, {result}} = Orchid.Symbiont.call(duration_predict, {:infer, {encoder_out_tensor, mask_tensor, phoneme_midi}}, :infinity)

    {:ok, [
      Orchid.Param.new(:ph_dur_pred, :encoder_out, result)
    ]}
  end
end
{:module, PredictDuration, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}

Predict Pitch:

defmodule PredictPitch do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:pitch_linguistic, :pitch_predict]

  def run_with_model(
    [
      %Orchid.Param{payload: languages},
      %Orchid.Param{payload: phonemes},
      %Orchid.Param{payload: phoneme_duration},
      %Orchid.Param{payload: note_midi}
    ], handlers, opts) do

    ph_dur =
      phoneme_duration
      |> Nx.backend_transfer(Nx.BinaryBackend)
      |> Nx.round()
      |> Nx.as_type(:s64)

    {:ok, {encoder_out, _mask}} =
      Orchid.Symbiont.call(
        handlers.pitch_linguistic,
        {:infer, {phonemes, languages, ph_dur}},
        :infinity
      )

    total_frames =
      ph_dur
      |> Nx.sum()
      |> Nx.to_number()

    key = Nx.Random.key(System.system_time())

    {pitch_noise, _key} =
      Nx.Random.normal(
        key,
        0.0,
        1.0,
        shape: {1, total_frames},
        type: :f32
      )

    expr =
      Nx.broadcast(Nx.tensor(1.0, type: :f32), {1, total_frames})

    retake =
      Nx.broadcast(Nx.tensor(1, type: :u8), {1, total_frames})

    note_rest =
      note_midi
      |> Nx.equal(0)
      |> Nx.as_type(:u8)

    note_midi = note_midi |> Nx.as_type(:f32)

    note_dur = ph_dur

    steps =
      opts
      |> Keyword.get(:steps, 20)
      |> Nx.tensor(type: :s64)

    {:ok, {pitch_pred}} =
      Orchid.Symbiont.call(
        handlers.pitch_predict,
        {:infer,
          {
            encoder_out,
            ph_dur,
            note_midi,
            note_rest,
            note_dur,
            pitch_noise,
            expr,
            retake,
            steps
          }},
        :infinity
      )

    {:ok, Orchid.Param.new(:pitch_pred, :payload, pitch_pred)}
  end
end
{:module, PredictPitch, <<70, 79, 82, 49, 0, 0, 18, ...>>, ...}
defmodule MIDIToPitch do
  use Orchid.Step

  def run(%Orchid.Param{payload: midi_pred}, _opts) do
    {:ok, midi_pred
      |> Nx.backend_transfer(Nx.BinaryBackend)
      |> Nx.add(-69.0)
      |> Nx.divide(12.0)
      |> then(&amp;Nx.pow(2.0, &amp;1))
      |> Nx.multiply(440)
      |> then(fn converted_f0 ->
           Nx.select(Nx.less(Nx.backend_transfer(midi_pred, Nx.BinaryBackend), 0.0), Nx.tensor(0.0), converted_f0)
         end)
      |> then(&amp;Orchid.Param.new(:f0, :tensor, &amp;1))}
  end
end
{:module, MIDIToPitch, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}

Other Variance Params:

defmodule VarianceModel do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:variance_linguistic, :variance]

  def run_with_model(
    [
      %Orchid.Param{payload: languages},
      %Orchid.Param{payload: phonemes},
      %Orchid.Param{payload: ph_dur},
      %Orchid.Param{payload: pitch}
    ],
    handlers,
    opts
  ) do

    ph_dur =
      ph_dur
      |> Nx.backend_transfer(Nx.BinaryBackend)
      |> Nx.round()
      |> Nx.as_type(:s64)

    {:ok, {encoder_out, _mask}} =
      Orchid.Symbiont.call(
        handlers.variance_linguistic,
        {:infer, {phonemes, languages, ph_dur}}
      )

    total_frames =
      ph_dur
      |> Nx.sum()
      |> Nx.to_number()

    key = Nx.Random.key(System.system_time())

    {breath_noise, key} =
      Nx.Random.normal(key, 0.0, 1.0,
        shape: {1, total_frames},
        type: :f32
      )

    {voice_noise, _key} =
      Nx.Random.normal(key, 0.0, 1.0,
        shape: {1, total_frames},
        type: :f32
      )

    retake =
      Nx.broadcast(
        Nx.tensor([1,1], type: :u8),
        {1, total_frames, 2}
      )

    steps =
      opts
      |> Keyword.get(:steps, 20)
      |> Nx.tensor(type: :s64)

    {:ok, {breath_pred, voice_pred}} =
      Orchid.Symbiont.call(
        handlers.variance,
        {:infer,
          {
            encoder_out,
            ph_dur,
            pitch,
            breath_noise,
            voice_noise,
            retake,
            steps
          }},
        :infinity
      )

    {:ok, [
      Orchid.Param.new(:breathiness, :payload, breath_pred),
      Orchid.Param.new(:voicing, :payload, voice_pred)
    ]}
  end
end
{:module, VarianceModel, <<70, 79, 82, 49, 0, 0, 18, ...>>, ...}

Acoustic Model and Vocoder

Acoustic Model:

defmodule Acoustic do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:acoustic]

  def run_with_model(
    [
      %Orchid.Param{payload: languages},
      %Orchid.Param{payload: phonemes},
      %Orchid.Param{payload: phoneme_duration},
      %Orchid.Param{payload: pitch},
      %Orchid.Param{payload: breathiness},
      %Orchid.Param{payload: voicing}
    ],
    handlers,
    opts
  ) do

    durations =
      phoneme_duration
      |> Nx.backend_transfer(Nx.BinaryBackend)
      |> Nx.round()
      |> Nx.as_type(:s64)

    frames = Nx.axis_size(pitch, 1)

    gender =
      Nx.broadcast(Nx.tensor(0.0, type: :f32), {1, frames})

    velocity =
      Nx.broadcast(Nx.tensor(1.0, type: :f32), {1, frames})

    steps =
      opts
      |> Keyword.get(:steps, 20)
      |> Nx.tensor(type: :s64)

    depth =
      opts
      |> Keyword.get(:depth, 1.0)
      |> Nx.tensor(type: :f32)

    {:ok, {mel}} =
      Orchid.Symbiont.call(
        handlers.acoustic,
        {:infer,
          {
            phonemes,
            languages,
            durations,
            pitch,
            breathiness,
            voicing,
            gender,
            velocity,
            depth,
            steps
          }},
        :infinity
      )

    {:ok, Orchid.Param.new(:mel, :payload, mel)}
  end
end
{:module, Acoustic, <<70, 79, 82, 49, 0, 0, 15, ...>>, ...}

Vocoder:

defmodule NSFHifiGAN_Vocoder do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:vocoder]

  def run_with_model(
    [%Orchid.Param{payload: mel}, %Orchid.Param{payload: f0}],
    handlers,
    _opts
  ) do

    {:ok, {audio}} = Orchid.Symbiont.call(handlers.vocoder, {:infer, {mel, f0}})

    {:ok, Orchid.Param.new(:audio, :payload, audio)}
  end
end
{:module, NSFHifiGAN_Vocoder, <<70, 79, 82, 49, 0, 0, 11, ...>>, ...}

Post-process (tensor to audio)

defmodule TensorToWave do
  use Orchid.Step

   def run(%Orchid.Param{payload: wave_tensor}, opts) do
    pcm_data =
      wave_tensor
      |> Nx.flatten()
      |> Nx.backend_transfer(Nx.BinaryBackend)
      |> Nx.multiply(32767.0)
      |> Nx.clip(-32768.0, 32767.0)
      |> Nx.as_type(:s16)
      |> Nx.to_binary()

    sample_rate = Keyword.get(opts, :sample_rate, 44100)
    byte_rate = sample_rate * 1 * 2  # sample_rate * channels * bytes_per_sample (Int16=2)
    data_size = byte_size(pcm_data)
    file_size = 36 + data_size

    header = <<
      "RIFF", file_size::little-integer-size(32), "WAVE",
      "fmt ", 16::little-integer-size(32),       # Subchunk1Size
      1::little-integer-size(16),                # AudioFormat (1 = PCM)
      1::little-integer-size(16),                # NumChannels (1 = Mono)
      sample_rate::little-integer-size(32),      # SampleRate
      byte_rate::little-integer-size(32),        # ByteRate
      2::little-integer-size(16),                # BlockAlign (channels * 2)
      16::little-integer-size(16),               # BitsPerSample (16 bits)
      "data", data_size::little-integer-size(32) # Subchunk2Size
    >>

    File.write!("E:/final.wav", header <> pcm_data)

    {:ok, Orchid.Param.new(:final, :audio, header <> pcm_data)}
  end
end
{:module, TensorToWave, <<70, 79, 82, 49, 0, 0, 15, ...>>, ...}

Build Pipeline and Prepare to Demo

defmodule QixuanPipeline do
  require Logger

  def load_models(model_root_path, model_config) do
    Logger.info("Loading DiffSinger models...")

    models =[
      {:duration_linguistic, model_config.predict_map.maybe_duration.linguistic.path},
      {:duration_predict,    model_config.predict_map.maybe_duration.duration.path},
      {:pitch_linguistic,    model_config.predict_map.maybe_pitch.linguistic.path},
      {:pitch_predict,       model_config.predict_map.maybe_pitch.predict.path},
      {:variance_linguistic, model_config.variance.linguistic.path},
      {:variance,            model_config.variance.variance.path},
      {:acoustic,            model_config.acoustic.infer.path},
      {:vocoder,             model_config.vocoder.path}
    ]

    for {name, rel_path} <- models do
      path = Path.join([model_root_path] ++ rel_path)
      :ok = Orchid.Symbiont.register(name, {Orchid.Symbiont.OrtexRunner, [name: name, path: path]})
    end

    Logger.info("All models loaded successfully.")
    :ok
  end

  def build_recipe(model_config) do
    injector = [extra_hooks_stack: [Orchid.Symbiont.Hooks.Injector]]

    dur_dict   = model_config.predict_map.maybe_duration.phonemes
    pitch_dict = model_config.predict_map.maybe_pitch.phonemes
    var_dict   = model_config.variance.phonemes
    sample_rate = model_config.vocoder.maybe_config["sample_rate"]

    duration_steps =[
      {DurationPredictEncoder, :words,[:duration_lang, :duration_phoneme, :word_division, :word_duration, :duration_ph_midi],[lang_dict: dur_dict.maybe_lang_dict, phoneme_dict: dur_dict.phoneme_dict]},
      {PredictDuration,[:duration_lang, :duration_phoneme, :word_division, :word_duration, :duration_ph_midi], 
        :phoneme_duration_predict, injector}
    ]

    pitch_steps = [
      {PitchPredictEncoder, :words,[:pitch_lang, :pitch_phoneme, :word_duration_from_pitch, :pitch_ph_midi],[lang_dict: pitch_dict.maybe_lang_dict, phoneme_dict: pitch_dict.phoneme_dict]},
      {PredictPitch,[:pitch_lang, :pitch_phoneme, :phoneme_duration_predict, :pitch_ph_midi], 
        :pitch_pred_midi, injector},
      {MIDIToPitch, :pitch_pred_midi, :pitch_pred}
    ]

    variance_steps = [
      {VarianceEncoder, :words,[:variance_lang, :variance_phoneme, :word_duration_from_variance, :variance_ph_midi],[lang_dict: var_dict.maybe_lang_dict, phoneme_dict: var_dict.phoneme_dict]},
      {VarianceModel,[:variance_lang, :variance_phoneme, :phoneme_duration_predict, :pitch_pred_midi],[:breathiness_pred, :voice_pred], injector}
    ]

    acoustic_step =[
      {Acoustic,[:variance_lang, :variance_phoneme, :phoneme_duration_predict, :pitch_pred, :breathiness_pred, :voice_pred], 
        :mel, injector}
    ]

    vocoder_step = [
      {NSFHifiGAN_Vocoder,[:mel, :pitch_pred], :wave_tensor, injector},
      {TensorToWave, :wave_tensor, :audio,[sample_rate: sample_rate]}
    ]

    all_steps = duration_steps ++ pitch_steps ++ variance_steps ++ acoustic_step ++ vocoder_step

    Orchid.Recipe.new(all_steps)
  end
end
{:module, QixuanPipeline, <<70, 79, 82, 49, 0, 0, 39, ...>>, ...}

Add a tracker

defmodule Orchid.Livebook.GanttTracker do
  use Agent

  def start_link(_) do
    Agent.start_link(fn ->[] end, name: __MODULE__)
  end

  def handle_event([:orchid, :step, status], measurements, meta, _config) do
    run_id = List.first(Process.get(:"$callers")) || self()
    run_id_str = inspect(run_id)

    time_val = if status == :start, do: measurements.system_time, else: measurements.duration

    Agent.update(__MODULE__, fn state ->[{run_id_str, status, time_val, meta} | state]
    end)
  end

  def get_spans() do
    events = Agent.get(__MODULE__, &amp; &amp;1) |> Enum.reverse()

        {_, spans} = Enum.reduce(events, {%{},[]}, fn
      {run_id, :start, sys_time, meta}, {pending, spans} ->
        start_ms = System.convert_time_unit(sys_time, :native, :microsecond) / 1000.0
        sig = {run_id, meta.impl, meta.in_keys, meta.out_keys}
        {Map.put(pending, sig, start_ms), spans}

      {run_id, status, duration, meta}, {pending, spans} when status in[:done, :exception, :special] ->
        duration_ms = System.convert_time_unit(duration, :native, :microsecond) / 1000.0
        sig = {run_id, meta.impl, meta.in_keys, meta.out_keys}
        
        case Map.pop(pending, sig) do
          {start_ms, new_pending} when not is_nil(start_ms) ->
            impl_name = if is_function(meta.impl), do: "Anonymous", else: inspect(meta.impl) |> String.split(".") |> List.last()
            out_keys = inspect(meta.out_keys) |> String.slice(0..14) |> Kernel.<>("...")
            
            span = %{
              run_id: run_id,
              step: "#{impl_name}(#{out_keys})",
              start: start_ms,
              end: start_ms + duration_ms,
              visual_end: start_ms + max(duration_ms, 10.0),
              duration: duration_ms,
              status: status
            }
            {new_pending, [span | spans]}
          {nil, ^pending} -> {pending, spans}
        end
    end)
    
    spans
  end

  def clear(), do: Agent.update(__MODULE__, fn _ ->[] end)
end
{:module, Orchid.Livebook.GanttTracker, <<70, 79, 82, 49, 0, 0, 29, ...>>, ...}
Kino.start_child({Orchid.Livebook.GanttTracker, []})

events =[
  {[:orchid, :step, :start], "start"},
  {[:orchid, :step, :done], "done"},
  {[:orchid, :step, :exception], "exception"},
  {[:orchid, :step, :special], "special"}
]

for {event, suffix} <- events do
  :telemetry.attach(
    "orchid-gantt-#{suffix}",
    event,
    &amp;Orchid.Livebook.GanttTracker.handle_event/4,
    nil
  )
end
[:ok, :ok, :ok, :ok]
defmodule MermaidRenderer do

  def render(%Orchid.Recipe{} = recipe) do
    mermaid_code = generate_mermaid(recipe.steps)

    Kino.Mermaid.new(mermaid_code)
  end

  defp generate_mermaid(steps) do
    header = "graph TD\n"

    body =
      steps
      |> Enum.with_index()
      |> Enum.flat_map(fn {step, idx} ->
        {impl, in_keys, out_keys, _opts} = Orchid.Step.ensure_full_step(step)

        step_id = "step_#{idx}"
        step_name = format_impl(impl)
        step_node = ~s|    #{step_id}["⚙️ #{step_name}"]:::stepClass|

        in_edges =
          in_keys
          |> normalize_keys()
          |> Enum.map(fn k -> ~s|    #{k}(["#{k}"]) -.-> #{step_id}| end)

        out_edges =
          out_keys
          |> normalize_keys()
          |> Enum.map(fn k -> ~s|    #{step_id} ==> #{k}(["#{k}"])| end)

        [step_node] ++ in_edges ++ out_edges
      end)
      |> Enum.uniq()
      |> Enum.join("\n")


    styles = """

        classDef stepClass fill:#2eb82e,stroke:#fff,stroke-width:2px,color:#fff;
    """

    header <> body <> styles
  end

  defp normalize_keys(keys) do
    keys
    |> Orchid.Step.ID.normalize_keys_to_set()
    |> MapSet.to_list()
    |> Enum.map(&amp;to_string/1)
  end

  defp format_impl(impl) when is_function(impl), do: "Anonymous Function"
  defp format_impl(impl) do
    inspect(impl) |> String.replace_prefix("Elixir.", "")
  end
end
{:module, MermaidRenderer, <<70, 79, 82, 49, 0, 0, 19, ...>>, ...}

Run Workflow

QixuanPipeline.load_models(model_root_path, model_config)

recipe = QixuanPipeline.build_recipe(model_config)

inputs = [
  %Orchid.Param{name: :words, payload: [
    {[{"zh", "AP"}], 10, 0},
    {[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
    {[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
    {[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
    {[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60},

    {[{"zh", "AP"}], 1, 0},

    {[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
    {[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
    {[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
    {[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60}
  ]},
]

Orchid.Livebook.GanttTracker.clear()

{elapse, {:ok, results}} = :timer.tc(
  &amp;Orchid.run/2,
  [recipe, inputs
  ],
  :microsecond
)

require Logger
Logger.info "Used #{elapse / 1000}ms."


11:55:57.387 [info] Loading DiffSinger models...

11:55:57.390 [info] All models loaded successfully.

11:55:57.392 [debug] ** (RuntimeError) bad return value from Logger formatter Logger.Formatter, got [<<185, 220, 181, 192, 213, 253, 212, 218, 177, 187, 185, 216, 177, 213, 161, 163, 92, 114, 92, 110, 34, 41>>, "\e[0m", 10] after "\e[31m\n11:55:57.388 [error] Writer crashed (:\""
    (kernel 10.4.1) logger_h_common.erl:433: :logger_h_common.string_to_binary/1
    (kernel 10.4.1) logger_h_common.erl:399: :logger_h_common.do_log_to_binary/2
    (kernel 10.4.1) logger_h_common.erl:180: :logger_h_common.log/2
    (kernel 10.4.1) logger_backend.erl:54: :logger_backend.call_handlers/3
    (kernel 10.4.1) user_drv.erl:581: :user_drv.server/3
    (stdlib 7.1) gen_statem.erl:3748: :gen_statem.loop_state_callback/11
    (stdlib 7.1) proc_lib.erl:333: :proc_lib.init_p_do_apply/3


11:55:57.392 [info] init got unexpected: {:io_request, #PID<0.92.0>, #Reference<0.2652535936.353632260.93387>,
 {:put_chars, :unicode,
  "Failed to write log message to stdout, trying stderr\n"}}

11:55:57.576 [debug] [Symbiont] Inference took 7.987ms via provider [:cpu]

11:55:57.579 [debug] [Symbiont] Inference took 3.174ms via provider [:cpu]

11:55:57.909 [debug] [Symbiont] Inference took 3.686ms via provider [:cpu]

11:55:58.876 [debug] [Symbiont] Inference took 941.875ms via provider [:cpu]

11:55:59.022 [debug] [Symbiont] Inference took 3.891ms via provider [:cpu]

11:55:59.660 [debug] [Symbiont] Inference took 630.988ms via provider [:cpu]

11:56:03.021 [debug] [Symbiont] Inference took 2828.8ms via provider [:cpu]

11:56:04.527 [debug] [Symbiont] Inference took 1428.172ms via provider [:cpu]

11:56:04.600 [info] Used 7208.755ms.
:ok

Show Graph

MermaidRenderer.render recipe
graph TD
    step_0["⚙️ DurationPredictEncoder"]:::stepClass
    words(["words"]) -.-> step_0
    step_0 ==> word_division(["word_division"])
    step_0 ==> word_duration(["word_duration"])
    step_0 ==> duration_lang(["duration_lang"])
    step_0 ==> duration_phoneme(["duration_phoneme"])
    step_0 ==> duration_ph_midi(["duration_ph_midi"])
    step_1["⚙️ PredictDuration"]:::stepClass
    word_division(["word_division"]) -.-> step_1
    word_duration(["word_duration"]) -.-> step_1
    duration_lang(["duration_lang"]) -.-> step_1
    duration_phoneme(["duration_phoneme"]) -.-> step_1
    duration_ph_midi(["duration_ph_midi"]) -.-> step_1
    step_1 ==> phoneme_duration_predict(["phoneme_duration_predict"])
    step_2["⚙️ PitchPredictEncoder"]:::stepClass
    words(["words"]) -.-> step_2
    step_2 ==> pitch_lang(["pitch_lang"])
    step_2 ==> pitch_phoneme(["pitch_phoneme"])
    step_2 ==> word_duration_from_pitch(["word_duration_from_pitch"])
    step_2 ==> pitch_ph_midi(["pitch_ph_midi"])
    step_3["⚙️ PredictPitch"]:::stepClass
    phoneme_duration_predict(["phoneme_duration_predict"]) -.-> step_3
    pitch_lang(["pitch_lang"]) -.-> step_3
    pitch_phoneme(["pitch_phoneme"]) -.-> step_3
    pitch_ph_midi(["pitch_ph_midi"]) -.-> step_3
    step_3 ==> pitch_pred_midi(["pitch_pred_midi"])
    step_4["⚙️ MIDIToPitch"]:::stepClass
    pitch_pred_midi(["pitch_pred_midi"]) -.-> step_4
    step_4 ==> pitch_pred(["pitch_pred"])
    step_5["⚙️ VarianceEncoder"]:::stepClass
    words(["words"]) -.-> step_5
    step_5 ==> variance_lang(["variance_lang"])
    step_5 ==> variance_phoneme(["variance_phoneme"])
    step_5 ==> word_duration_from_variance(["word_duration_from_variance"])
    step_5 ==> variance_ph_midi(["variance_ph_midi"])
    step_6["⚙️ VarianceModel"]:::stepClass
    phoneme_duration_predict(["phoneme_duration_predict"]) -.-> step_6
    pitch_pred_midi(["pitch_pred_midi"]) -.-> step_6
    variance_lang(["variance_lang"]) -.-> step_6
    variance_phoneme(["variance_phoneme"]) -.-> step_6
    step_6 ==> voice_pred(["voice_pred"])
    step_6 ==> breathiness_pred(["breathiness_pred"])
    step_7["⚙️ Acoustic"]:::stepClass
    pitch_pred(["pitch_pred"]) -.-> step_7
    voice_pred(["voice_pred"]) -.-> step_7
    phoneme_duration_predict(["phoneme_duration_predict"]) -.-> step_7
    variance_lang(["variance_lang"]) -.-> step_7
    variance_phoneme(["variance_phoneme"]) -.-> step_7
    breathiness_pred(["breathiness_pred"]) -.-> step_7
    step_7 ==> mel(["mel"])
    step_8["⚙️ NSFHifiGAN_Vocoder"]:::stepClass
    pitch_pred(["pitch_pred"]) -.-> step_8
    mel(["mel"]) -.-> step_8
    step_8 ==> wave_tensor(["wave_tensor"])
    step_9["⚙️ TensorToWave"]:::stepClass
    wave_tensor(["wave_tensor"]) -.-> step_9
    step_9 ==> audio(["audio"])
    classDef stepClass fill:#2eb82e,stroke:#fff,stroke-width:2px,color:#fff;

Execution Timeline:

alias VegaLite, as: Vl

spans = Orchid.Livebook.GanttTracker.get_spans()
min_start = spans |> Enum.map(&amp; &amp;1.start) |> Enum.min(fn -> 0 end)

chart_data =
  Enum.map(spans, fn span ->
    %{
      "step" => span.step,
      "start" => span.start - min_start,
      "end" => span.visual_end - min_start,
      "duration" => Float.round(span.duration, 2),
      "status" => Atom.to_string(span.status)
    }
  end)

Vl.new(width: 600, height: 250, title: "Orchid Workflow Execution Timeline")
|> Vl.data_from_values(chart_data)
|> Vl.mark(:bar, corner_radius: 4, height: 20)
|> Vl.encode_field(:y, "step", type: :nominal, title: "Steps", sort: [field: "start", op: "min"], axis: [labelLimit: 400])
|> Vl.encode_field(:x, "start", type: :quantitative, title: "Time Offset (ms)")
|> Vl.encode_field(:x2, "end")
|> Vl.encode_field(:color, "status", type: :nominal, title: "Status", scale: [range: ["#2eb82e", "#d9534f"]])
|> Vl.encode_field(:tooltip, "duration", type: :quantitative, title: "Duration (ms)")
|> Kino.VegaLite.new()

Prepare for the demo:

alias VegaLite, as: Vl


mel_tensor =
  results.mel.payload[0]
  |> Nx.backend_transfer(Nx.BinaryBackend)

mel_data =
  mel_tensor
  |> Nx.to_batched(1)
  |> Enum.with_index()
  |> Enum.flat_map(fn {row_tensor, frame_idx} ->
    row_tensor
    |> Nx.to_flat_list()
    |> Enum.with_index()
    |> Enum.map(fn {val, bin_idx} ->
      %{
        "x1" => frame_idx, "x2" => frame_idx + 1,
        "y1" => bin_idx, "y2" => bin_idx + 1,
        "value" => val
      }
    end)
  end)

f0_data =
  results.pitch_pred.payload[0]
  |> Nx.backend_transfer(Nx.BinaryBackend)
  |> Nx.to_flat_list()
  |> Enum.with_index()
  |> Enum.map(fn {f0, frame} -> %{"frame" => frame, "f0" => f0} end)

durations = results.phoneme_duration_predict.payload[0]
  |> Nx.backend_transfer(Nx.BinaryBackend)
  |> Nx.to_flat_list()

boundaries = Enum.scan(durations, 0, fn dur, acc -> dur + acc end)
boundary_data = Enum.map(boundaries, fn b -> %{"frame" => b} end)


mel_layer =
  Vl.new()
  |> Vl.data_from_values(mel_data)
  |> Vl.mark(:rect, tooltip: false, stroke: nil)
  |> Vl.encode_field(:x, "x1", type: :quantitative, title: "Time (Frames)")
  |> Vl.encode_field(:x2, "x2")
  |> Vl.encode_field(:y, "y1", type: :quantitative, title: "Mel Frequency Bin")
  |> Vl.encode_field(:y2, "y2")
  |> Vl.encode_field(:color, "value",
    type: :quantitative,
    scale: [scheme: "greys", reverse: true, domain: [-11.0, 2.0]],
    legend: false
  )

f0_layer =
  Vl.new()
  |> Vl.data_from_values(f0_data)
  |> Vl.mark(:line, color: "#007bff", strokeWidth: 2)
  |> Vl.encode_field(:x, "frame", type: :quantitative)
  |> Vl.encode_field(:y, "f0",
    type: :quantitative,
    title: "Pitch (Hz)",
    axis: [orient: "right", titleColor: "#007bff"],
    scale: [domain: [50, 800]]
  )

boundary_layer =
  Vl.new()
  |> Vl.data_from_values(boundary_data)
  |> Vl.mark(:rule, color: "red", strokeDash: [4, 4], strokeWidth: 1.5)
  |> Vl.encode_field(:x, "frame", type: :quantitative)

:ok
:ok

Demostration

File.write!("E:/final.wav", results.audio.payload)

Kino.Audio.new(results.audio.payload, :wav)
Vl.new(width: 800, height: 400, title: "Output Analysis (Mel Spectrum + F0 + Phoneme Duration)")
|> Vl.resolve(:scale, y: :independent)
|> Vl.layers([mel_layer, f0_layer, boundary_layer])
|> Kino.VegaLite.new()

Multiple Pipeline

Try concurrent.

Kino.start_child({Orchid.Livebook.GanttTracker,[]})
Orchid.Livebook.GanttTracker.clear()

pipeline_tasks = 
  for _ <- 1..3 do
    Task.async(fn ->
      Orchid.run(recipe, inputs, [executor_and_opts: {Orchid.Executor.Async, []},
      baggage: %{
        meta_store: MetaStore,
        blob_store: BlobStore
      },
      global_hooks_stack: [OrchidStratum.BypassHook]
    ])
    end)
  end


{elapse, _res} = fn -> Task.await_many(pipeline_tasks, :infinity) end
|> :timer.tc

# Est: T = 4 * Concurrent + 3
Logger.info "Total used #{elapse / 1000_000}s."

12:04:36.546 [debug] [Symbiont] Inference took 7.065ms via provider [:cpu]

12:04:36.551 [debug] [Symbiont] Inference took 4.096ms via provider [:cpu]

12:04:36.567 [debug] [Symbiont] Inference took 20.684ms via provider [:cpu]

12:04:36.571 [debug] [Symbiont] Inference took 19.046ms via provider [:cpu]

12:04:36.575 [debug] [Symbiont] Inference took 7.577ms via provider [:cpu]

12:04:36.587 [debug] [Symbiont] Inference took 20.275ms via provider [:cpu]

12:04:36.595 [debug] [Symbiont] Inference took 19.353ms via provider [:cpu]

12:04:36.596 [debug] [Symbiont] Inference took 8.294ms via provider [:cpu]

12:04:36.627 [debug] [Symbiont] Inference took 26.624ms via provider [:cpu]

12:04:37.782 [debug] [Symbiont] Inference took 1200.025ms via provider [:cpu]

12:04:37.793 [debug] [Symbiont] Inference took 10.137ms via provider [:cpu]

12:04:38.928 [debug] [Symbiont] Inference took 1121.996ms via provider [:cpu]

12:04:39.912 [debug] [Symbiont] Inference took 2130.534ms via provider [:cpu]

12:04:39.932 [debug] [Symbiont] Inference took 19.865ms via provider [:cpu]

12:04:42.291 [debug] [Symbiont] Inference took 2346.701ms via provider [:cpu]

12:04:42.834 [debug] [Symbiont] Inference took 2921.267ms via provider [:cpu]

12:04:42.854 [debug] [Symbiont] Inference took 19.353ms via provider [:cpu]

12:04:44.055 [debug] [Symbiont] Inference took 1187.43ms via provider [:cpu]

12:04:44.310 [debug] [Symbiont] Inference took 5380.813ms via provider [:cpu]

12:04:45.921 [debug] [Symbiont] Inference took 1610.854ms via provider [:cpu]

12:04:47.895 [debug] [Symbiont] Inference took 3584.819ms via provider [:cpu]

12:04:49.711 [debug] [Symbiont] Inference took 1815.347ms via provider [:cpu]

12:04:51.681 [debug] [Symbiont] Inference took 3786.035ms via provider [:cpu]

12:04:52.884 [debug] [Symbiont] Inference took 1203.302ms via provider [:cpu]

12:04:52.999 [info] Total used 16.460596s.
:ok
alias VegaLite, as: Vl

spans = Orchid.Livebook.GanttTracker.get_spans()
min_start = spans |> Enum.map(&amp; &amp;1.start) |> Enum.min(fn -> 0 end)

chart_data = 
  Enum.map(spans, fn span ->
    %{
      "run_id" => span.run_id,
      "step" => span.step,
      "start" => (span.start - min_start) / 1000,
      "end" => (span.end - min_start) / 1000,
      "duration" => Float.round(span.duration, 2),
      "status" => Atom.to_string(span.status)
    }
  end)

Vl.new(width: 600, height: 250, title: "Concurrent Orchid Workflows")
|> Vl.data_from_values(chart_data)
|> Vl.encode_field(:row, "run_id", type: :nominal, title: "Pipeline Instance (PID)")
|> Vl.mark(:bar, corner_radius: 4, height: 20)
|> Vl.encode_field(:y, "step",
     type: :nominal,
     title: nil,
     sort: [field: "start", op: "min"],
     axis: [labelLimit: 300]
   )
|> Vl.encode_field(:x, "start", type: :quantitative, title: "Global Time (ms)")
|> Vl.encode_field(:x2, "end")
|> Vl.encode_field(:color, "status", type: :nominal, scale: [range: ["#2eb82e", "#d9534f"]])
|> Vl.encode_field(:tooltip, "duration", type: :quantitative, title: "Duration (ms)")
|> Vl.resolve(:scale, x: :shared)
|> Kino.VegaLite.new()

Cache

metaref = OrchidStratum.MetaStorage.EtsAdapter.init()
blobref = OrchidStratum.BlobStorage.EtsAdapter.init()

cache_recipe_opts = [
  executor_and_opts: {Orchid.Executor.Async, []},
  baggage: %{
    meta_store: {OrchidStratum.MetaStorage.EtsAdapter, metaref},
    blob_store: {OrchidStratum.BlobStorage.EtsAdapter, blobref}
  },
  global_hooks_stack: [OrchidStratum.BypassHook]
]

recipe_with_cache = Orchid.Recipe.walk(
  recipe.steps,
  fn step ->
    case step do
      {impl, i, o} -> {impl, i, o, [cache: true]}
        {impl, i, o, old_opts} ->
          {impl, i, o, old_opts ++
            [cache: true,
              cache_keys: Keyword.keys(old_opts)
            ]
          }
    end
  end
)

# Prewarm
Orchid.run(recipe_with_cache, inputs, cache_recipe_opts)

defmodule BenchmarkRunner do
  def run_without_cache(recipe, inputs, opts) do
    Orchid.run(recipe, inputs, opts)
  end

  def run_with_cache(recipe_with_cache, inputs, opts) do
    Orchid.run(recipe_with_cache, inputs, opts)
  end
end

Benchee.run(
  %{
    "run_without_cache" => fn ->
      BenchmarkRunner.run_without_cache(recipe, inputs, [executor_and_opts: {Orchid.Executor.Async, []}]) end,
    "run_with_cache"    => fn ->
      BenchmarkRunner.run_with_cache(recipe_with_cache, inputs, cache_recipe_opts) end
  },
  time: 3,
  memory_time: 3,
  reduction_time: 2
)

12:03:56.906 [debug] [Symbiont] Inference took 7.68ms via provider [:cpu]

12:03:56.911 [debug] [Symbiont] Inference took 4.403ms via provider [:cpu]

12:03:56.922 [debug] [Symbiont] Inference took 10.444ms via provider [:cpu]

12:03:57.875 [debug] [Symbiont] Inference took 933.888ms via provider [:cpu]

12:03:57.880 [debug] [Symbiont] Inference took 4.505ms via provider [:cpu]

12:03:58.527 [debug] [Symbiont] Inference took 637.644ms via provider [:cpu]

12:04:01.423 [debug] [Symbiont] Inference took 2896.281ms via provider [:cpu]

12:04:02.598 [debug] [Symbiont] Inference took 1173.504ms via provider [:cpu]
Error trying to determine erlang version enoent, falling back to overall OTP version
Warning: the benchmark run_with_cache is using an evaluated function.
  Evaluated functions perform slower than compiled functions.
  You can move the Benchee caller to a function in a module and invoke `Mod.fun()` instead.
  Alternatively, you can move the benchmark into a benchmark.exs file and run mix run benchmark.exs

Warning: the benchmark run_without_cache is using an evaluated function.
  Evaluated functions perform slower than compiled functions.
  You can move the Benchee caller to a function in a module and invoke `Mod.fun()` instead.
  Alternatively, you can move the benchmark into a benchmark.exs file and run mix run benchmark.exs

Operating System: Windows
CPU Information: 12th Gen Intel(R) Core(TM) i5-12400
Number of Available Cores: 12
Available memory: 31.77 GB
Elixir 1.19.3
Erlang 28
JIT enabled: true

Benchmark suite executing with the following configuration:
warmup: 2 s
time: 3 s
memory time: 3 s
reduction time: 2 s
parallel: 1
inputs: none specified
Estimated total run time: 20 s
Excluding outliers: false

Benchmarking run_with_cache ...
Benchmarking run_without_cache ...

12:04:13.067 [debug] [Symbiont] Inference took 8.499ms via provider [:cpu]

12:04:13.068 [debug] [Symbiont] Inference took 1.843ms via provider [:cpu]

12:04:13.074 [debug] [Symbiont] Inference took 5.427ms via provider [:cpu]

12:04:14.070 [debug] [Symbiont] Inference took 992.051ms via provider [:cpu]

12:04:14.081 [debug] [Symbiont] Inference took 10.444ms via provider [:cpu]

12:04:14.787 [debug] [Symbiont] Inference took 698.47ms via provider [:cpu]

12:04:17.627 [debug] [Symbiont] Inference took 2838.937ms via provider [:cpu]

12:04:18.782 [debug] [Symbiont] Inference took 1153.638ms via provider [:cpu]

12:04:18.872 [debug] [Symbiont] Inference took 7.065ms via provider [:cpu]

12:04:18.875 [debug] [Symbiont] Inference took 2.764ms via provider [:cpu]

12:04:18.893 [debug] [Symbiont] Inference took 17.1ms via provider [:cpu]

12:04:19.936 [debug] [Symbiont] Inference took 1034.649ms via provider [:cpu]

12:04:19.946 [debug] [Symbiont] Inference took 7.475ms via provider [:cpu]

12:04:20.582 [debug] [Symbiont] Inference took 629.555ms via provider [:cpu]

12:04:23.500 [debug] [Symbiont] Inference took 2916.556ms via provider [:cpu]

12:04:24.690 [debug] [Symbiont] Inference took 1189.683ms via provider [:cpu]

12:04:24.768 [debug] [Symbiont] Inference took 5.939ms via provider [:cpu]

12:04:24.770 [debug] [Symbiont] Inference took 1.74ms via provider [:cpu]

12:04:24.782 [debug] [Symbiont] Inference took 11.161ms via provider [:cpu]

12:04:25.754 [debug] [Symbiont] Inference took 966.86ms via provider [:cpu]

12:04:25.762 [debug] [Symbiont] Inference took 6.758ms via provider [:cpu]

12:04:26.397 [debug] [Symbiont] Inference took 625.766ms via provider [:cpu]

12:04:29.282 [debug] [Symbiont] Inference took 2883.993ms via provider [:cpu]

12:04:30.416 [debug] [Symbiont] Inference took 1133.875ms via provider [:cpu]

12:04:30.500 [debug] [Symbiont] Inference took 6.451ms via provider [:cpu]

12:04:30.503 [debug] [Symbiont] Inference took 1.843ms via provider [:cpu]

12:04:30.520 [debug] [Symbiont] Inference took 16.793ms via provider [:cpu]

12:04:31.579 [debug] [Symbiont] Inference took 1051.443ms via provider [:cpu]

12:04:31.587 [debug] [Symbiont] Inference took 6.451ms via provider [:cpu]

12:04:32.247 [debug] [Symbiont] Inference took 652.185ms via provider [:cpu]

12:04:35.146 [debug] [Symbiont] Inference took 2897.715ms via provider [:cpu]

12:04:36.295 [debug] [Symbiont] Inference took 1148.518ms via provider [:cpu]
Calculating statistics...
Formatting results...

Name                        ips        average  deviation         median         99th %
run_with_cache           718.93      0.00139 s   ±105.19%      0.00113 s      0.00363 s
run_without_cache         0.170         5.90 s     ±0.00%         5.90 s         5.90 s

Comparison: 
run_with_cache           718.93
run_without_cache         0.170 - 4239.66x slower +5.90 s

Memory usage statistics:

Name                      average  deviation         median         99th %
run_with_cache           72.39 KB     ±1.64%       73.34 KB       73.34 KB
run_without_cache        74.43 KB     ±0.00%       74.43 KB       74.43 KB

Comparison: 
run_with_cache           73.34 KB
run_without_cache        74.43 KB - 1.03x memory usage +2.04 KB

Reduction count statistics:

Name                      average  deviation         median         99th %
run_with_cache             8.73 K     ±0.18%         8.74 K         8.75 K
run_without_cache          8.87 K     ±0.00%         8.87 K         8.87 K

Comparison: 
run_with_cache             8.74 K
run_without_cache          8.87 K - 1.02x reduction count +0.140 K