DiffSingerElixirPoC
System.shell("mix hex.clean --all")
System.shell("mix hex.config http_timeout 120")
# If the download fails, you can comment out all the git repositories first,
#and then install them incrementally one by one
#after the hex packages are installed.
Mix.install([
{:orchid, "~> 0.5", override: true},
{:diff_singer, git: "https://github.com/GES233/DiffSinger.git"},
{:orchid_stratum, git: "https://github.com/SynapticStrings/OrchidStratum.git"},
{:kino, "~> 0.19.0"},
{:vega_lite, "~> 0.1.8"},
{:kino_vega_lite, "~> 0.1.11"},
{:kino_benchee, "~> 0.1.0"}
])
:telemetry.attach(
"orchid-step-exception-logger",
[:orchid, :step, :exception],
&Orchid.Runner.Hooks.Telemetry.error_handler/4,
%{}
)
Fetch Model’s Metadata
Obtain the metadata and basic information of the sound library’s models as the skeleton for subsequent dependency graph construction.
Here, we take Qixuan v2.5.0 (for OpenUTAU), maintained by OpenVPI, as an example.
# If you have your local version
# Chenge the path to yours
model_root_path = "E:/ProgramAssets/OpenUTAUSingers/Qixuan_v2.5.0_DiffSinger_OpenUtau"
model_config = DiffSinger.VoiceBank.Config.fetch_overview(model_root_path)
:ok
:ok
Prelude Steps
This is a node that converts lyrics/MIDI into corresponding phoneme IDs and pitches (MIDI) based on a phoneme dictionary.
A primary reason for needing MIDI is to use its duration information as a reference for subsequent phoneme duration prediction.
A rasterized node will be implemented later to accommodate the processing of notes and phonemes.
defmodule CommonEncoder do
@moduledoc """
The lyrics and pitch are initially encoded for use in subsequent models.
"""
def run_partial(%Orchid.Param{payload: words}, opts) do
lang_dict = Keyword.fetch!(opts, :lang_dict)
phoneme_dict = Keyword.fetch!(opts, :phoneme_dict)
Enum.reduce(
words,
{[], [], [], [], []},
fn {phonemes, duration, midi_note}, {acc_l, acc_t, acc_wdiv, acc_wdur, acc_midi} ->
ph_count = length(phonemes)
{curr_langs, curr_toks} =
phonemes
|> Enum.map(fn {lang, phone} -> {lang_dict[lang], phoneme_dict[phone]} end)
|> Enum.unzip()
curr_midis = List.duplicate(midi_note, ph_count)
{
acc_l ++ curr_langs,
acc_t ++ curr_toks,
acc_wdiv ++ [ph_count],
acc_wdur ++ [duration],
acc_midi ++ curr_midis
}
end)
end
end
{:module, CommonEncoder, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}
defmodule DurationPredictEncoder do
use Orchid.Step
def run(param, opts) do
{langs, toks, w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)
lang_map_param = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
phoneme_map_param = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
word_div_param = Orchid.Param.new(:word_division, :payload, Nx.tensor([w_div], type: :s64))
word_dur_param = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
ph_midi_param = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))
{:ok, [lang_map_param, phoneme_map_param, word_div_param, word_dur_param, ph_midi_param]}
end
end
{:module, DurationPredictEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}
defmodule PitchPredictEncoder do
use Orchid.Step
def run(%Orchid.Param{} = param, opts) do
{langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)
languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))
{:ok, [languages, tokens, word_dur, ph_midi]}
end
end
{:module, PitchPredictEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}
# Used for Variance model
defmodule VarianceEncoder do
use Orchid.Step
def run(param, opts) do
{langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)
languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))
{:ok, [languages, tokens, word_dur, ph_midi]}
end
end
{:module, VarianceEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}
Variance Model
Predict Phoneme Duration:
defmodule PredictDuration do
@behaviour Orchid.Symbiont.Step
def required, do: [:duration_linguistic, :duration_predict]
def run_with_model(
[
%Orchid.Param{payload: lang_map},
%Orchid.Param{payload: phoneme_map},
%Orchid.Param{payload: word_division},
%Orchid.Param{payload: word_duration},
%Orchid.Param{payload: phoneme_midi}
],
handlers,
_opts
) do
duration_linguistic = handlers.duration_linguistic
duration_predict = handlers.duration_predict
inputs = {phoneme_map, lang_map, word_division, word_duration}
{:ok, {encoder_out_tensor, mask_tensor}} = Orchid.Symbiont.call(duration_linguistic, {:infer, inputs}, :infinity)
{:ok, {result}} = Orchid.Symbiont.call(duration_predict, {:infer, {encoder_out_tensor, mask_tensor, phoneme_midi}}, :infinity)
{:ok, [
Orchid.Param.new(:ph_dur_pred, :encoder_out, result)
]}
end
end
{:module, PredictDuration, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}
Predict Pitch:
defmodule PredictPitch do
@behaviour Orchid.Symbiont.Step
def required, do: [:pitch_linguistic, :pitch_predict]
def run_with_model(
[
%Orchid.Param{payload: languages},
%Orchid.Param{payload: phonemes},
%Orchid.Param{payload: phoneme_duration},
%Orchid.Param{payload: note_midi}
], handlers, opts) do
ph_dur =
phoneme_duration
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.round()
|> Nx.as_type(:s64)
{:ok, {encoder_out, _mask}} =
Orchid.Symbiont.call(
handlers.pitch_linguistic,
{:infer, {phonemes, languages, ph_dur}},
:infinity
)
total_frames =
ph_dur
|> Nx.sum()
|> Nx.to_number()
key = Nx.Random.key(System.system_time())
{pitch_noise, _key} =
Nx.Random.normal(
key,
0.0,
1.0,
shape: {1, total_frames},
type: :f32
)
expr =
Nx.broadcast(Nx.tensor(1.0, type: :f32), {1, total_frames})
retake =
Nx.broadcast(Nx.tensor(1, type: :u8), {1, total_frames})
note_rest =
note_midi
|> Nx.equal(0)
|> Nx.as_type(:u8)
note_midi = note_midi |> Nx.as_type(:f32)
note_dur = ph_dur
steps =
opts
|> Keyword.get(:steps, 20)
|> Nx.tensor(type: :s64)
{:ok, {pitch_pred}} =
Orchid.Symbiont.call(
handlers.pitch_predict,
{:infer,
{
encoder_out,
ph_dur,
note_midi,
note_rest,
note_dur,
pitch_noise,
expr,
retake,
steps
}},
:infinity
)
{:ok, Orchid.Param.new(:pitch_pred, :payload, pitch_pred)}
end
end
{:module, PredictPitch, <<70, 79, 82, 49, 0, 0, 18, ...>>, ...}
defmodule MIDIToPitch do
use Orchid.Step
def run(%Orchid.Param{payload: midi_pred}, _opts) do
{:ok, midi_pred
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.add(-69.0)
|> Nx.divide(12.0)
|> then(&Nx.pow(2.0, &1))
|> Nx.multiply(440)
|> then(fn converted_f0 ->
Nx.select(Nx.less(Nx.backend_transfer(midi_pred, Nx.BinaryBackend), 0.0), Nx.tensor(0.0), converted_f0)
end)
|> then(&Orchid.Param.new(:f0, :tensor, &1))}
end
end
{:module, MIDIToPitch, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}
Other Variance Params:
defmodule VarianceModel do
@behaviour Orchid.Symbiont.Step
def required, do: [:variance_linguistic, :variance]
def run_with_model(
[
%Orchid.Param{payload: languages},
%Orchid.Param{payload: phonemes},
%Orchid.Param{payload: ph_dur},
%Orchid.Param{payload: pitch}
],
handlers,
opts
) do
ph_dur =
ph_dur
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.round()
|> Nx.as_type(:s64)
{:ok, {encoder_out, _mask}} =
Orchid.Symbiont.call(
handlers.variance_linguistic,
{:infer, {phonemes, languages, ph_dur}}
)
total_frames =
ph_dur
|> Nx.sum()
|> Nx.to_number()
key = Nx.Random.key(System.system_time())
{breath_noise, key} =
Nx.Random.normal(key, 0.0, 1.0,
shape: {1, total_frames},
type: :f32
)
{voice_noise, _key} =
Nx.Random.normal(key, 0.0, 1.0,
shape: {1, total_frames},
type: :f32
)
retake =
Nx.broadcast(
Nx.tensor([1,1], type: :u8),
{1, total_frames, 2}
)
steps =
opts
|> Keyword.get(:steps, 20)
|> Nx.tensor(type: :s64)
{:ok, {breath_pred, voice_pred}} =
Orchid.Symbiont.call(
handlers.variance,
{:infer,
{
encoder_out,
ph_dur,
pitch,
breath_noise,
voice_noise,
retake,
steps
}},
:infinity
)
{:ok, [
Orchid.Param.new(:breathiness, :payload, breath_pred),
Orchid.Param.new(:voicing, :payload, voice_pred)
]}
end
end
{:module, VarianceModel, <<70, 79, 82, 49, 0, 0, 18, ...>>, ...}
Acoustic Model and Vocoder
Acoustic Model:
defmodule Acoustic do
@behaviour Orchid.Symbiont.Step
def required, do: [:acoustic]
def run_with_model(
[
%Orchid.Param{payload: languages},
%Orchid.Param{payload: phonemes},
%Orchid.Param{payload: phoneme_duration},
%Orchid.Param{payload: pitch},
%Orchid.Param{payload: breathiness},
%Orchid.Param{payload: voicing}
],
handlers,
opts
) do
durations =
phoneme_duration
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.round()
|> Nx.as_type(:s64)
frames = Nx.axis_size(pitch, 1)
gender =
Nx.broadcast(Nx.tensor(0.0, type: :f32), {1, frames})
velocity =
Nx.broadcast(Nx.tensor(1.0, type: :f32), {1, frames})
steps =
opts
|> Keyword.get(:steps, 20)
|> Nx.tensor(type: :s64)
depth =
opts
|> Keyword.get(:depth, 1.0)
|> Nx.tensor(type: :f32)
{:ok, {mel}} =
Orchid.Symbiont.call(
handlers.acoustic,
{:infer,
{
phonemes,
languages,
durations,
pitch,
breathiness,
voicing,
gender,
velocity,
depth,
steps
}},
:infinity
)
{:ok, Orchid.Param.new(:mel, :payload, mel)}
end
end
{:module, Acoustic, <<70, 79, 82, 49, 0, 0, 15, ...>>, ...}
Vocoder:
defmodule NSFHifiGAN_Vocoder do
@behaviour Orchid.Symbiont.Step
def required, do: [:vocoder]
def run_with_model(
[%Orchid.Param{payload: mel}, %Orchid.Param{payload: f0}],
handlers,
_opts
) do
{:ok, {audio}} = Orchid.Symbiont.call(handlers.vocoder, {:infer, {mel, f0}})
{:ok, Orchid.Param.new(:audio, :payload, audio)}
end
end
{:module, NSFHifiGAN_Vocoder, <<70, 79, 82, 49, 0, 0, 11, ...>>, ...}
Post-process (tensor to audio)
defmodule TensorToWave do
use Orchid.Step
def run(%Orchid.Param{payload: wave_tensor}, opts) do
pcm_data =
wave_tensor
|> Nx.flatten()
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.multiply(32767.0)
|> Nx.clip(-32768.0, 32767.0)
|> Nx.as_type(:s16)
|> Nx.to_binary()
sample_rate = Keyword.get(opts, :sample_rate, 44100)
byte_rate = sample_rate * 1 * 2 # sample_rate * channels * bytes_per_sample (Int16=2)
data_size = byte_size(pcm_data)
file_size = 36 + data_size
header = <<
"RIFF", file_size::little-integer-size(32), "WAVE",
"fmt ", 16::little-integer-size(32), # Subchunk1Size
1::little-integer-size(16), # AudioFormat (1 = PCM)
1::little-integer-size(16), # NumChannels (1 = Mono)
sample_rate::little-integer-size(32), # SampleRate
byte_rate::little-integer-size(32), # ByteRate
2::little-integer-size(16), # BlockAlign (channels * 2)
16::little-integer-size(16), # BitsPerSample (16 bits)
"data", data_size::little-integer-size(32) # Subchunk2Size
>>
File.write!("E:/final.wav", header <> pcm_data)
{:ok, Orchid.Param.new(:final, :audio, header <> pcm_data)}
end
end
{:module, TensorToWave, <<70, 79, 82, 49, 0, 0, 15, ...>>, ...}
Build Pipeline and Prepare to Demo
defmodule QixuanPipeline do
require Logger
def load_models(model_root_path, model_config) do
Logger.info("Loading DiffSinger models...")
models =[
{:duration_linguistic, model_config.predict_map.maybe_duration.linguistic.path},
{:duration_predict, model_config.predict_map.maybe_duration.duration.path},
{:pitch_linguistic, model_config.predict_map.maybe_pitch.linguistic.path},
{:pitch_predict, model_config.predict_map.maybe_pitch.predict.path},
{:variance_linguistic, model_config.variance.linguistic.path},
{:variance, model_config.variance.variance.path},
{:acoustic, model_config.acoustic.infer.path},
{:vocoder, model_config.vocoder.path}
]
for {name, rel_path} <- models do
path = Path.join([model_root_path] ++ rel_path)
:ok = Orchid.Symbiont.register(name, {Orchid.Symbiont.OrtexRunner, [name: name, path: path]})
end
Logger.info("All models loaded successfully.")
:ok
end
def build_recipe(model_config) do
injector = [extra_hooks_stack: [Orchid.Symbiont.Hooks.Injector]]
dur_dict = model_config.predict_map.maybe_duration.phonemes
pitch_dict = model_config.predict_map.maybe_pitch.phonemes
var_dict = model_config.variance.phonemes
sample_rate = model_config.vocoder.maybe_config["sample_rate"]
duration_steps =[
{DurationPredictEncoder, :words,[:duration_lang, :duration_phoneme, :word_division, :word_duration, :duration_ph_midi],[lang_dict: dur_dict.maybe_lang_dict, phoneme_dict: dur_dict.phoneme_dict]},
{PredictDuration,[:duration_lang, :duration_phoneme, :word_division, :word_duration, :duration_ph_midi],
:phoneme_duration_predict, injector}
]
pitch_steps = [
{PitchPredictEncoder, :words,[:pitch_lang, :pitch_phoneme, :word_duration_from_pitch, :pitch_ph_midi],[lang_dict: pitch_dict.maybe_lang_dict, phoneme_dict: pitch_dict.phoneme_dict]},
{PredictPitch,[:pitch_lang, :pitch_phoneme, :phoneme_duration_predict, :pitch_ph_midi],
:pitch_pred_midi, injector},
{MIDIToPitch, :pitch_pred_midi, :pitch_pred}
]
variance_steps = [
{VarianceEncoder, :words,[:variance_lang, :variance_phoneme, :word_duration_from_variance, :variance_ph_midi],[lang_dict: var_dict.maybe_lang_dict, phoneme_dict: var_dict.phoneme_dict]},
{VarianceModel,[:variance_lang, :variance_phoneme, :phoneme_duration_predict, :pitch_pred_midi],[:breathiness_pred, :voice_pred], injector}
]
acoustic_step =[
{Acoustic,[:variance_lang, :variance_phoneme, :phoneme_duration_predict, :pitch_pred, :breathiness_pred, :voice_pred],
:mel, injector}
]
vocoder_step = [
{NSFHifiGAN_Vocoder,[:mel, :pitch_pred], :wave_tensor, injector},
{TensorToWave, :wave_tensor, :audio,[sample_rate: sample_rate]}
]
all_steps = duration_steps ++ pitch_steps ++ variance_steps ++ acoustic_step ++ vocoder_step
Orchid.Recipe.new(all_steps)
end
end
{:module, QixuanPipeline, <<70, 79, 82, 49, 0, 0, 39, ...>>, ...}
Add a tracker
defmodule Orchid.Livebook.GanttTracker do
use Agent
def start_link(_) do
Agent.start_link(fn ->[] end, name: __MODULE__)
end
def handle_event([:orchid, :step, status], measurements, meta, _config) do
run_id = List.first(Process.get(:"$callers")) || self()
run_id_str = inspect(run_id)
time_val = if status == :start, do: measurements.system_time, else: measurements.duration
Agent.update(__MODULE__, fn state ->[{run_id_str, status, time_val, meta} | state]
end)
end
def get_spans() do
events = Agent.get(__MODULE__, & &1) |> Enum.reverse()
{_, spans} = Enum.reduce(events, {%{},[]}, fn
{run_id, :start, sys_time, meta}, {pending, spans} ->
start_ms = System.convert_time_unit(sys_time, :native, :microsecond) / 1000.0
sig = {run_id, meta.impl, meta.in_keys, meta.out_keys}
{Map.put(pending, sig, start_ms), spans}
{run_id, status, duration, meta}, {pending, spans} when status in[:done, :exception, :special] ->
duration_ms = System.convert_time_unit(duration, :native, :microsecond) / 1000.0
sig = {run_id, meta.impl, meta.in_keys, meta.out_keys}
case Map.pop(pending, sig) do
{start_ms, new_pending} when not is_nil(start_ms) ->
impl_name = if is_function(meta.impl), do: "Anonymous", else: inspect(meta.impl) |> String.split(".") |> List.last()
out_keys = inspect(meta.out_keys) |> String.slice(0..14) |> Kernel.<>("...")
span = %{
run_id: run_id,
step: "#{impl_name}(#{out_keys})",
start: start_ms,
end: start_ms + duration_ms,
visual_end: start_ms + max(duration_ms, 10.0),
duration: duration_ms,
status: status
}
{new_pending, [span | spans]}
{nil, ^pending} -> {pending, spans}
end
end)
spans
end
def clear(), do: Agent.update(__MODULE__, fn _ ->[] end)
end
{:module, Orchid.Livebook.GanttTracker, <<70, 79, 82, 49, 0, 0, 29, ...>>, ...}
Kino.start_child({Orchid.Livebook.GanttTracker, []})
events =[
{[:orchid, :step, :start], "start"},
{[:orchid, :step, :done], "done"},
{[:orchid, :step, :exception], "exception"},
{[:orchid, :step, :special], "special"}
]
for {event, suffix} <- events do
:telemetry.attach(
"orchid-gantt-#{suffix}",
event,
&Orchid.Livebook.GanttTracker.handle_event/4,
nil
)
end
[:ok, :ok, :ok, :ok]
defmodule MermaidRenderer do
def render(%Orchid.Recipe{} = recipe) do
mermaid_code = generate_mermaid(recipe.steps)
Kino.Mermaid.new(mermaid_code)
end
defp generate_mermaid(steps) do
header = "graph TD\n"
body =
steps
|> Enum.with_index()
|> Enum.flat_map(fn {step, idx} ->
{impl, in_keys, out_keys, _opts} = Orchid.Step.ensure_full_step(step)
step_id = "step_#{idx}"
step_name = format_impl(impl)
step_node = ~s| #{step_id}["⚙️ #{step_name}"]:::stepClass|
in_edges =
in_keys
|> normalize_keys()
|> Enum.map(fn k -> ~s| #{k}(["#{k}"]) -.-> #{step_id}| end)
out_edges =
out_keys
|> normalize_keys()
|> Enum.map(fn k -> ~s| #{step_id} ==> #{k}(["#{k}"])| end)
[step_node] ++ in_edges ++ out_edges
end)
|> Enum.uniq()
|> Enum.join("\n")
styles = """
classDef stepClass fill:#2eb82e,stroke:#fff,stroke-width:2px,color:#fff;
"""
header <> body <> styles
end
defp normalize_keys(keys) do
keys
|> Orchid.Step.ID.normalize_keys_to_set()
|> MapSet.to_list()
|> Enum.map(&to_string/1)
end
defp format_impl(impl) when is_function(impl), do: "Anonymous Function"
defp format_impl(impl) do
inspect(impl) |> String.replace_prefix("Elixir.", "")
end
end
{:module, MermaidRenderer, <<70, 79, 82, 49, 0, 0, 19, ...>>, ...}
Run Workflow
QixuanPipeline.load_models(model_root_path, model_config)
recipe = QixuanPipeline.build_recipe(model_config)
inputs = [
%Orchid.Param{name: :words, payload: [
{[{"zh", "AP"}], 10, 0},
{[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
{[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
{[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
{[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60},
{[{"zh", "AP"}], 1, 0},
{[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
{[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
{[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
{[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60}
]},
]
Orchid.Livebook.GanttTracker.clear()
{elapse, {:ok, results}} = :timer.tc(
&Orchid.run/2,
[recipe, inputs
],
:microsecond
)
require Logger
Logger.info "Used #{elapse / 1000}ms."
11:55:57.387 [info] Loading DiffSinger models...
11:55:57.390 [info] All models loaded successfully.
11:55:57.392 [debug] ** (RuntimeError) bad return value from Logger formatter Logger.Formatter, got [<<185, 220, 181, 192, 213, 253, 212, 218, 177, 187, 185, 216, 177, 213, 161, 163, 92, 114, 92, 110, 34, 41>>, "\e[0m", 10] after "\e[31m\n11:55:57.388 [error] Writer crashed (:\""
(kernel 10.4.1) logger_h_common.erl:433: :logger_h_common.string_to_binary/1
(kernel 10.4.1) logger_h_common.erl:399: :logger_h_common.do_log_to_binary/2
(kernel 10.4.1) logger_h_common.erl:180: :logger_h_common.log/2
(kernel 10.4.1) logger_backend.erl:54: :logger_backend.call_handlers/3
(kernel 10.4.1) user_drv.erl:581: :user_drv.server/3
(stdlib 7.1) gen_statem.erl:3748: :gen_statem.loop_state_callback/11
(stdlib 7.1) proc_lib.erl:333: :proc_lib.init_p_do_apply/3
11:55:57.392 [info] init got unexpected: {:io_request, #PID<0.92.0>, #Reference<0.2652535936.353632260.93387>,
{:put_chars, :unicode,
"Failed to write log message to stdout, trying stderr\n"}}
11:55:57.576 [debug] [Symbiont] Inference took 7.987ms via provider [:cpu]
11:55:57.579 [debug] [Symbiont] Inference took 3.174ms via provider [:cpu]
11:55:57.909 [debug] [Symbiont] Inference took 3.686ms via provider [:cpu]
11:55:58.876 [debug] [Symbiont] Inference took 941.875ms via provider [:cpu]
11:55:59.022 [debug] [Symbiont] Inference took 3.891ms via provider [:cpu]
11:55:59.660 [debug] [Symbiont] Inference took 630.988ms via provider [:cpu]
11:56:03.021 [debug] [Symbiont] Inference took 2828.8ms via provider [:cpu]
11:56:04.527 [debug] [Symbiont] Inference took 1428.172ms via provider [:cpu]
11:56:04.600 [info] Used 7208.755ms.
:ok
Show Graph
MermaidRenderer.render recipe
graph TD
step_0["⚙️ DurationPredictEncoder"]:::stepClass
words(["words"]) -.-> step_0
step_0 ==> word_division(["word_division"])
step_0 ==> word_duration(["word_duration"])
step_0 ==> duration_lang(["duration_lang"])
step_0 ==> duration_phoneme(["duration_phoneme"])
step_0 ==> duration_ph_midi(["duration_ph_midi"])
step_1["⚙️ PredictDuration"]:::stepClass
word_division(["word_division"]) -.-> step_1
word_duration(["word_duration"]) -.-> step_1
duration_lang(["duration_lang"]) -.-> step_1
duration_phoneme(["duration_phoneme"]) -.-> step_1
duration_ph_midi(["duration_ph_midi"]) -.-> step_1
step_1 ==> phoneme_duration_predict(["phoneme_duration_predict"])
step_2["⚙️ PitchPredictEncoder"]:::stepClass
words(["words"]) -.-> step_2
step_2 ==> pitch_lang(["pitch_lang"])
step_2 ==> pitch_phoneme(["pitch_phoneme"])
step_2 ==> word_duration_from_pitch(["word_duration_from_pitch"])
step_2 ==> pitch_ph_midi(["pitch_ph_midi"])
step_3["⚙️ PredictPitch"]:::stepClass
phoneme_duration_predict(["phoneme_duration_predict"]) -.-> step_3
pitch_lang(["pitch_lang"]) -.-> step_3
pitch_phoneme(["pitch_phoneme"]) -.-> step_3
pitch_ph_midi(["pitch_ph_midi"]) -.-> step_3
step_3 ==> pitch_pred_midi(["pitch_pred_midi"])
step_4["⚙️ MIDIToPitch"]:::stepClass
pitch_pred_midi(["pitch_pred_midi"]) -.-> step_4
step_4 ==> pitch_pred(["pitch_pred"])
step_5["⚙️ VarianceEncoder"]:::stepClass
words(["words"]) -.-> step_5
step_5 ==> variance_lang(["variance_lang"])
step_5 ==> variance_phoneme(["variance_phoneme"])
step_5 ==> word_duration_from_variance(["word_duration_from_variance"])
step_5 ==> variance_ph_midi(["variance_ph_midi"])
step_6["⚙️ VarianceModel"]:::stepClass
phoneme_duration_predict(["phoneme_duration_predict"]) -.-> step_6
pitch_pred_midi(["pitch_pred_midi"]) -.-> step_6
variance_lang(["variance_lang"]) -.-> step_6
variance_phoneme(["variance_phoneme"]) -.-> step_6
step_6 ==> voice_pred(["voice_pred"])
step_6 ==> breathiness_pred(["breathiness_pred"])
step_7["⚙️ Acoustic"]:::stepClass
pitch_pred(["pitch_pred"]) -.-> step_7
voice_pred(["voice_pred"]) -.-> step_7
phoneme_duration_predict(["phoneme_duration_predict"]) -.-> step_7
variance_lang(["variance_lang"]) -.-> step_7
variance_phoneme(["variance_phoneme"]) -.-> step_7
breathiness_pred(["breathiness_pred"]) -.-> step_7
step_7 ==> mel(["mel"])
step_8["⚙️ NSFHifiGAN_Vocoder"]:::stepClass
pitch_pred(["pitch_pred"]) -.-> step_8
mel(["mel"]) -.-> step_8
step_8 ==> wave_tensor(["wave_tensor"])
step_9["⚙️ TensorToWave"]:::stepClass
wave_tensor(["wave_tensor"]) -.-> step_9
step_9 ==> audio(["audio"])
classDef stepClass fill:#2eb82e,stroke:#fff,stroke-width:2px,color:#fff;
Execution Timeline:
alias VegaLite, as: Vl
spans = Orchid.Livebook.GanttTracker.get_spans()
min_start = spans |> Enum.map(& &1.start) |> Enum.min(fn -> 0 end)
chart_data =
Enum.map(spans, fn span ->
%{
"step" => span.step,
"start" => span.start - min_start,
"end" => span.visual_end - min_start,
"duration" => Float.round(span.duration, 2),
"status" => Atom.to_string(span.status)
}
end)
Vl.new(width: 600, height: 250, title: "Orchid Workflow Execution Timeline")
|> Vl.data_from_values(chart_data)
|> Vl.mark(:bar, corner_radius: 4, height: 20)
|> Vl.encode_field(:y, "step", type: :nominal, title: "Steps", sort: [field: "start", op: "min"], axis: [labelLimit: 400])
|> Vl.encode_field(:x, "start", type: :quantitative, title: "Time Offset (ms)")
|> Vl.encode_field(:x2, "end")
|> Vl.encode_field(:color, "status", type: :nominal, title: "Status", scale: [range: ["#2eb82e", "#d9534f"]])
|> Vl.encode_field(:tooltip, "duration", type: :quantitative, title: "Duration (ms)")
|> Kino.VegaLite.new()
Prepare for the demo:
alias VegaLite, as: Vl
mel_tensor =
results.mel.payload[0]
|> Nx.backend_transfer(Nx.BinaryBackend)
mel_data =
mel_tensor
|> Nx.to_batched(1)
|> Enum.with_index()
|> Enum.flat_map(fn {row_tensor, frame_idx} ->
row_tensor
|> Nx.to_flat_list()
|> Enum.with_index()
|> Enum.map(fn {val, bin_idx} ->
%{
"x1" => frame_idx, "x2" => frame_idx + 1,
"y1" => bin_idx, "y2" => bin_idx + 1,
"value" => val
}
end)
end)
f0_data =
results.pitch_pred.payload[0]
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.to_flat_list()
|> Enum.with_index()
|> Enum.map(fn {f0, frame} -> %{"frame" => frame, "f0" => f0} end)
durations = results.phoneme_duration_predict.payload[0]
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.to_flat_list()
boundaries = Enum.scan(durations, 0, fn dur, acc -> dur + acc end)
boundary_data = Enum.map(boundaries, fn b -> %{"frame" => b} end)
mel_layer =
Vl.new()
|> Vl.data_from_values(mel_data)
|> Vl.mark(:rect, tooltip: false, stroke: nil)
|> Vl.encode_field(:x, "x1", type: :quantitative, title: "Time (Frames)")
|> Vl.encode_field(:x2, "x2")
|> Vl.encode_field(:y, "y1", type: :quantitative, title: "Mel Frequency Bin")
|> Vl.encode_field(:y2, "y2")
|> Vl.encode_field(:color, "value",
type: :quantitative,
scale: [scheme: "greys", reverse: true, domain: [-11.0, 2.0]],
legend: false
)
f0_layer =
Vl.new()
|> Vl.data_from_values(f0_data)
|> Vl.mark(:line, color: "#007bff", strokeWidth: 2)
|> Vl.encode_field(:x, "frame", type: :quantitative)
|> Vl.encode_field(:y, "f0",
type: :quantitative,
title: "Pitch (Hz)",
axis: [orient: "right", titleColor: "#007bff"],
scale: [domain: [50, 800]]
)
boundary_layer =
Vl.new()
|> Vl.data_from_values(boundary_data)
|> Vl.mark(:rule, color: "red", strokeDash: [4, 4], strokeWidth: 1.5)
|> Vl.encode_field(:x, "frame", type: :quantitative)
:ok
:ok
Demostration
File.write!("E:/final.wav", results.audio.payload)
Kino.Audio.new(results.audio.payload, :wav)
Vl.new(width: 800, height: 400, title: "Output Analysis (Mel Spectrum + F0 + Phoneme Duration)")
|> Vl.resolve(:scale, y: :independent)
|> Vl.layers([mel_layer, f0_layer, boundary_layer])
|> Kino.VegaLite.new()
Multiple Pipeline
Try concurrent.
Kino.start_child({Orchid.Livebook.GanttTracker,[]})
Orchid.Livebook.GanttTracker.clear()
pipeline_tasks =
for _ <- 1..3 do
Task.async(fn ->
Orchid.run(recipe, inputs, [executor_and_opts: {Orchid.Executor.Async, []},
baggage: %{
meta_store: MetaStore,
blob_store: BlobStore
},
global_hooks_stack: [OrchidStratum.BypassHook]
])
end)
end
{elapse, _res} = fn -> Task.await_many(pipeline_tasks, :infinity) end
|> :timer.tc
# Est: T = 4 * Concurrent + 3
Logger.info "Total used #{elapse / 1000_000}s."
12:04:36.546 [debug] [Symbiont] Inference took 7.065ms via provider [:cpu]
12:04:36.551 [debug] [Symbiont] Inference took 4.096ms via provider [:cpu]
12:04:36.567 [debug] [Symbiont] Inference took 20.684ms via provider [:cpu]
12:04:36.571 [debug] [Symbiont] Inference took 19.046ms via provider [:cpu]
12:04:36.575 [debug] [Symbiont] Inference took 7.577ms via provider [:cpu]
12:04:36.587 [debug] [Symbiont] Inference took 20.275ms via provider [:cpu]
12:04:36.595 [debug] [Symbiont] Inference took 19.353ms via provider [:cpu]
12:04:36.596 [debug] [Symbiont] Inference took 8.294ms via provider [:cpu]
12:04:36.627 [debug] [Symbiont] Inference took 26.624ms via provider [:cpu]
12:04:37.782 [debug] [Symbiont] Inference took 1200.025ms via provider [:cpu]
12:04:37.793 [debug] [Symbiont] Inference took 10.137ms via provider [:cpu]
12:04:38.928 [debug] [Symbiont] Inference took 1121.996ms via provider [:cpu]
12:04:39.912 [debug] [Symbiont] Inference took 2130.534ms via provider [:cpu]
12:04:39.932 [debug] [Symbiont] Inference took 19.865ms via provider [:cpu]
12:04:42.291 [debug] [Symbiont] Inference took 2346.701ms via provider [:cpu]
12:04:42.834 [debug] [Symbiont] Inference took 2921.267ms via provider [:cpu]
12:04:42.854 [debug] [Symbiont] Inference took 19.353ms via provider [:cpu]
12:04:44.055 [debug] [Symbiont] Inference took 1187.43ms via provider [:cpu]
12:04:44.310 [debug] [Symbiont] Inference took 5380.813ms via provider [:cpu]
12:04:45.921 [debug] [Symbiont] Inference took 1610.854ms via provider [:cpu]
12:04:47.895 [debug] [Symbiont] Inference took 3584.819ms via provider [:cpu]
12:04:49.711 [debug] [Symbiont] Inference took 1815.347ms via provider [:cpu]
12:04:51.681 [debug] [Symbiont] Inference took 3786.035ms via provider [:cpu]
12:04:52.884 [debug] [Symbiont] Inference took 1203.302ms via provider [:cpu]
12:04:52.999 [info] Total used 16.460596s.
:ok
alias VegaLite, as: Vl
spans = Orchid.Livebook.GanttTracker.get_spans()
min_start = spans |> Enum.map(& &1.start) |> Enum.min(fn -> 0 end)
chart_data =
Enum.map(spans, fn span ->
%{
"run_id" => span.run_id,
"step" => span.step,
"start" => (span.start - min_start) / 1000,
"end" => (span.end - min_start) / 1000,
"duration" => Float.round(span.duration, 2),
"status" => Atom.to_string(span.status)
}
end)
Vl.new(width: 600, height: 250, title: "Concurrent Orchid Workflows")
|> Vl.data_from_values(chart_data)
|> Vl.encode_field(:row, "run_id", type: :nominal, title: "Pipeline Instance (PID)")
|> Vl.mark(:bar, corner_radius: 4, height: 20)
|> Vl.encode_field(:y, "step",
type: :nominal,
title: nil,
sort: [field: "start", op: "min"],
axis: [labelLimit: 300]
)
|> Vl.encode_field(:x, "start", type: :quantitative, title: "Global Time (ms)")
|> Vl.encode_field(:x2, "end")
|> Vl.encode_field(:color, "status", type: :nominal, scale: [range: ["#2eb82e", "#d9534f"]])
|> Vl.encode_field(:tooltip, "duration", type: :quantitative, title: "Duration (ms)")
|> Vl.resolve(:scale, x: :shared)
|> Kino.VegaLite.new()
Cache
metaref = OrchidStratum.MetaStorage.EtsAdapter.init()
blobref = OrchidStratum.BlobStorage.EtsAdapter.init()
cache_recipe_opts = [
executor_and_opts: {Orchid.Executor.Async, []},
baggage: %{
meta_store: {OrchidStratum.MetaStorage.EtsAdapter, metaref},
blob_store: {OrchidStratum.BlobStorage.EtsAdapter, blobref}
},
global_hooks_stack: [OrchidStratum.BypassHook]
]
recipe_with_cache = Orchid.Recipe.walk(
recipe.steps,
fn step ->
case step do
{impl, i, o} -> {impl, i, o, [cache: true]}
{impl, i, o, old_opts} ->
{impl, i, o, old_opts ++
[cache: true,
cache_keys: Keyword.keys(old_opts)
]
}
end
end
)
# Prewarm
Orchid.run(recipe_with_cache, inputs, cache_recipe_opts)
defmodule BenchmarkRunner do
def run_without_cache(recipe, inputs, opts) do
Orchid.run(recipe, inputs, opts)
end
def run_with_cache(recipe_with_cache, inputs, opts) do
Orchid.run(recipe_with_cache, inputs, opts)
end
end
Benchee.run(
%{
"run_without_cache" => fn ->
BenchmarkRunner.run_without_cache(recipe, inputs, [executor_and_opts: {Orchid.Executor.Async, []}]) end,
"run_with_cache" => fn ->
BenchmarkRunner.run_with_cache(recipe_with_cache, inputs, cache_recipe_opts) end
},
time: 3,
memory_time: 3,
reduction_time: 2
)
12:03:56.906 [debug] [Symbiont] Inference took 7.68ms via provider [:cpu]
12:03:56.911 [debug] [Symbiont] Inference took 4.403ms via provider [:cpu]
12:03:56.922 [debug] [Symbiont] Inference took 10.444ms via provider [:cpu]
12:03:57.875 [debug] [Symbiont] Inference took 933.888ms via provider [:cpu]
12:03:57.880 [debug] [Symbiont] Inference took 4.505ms via provider [:cpu]
12:03:58.527 [debug] [Symbiont] Inference took 637.644ms via provider [:cpu]
12:04:01.423 [debug] [Symbiont] Inference took 2896.281ms via provider [:cpu]
12:04:02.598 [debug] [Symbiont] Inference took 1173.504ms via provider [:cpu]
Error trying to determine erlang version enoent, falling back to overall OTP version
Warning: the benchmark run_with_cache is using an evaluated function.
Evaluated functions perform slower than compiled functions.
You can move the Benchee caller to a function in a module and invoke `Mod.fun()` instead.
Alternatively, you can move the benchmark into a benchmark.exs file and run mix run benchmark.exs
Warning: the benchmark run_without_cache is using an evaluated function.
Evaluated functions perform slower than compiled functions.
You can move the Benchee caller to a function in a module and invoke `Mod.fun()` instead.
Alternatively, you can move the benchmark into a benchmark.exs file and run mix run benchmark.exs
Operating System: Windows
CPU Information: 12th Gen Intel(R) Core(TM) i5-12400
Number of Available Cores: 12
Available memory: 31.77 GB
Elixir 1.19.3
Erlang 28
JIT enabled: true
Benchmark suite executing with the following configuration:
warmup: 2 s
time: 3 s
memory time: 3 s
reduction time: 2 s
parallel: 1
inputs: none specified
Estimated total run time: 20 s
Excluding outliers: false
Benchmarking run_with_cache ...
Benchmarking run_without_cache ...
12:04:13.067 [debug] [Symbiont] Inference took 8.499ms via provider [:cpu]
12:04:13.068 [debug] [Symbiont] Inference took 1.843ms via provider [:cpu]
12:04:13.074 [debug] [Symbiont] Inference took 5.427ms via provider [:cpu]
12:04:14.070 [debug] [Symbiont] Inference took 992.051ms via provider [:cpu]
12:04:14.081 [debug] [Symbiont] Inference took 10.444ms via provider [:cpu]
12:04:14.787 [debug] [Symbiont] Inference took 698.47ms via provider [:cpu]
12:04:17.627 [debug] [Symbiont] Inference took 2838.937ms via provider [:cpu]
12:04:18.782 [debug] [Symbiont] Inference took 1153.638ms via provider [:cpu]
12:04:18.872 [debug] [Symbiont] Inference took 7.065ms via provider [:cpu]
12:04:18.875 [debug] [Symbiont] Inference took 2.764ms via provider [:cpu]
12:04:18.893 [debug] [Symbiont] Inference took 17.1ms via provider [:cpu]
12:04:19.936 [debug] [Symbiont] Inference took 1034.649ms via provider [:cpu]
12:04:19.946 [debug] [Symbiont] Inference took 7.475ms via provider [:cpu]
12:04:20.582 [debug] [Symbiont] Inference took 629.555ms via provider [:cpu]
12:04:23.500 [debug] [Symbiont] Inference took 2916.556ms via provider [:cpu]
12:04:24.690 [debug] [Symbiont] Inference took 1189.683ms via provider [:cpu]
12:04:24.768 [debug] [Symbiont] Inference took 5.939ms via provider [:cpu]
12:04:24.770 [debug] [Symbiont] Inference took 1.74ms via provider [:cpu]
12:04:24.782 [debug] [Symbiont] Inference took 11.161ms via provider [:cpu]
12:04:25.754 [debug] [Symbiont] Inference took 966.86ms via provider [:cpu]
12:04:25.762 [debug] [Symbiont] Inference took 6.758ms via provider [:cpu]
12:04:26.397 [debug] [Symbiont] Inference took 625.766ms via provider [:cpu]
12:04:29.282 [debug] [Symbiont] Inference took 2883.993ms via provider [:cpu]
12:04:30.416 [debug] [Symbiont] Inference took 1133.875ms via provider [:cpu]
12:04:30.500 [debug] [Symbiont] Inference took 6.451ms via provider [:cpu]
12:04:30.503 [debug] [Symbiont] Inference took 1.843ms via provider [:cpu]
12:04:30.520 [debug] [Symbiont] Inference took 16.793ms via provider [:cpu]
12:04:31.579 [debug] [Symbiont] Inference took 1051.443ms via provider [:cpu]
12:04:31.587 [debug] [Symbiont] Inference took 6.451ms via provider [:cpu]
12:04:32.247 [debug] [Symbiont] Inference took 652.185ms via provider [:cpu]
12:04:35.146 [debug] [Symbiont] Inference took 2897.715ms via provider [:cpu]
12:04:36.295 [debug] [Symbiont] Inference took 1148.518ms via provider [:cpu]
Calculating statistics...
Formatting results...
Name ips average deviation median 99th %
run_with_cache 718.93 0.00139 s ±105.19% 0.00113 s 0.00363 s
run_without_cache 0.170 5.90 s ±0.00% 5.90 s 5.90 s
Comparison:
run_with_cache 718.93
run_without_cache 0.170 - 4239.66x slower +5.90 s
Memory usage statistics:
Name average deviation median 99th %
run_with_cache 72.39 KB ±1.64% 73.34 KB 73.34 KB
run_without_cache 74.43 KB ±0.00% 74.43 KB 74.43 KB
Comparison:
run_with_cache 73.34 KB
run_without_cache 74.43 KB - 1.03x memory usage +2.04 KB
Reduction count statistics:
Name average deviation median 99th %
run_with_cache 8.73 K ±0.18% 8.74 K 8.75 K
run_without_cache 8.87 K ±0.00% 8.87 K 8.87 K
Comparison:
run_with_cache 8.74 K
run_without_cache 8.87 K - 1.02x reduction count +0.140 K