DiffSingerElixirPoC
Mix.install([
{:diff_singer, git: "https://github.com/GES233/DiffSinger.git"},
{:kino, "~> 0.19.0"},
{:vega_lite, "~> 0.1.8"},
{:kino_vega_lite, "~> 0.1.11"}
])
:telemetry.attach(
"orchid-step-exception-logger",
[:orchid, :step, :exception],
&Orchid.Runner.Hooks.Telemetry.error_handler/4,
%{}
)
Fetch Model’s Metadata
Obtain the metadata and basic information of the sound library’s models as the skeleton for subsequent dependency graph construction.
Here, we take Qixuan v2.5.0 (for OpenUTAU), maintained by OpenVPI, as an example.
# If you have your local version
# Chenge the path to yours
model_root_path = "E:/ProgramAssets/OpenUTAUSingers/Qixuan_v2.5.0_DiffSinger_OpenUtau"
model_config = DiffSinger.VoiceBank.Config.fetch_overview(model_root_path)
:ok
:ok
Prelude Steps
This is a node that converts lyrics/MIDI into corresponding phoneme IDs and pitches (MIDI) based on a phoneme dictionary.
A primary reason for needing MIDI is to use its duration information as a reference for subsequent phoneme duration prediction.
A rasterized node will be implemented later to accommodate the processing of notes and phonemes.
defmodule CommonEncoder do
@moduledoc """
The lyrics and pitch are initially encoded for use in subsequent models.
"""
def run_partial(%Orchid.Param{payload: words}, opts) do
lang_dict = Keyword.fetch!(opts, :lang_dict)
phoneme_dict = Keyword.fetch!(opts, :phoneme_dict)
Enum.reduce(
words,
{[], [], [], [], []},
fn {phonemes, duration, midi_note}, {acc_l, acc_t, acc_wdiv, acc_wdur, acc_midi} ->
ph_count = length(phonemes)
{curr_langs, curr_toks} =
phonemes
|> Enum.map(fn {lang, phone} -> {lang_dict[lang], phoneme_dict[phone]} end)
|> Enum.unzip()
curr_midis = List.duplicate(midi_note, ph_count)
{
acc_l ++ curr_langs,
acc_t ++ curr_toks,
acc_wdiv ++ [ph_count],
acc_wdur ++ [duration],
acc_midi ++ curr_midis
}
end)
end
end
{:module, CommonEncoder, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}
defmodule DurationPredictEncoder do
use Orchid.Step
def run(param, opts) do
{langs, toks, w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)
lang_map_param = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
phoneme_map_param = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
word_div_param = Orchid.Param.new(:word_division, :payload, Nx.tensor([w_div], type: :s64))
word_dur_param = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
ph_midi_param = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))
{:ok, [lang_map_param, phoneme_map_param, word_div_param, word_dur_param, ph_midi_param]}
end
end
{:module, DurationPredictEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}
defmodule PitchPredictEncoder do
use Orchid.Step
def run(%Orchid.Param{} = param, opts) do
{langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)
languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))
{:ok, [languages, tokens, word_dur, ph_midi]}
end
end
{:module, PitchPredictEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}
# Used for Variance model
defmodule VarianceEncoder do
use Orchid.Step
def run(param, opts) do
{langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)
languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))
{:ok, [languages, tokens, word_dur, ph_midi]}
end
end
{:module, VarianceEncoder, <<70, 79, 82, 49, 0, 0, 12, ...>>, ...}
Variance Model
Predict Phoneme Duration:
defmodule PredictDuration do
@behaviour Orchid.Symbiont.Step
def required, do: [:duration_linguistic, :duration_predict]
def run_with_model(
[
%Orchid.Param{payload: lang_map},
%Orchid.Param{payload: phoneme_map},
%Orchid.Param{payload: word_division},
%Orchid.Param{payload: word_duration},
%Orchid.Param{payload: phoneme_midi}
],
handlers,
_opts
) do
duration_linguistic = handlers.duration_linguistic
duration_predict = handlers.duration_predict
inputs = {phoneme_map, lang_map, word_division, word_duration}
{:ok, {encoder_out_tensor, mask_tensor}} = Orchid.Symbiont.call(duration_linguistic, {:infer, inputs})
{:ok, {result}} = Orchid.Symbiont.call(duration_predict, {:infer, {encoder_out_tensor, mask_tensor, phoneme_midi}})
{:ok, [
Orchid.Param.new(:ph_dur_pred, :encoder_out, result)
]}
end
end
{:module, PredictDuration, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}
Predict Pitch:
defmodule PredictPitch do
@behaviour Orchid.Symbiont.Step
def required, do: [:pitch_linguistic, :pitch_predict]
def run_with_model(
[
%Orchid.Param{payload: languages},
%Orchid.Param{payload: phonemes},
%Orchid.Param{payload: phoneme_duration},
%Orchid.Param{payload: note_midi}
], handlers, opts) do
ph_dur =
phoneme_duration
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.round()
|> Nx.as_type(:s64)
{:ok, {encoder_out, _mask}} =
Orchid.Symbiont.call(
handlers.pitch_linguistic,
{:infer, {phonemes, languages, ph_dur}}
)
total_frames =
ph_dur
|> Nx.sum()
|> Nx.to_number()
key = Nx.Random.key(System.system_time())
{pitch_noise, _key} =
Nx.Random.normal(
key,
0.0,
1.0,
shape: {1, total_frames},
type: :f32
)
expr =
Nx.broadcast(Nx.tensor(1.0, type: :f32), {1, total_frames})
retake =
Nx.broadcast(Nx.tensor(1, type: :u8), {1, total_frames})
note_rest =
note_midi
|> Nx.equal(0)
|> Nx.as_type(:u8)
note_midi = note_midi |> Nx.as_type(:f32)
note_dur = ph_dur
steps =
opts
|> Keyword.get(:steps, 20)
|> Nx.tensor(type: :s64)
{:ok, {pitch_pred}} =
Orchid.Symbiont.call(
handlers.pitch_predict,
{:infer,
{
encoder_out,
ph_dur,
note_midi,
note_rest,
note_dur,
pitch_noise,
expr,
retake,
steps
}}
)
{:ok, Orchid.Param.new(:pitch_pred, :payload, pitch_pred)}
end
end
{:module, PredictPitch, <<70, 79, 82, 49, 0, 0, 18, ...>>, ...}
defmodule MIDIToPitch do
use Orchid.Step
def run(%Orchid.Param{payload: midi_pred}, _opts) do
{:ok, midi_pred
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.add(-69.0)
|> Nx.divide(12.0)
|> then(&Nx.pow(2.0, &1))
|> Nx.multiply(440)
|> then(fn converted_f0 ->
Nx.select(Nx.less(Nx.backend_transfer(midi_pred, Nx.BinaryBackend), 0.0), Nx.tensor(0.0), converted_f0)
end)
|> then(&Orchid.Param.new(:f0, :tensor, &1))}
end
end
{:module, MIDIToPitch, <<70, 79, 82, 49, 0, 0, 13, ...>>, ...}
Other Variance Params:
defmodule VarianceModel do
@behaviour Orchid.Symbiont.Step
def required, do: [:variance_linguistic, :variance]
def run_with_model(
[
%Orchid.Param{payload: languages},
%Orchid.Param{payload: phonemes},
%Orchid.Param{payload: ph_dur},
%Orchid.Param{payload: pitch}
],
handlers,
opts
) do
ph_dur =
ph_dur
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.round()
|> Nx.as_type(:s64)
{:ok, {encoder_out, _mask}} =
Orchid.Symbiont.call(
handlers.variance_linguistic,
{:infer, {phonemes, languages, ph_dur}}
)
total_frames =
ph_dur
|> Nx.sum()
|> Nx.to_number()
key = Nx.Random.key(System.system_time())
{breath_noise, key} =
Nx.Random.normal(key, 0.0, 1.0,
shape: {1, total_frames},
type: :f32
)
{voice_noise, _key} =
Nx.Random.normal(key, 0.0, 1.0,
shape: {1, total_frames},
type: :f32
)
retake =
Nx.broadcast(
Nx.tensor([1,1], type: :u8),
{1, total_frames, 2}
)
steps =
opts
|> Keyword.get(:steps, 20)
|> Nx.tensor(type: :s64)
{:ok, {breath_pred, voice_pred}} =
Orchid.Symbiont.call(
handlers.variance,
{:infer,
{
encoder_out,
ph_dur,
pitch,
breath_noise,
voice_noise,
retake,
steps
}},
:infinity
)
{:ok, [
Orchid.Param.new(:breathiness, :payload, breath_pred),
Orchid.Param.new(:voicing, :payload, voice_pred)
]}
end
end
{:module, VarianceModel, <<70, 79, 82, 49, 0, 0, 18, ...>>, ...}
Acoustic Model and Vocoder
Acoustic Model:
defmodule Acoustic do
@behaviour Orchid.Symbiont.Step
def required, do: [:acoustic]
def run_with_model(
[
%Orchid.Param{payload: languages},
%Orchid.Param{payload: phonemes},
%Orchid.Param{payload: phoneme_duration},
%Orchid.Param{payload: pitch},
%Orchid.Param{payload: breathiness},
%Orchid.Param{payload: voicing}
],
handlers,
opts
) do
durations =
phoneme_duration
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.round()
|> Nx.as_type(:s64)
frames = Nx.axis_size(pitch, 1)
gender =
Nx.broadcast(Nx.tensor(0.0, type: :f32), {1, frames})
velocity =
Nx.broadcast(Nx.tensor(1.0, type: :f32), {1, frames})
steps =
opts
|> Keyword.get(:steps, 20)
|> Nx.tensor(type: :s64)
depth =
opts
|> Keyword.get(:depth, 1.0)
|> Nx.tensor(type: :f32)
{:ok, {mel}} =
Orchid.Symbiont.call(
handlers.acoustic,
{:infer,
{
phonemes,
languages,
durations,
pitch,
breathiness,
voicing,
gender,
velocity,
depth,
steps
}},
:infinity
)
{:ok, Orchid.Param.new(:mel, :payload, mel)}
end
end
{:module, Acoustic, <<70, 79, 82, 49, 0, 0, 15, ...>>, ...}
Vocoder:
defmodule NSFHifiGAN_Vocoder do
@behaviour Orchid.Symbiont.Step
def required, do: [:vocoder]
def run_with_model(
[%Orchid.Param{payload: mel}, %Orchid.Param{payload: f0}],
handlers,
_opts
) do
{:ok, {audio}} = Orchid.Symbiont.call(handlers.vocoder, {:infer, {mel, f0}})
{:ok, Orchid.Param.new(:audio, :payload, audio)}
end
end
{:module, NSFHifiGAN_Vocoder, <<70, 79, 82, 49, 0, 0, 11, ...>>, ...}
Post-process (tensor to audio)
defmodule TensorToWave do
use Orchid.Step
def run(%Orchid.Param{payload: wave_tensor}, opts) do
pcm_data =
wave_tensor
|> Nx.flatten()
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.multiply(32767.0)
|> Nx.clip(-32768.0, 32767.0)
|> Nx.as_type(:s16)
|> Nx.to_binary()
sample_rate = Keyword.get(opts, :sample_rate, 44100)
byte_rate = sample_rate * 1 * 2 # sample_rate * channels * bytes_per_sample (Int16=2)
data_size = byte_size(pcm_data)
file_size = 36 + data_size
header = <<
"RIFF", file_size::little-integer-size(32), "WAVE",
"fmt ", 16::little-integer-size(32), # Subchunk1Size
1::little-integer-size(16), # AudioFormat (1 = PCM)
1::little-integer-size(16), # NumChannels (1 = Mono)
sample_rate::little-integer-size(32), # SampleRate
byte_rate::little-integer-size(32), # ByteRate
2::little-integer-size(16), # BlockAlign (channels * 2)
16::little-integer-size(16), # BitsPerSample (16 bits)
"data", data_size::little-integer-size(32) # Subchunk2Size
>>
File.write!("E:/final.wav", header <> pcm_data)
{:ok, Orchid.Param.new(:final, :audio, header <> pcm_data)}
end
end
{:module, TensorToWave, <<70, 79, 82, 49, 0, 0, 15, ...>>, ...}
Build Pipeline and Prepare to Demo
defmodule Orchid.Livebook.GanttTracker do
use Agent
def start_link(_) do
Agent.start_link(fn ->[] end, name: __MODULE__)
end
def handle_event([:orchid, :step, status], measurements, meta, _config) do
run_id = List.first(Process.get(:"$callers")) || self()
run_id_str = inspect(run_id)
time_val = if status == :start, do: measurements.system_time, else: measurements.duration
Agent.update(__MODULE__, fn state ->[{run_id_str, status, time_val, meta} | state]
end)
end
def get_spans() do
events = Agent.get(__MODULE__, & &1) |> Enum.reverse()
{_, spans} = Enum.reduce(events, {%{},[]}, fn
{run_id, :start, sys_time, meta}, {pending, spans} ->
start_ms = System.convert_time_unit(sys_time, :native, :microsecond) / 1000.0
sig = {run_id, meta.impl, meta.in_keys, meta.out_keys}
{Map.put(pending, sig, start_ms), spans}
{run_id, status, duration, meta}, {pending, spans} when status in[:done, :exception, :special] ->
duration_ms = System.convert_time_unit(duration, :native, :microsecond) / 1000.0
sig = {run_id, meta.impl, meta.in_keys, meta.out_keys}
case Map.pop(pending, sig) do
{start_ms, new_pending} when not is_nil(start_ms) ->
impl_name = if is_function(meta.impl), do: "Anonymous", else: inspect(meta.impl) |> String.split(".") |> List.last()
out_keys = inspect(meta.out_keys) |> String.slice(0..14) |> Kernel.<>("...")
span = %{
run_id: run_id,
step: "#{impl_name}(#{out_keys})",
start: start_ms,
end: start_ms + duration_ms,
visual_end: start_ms + max(duration_ms, 10.0),
duration: duration_ms,
status: status
}
{new_pending, [span | spans]}
{nil, ^pending} -> {pending, spans}
end
end)
spans
end
def clear(), do: Agent.update(__MODULE__, fn _ ->[] end)
end
{:module, Orchid.Livebook.GanttTracker, <<70, 79, 82, 49, 0, 0, 29, ...>>, ...}
Kino.start_child({Orchid.Livebook.GanttTracker, []})
events =[
{[:orchid, :step, :start], "start"},
{[:orchid, :step, :done], "done"},
{[:orchid, :step, :exception], "exception"},
{[:orchid, :step, :special], "special"}
]
for {event, suffix} <- events do
:telemetry.attach(
"orchid-gantt-#{suffix}",
event,
&Orchid.Livebook.GanttTracker.handle_event/4,
nil
)
end
[:ok, :ok, :ok, :ok]
Run Workflow
defmodule QixuanPipeline do
require Logger
def load_models(model_root_path, model_config) do
Logger.info("Loading DiffSinger models...")
models =[
{:duration_linguistic, model_config.predict_map.maybe_duration.linguistic.path},
{:duration_predict, model_config.predict_map.maybe_duration.duration.path},
{:pitch_linguistic, model_config.predict_map.maybe_pitch.linguistic.path},
{:pitch_predict, model_config.predict_map.maybe_pitch.predict.path},
{:variance_linguistic, model_config.variance.linguistic.path},
{:variance, model_config.variance.variance.path},
{:acoustic, model_config.acoustic.infer.path},
{:vocoder, model_config.vocoder.path}
]
for {name, rel_path} <- models do
path = Path.join([model_root_path] ++ rel_path)
:ok = Orchid.Symbiont.register(name, {Orchid.Symbiont.OrtexRunner, [name: name, path: path]})
end
Logger.info("All models loaded successfully.")
:ok
end
def build_recipe(model_config) do
injector = [extra_hooks_stack: [Orchid.Symbiont.Hooks.Injector]]
dur_dict = model_config.predict_map.maybe_duration.phonemes
pitch_dict = model_config.predict_map.maybe_pitch.phonemes
var_dict = model_config.variance.phonemes
sample_rate = model_config.vocoder.maybe_config["sample_rate"]
duration_steps =[
{DurationPredictEncoder, :words,[:duration_lang, :duration_phoneme, :word_division, :word_duration, :duration_ph_midi],[lang_dict: dur_dict.maybe_lang_dict, phoneme_dict: dur_dict.phoneme_dict]},
{PredictDuration,[:duration_lang, :duration_phoneme, :word_division, :word_duration, :duration_ph_midi],
:phoneme_duration_predict, injector}
]
pitch_steps = [
{PitchPredictEncoder, :words,[:pitch_lang, :pitch_phoneme, :word_duration_from_pitch, :pitch_ph_midi],[lang_dict: pitch_dict.maybe_lang_dict, phoneme_dict: pitch_dict.phoneme_dict]},
{PredictPitch,[:pitch_lang, :pitch_phoneme, :phoneme_duration_predict, :pitch_ph_midi],
:pitch_pred_midi, injector},
{MIDIToPitch, :pitch_pred_midi, :pitch_pred}
]
variance_steps = [
{VarianceEncoder, :words,[:variance_lang, :variance_phoneme, :word_duration_from_variance, :variance_ph_midi],[lang_dict: var_dict.maybe_lang_dict, phoneme_dict: var_dict.phoneme_dict]},
{VarianceModel,[:variance_lang, :variance_phoneme, :phoneme_duration_predict, :pitch_pred_midi],[:breathiness_pred, :voice_pred], injector}
]
acoustic_step =[
{Acoustic,[:variance_lang, :variance_phoneme, :phoneme_duration_predict, :pitch_pred, :breathiness_pred, :voice_pred],
:mel, injector}
]
vocoder_step = [
{NSFHifiGAN_Vocoder,[:mel, :pitch_pred], :wave_tensor, injector},
{TensorToWave, :wave_tensor, :audio,[sample_rate: sample_rate]}
]
all_steps = duration_steps ++ pitch_steps ++ variance_steps ++ acoustic_step ++ vocoder_step
Orchid.Recipe.new(all_steps)
end
end
QixuanPipeline.load_models(model_root_path, model_config)
recipe = QixuanPipeline.build_recipe(model_config)
inputs = [
%Orchid.Param{name: :words, payload: [
{[{"zh", "AP"}], 10, 0},
{[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
{[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
{[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
{[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60},
{[{"zh", "AP"}], 1, 0},
{[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
{[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
{[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
{[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60}
]},
]
Orchid.Livebook.GanttTracker.clear()
{elapse, {:ok, results}} = :timer.tc(&Orchid.run/2, [recipe, inputs], :microsecond)
require Logger
Logger.info "Used #{elapse / 1000}ms."
14:42:57.019 [info] Loading DiffSinger models...
14:42:57.023 [info] All models loaded successfully.
14:42:57.024 [info] init got unexpected: {:io_request, #PID<0.92.0>, #Reference<0.2798130424.2264662018.125731>,
{:put_chars, :unicode,
"Failed to write log message to stdout, trying stderr\n"}}
14:42:57.024 [debug] ** (RuntimeError) bad return value from Logger formatter Logger.Formatter, got [<<185, 220, 181, 192, 213, 253, 212, 218, 177, 187, 185, 216, 177, 213, 161, 163, 92, 114, 92, 110, 34, 41>>, "\e[0m", 10] after "\e[31m\n14:42:57.022 [error] Writer crashed (:\""
(kernel 10.4.1) logger_h_common.erl:433: :logger_h_common.string_to_binary/1
(kernel 10.4.1) logger_h_common.erl:399: :logger_h_common.do_log_to_binary/2
(kernel 10.4.1) logger_h_common.erl:180: :logger_h_common.log/2
(kernel 10.4.1) logger_backend.erl:54: :logger_backend.call_handlers/3
(kernel 10.4.1) user_drv.erl:581: :user_drv.server/3
(stdlib 7.1) gen_statem.erl:3748: :gen_statem.loop_state_callback/11
(stdlib 7.1) proc_lib.erl:333: :proc_lib.init_p_do_apply/3
14:42:57.236 [debug] [Symbiont] Inference took 9.728ms via provider [:cpu]
14:42:57.238 [debug] [Symbiont] Inference took 1.536ms via provider [:cpu]
14:42:57.592 [debug] [Symbiont] Inference took 8.806ms via provider [:cpu]
14:42:58.660 [debug] [Symbiont] Inference took 986.214ms via provider [:cpu]
14:42:58.818 [debug] [Symbiont] Inference took 4.71ms via provider [:cpu]
14:42:59.445 [debug] [Symbiont] Inference took 612.352ms via provider [:cpu]
14:43:02.839 [debug] [Symbiont] Inference took 2934.272ms via provider [:cpu]
14:43:04.373 [debug] [Symbiont] Inference took 1445.068ms via provider [:cpu]
14:43:04.441 [info] Used 7415.5ms.
:ok
Execution Timeline:
alias VegaLite, as: Vl
spans = Orchid.Livebook.GanttTracker.get_spans()
min_start = spans |> Enum.map(& &1.start) |> Enum.min(fn -> 0 end)
chart_data =
Enum.map(spans, fn span ->
%{
"step" => span.step,
"start" => span.start - min_start,
"end" => span.visual_end - min_start,
"duration" => Float.round(span.duration, 2),
"status" => Atom.to_string(span.status)
}
end)
Vl.new(width: 650, height: 250, title: "Orchid Workflow Execution Timeline")
|> Vl.data_from_values(chart_data)
|> Vl.mark(:bar, corner_radius: 4, height: 20)
|> Vl.encode_field(:y, "step", type: :nominal, title: "Steps", sort: [field: "start", op: "min"], axis: [labelLimit: 400])
|> Vl.encode_field(:x, "start", type: :quantitative, title: "Time Offset (ms)")
|> Vl.encode_field(:x2, "end")
|> Vl.encode_field(:color, "status", type: :nominal, title: "Status", scale: [range: ["#2eb82e", "#d9534f"]])
|> Vl.encode_field(:tooltip, "duration", type: :quantitative, title: "Duration (ms)")
|> Kino.VegaLite.new()
Prepare for the Spectrum demo:
alias VegaLite, as: Vl
mel_tensor =
results.mel.payload[0]
|> Nx.backend_transfer(Nx.BinaryBackend)
mel_data =
mel_tensor
|> Nx.to_batched(1)
|> Enum.with_index()
|> Enum.flat_map(fn {row_tensor, frame_idx} ->
row_tensor
|> Nx.to_flat_list()
|> Enum.with_index()
|> Enum.map(fn {val, bin_idx} ->
%{
"x1" => frame_idx, "x2" => frame_idx + 1,
"y1" => bin_idx, "y2" => bin_idx + 1,
"value" => val
}
end)
end)
f0_data =
results.pitch_pred.payload[0]
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.to_flat_list()
|> Enum.with_index()
|> Enum.map(fn {f0, frame} -> %{"frame" => frame, "f0" => f0} end)
durations = results.phoneme_duration_predict.payload[0]
|> Nx.backend_transfer(Nx.BinaryBackend)
|> Nx.to_flat_list()
boundaries = Enum.scan(durations, 0, fn dur, acc -> dur + acc end)
boundary_data = Enum.map(boundaries, fn b -> %{"frame" => b} end)
mel_layer =
Vl.new()
|> Vl.data_from_values(mel_data)
|> Vl.mark(:rect, tooltip: false, stroke: nil)
|> Vl.encode_field(:x, "x1", type: :quantitative, title: "Time (Frames)")
|> Vl.encode_field(:x2, "x2")
|> Vl.encode_field(:y, "y1", type: :quantitative, title: "Mel Frequency Bin")
|> Vl.encode_field(:y2, "y2")
|> Vl.encode_field(:color, "value",
type: :quantitative,
scale: [scheme: "greys", reverse: true, domain: [-11.0, 2.0]],
legend: false
)
f0_layer =
Vl.new()
|> Vl.data_from_values(f0_data)
|> Vl.mark(:line, color: "#007bff", strokeWidth: 2)
|> Vl.encode_field(:x, "frame", type: :quantitative)
|> Vl.encode_field(:y, "f0",
type: :quantitative,
title: "Pitch (Hz)",
axis: [orient: "right", titleColor: "#007bff"],
scale: [domain: [50, 800]]
)
boundary_layer =
Vl.new()
|> Vl.data_from_values(boundary_data)
|> Vl.mark(:rule, color: "red", strokeDash: [4, 4], strokeWidth: 1.5)
|> Vl.encode_field(:x, "frame", type: :quantitative)
:ok
:ok
Demostration
File.write!("E:/final.wav", results.audio.payload)
Kino.Audio.new(results.audio.payload, :wav)
# audio_html = """
#
# Qixuan DiffSinger Output
# Two Tigers (Liang Zhi Lao Hu)
#
#
#
# Your browser does not support the audio element.
#
#
# """
# Kino.HTML.new(audio_html)
Vl.new(width: 800, height: 400, title: "Output Analysis (Mel Spectrum + F0 + Phoneme Duration)")
|> Vl.resolve(:scale, y: :independent)
|> Vl.layers([mel_layer, f0_layer, boundary_layer])
|> Kino.VegaLite.new()
Multiple Pipeline
Kino.start_child({Orchid.Livebook.GanttTracker,[]})
Orchid.Livebook.GanttTracker.clear()
pipeline_tasks =
for _ <- 1..3 do
Task.async(fn ->
Orchid.run(recipe, inputs, executor_and_opts: {Orchid.Executor.Async,[]})
end)
end
Task.await_many(pipeline_tasks, :infinity)
14:43:05.899 [debug] [Symbiont] Inference took 6.348ms via provider [:cpu]
14:43:05.906 [debug] [Symbiont] Inference took 6.348ms via provider [:cpu]
14:43:05.906 [debug] [Symbiont] Inference took 6.86ms via provider [:cpu]
14:43:05.910 [debug] [Symbiont] Inference took 3.481ms via provider [:cpu]
14:43:05.914 [debug] [Symbiont] Inference took 7.987ms via provider [:cpu]
14:43:05.924 [debug] [Symbiont] Inference took 17.92ms via provider [:cpu]
14:43:05.926 [debug] [Symbiont] Inference took 11.98ms via provider [:cpu]
14:43:05.932 [debug] [Symbiont] Inference took 7.884ms via provider [:cpu]
14:43:05.955 [debug] [Symbiont] Inference took 15.667ms via provider [:cpu]
14:43:06.920 [debug] [Symbiont] Inference took 996.454ms via provider [:cpu]
14:43:06.930 [debug] [Symbiont] Inference took 10.547ms via provider [:cpu]
14:43:07.877 [debug] [Symbiont] Inference took 935.219ms via provider [:cpu]
14:43:08.243 [debug] [Symbiont] Inference took 1324.032ms via provider [:cpu]
14:43:08.252 [debug] [Symbiont] Inference took 7.27ms via provider [:cpu]
14:43:09.842 [debug] [Symbiont] Inference took 1576.345ms via provider [:cpu]
14:43:10.317 [debug] [Symbiont] Inference took 2072.678ms via provider [:cpu]
14:43:10.326 [debug] [Symbiont] Inference took 9.216ms via provider [:cpu]
14:43:11.038 [debug] [Symbiont] Inference took 697.753ms via provider [:cpu]
14:43:11.625 [debug] [Symbiont] Inference took 3751.628ms via provider [:cpu]
14:43:12.882 [debug] [Symbiont] Inference took 1251.635ms via provider [:cpu]
14:43:14.437 [debug] [Symbiont] Inference took 2812.518ms via provider [:cpu]
14:43:15.677 [debug] [Symbiont] Inference took 1235.968ms via provider [:cpu]
14:43:17.236 [debug] [Symbiont] Inference took 2796.748ms via provider [:cpu]
14:43:18.365 [debug] [Symbiont] Inference took 1125.785ms via provider [:cpu]
[
ok: %{
words: %Orchid.Param{
name: :words,
type: nil,
payload: [
{[{"zh", "AP"}], 10, 0},
{[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
{[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
{[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
{[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60},
{[{"zh", "AP"}], 1, 0},
{[{"zh", "zh/l"}, {"zh", "zh/iang"}], 40, 60},
{[{"zh", "zh/zh"}, {"zh", "zh/i"}], 40, 62},
{[{"zh", "zh/l"}, {"zh", "zh/ao"}], 40, 64},
{[{"zh", "zh/h"}, {"zh", "zh/u"}], 40, 60}
],
metadata: %{}
},
...
},
...
]
alias VegaLite, as: Vl
spans = Orchid.Livebook.GanttTracker.get_spans()
min_start = spans |> Enum.map(& &1.start) |> Enum.min(fn -> 0 end)
chart_data =
Enum.map(spans, fn span ->
%{
"run_id" => span.run_id,
"step" => span.step,
"start" => span.start - min_start,
"end" => span.end - min_start,
"duration" => Float.round(span.duration, 2),
"status" => Atom.to_string(span.status)
}
end)
Vl.new(width: 650, height: 250, title: "Concurrent Orchid Workflows")
|> Vl.data_from_values(chart_data)
|> Vl.encode_field(:row, "run_id", type: :nominal, title: "Pipeline Instance (PID)")
|> Vl.mark(:bar, corner_radius: 4, height: 20)
|> Vl.encode_field(:y, "step",
type: :nominal,
title: nil,
sort: [field: "start", op: "min"],
axis: [labelLimit: 300]
)
|> Vl.encode_field(:x, "start", type: :quantitative, title: "Global Time (ms)")
|> Vl.encode_field(:x2, "end")
|> Vl.encode_field(:color, "status", type: :nominal, scale: [range: ["#2eb82e", "#d9534f"]])
|> Vl.encode_field(:tooltip, "duration", type: :quantitative, title: "Duration (ms)")
|> Vl.resolve(:scale, x: :shared)
|> Kino.VegaLite.new()