DiffSingerElixirPoC
Mix.install([
# https://github.com/GES233/DiffSinger
{:diff_singer, path: "D:/CodeRepo/DiffSingerRepo/DiffSingerEx"}
])
获得声库模型元数据
获取声库的模型的元数据以及基本情况,作为后续构建依赖图的骨架。
model_root_path = "E:/ProgramAssets/OpenUTAUSingers/Qixuan_v2.5.0_DiffSinger_OpenUtau"
model_config = DiffSinger.VoiceBank.Config.fetch_overview(model_root_path)
:ok
一个简单的 Inspector 。
# An inspector
model_config.predict_map.maybe_pitch.predict.signature
构建入点的节点
这里是将歌词/MIDI 根据音素字典转变为对应音素 ID /音高的一个节点。
需要 MIDI 的一个主要原因是应用其中的时长信息作为一个 reference 以用作后续音素的时长预测。
defmodule CommonEncoder do
@type current_phoneme :: {lang_name :: binary(), current_phoneme_under_lang :: binary()}
@type current_word :: [{[current_phoneme], word_duration :: non_neg_integer()}]
# 输出直接连接 duration.linguistic 模型的输入
# ["tokens", "languages", "word_div", "word_dur"]
def run_partial(%Orchid.Param{payload: words}, opts) do
lang_dict = Keyword.fetch!(opts, :lang_dict)
phoneme_dict = Keyword.fetch!(opts, :phoneme_dict)
Enum.reduce(
words,
{[], [], [], [], []},
fn {phonemes, duration, midi_note}, {acc_l, acc_t, acc_wdiv, acc_wdur, acc_midi} ->
ph_count = length(phonemes)
{curr_langs, curr_toks} =
phonemes
|> Enum.map(fn {lang, phone} -> {lang_dict[lang], phoneme_dict[phone]} end)
|> Enum.unzip()
curr_midis = List.duplicate(midi_note, ph_count)
{
acc_l ++ curr_langs,
acc_t ++ curr_toks,
acc_wdiv ++ [ph_count],
acc_wdur ++ [duration],
acc_midi ++ curr_midis
}
end)
end
end
defmodule DurationPredictEncoder do
use Orchid.Step
def run(param, opts) do
{langs, toks, w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)
lang_map_param = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
phoneme_map_param = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
word_div_param = Orchid.Param.new(:word_division, :payload, Nx.tensor([w_div], type: :s64))
word_dur_param = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
ph_midi_param = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))
{:ok, [lang_map_param, phoneme_map_param, word_div_param, word_dur_param, ph_midi_param]}
end
end
defmodule PitchPredictEncoder do
use Orchid.Step
def run(%Orchid.Param{} = param, opts) do
{langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)
languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))
{:ok, [languages, tokens, word_dur, ph_midi]}
end
end
# 变分模型
%{maybe_lang_dict: lang_dict, phoneme_dict: phoneme_dict} = model_config.variance.phonemes
defmodule VarianceEncoder do
use Orchid.Step
def run(param, opts) do
{langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)
languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))
{:ok, [languages, tokens, word_dur, ph_midi]}
end
end
# 你好
VarianceEncoder.run(
%Orchid.Param{payload: [
{[{"zh", "zh/n"}, {"zh", "zh/i"}], 200, 62},
{[{"zh", "zh/h"}, {"zh", "zh/ao"}], 220, 61}
]},
lang_dict: lang_dict,
phoneme_dict: phoneme_dict
)
搭建变分模型与音高时长预测模型
预测音素时长:
defmodule PredictDuration do
@behaviour Orchid.Symbiont.Step
def required, do: [:duration_linguistic, :duration_predict]
# [{Orchid.Symbiont.Step.get_required(), [foo: :bar]}]
def run_with_model(
[
%Orchid.Param{payload: lang_map},
%Orchid.Param{payload: phoneme_map},
%Orchid.Param{payload: word_division},
%Orchid.Param{payload: word_duration},
%Orchid.Param{payload: phoneme_midi}
],
handlers,
_opts
) do
duration_linguistic = handlers.duration_linguistic
duration_predict = handlers.duration_predict
inputs = {phoneme_map, lang_map, word_division, word_duration}
{:ok, {encoder_out_tensor, mask_tensor}} = Orchid.Symbiont.call(duration_linguistic, {:infer, inputs})
{:ok, result} = Orchid.Symbiont.call(duration_predict, {:infer, {encoder_out_tensor, mask_tensor, phoneme_midi}})
{:ok, [
Orchid.Param.new(:ph_dur_pred, :encoder_out, result)
]}
end
end
:ok = Orchid.Symbiont.register(
:duration_linguistic,
{Orchid.Symbiont.OrtexRunner, [name: :duration_linguistic, path: Path.join([model_root_path | model_config.predict_map.maybe_duration.linguistic.path])]}
)
:ok = Orchid.Symbiont.register(
:duration_predict,
{Orchid.Symbiont.OrtexRunner, [name: :duration_predict, path: Path.join([model_root_path | model_config.predict_map.maybe_duration.duration.path])]}
)
%{maybe_lang_dict: lang_dict, phoneme_dict: phoneme_dict} = model_config.predict_map.maybe_duration.phonemes
recipe = Orchid.Recipe.new([
{DurationPredictEncoder, :words, [:lang_map, :phoneme_map, :word_division, :word_duration, :ph_midi], [lang_dict: lang_dict, phoneme_dict: phoneme_dict]},
{PredictDuration, [:lang_map, :phoneme_map, :word_division, :word_duration, :ph_midi], :res, extra_hooks_stack: [Orchid.Symbiont.Hooks.Injector]}
])
inputs = %Orchid.Param{name: :words, payload: [
{[{"zh", "zh/n"}, {"zh", "zh/i"}], 200, 60},
{[{"zh", "zh/h"}, {"zh", "zh/ao"}], 220, 62}
]}
Orchid.run(recipe, inputs)
预测音素音高:
defmodule PredictPitch do
@behaviour Orchid.Symbiont.Step
def required, do: [:pitch_linguistic, :pitch_predict]
def run_with_model(
[
%Orchid.Param{payload: languages},
%Orchid.Param{payload: phonemes},
%Orchid.Param{payload: phoneme_duration},
%Orchid.Param{payload: note_midi},
%Orchid.Param{payload: note_rest}, # 是否是休止符
%Orchid.Param{payload: note_dur},
%Orchid.Param{payload: expr}, # 表现力参数 (若无可用0填充)
%Orchid.Param{payload: pitch_base}, # Base pitch 或高斯噪声
%Orchid.Param{payload: retake}, # 局部重绘的 mask, 全新生成通常全为 1
%Orchid.Param{payload: steps} # 扩散步数,比如 Nx.tensor(20, type: :s64)
],
handlers,
_opts
) do
pitch_linguistic = handlers.pitch_linguistic
pitch_predict = handlers.pitch_predict
{:ok, {encoder_out_tensor, mask_tensor}} = Orchid.Symbiont.call(pitch_linguistic, {:infer, {phonemes, languages, phoneme_duration}})
ph_dur = phoneme_duration
|> Nx.round()
|> Nx.as_type(:s64)
mask_tensor
|> IO.inspect()
# If all `1`
# ["encoder_out", "ph_dur", "note_midi", "note_rest", "note_dur", "pitch", "expr", "retake", "steps"]
{:ok, result} = Orchid.Symbiont.call(pitch_predict, {:infer, {encoder_out_tensor, ph_dur, note_midi, note_rest, note_dur, pitch_base, expr, retake, steps}})
{:ok, [
Orchid.Param.new(:ph_dur_pred, :encoder_out, result)
]}
end
end
:ok = Orchid.Symbiont.register(
:pitch_linguistic,
{Orchid.Symbiont.OrtexRunner, [name: :pitch_linguistic, path: Path.join([model_root_path | model_config.predict_map.maybe_pitch.linguistic.path])]}
)
:ok = Orchid.Symbiont.register(
:pitch_predict,
{Orchid.Symbiont.OrtexRunner, [name: :pitch_predict, path: Path.join([model_root_path | model_config.predict_map.maybe_duration.duration.path])]}
)
%{maybe_lang_dict: lang_dict, phoneme_dict: phoneme_dict} = model_config.predict_map.maybe_pitch.phonemes
# recipe = Orchid.Recipe.new([
# {DurationPredictEncoder, :words, [:duration_lang, :duration_phoneme, :duration_word_division, :word_duration, :ph_midi], [lang_dict: lang_dict, phoneme_dict: phoneme_dict]},
# {PredictDuration, [:duration_lang, :duration_phoneme, :duration_word_division, :word_duration, :ph_midi], :phoneme_duration, extra_hooks_stack: [Orchid.Symbiont.Hooks.Injector]},
# {PredictPitch, [:lang_map, :phoneme_map, :phoneme_duration, :ph_midi], :res, extra_hooks_stack: [Orchid.Symbiont.Hooks.Injector]},
# ])
# inputs = %Orchid.Param{name: :words, payload: [
# {[{"zh", "zh/n"}, {"zh", "zh/i"}], 200, 60},
# {[{"zh", "zh/h"}, {"zh", "zh/ao"}], 220, 62}
# ]}
# Orchid.run(recipe, inputs)
其他变分参数:TODO
搭建声学模型与声码器
声学模型:
声码器:
后处理(将张量变为可被播放的片段)
End-to-end 管线
_phoneme_predict = []
_pitch_predict = []