Powered by AppSignal & Oban Pro

DiffSingerElixirPoC

simple_run.livemd

DiffSingerElixirPoC

Mix.install([
  # https://github.com/GES233/DiffSinger
  {:diff_singer, path: "D:/CodeRepo/DiffSingerRepo/DiffSingerEx"}
])

获得声库模型元数据

获取声库的模型的元数据以及基本情况,作为后续构建依赖图的骨架。

这里以 OpenVPI 管理的 Qixuan 为例。

model_root_path = "E:/ProgramAssets/OpenUTAUSingers/Qixuan_v2.5.0_DiffSinger_OpenUtau"
model_config = DiffSinger.VoiceBank.Config.fetch_overview(model_root_path)

:ok

一个简单的 Inspector 。

# An inspector
model_config.predict_map.maybe_pitch.predict.signature

构建入点的节点

这里是将歌词/MIDI 根据音素字典转变为对应音素 ID /音高的一个节点。

需要 MIDI 的一个主要原因是应用其中的时长信息作为一个 reference 以用作后续音素的时长预测。

defmodule CommonEncoder do
  @type current_phoneme :: {lang_name :: binary(), current_phoneme_under_lang :: binary()}
  @type current_word :: [{[current_phoneme], word_duration :: non_neg_integer()}]

  # 输出直接连接 duration.linguistic 模型的输入
  # ["tokens", "languages", "word_div", "word_dur"]
  def run_partial(%Orchid.Param{payload: words}, opts) do
   lang_dict = Keyword.fetch!(opts, :lang_dict)
    phoneme_dict = Keyword.fetch!(opts, :phoneme_dict)

    Enum.reduce(
      words,
      {[], [], [], [], []},
      fn {phonemes, duration, midi_note}, {acc_l, acc_t, acc_wdiv, acc_wdur, acc_midi} ->
        ph_count = length(phonemes)

        {curr_langs, curr_toks} =
          phonemes
          |> Enum.map(fn {lang, phone} -> {lang_dict[lang], phoneme_dict[phone]} end)
          |> Enum.unzip()

        curr_midis = List.duplicate(midi_note, ph_count)

        {
          acc_l ++ curr_langs,
          acc_t ++ curr_toks,
          acc_wdiv ++ [ph_count],
          acc_wdur ++ [duration],
          acc_midi ++ curr_midis
        }
      end)
  end
end
defmodule DurationPredictEncoder do
  use Orchid.Step

  def run(param, opts) do
    {langs, toks, w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)

    lang_map_param = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
    phoneme_map_param = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
    word_div_param = Orchid.Param.new(:word_division, :payload, Nx.tensor([w_div], type: :s64))
    word_dur_param = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
    ph_midi_param = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))

    {:ok, [lang_map_param, phoneme_map_param, word_div_param, word_dur_param, ph_midi_param]}
  end
end
defmodule PitchPredictEncoder do
  use Orchid.Step

  def run(%Orchid.Param{} = param, opts) do
    {langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)

    languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
    tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
    word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
    ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))

    {:ok, [languages, tokens, word_dur, ph_midi]}
  end
end
# 变分模型
%{maybe_lang_dict: lang_dict, phoneme_dict: phoneme_dict} = model_config.variance.phonemes

defmodule VarianceEncoder do
  use Orchid.Step

  def run(param, opts) do
    {langs, toks, _w_div, w_dur, ph_midis} = CommonEncoder.run_partial(param, opts)

    languages = Orchid.Param.new(:lang_map, :payload, Nx.tensor([langs], type: :s64))
    tokens = Orchid.Param.new(:phoneme_map, :payload, Nx.tensor([toks], type: :s64))
    word_dur = Orchid.Param.new(:word_duration, :payload, Nx.tensor([w_dur], type: :s64))
    ph_midi = Orchid.Param.new(:ph_midi, :payload, Nx.tensor([ph_midis], type: :s64))

    {:ok, [languages, tokens, word_dur, ph_midi]}
  end
end

# 你好
VarianceEncoder.run(
  %Orchid.Param{payload: [
    {[{"zh", "zh/n"}, {"zh", "zh/i"}], 200, 62},
    {[{"zh", "zh/h"}, {"zh", "zh/ao"}], 220, 61}
  ]},
  lang_dict: lang_dict,
  phoneme_dict: phoneme_dict
)

搭建变分模型与音高时长预测模型

预测音素时长

defmodule PredictDuration do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:duration_linguistic, :duration_predict]

  # [{Orchid.Symbiont.Step.get_required(), [foo: :bar]}]
  def run_with_model(
    [
      %Orchid.Param{payload: lang_map},
      %Orchid.Param{payload: phoneme_map},
      %Orchid.Param{payload: word_division},
      %Orchid.Param{payload: word_duration},
      %Orchid.Param{payload: phoneme_midi}
    ],
    handlers,
    _opts
  ) do
    duration_linguistic = handlers.duration_linguistic
    duration_predict = handlers.duration_predict

    inputs = {phoneme_map, lang_map, word_division, word_duration}

    {:ok, {encoder_out_tensor, mask_tensor}} = Orchid.Symbiont.call(duration_linguistic, {:infer, inputs})

    {:ok, result} = Orchid.Symbiont.call(duration_predict, {:infer, {encoder_out_tensor, mask_tensor, phoneme_midi}})

    {:ok, [
      Orchid.Param.new(:ph_dur_pred, :encoder_out, result)
    ]}
  end
end

:ok = Orchid.Symbiont.register(
  :duration_linguistic,
  {Orchid.Symbiont.OrtexRunner, [name: :duration_linguistic, path: Path.join([model_root_path | model_config.predict_map.maybe_duration.linguistic.path])]}
)

:ok = Orchid.Symbiont.register(
  :duration_predict,
  {Orchid.Symbiont.OrtexRunner, [name: :duration_predict, path: Path.join([model_root_path | model_config.predict_map.maybe_duration.duration.path])]}
)


%{maybe_lang_dict: lang_dict, phoneme_dict: phoneme_dict} = model_config.predict_map.maybe_duration.phonemes

recipe = Orchid.Recipe.new([
  {DurationPredictEncoder, :words, [:lang_map, :phoneme_map, :word_division, :word_duration, :ph_midi], [lang_dict: lang_dict, phoneme_dict: phoneme_dict]},
  {PredictDuration, [:lang_map, :phoneme_map, :word_division, :word_duration, :ph_midi], :res, extra_hooks_stack: [Orchid.Symbiont.Hooks.Injector]}
])
inputs = %Orchid.Param{name: :words, payload: [
    {[{"zh", "zh/n"}, {"zh", "zh/i"}], 200, 60},
    {[{"zh", "zh/h"}, {"zh", "zh/ao"}], 220, 62}
  ]}
Orchid.run(recipe, inputs)

预测音素音高

defmodule PredictPitch do
  @behaviour Orchid.Symbiont.Step

  def required, do: [:pitch_linguistic, :pitch_predict]

  def run_with_model(
    [
      %Orchid.Param{payload: languages},
      %Orchid.Param{payload: phonemes},
      %Orchid.Param{payload: phoneme_duration},
      %Orchid.Param{payload: note_midi},
      %Orchid.Param{payload: note_rest},      # 是否是休止符
      %Orchid.Param{payload: note_dur},
      %Orchid.Param{payload: expr},           # 表现力参数 (若无可用0填充)
      %Orchid.Param{payload: pitch_base},     # Base pitch 或高斯噪声
      %Orchid.Param{payload: retake},         # 局部重绘的 mask, 全新生成通常全为 1
      %Orchid.Param{payload: steps}           # 扩散步数,比如 Nx.tensor(20, type: :s64)
    ],
    handlers,
    _opts
  ) do
    pitch_linguistic = handlers.pitch_linguistic
    pitch_predict = handlers.pitch_predict

    {:ok, {encoder_out_tensor, mask_tensor}} = Orchid.Symbiont.call(pitch_linguistic, {:infer, {phonemes, languages, phoneme_duration}})

    ph_dur = phoneme_duration
    |> Nx.round()
    |> Nx.as_type(:s64)

    mask_tensor
    |> IO.inspect()
    # If all `1`
  
    # ["encoder_out", "ph_dur", "note_midi", "note_rest", "note_dur", "pitch", "expr", "retake", "steps"]
    {:ok, result} = Orchid.Symbiont.call(pitch_predict, {:infer, {encoder_out_tensor, ph_dur, note_midi, note_rest, note_dur, pitch_base, expr, retake, steps}})

    {:ok, [
      Orchid.Param.new(:ph_dur_pred, :encoder_out, result)
    ]}
  end
end

:ok = Orchid.Symbiont.register(
  :pitch_linguistic,
  {Orchid.Symbiont.OrtexRunner, [name: :pitch_linguistic, path: Path.join([model_root_path | model_config.predict_map.maybe_pitch.linguistic.path])]}
)

:ok = Orchid.Symbiont.register(
  :pitch_predict,
  {Orchid.Symbiont.OrtexRunner, [name: :pitch_predict, path: Path.join([model_root_path | model_config.predict_map.maybe_duration.duration.path])]}
)

%{maybe_lang_dict: lang_dict, phoneme_dict: phoneme_dict} = model_config.predict_map.maybe_pitch.phonemes

# recipe = Orchid.Recipe.new([
#   {DurationPredictEncoder, :words, [:duration_lang, :duration_phoneme, :duration_word_division, :word_duration, :ph_midi], [lang_dict: lang_dict, phoneme_dict: phoneme_dict]},
#   {PredictDuration, [:duration_lang, :duration_phoneme, :duration_word_division, :word_duration, :ph_midi], :phoneme_duration, extra_hooks_stack: [Orchid.Symbiont.Hooks.Injector]},
#   {PredictPitch, [:lang_map, :phoneme_map, :phoneme_duration, :ph_midi], :res, extra_hooks_stack: [Orchid.Symbiont.Hooks.Injector]},
# ])
# inputs = %Orchid.Param{name: :words, payload: [
#     {[{"zh", "zh/n"}, {"zh", "zh/i"}], 200, 60},
#     {[{"zh", "zh/h"}, {"zh", "zh/ao"}], 220, 62}
#   ]}
# Orchid.run(recipe, inputs)

其他变分参数:TODO

搭建声学模型与声码器

声学模型

声码器

后处理(将张量变为可被播放的片段)

End-to-end 管线

_phoneme_predict = []
_pitch_predict = []