From 6914240a802c097f680ed36dddff7bed6e2888b5 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 23 Sep 2023 00:39:12 +0800 Subject: [PATCH 1/2] Support shallow diffusion --- OpenUtau.Core/DiffSinger/DiffSingerConfig.cs | 3 +++ .../DiffSinger/DiffSingerRenderer.cs | 27 ++++++++++++++++--- OpenUtau.Core/Util/Preferences.cs | 1 + OpenUtau/Strings/Strings.axaml | 3 ++- OpenUtau/Strings/Strings.zh-CN.axaml | 3 ++- OpenUtau/ViewModels/PreferencesViewModel.cs | 7 +++++ OpenUtau/Views/PreferencesDialog.axaml | 12 +++++++++ 7 files changed, 51 insertions(+), 5 deletions(-) diff --git a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs index 480eabb5a..07d9e2d3e 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs @@ -26,6 +26,8 @@ public class DsConfig { public bool useEnergyEmbed = false; public bool useBreathinessEmbed= false; public AugmentationArgs augmentationArgs; + public bool useShallowDiffusion = false; + public int maxDepth = -1; public string dur; public string linguistic; public string pitch; @@ -33,6 +35,7 @@ public class DsConfig { public int hop_size = 512; public int sample_rate = 44100; public bool predict_dur = true; + public bool use_note_rest = false; public float frameMs(){ return 1000f * hop_size / sample_rate; } diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs index 585bbab17..c9443d410 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs @@ -71,8 +71,23 @@ public Task Render(RenderPhrase phrase, Progress progress, Cancell return new RenderResult(); } var result = Layout(phrase); + + // calculate real depth int speedup = Core.Util.Preferences.Default.DiffsingerSpeedup; - var wavPath = Path.Join(PathManager.Inst.CachePath, $"ds-{phrase.hash:x16}-{speedup}x.wav"); + var singer = (DiffSingerSinger) phrase.singer; + int depth = Core.Util.Preferences.Default.DiffSingerDepth; + if (singer.dsConfig.useShallowDiffusion) { + int kStep = singer.dsConfig.maxDepth; + if (kStep < 0) { + throw new InvalidDataException("Max depth is unset or is negative."); + } + depth = Math.Min(depth, kStep); // make sure depth <= K_step + depth = depth / speedup * speedup; // make sure depth can be divided by speedup + } + var wavName = singer.dsConfig.useShallowDiffusion + ? $"ds-{phrase.hash:x16}-kstep{depth}-{speedup}x.wav" // if the depth changes, phrase should be re-rendered + : $"ds-{phrase.hash:x16}-{speedup}x.wav"; // preserve this for not invalidating cache from older versions + var wavPath = Path.Join(PathManager.Inst.CachePath, wavName); string progressInfo = $"{this}{speedup}x \"{string.Join(" ", phrase.phones.Select(p => p.phoneme))}\""; if (File.Exists(wavPath)) { try { @@ -84,7 +99,7 @@ public Task Render(RenderPhrase phrase, Progress progress, Cancell } } if (result.samples == null) { - result.samples = InvokeDiffsinger(phrase, speedup); + result.samples = InvokeDiffsinger(phrase, depth, speedup); var source = new WaveSource(0, 0, 0, 1); source.SetSamples(result.samples); WaveFileWriter.CreateWaveFile16(wavPath, new ExportAdapter(source).ToMono(1, 0)); @@ -103,7 +118,7 @@ public Task Render(RenderPhrase phrase, Progress progress, Cancell leadingMs、positionMs、estimatedLengthMs: timeaxis layout in Ms, double */ - float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) { + float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup) { var singer = phrase.singer as DiffSingerSinger; //Check if dsconfig.yaml is correct if(String.IsNullOrEmpty(singer.dsConfig.vocoder) || @@ -151,6 +166,12 @@ float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) { var f0tensor = new DenseTensor(f0, new int[] { f0.Length }) .Reshape(new int[] { 1, f0.Length }); acousticInputs.Add(NamedOnnxValue.CreateFromTensor("f0",f0tensor)); + + // sampling acceleration related + if (singer.dsConfig.useShallowDiffusion) { + acousticInputs.Add(NamedOnnxValue.CreateFromTensor("depth", + new DenseTensor(new long[] { depth }, new int[] { 1 }, false))); + } acousticInputs.Add(NamedOnnxValue.CreateFromTensor("speedup", new DenseTensor(new long[] { speedup }, new int[] { 1 },false))); diff --git a/OpenUtau.Core/Util/Preferences.cs b/OpenUtau.Core/Util/Preferences.cs index f0918351c..80a7b2e93 100644 --- a/OpenUtau.Core/Util/Preferences.cs +++ b/OpenUtau.Core/Util/Preferences.cs @@ -118,6 +118,7 @@ public class SerializablePreferences { public string OnnxRunner = string.Empty; public int OnnxGpu = 0; public int DiffsingerSpeedup = 50; + public int DiffSingerDepth = 1000; public string Language = string.Empty; public string SortingOrder = string.Empty; public List RecentFiles = new List(); diff --git a/OpenUtau/Strings/Strings.axaml b/OpenUtau/Strings/Strings.axaml index 86980f6aa..a48da7a59 100644 --- a/OpenUtau/Strings/Strings.axaml +++ b/OpenUtau/Strings/Strings.axaml @@ -301,7 +301,8 @@ Warning: this option removes custom presets. Move cursor back to where you started playing Test Rendering - Diffsinger Render Speedup + DiffSinger Render Speedup + DiffSinger Render Depth GPU Machine Learning Runner Phase Compensation diff --git a/OpenUtau/Strings/Strings.zh-CN.axaml b/OpenUtau/Strings/Strings.zh-CN.axaml index 74cf9db7f..6ec19147e 100644 --- a/OpenUtau/Strings/Strings.zh-CN.axaml +++ b/OpenUtau/Strings/Strings.zh-CN.axaml @@ -288,7 +288,8 @@ 将播放标记移回开始播放处 测试 渲染 - Diffsinger渲染加速 + DiffSinger 渲染加速 + DiffSinger 渲染深度 机器学习运行器 GPU 相位修正 diff --git a/OpenUtau/ViewModels/PreferencesViewModel.cs b/OpenUtau/ViewModels/PreferencesViewModel.cs index d6460b3a3..115eedeb9 100644 --- a/OpenUtau/ViewModels/PreferencesViewModel.cs +++ b/OpenUtau/ViewModels/PreferencesViewModel.cs @@ -35,6 +35,7 @@ public AudioOutputDevice? AudioOutputDevice { public List OnnxGpuOptions { get; set; } [Reactive] public GpuInfo OnnxGpu { get; set; } public List DiffsingerSpeedupOptions { get; } = new List { 1, 5, 10, 20, 50, 100 }; + [Reactive] public int DiffSingerDepth { get; set; } [Reactive] public int DiffsingerSpeedup { get; set; } [Reactive] public bool HighThreads { get; set; } [Reactive] public int Theme { get; set; } @@ -124,6 +125,7 @@ public PreferencesViewModel() { OnnxRunnerOptions[0] : Preferences.Default.OnnxRunner; OnnxGpuOptions = Onnx.getGpuInfo(); OnnxGpu = OnnxGpuOptions.FirstOrDefault(x => x.deviceId == Preferences.Default.OnnxGpu, OnnxGpuOptions[0]); + DiffSingerDepth = Preferences.Default.DiffSingerDepth; DiffsingerSpeedup = Preferences.Default.DiffsingerSpeedup; Theme = Preferences.Default.Theme; UseTrackColor = Preferences.Default.UseTrackColor; @@ -268,6 +270,11 @@ public PreferencesViewModel() { Preferences.Default.DiffsingerSpeedup = index; Preferences.Save(); }); + this.WhenAnyValue(vm => vm.DiffSingerDepth) + .Subscribe(index => { + Preferences.Default.DiffSingerDepth = index; + Preferences.Save(); + }); } public void TestAudioOutputDevice() { diff --git a/OpenUtau/Views/PreferencesDialog.axaml b/OpenUtau/Views/PreferencesDialog.axaml index 0325a23b0..fd009b2ef 100644 --- a/OpenUtau/Views/PreferencesDialog.axaml +++ b/OpenUtau/Views/PreferencesDialog.axaml @@ -118,6 +118,18 @@ + + + + + + + + + + + From 9e147def3f9478d8be7d2ad04a89efbfa7cfcaa4 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Sat, 23 Sep 2023 01:16:35 +0800 Subject: [PATCH 2/2] kstep => depth --- OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs index c9443d410..22c7a8a65 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs @@ -85,7 +85,7 @@ public Task Render(RenderPhrase phrase, Progress progress, Cancell depth = depth / speedup * speedup; // make sure depth can be divided by speedup } var wavName = singer.dsConfig.useShallowDiffusion - ? $"ds-{phrase.hash:x16}-kstep{depth}-{speedup}x.wav" // if the depth changes, phrase should be re-rendered + ? $"ds-{phrase.hash:x16}-depth{depth}-{speedup}x.wav" // if the depth changes, phrase should be re-rendered : $"ds-{phrase.hash:x16}-{speedup}x.wav"; // preserve this for not invalidating cache from older versions var wavPath = Path.Join(PathManager.Inst.CachePath, wavName); string progressInfo = $"{this}{speedup}x \"{string.Join(" ", phrase.phones.Select(p => p.phoneme))}\"";