Skip to content

Commit

Permalink
Merge pull request #62 from yqzhishen/shallow-diffusion
Browse files Browse the repository at this point in the history
Support acoustic models with shallow diffusion
  • Loading branch information
oxygen-dioxide authored Sep 23, 2023
2 parents 7377bab + bd1c395 commit 3915ebc
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 5 deletions.
3 changes: 3 additions & 0 deletions OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,16 @@ public class DsConfig {
public bool useEnergyEmbed = false;
public bool useBreathinessEmbed= false;
public AugmentationArgs augmentationArgs;
public bool useShallowDiffusion = false;
public int maxDepth = -1;
public string dur;
public string linguistic;
public string pitch;
public string variance;
public int hop_size = 512;
public int sample_rate = 44100;
public bool predict_dur = true;
public bool use_note_rest = false;
public float frameMs(){
return 1000f * hop_size / sample_rate;
}
Expand Down
27 changes: 24 additions & 3 deletions OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,23 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, Cancell
return new RenderResult();
}
var result = Layout(phrase);

// calculate real depth
int speedup = Core.Util.Preferences.Default.DiffsingerSpeedup;
var wavPath = Path.Join(PathManager.Inst.CachePath, $"ds-{phrase.hash:x16}-{speedup}x.wav");
var singer = (DiffSingerSinger) phrase.singer;
int depth = Core.Util.Preferences.Default.DiffSingerDepth;
if (singer.dsConfig.useShallowDiffusion) {
int kStep = singer.dsConfig.maxDepth;
if (kStep < 0) {
throw new InvalidDataException("Max depth is unset or is negative.");
}
depth = Math.Min(depth, kStep); // make sure depth <= K_step
depth = depth / speedup * speedup; // make sure depth can be divided by speedup
}
var wavName = singer.dsConfig.useShallowDiffusion
? $"ds-{phrase.hash:x16}-depth{depth}-{speedup}x.wav" // if the depth changes, phrase should be re-rendered
: $"ds-{phrase.hash:x16}-{speedup}x.wav"; // preserve this for not invalidating cache from older versions
var wavPath = Path.Join(PathManager.Inst.CachePath, wavName);
string progressInfo = $"{this}{speedup}x \"{string.Join(" ", phrase.phones.Select(p => p.phoneme))}\"";
if (File.Exists(wavPath)) {
try {
Expand All @@ -84,7 +99,7 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, Cancell
}
}
if (result.samples == null) {
result.samples = InvokeDiffsinger(phrase, speedup);
result.samples = InvokeDiffsinger(phrase, depth, speedup);
var source = new WaveSource(0, 0, 0, 1);
source.SetSamples(result.samples);
WaveFileWriter.CreateWaveFile16(wavPath, new ExportAdapter(source).ToMono(1, 0));
Expand All @@ -103,7 +118,7 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, Cancell
leadingMs、positionMs、estimatedLengthMs: timeaxis layout in Ms, double
*/

float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) {
float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup) {
var singer = phrase.singer as DiffSingerSinger;
//Check if dsconfig.yaml is correct
if(String.IsNullOrEmpty(singer.dsConfig.vocoder) ||
Expand Down Expand Up @@ -151,6 +166,12 @@ float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) {
var f0tensor = new DenseTensor<float>(f0, new int[] { f0.Length })
.Reshape(new int[] { 1, f0.Length });
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("f0",f0tensor));

// sampling acceleration related
if (singer.dsConfig.useShallowDiffusion) {
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("depth",
new DenseTensor<long>(new long[] { depth }, new int[] { 1 }, false)));
}
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("speedup",
new DenseTensor<long>(new long[] { speedup }, new int[] { 1 },false)));

Expand Down
1 change: 1 addition & 0 deletions OpenUtau.Core/Util/Preferences.cs
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ public class SerializablePreferences {
public string OnnxRunner = string.Empty;
public int OnnxGpu = 0;
public int DiffsingerSpeedup = 50;
public int DiffSingerDepth = 1000;
public string Language = string.Empty;
public string SortingOrder = string.Empty;
public List<string> RecentFiles = new List<string>();
Expand Down
3 changes: 2 additions & 1 deletion OpenUtau/Strings/Strings.axaml
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,8 @@ Warning: this option removes custom presets.</system:String>
<system:String x:Key="prefs.playback.lockstarttime.on">Move cursor back to where you started playing</system:String>
<system:String x:Key="prefs.playback.test">Test</system:String>
<system:String x:Key="prefs.rendering">Rendering</system:String>
<system:String x:Key="prefs.rendering.diffsingerspeedup">Diffsinger Render Speedup</system:String>
<system:String x:Key="prefs.rendering.diffsingerspeedup">DiffSinger Render Speedup</system:String>
<system:String x:Key="prefs.rendering.diffsingerdepth">DiffSinger Render Depth</system:String>
<system:String x:Key="prefs.rendering.onnxgpu">GPU</system:String>
<system:String x:Key="prefs.rendering.onnxrunner">Machine Learning Runner</system:String>
<system:String x:Key="prefs.rendering.phasecomp">Phase Compensation</system:String>
Expand Down
3 changes: 2 additions & 1 deletion OpenUtau/Strings/Strings.zh-CN.axaml
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@
<system:String x:Key="prefs.playback.lockstarttime.on">将播放标记移回开始播放处</system:String>
<system:String x:Key="prefs.playback.test">测试</system:String>
<system:String x:Key="prefs.rendering">渲染</system:String>
<system:String x:Key="prefs.rendering.diffsingerspeedup">Diffsinger渲染加速</system:String>
<system:String x:Key="prefs.rendering.diffsingerspeedup">DiffSinger 渲染加速</system:String>
<system:String x:Key="prefs.rendering.diffsingerdepth">DiffSinger 渲染深度</system:String>
<system:String x:Key="prefs.rendering.onnxrunner">机器学习运行器</system:String>
<system:String x:Key="prefs.rendering.onnxgpu">GPU</system:String>
<system:String x:Key="prefs.rendering.phasecomp">相位修正</system:String>
Expand Down
7 changes: 7 additions & 0 deletions OpenUtau/ViewModels/PreferencesViewModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public AudioOutputDevice? AudioOutputDevice {
public List<GpuInfo> OnnxGpuOptions { get; set; }
[Reactive] public GpuInfo OnnxGpu { get; set; }
public List<int> DiffsingerSpeedupOptions { get; } = new List<int> { 1, 5, 10, 20, 50, 100 };
[Reactive] public int DiffSingerDepth { get; set; }
[Reactive] public int DiffsingerSpeedup { get; set; }
[Reactive] public bool HighThreads { get; set; }
[Reactive] public int Theme { get; set; }
Expand Down Expand Up @@ -124,6 +125,7 @@ public PreferencesViewModel() {
OnnxRunnerOptions[0] : Preferences.Default.OnnxRunner;
OnnxGpuOptions = Onnx.getGpuInfo();
OnnxGpu = OnnxGpuOptions.FirstOrDefault(x => x.deviceId == Preferences.Default.OnnxGpu, OnnxGpuOptions[0]);
DiffSingerDepth = Preferences.Default.DiffSingerDepth;
DiffsingerSpeedup = Preferences.Default.DiffsingerSpeedup;
Theme = Preferences.Default.Theme;
UseTrackColor = Preferences.Default.UseTrackColor;
Expand Down Expand Up @@ -268,6 +270,11 @@ public PreferencesViewModel() {
Preferences.Default.DiffsingerSpeedup = index;
Preferences.Save();
});
this.WhenAnyValue(vm => vm.DiffSingerDepth)
.Subscribe(index => {
Preferences.Default.DiffSingerDepth = index;
Preferences.Save();
});
}

public void TestAudioOutputDevice() {
Expand Down
12 changes: 12 additions & 0 deletions OpenUtau/Views/PreferencesDialog.axaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,18 @@
<TextBlock Classes="restart"/>
<TextBlock Text="{DynamicResource prefs.rendering.diffsingerspeedup}" Margin="0,10,0,0"/>
<ComboBox HorizontalAlignment="Stretch" ItemsSource="{Binding DiffsingerSpeedupOptions}" SelectedItem="{Binding DiffsingerSpeedup}"/>
<Grid ColumnDefinitions="Auto,8,40,8,*" Margin="0,10,0,0">
<TextBlock Grid.Column="0" Text="{DynamicResource prefs.rendering.diffsingerdepth}"/>
<TextBlock Grid.Column="2">
<TextBlock.Text>
<MultiBinding StringFormat="{}{0:#0}">
<Binding Path="DiffSingerDepth"/>
</MultiBinding>
</TextBlock.Text>
</TextBlock>
<Slider Grid.Column="4" Classes="fader" Value="{Binding DiffSingerDepth}" Minimum="0" Maximum="1000"
TickPlacement="BottomRight" TickFrequency="20" IsSnapToTickEnabled="true"/>
</Grid>
</StackPanel>
</HeaderedContentControl>
<HeaderedContentControl Classes="groupbox" Header="{DynamicResource prefs.appearance}">
Expand Down

0 comments on commit 3915ebc

Please sign in to comment.