diff --git a/.gitignore b/.gitignore index 99908ab74..c1c66cd8c 100644 --- a/.gitignore +++ b/.gitignore @@ -170,6 +170,7 @@ data/* !/default_config.yml /Web/ /emotional/*/*.bin +/slm/*/*.bin /bert/*/*.bin /bert/*/*.h5 /bert/*/*.model diff --git a/bert_gen.py b/bert_gen.py index 588c768c8..81175967c 100644 --- a/bert_gen.py +++ b/bert_gen.py @@ -1,17 +1,16 @@ -import argparse -from multiprocessing import Pool, cpu_count - import torch -import torch.multiprocessing as mp -from tqdm import tqdm - +from multiprocessing import Pool import commons import utils +from tqdm import tqdm +from text import check_bert_models, cleaned_text_to_sequence, get_bert +import argparse +import torch.multiprocessing as mp from config import config -from text import cleaned_text_to_sequence, get_bert -def process_line(line): +def process_line(x): + line, add_blank = x device = config.bert_gen_config.device if config.bert_gen_config.use_multi_device: rank = mp.current_process()._identity @@ -28,12 +27,13 @@ def process_line(line): word2ph = [i for i in word2ph] phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) - phone = commons.intersperse(phone, 0) - tone = commons.intersperse(tone, 0) - language = commons.intersperse(language, 0) - for i in range(len(word2ph)): - word2ph[i] = word2ph[i] * 2 - word2ph[0] += 1 + if add_blank: + phone = commons.intersperse(phone, 0) + tone = commons.intersperse(tone, 0) + language = commons.intersperse(language, 0) + for i in range(len(word2ph)): + word2ph[i] = word2ph[i] * 2 + word2ph[0] += 1 bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt") @@ -59,16 +59,23 @@ def process_line(line): args, _ = parser.parse_known_args() config_path = args.config hps = utils.get_hparams_from_file(config_path) + check_bert_models() lines = [] with open(hps.data.training_files, encoding="utf-8") as f: lines.extend(f.readlines()) with open(hps.data.validation_files, encoding="utf-8") as f: lines.extend(f.readlines()) + add_blank = [hps.data.add_blank] * len(lines) + if len(lines) != 0: - num_processes = min(args.num_processes, cpu_count()) + num_processes = args.num_processes with Pool(processes=num_processes) as pool: - for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)): - pass + for _ in tqdm( + pool.imap_unordered(process_line, zip(lines, add_blank)), + total=len(lines), + ): + # 这里是缩进的代码块,表示循环体 + pass # 使用pass语句作为占位符 print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!") diff --git a/clap_gen.py b/clap_gen.py index 20380abe6..3054759d6 100644 --- a/clap_gen.py +++ b/clap_gen.py @@ -27,7 +27,7 @@ def process_line(line): device = torch.device("cpu") wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|") - clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy") + clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.pt") if os.path.isfile(clap_path): return diff --git a/configs/config.json b/configs/config.json index d1828ffb2..29295770c 100644 --- a/configs/config.json +++ b/configs/config.json @@ -10,18 +10,20 @@ 0.99 ], "eps": 1e-09, - "batch_size": 12, - "fp16_run": false, + "batch_size": 16, + "bf16_run": false, "lr_decay": 0.99995, "segment_size": 16384, "init_lr_ratio": 1, "warmup_epochs": 0, "c_mel": 45, "c_kl": 1.0, + "c_commit": 100, "skip_optimizer": true, "freeze_ZH_bert": false, "freeze_JP_bert": false, - "freeze_EN_bert": false + "freeze_EN_bert": false, + "freeze_emo": false }, "data": { "training_files": "filelists/train.list", @@ -35,7 +37,7 @@ "mel_fmin": 0.0, "mel_fmax": null, "add_blank": true, - "n_speakers": 896, + "n_speakers": 850, "cleaned_text": true, "spk2id": { "派蒙_ZH": 0, @@ -119,203 +121,203 @@ "伊迪娅_ZH": 78, "留云借风真君_ZH": 79, "绮良良_ZH": 80, - "七七_ZH": 81, - "式大将_ZH": 82, - "瑶瑶_ZH": 83, - "奥兹_ZH": 84, - "菲米尼_ZH": 85, - "米卡_ZH": 86, - "哲平_ZH": 87, - "大肉丸_ZH": 88, - "托克_ZH": 89, - "蒂玛乌斯_ZH": 90, - "昆钧_ZH": 91, - "欧菲妮_ZH": 92, - "塞琉斯_ZH": 93, - "仆人_ZH": 94, - "迈勒斯_ZH": 95, - "希格雯_ZH": 96, - "阿守_ZH": 97, - "拉赫曼_ZH": 98, - "杜拉夫_ZH": 99, - "伊利亚斯_ZH": 100, - "阿晃_ZH": 101, - "旁白_ZH": 102, - "爱德琳_ZH": 103, - "埃洛伊_ZH": 104, - "德沃沙克_ZH": 105, - "玛乔丽_ZH": 106, - "塞塔蕾_ZH": 107, - "柊千里_ZH": 108, - "海芭夏_ZH": 109, - "九条镰治_ZH": 110, - "阿娜耶_ZH": 111, - "笼钓瓶一心_ZH": 112, - "回声海螺_ZH": 113, - "劳维克_ZH": 114, - "元太_ZH": 115, - "阿扎尔_ZH": 116, - "查尔斯_ZH": 117, - "阿洛瓦_ZH": 118, - "埃勒曼_ZH": 119, - "纳比尔_ZH": 120, - "莎拉_ZH": 121, - "康纳_ZH": 122, - "博来_ZH": 123, - "玛塞勒_ZH": 124, - "阿祇_ZH": 125, - "博士_ZH": 126, - "玛格丽特_ZH": 127, - "迪尔菲_ZH": 128, - "宛烟_ZH": 129, - "羽生田千鹤_ZH": 130, - "海妮耶_ZH": 131, - "旅行者_ZH": 132, - "霍夫曼_ZH": 133, - "佐西摩斯_ZH": 134, - "鹿野奈奈_ZH": 135, - "舒伯特_ZH": 136, - "天叔_ZH": 137, - "艾莉丝_ZH": 138, - "龙二_ZH": 139, - "莺儿_ZH": 140, - "嘉良_ZH": 141, - "一心传名刀_ZH": 142, - "费迪南德_ZH": 143, - "珊瑚_ZH": 144, - "言笑_ZH": 145, - "久利须_ZH": 146, - "嘉玛_ZH": 147, - "艾文_ZH": 148, - "克洛琳德_ZH": 149, - "丹吉尔_ZH": 150, - "女士_ZH": 151, - "白老先生_ZH": 152, - "天目十五_ZH": 153, - "老孟_ZH": 154, - "巴达维_ZH": 155, - "长生_ZH": 156, - "吴船长_ZH": 157, - "拉齐_ZH": 158, - "艾伯特_ZH": 159, - "松浦_ZH": 160, - "埃泽_ZH": 161, - "阿圆_ZH": 162, - "莫塞伊思_ZH": 163, - "阿拉夫_ZH": 164, - "杜吉耶_ZH": 165, - "石头_ZH": 166, - "百闻_ZH": 167, - "波洛_ZH": 168, - "斯坦利_ZH": 169, - "博易_ZH": 170, - "迈蒙_ZH": 171, - "掇星攫辰天君_ZH": 172, - "毗伽尔_ZH": 173, - "芙卡洛斯_ZH": 174, - "恶龙_ZH": 175, - "恕筠_ZH": 176, - "知易_ZH": 177, - "克列门特_ZH": 178, - "大慈树王_ZH": 179, - "西拉杰_ZH": 180, - "上杉_ZH": 181, - "阿尔卡米_ZH": 182, - "纯水精灵_ZH": 183, - "常九爷_ZH": 184, - "沙扎曼_ZH": 185, - "田铁嘴_ZH": 186, - "克罗索_ZH": 187, - "阿巴图伊_ZH": 188, - "悦_ZH": 189, + "陌生人_ZH": 81, + "七七_ZH": 82, + "式大将_ZH": 83, + "瑶瑶_ZH": 84, + "奥兹_ZH": 85, + "菲米尼_ZH": 86, + "米卡_ZH": 87, + "哲平_ZH": 88, + "浮游水蕈兽·元素生命_ZH": 89, + "大肉丸_ZH": 90, + "托克_ZH": 91, + "蒂玛乌斯_ZH": 92, + "昆钧_ZH": 93, + "欧菲妮_ZH": 94, + "塞琉斯_ZH": 95, + "仆人_ZH": 96, + "迈勒斯_ZH": 97, + "希格雯_ZH": 98, + "阿守_ZH": 99, + "拉赫曼_ZH": 100, + "杜拉夫_ZH": 101, + "伊利亚斯_ZH": 102, + "阿晃_ZH": 103, + "旁白_ZH": 104, + "爱德琳_ZH": 105, + "埃洛伊_ZH": 106, + "德沃沙克_ZH": 107, + "玛乔丽_ZH": 108, + "塞塔蕾_ZH": 109, + "柊千里_ZH": 110, + "海芭夏_ZH": 111, + "九条镰治_ZH": 112, + "阿娜耶_ZH": 113, + "笼钓瓶一心_ZH": 114, + "回声海螺_ZH": 115, + "劳维克_ZH": 116, + "元太_ZH": 117, + "阿扎尔_ZH": 118, + "查尔斯_ZH": 119, + "阿洛瓦_ZH": 120, + "埃勒曼_ZH": 121, + "纳比尔_ZH": 122, + "莎拉_ZH": 123, + "康纳_ZH": 124, + "博来_ZH": 125, + "玛塞勒_ZH": 126, + "阿祇_ZH": 127, + "博士_ZH": 128, + "玛格丽特_ZH": 129, + "迪尔菲_ZH": 130, + "宛烟_ZH": 131, + "羽生田千鹤_ZH": 132, + "海妮耶_ZH": 133, + "旅行者_ZH": 134, + "霍夫曼_ZH": 135, + "佐西摩斯_ZH": 136, + "鹿野奈奈_ZH": 137, + "舒伯特_ZH": 138, + "天叔_ZH": 139, + "艾莉丝_ZH": 140, + "龙二_ZH": 141, + "莺儿_ZH": 142, + "嘉良_ZH": 143, + "一心传名刀_ZH": 144, + "珊瑚_ZH": 145, + "言笑_ZH": 146, + "久利须_ZH": 147, + "嘉玛_ZH": 148, + "艾文_ZH": 149, + "克洛琳德_ZH": 150, + "丹吉尔_ZH": 151, + "女士_ZH": 152, + "白老先生_ZH": 153, + "天目十五_ZH": 154, + "老孟_ZH": 155, + "巴达维_ZH": 156, + "长生_ZH": 157, + "吴船长_ZH": 158, + "拉齐_ZH": 159, + "艾伯特_ZH": 160, + "松浦_ZH": 161, + "埃泽_ZH": 162, + "阿圆_ZH": 163, + "莫塞伊思_ZH": 164, + "阿拉夫_ZH": 165, + "杜吉耶_ZH": 166, + "石头_ZH": 167, + "百闻_ZH": 168, + "波洛_ZH": 169, + "斯坦利_ZH": 170, + "博易_ZH": 171, + "迈蒙_ZH": 172, + "掇星攫辰天君_ZH": 173, + "毗伽尔_ZH": 174, + "芙卡洛斯_ZH": 175, + "恶龙_ZH": 176, + "恕筠_ZH": 177, + "知易_ZH": 178, + "克列门特_ZH": 179, + "大慈树王_ZH": 180, + "西拉杰_ZH": 181, + "上杉_ZH": 182, + "阿尔卡米_ZH": 183, + "纯水精灵_ZH": 184, + "常九爷_ZH": 185, + "沙扎曼_ZH": 186, + "田铁嘴_ZH": 187, + "克罗索_ZH": 188, + "阿巴图伊_ZH": 189, "阿佩普_ZH": 190, "埃尔欣根_ZH": 191, "萨赫哈蒂_ZH": 192, "塔杰·拉德卡尼_ZH": 193, "安西_ZH": 194, - "埃舍尔_ZH": 195, - "萨齐因_ZH": 196, - "派蒙_JP": 197, - "纳西妲_JP": 198, - "凯亚_JP": 199, - "阿贝多_JP": 200, - "温迪_JP": 201, - "枫原万叶_JP": 202, - "钟离_JP": 203, - "荒泷一斗_JP": 204, - "八重神子_JP": 205, - "艾尔海森_JP": 206, - "提纳里_JP": 207, - "迪希雅_JP": 208, - "卡维_JP": 209, - "宵宫_JP": 210, - "那维莱特_JP": 211, - "莱依拉_JP": 212, - "赛诺_JP": 213, - "莫娜_JP": 214, - "诺艾尔_JP": 215, - "托马_JP": 216, - "凝光_JP": 217, - "林尼_JP": 218, - "北斗_JP": 219, - "柯莱_JP": 220, - "神里绫华_JP": 221, - "可莉_JP": 222, - "芭芭拉_JP": 223, - "雷电将军_JP": 224, - "娜维娅_JP": 225, - "芙宁娜_JP": 226, - "珊瑚宫心海_JP": 227, - "鹿野院平藏_JP": 228, - "迪奥娜_JP": 229, - "琴_JP": 230, - "五郎_JP": 231, - "班尼特_JP": 232, - "达达利亚_JP": 233, - "安柏_JP": 234, - "莱欧斯利_JP": 235, - "夜兰_JP": 236, - "妮露_JP": 237, - "辛焱_JP": 238, - "丽莎_JP": 239, - "珐露珊_JP": 240, - "魈_JP": 241, - "香菱_JP": 242, - "迪卢克_JP": 243, - "砂糖_JP": 244, - "烟绯_JP": 245, - "早柚_JP": 246, - "云堇_JP": 247, - "刻晴_JP": 248, - "重云_JP": 249, - "优菈_JP": 250, - "胡桃_JP": 251, - "流浪者_JP": 252, - "久岐忍_JP": 253, - "神里绫人_JP": 254, - "甘雨_JP": 255, - "戴因斯雷布_JP": 256, - "菲谢尔_JP": 257, - "白术_JP": 258, - "行秋_JP": 259, - "九条裟罗_JP": 260, - "夏洛蒂_JP": 261, - "雷泽_JP": 262, - "申鹤_JP": 263, - "空_JP": 264, - "荧_JP": 265, - "迪娜泽黛_JP": 266, - "凯瑟琳_JP": 267, - "多莉_JP": 268, - "坎蒂丝_JP": 269, - "琳妮特_JP": 270, - "萍姥姥_JP": 271, - "罗莎莉亚_JP": 272, - "埃德_JP": 273, - "爱贝尔_JP": 274, - "伊迪娅_JP": 275, - "留云借风真君_JP": 276, - "绮良良_JP": 277, + "陆行岩本真蕈·元素生命_ZH": 195, + "派蒙_JP": 196, + "纳西妲_JP": 197, + "凯亚_JP": 198, + "阿贝多_JP": 199, + "温迪_JP": 200, + "枫原万叶_JP": 201, + "钟离_JP": 202, + "荒泷一斗_JP": 203, + "八重神子_JP": 204, + "艾尔海森_JP": 205, + "提纳里_JP": 206, + "迪希雅_JP": 207, + "卡维_JP": 208, + "宵宫_JP": 209, + "那维莱特_JP": 210, + "莱依拉_JP": 211, + "赛诺_JP": 212, + "莫娜_JP": 213, + "诺艾尔_JP": 214, + "托马_JP": 215, + "凝光_JP": 216, + "林尼_JP": 217, + "北斗_JP": 218, + "柯莱_JP": 219, + "神里绫华_JP": 220, + "可莉_JP": 221, + "芭芭拉_JP": 222, + "雷电将军_JP": 223, + "娜维娅_JP": 224, + "芙宁娜_JP": 225, + "珊瑚宫心海_JP": 226, + "鹿野院平藏_JP": 227, + "迪奥娜_JP": 228, + "琴_JP": 229, + "五郎_JP": 230, + "班尼特_JP": 231, + "达达利亚_JP": 232, + "安柏_JP": 233, + "莱欧斯利_JP": 234, + "夜兰_JP": 235, + "妮露_JP": 236, + "辛焱_JP": 237, + "丽莎_JP": 238, + "珐露珊_JP": 239, + "魈_JP": 240, + "香菱_JP": 241, + "迪卢克_JP": 242, + "砂糖_JP": 243, + "烟绯_JP": 244, + "早柚_JP": 245, + "云堇_JP": 246, + "刻晴_JP": 247, + "重云_JP": 248, + "优菈_JP": 249, + "胡桃_JP": 250, + "流浪者_JP": 251, + "久岐忍_JP": 252, + "神里绫人_JP": 253, + "甘雨_JP": 254, + "戴因斯雷布_JP": 255, + "菲谢尔_JP": 256, + "白术_JP": 257, + "行秋_JP": 258, + "九条裟罗_JP": 259, + "夏洛蒂_JP": 260, + "雷泽_JP": 261, + "申鹤_JP": 262, + "空_JP": 263, + "荧_JP": 264, + "迪娜泽黛_JP": 265, + "凯瑟琳_JP": 266, + "多莉_JP": 267, + "坎蒂丝_JP": 268, + "琳妮特_JP": 269, + "萍姥姥_JP": 270, + "罗莎莉亚_JP": 271, + "埃德_JP": 272, + "爱贝尔_JP": 273, + "伊迪娅_JP": 274, + "留云借风真君_JP": 275, + "绮良良_JP": 276, + "陌生人_JP": 277, "七七_JP": 278, "式大将_JP": 279, "瑶瑶_JP": 280, @@ -323,576 +325,571 @@ "菲米尼_JP": 282, "米卡_JP": 283, "哲平_JP": 284, - "大肉丸_JP": 285, - "托克_JP": 286, - "蒂玛乌斯_JP": 287, - "昆钧_JP": 288, - "欧菲妮_JP": 289, - "塞琉斯_JP": 290, - "仆人_JP": 291, - "迈勒斯_JP": 292, - "希格雯_JP": 293, - "阿守_JP": 294, - "拉赫曼_JP": 295, - "杜拉夫_JP": 296, - "伊利亚斯_JP": 297, - "阿晃_JP": 298, - "旁白_JP": 299, - "爱德琳_JP": 300, - "埃洛伊_JP": 301, - "德沃沙克_JP": 302, - "玛乔丽_JP": 303, - "塞塔蕾_JP": 304, - "柊千里_JP": 305, - "海芭夏_JP": 306, - "九条镰治_JP": 307, - "阿娜耶_JP": 308, - "笼钓瓶一心_JP": 309, - "回声海螺_JP": 310, - "劳维克_JP": 311, - "元太_JP": 312, - "阿扎尔_JP": 313, - "查尔斯_JP": 314, - "阿洛瓦_JP": 315, - "埃勒曼_JP": 316, - "纳比尔_JP": 317, - "莎拉_JP": 318, - "康纳_JP": 319, - "博来_JP": 320, - "玛塞勒_JP": 321, - "阿祇_JP": 322, - "博士_JP": 323, - "迪尔菲_JP": 324, - "玛格丽特_JP": 325, - "宛烟_JP": 326, - "羽生田千鹤_JP": 327, - "海妮耶_JP": 328, - "霍夫曼_JP": 329, - "旅行者_JP": 330, - "佐西摩斯_JP": 331, - "舒伯特_JP": 332, - "鹿野奈奈_JP": 333, - "天叔_JP": 334, - "龙二_JP": 335, - "艾莉丝_JP": 336, - "莺儿_JP": 337, - "嘉良_JP": 338, - "珊瑚_JP": 339, - "言笑_JP": 340, - "一心传名刀_JP": 341, - "费迪南德_JP": 342, - "久利须_JP": 343, - "嘉玛_JP": 344, - "艾文_JP": 345, - "克洛琳德_JP": 346, - "丹吉尔_JP": 347, - "天目十五_JP": 348, - "女士_JP": 349, - "老孟_JP": 350, - "白老先生_JP": 351, - "舍利夫_JP": 352, - "巴达维_JP": 353, - "拉齐_JP": 354, - "长生_JP": 355, - "吴船长_JP": 356, - "艾伯特_JP": 357, - "松浦_JP": 358, - "埃泽_JP": 359, - "阿圆_JP": 360, - "阿拉夫_JP": 361, - "莫塞伊思_JP": 362, - "石头_JP": 363, - "百闻_JP": 364, - "杜吉耶_JP": 365, - "波洛_JP": 366, - "掇星攫辰天君_JP": 367, - "迈蒙_JP": 368, - "博易_JP": 369, - "诗筠_JP": 370, - "斯坦利_JP": 371, - "毗伽尔_JP": 372, - "芙卡洛斯_JP": 373, - "恶龙_JP": 374, - "小仓澪_JP": 375, - "恕筠_JP": 376, - "知易_JP": 377, - "克列门特_JP": 378, - "大慈树王_JP": 379, - "望雅_JP": 380, - "黑田_JP": 381, - "卡莉娜_JP": 382, - "马姆杜_JP": 383, - "科林斯_JP": 384, - "上杉_JP": 385, - "西拉杰_JP": 386, - "菲尔戈黛特_JP": 387, - "一平_JP": 388, - "纯水精灵_JP": 389, - "阿尔卡米_JP": 390, - "老戴_JP": 391, - "谢赫祖拜尔_JP": 392, - "沙扎曼_JP": 393, - "田铁嘴_JP": 394, - "小野寺_JP": 395, - "百识_JP": 396, - "克罗索_JP": 397, - "莱斯格_JP": 398, - "芷巧_JP": 399, - "加藤洋平_JP": 400, - "阿巴图伊_JP": 401, - "埃尔欣根_JP": 402, - "斯嘉莉_JP": 403, - "阿佩普_JP": 404, - "巫女_JP": 405, - "卡布斯_JP": 406, - "洛伦佐_JP": 407, - "萨赫哈蒂_JP": 408, - "娜德瓦_JP": 409, - "塞德娜_JP": 410, - "塔杰·拉德卡尼_JP": 411, - "绘星_JP": 412, - "泽田_JP": 413, - "安西_JP": 414, - "拉伊德_JP": 415, - "亚卡巴_JP": 416, - "有乐斋_JP": 417, - "莱昂_JP": 418, - "尤苏波夫_JP": 419, - "夏妮_JP": 420, - "埃舍尔_JP": 421, - "萨齐因_JP": 422, - "古山_JP": 423, - "自称渊上之物_JP": 424, - "丹羽_JP": 425, - "塞萨尔的日记_JP": 426, - "派蒙_EN": 427, - "纳西妲_EN": 428, - "凯亚_EN": 429, - "阿贝多_EN": 430, - "温迪_EN": 431, - "枫原万叶_EN": 432, - "钟离_EN": 433, - "荒泷一斗_EN": 434, - "八重神子_EN": 435, - "艾尔海森_EN": 436, - "提纳里_EN": 437, - "迪希雅_EN": 438, - "卡维_EN": 439, - "宵宫_EN": 440, - "莱依拉_EN": 441, - "那维莱特_EN": 442, - "赛诺_EN": 443, - "莫娜_EN": 444, - "诺艾尔_EN": 445, - "托马_EN": 446, - "凝光_EN": 447, - "林尼_EN": 448, - "北斗_EN": 449, - "柯莱_EN": 450, - "神里绫华_EN": 451, - "可莉_EN": 452, - "芭芭拉_EN": 453, - "雷电将军_EN": 454, - "娜维娅_EN": 455, - "芙宁娜_EN": 456, - "珊瑚宫心海_EN": 457, - "鹿野院平藏_EN": 458, - "迪奥娜_EN": 459, - "五郎_EN": 460, - "琴_EN": 461, - "班尼特_EN": 462, - "达达利亚_EN": 463, - "安柏_EN": 464, - "莱欧斯利_EN": 465, - "夜兰_EN": 466, - "妮露_EN": 467, - "辛焱_EN": 468, - "珐露珊_EN": 469, - "丽莎_EN": 470, - "魈_EN": 471, - "香菱_EN": 472, - "迪卢克_EN": 473, - "砂糖_EN": 474, - "烟绯_EN": 475, - "早柚_EN": 476, - "云堇_EN": 477, - "刻晴_EN": 478, - "重云_EN": 479, - "优菈_EN": 480, - "胡桃_EN": 481, - "流浪者_EN": 482, - "久岐忍_EN": 483, - "神里绫人_EN": 484, - "甘雨_EN": 485, - "戴因斯雷布_EN": 486, - "菲谢尔_EN": 487, - "白术_EN": 488, - "行秋_EN": 489, - "九条裟罗_EN": 490, - "夏洛蒂_EN": 491, - "雷泽_EN": 492, - "申鹤_EN": 493, - "荧_EN": 494, - "空_EN": 495, - "迪娜泽黛_EN": 496, - "凯瑟琳_EN": 497, - "多莉_EN": 498, - "坎蒂丝_EN": 499, - "琳妮特_EN": 500, - "萍姥姥_EN": 501, - "罗莎莉亚_EN": 502, - "埃德_EN": 503, - "爱贝尔_EN": 504, - "伊迪娅_EN": 505, - "留云借风真君_EN": 506, - "绮良良_EN": 507, - "七七_EN": 508, - "式大将_EN": 509, - "瑶瑶_EN": 510, - "奥兹_EN": 511, - "菲米尼_EN": 512, - "米卡_EN": 513, - "哲平_EN": 514, - "大肉丸_EN": 515, - "托克_EN": 516, - "蒂玛乌斯_EN": 517, - "昆钧_EN": 518, - "欧菲妮_EN": 519, - "塞琉斯_EN": 520, - "仆人_EN": 521, - "迈勒斯_EN": 522, - "希格雯_EN": 523, - "阿守_EN": 524, - "拉赫曼_EN": 525, - "杜拉夫_EN": 526, - "伊利亚斯_EN": 527, - "阿晃_EN": 528, - "旁白_EN": 529, - "爱德琳_EN": 530, - "埃洛伊_EN": 531, - "德沃沙克_EN": 532, - "玛乔丽_EN": 533, - "塞塔蕾_EN": 534, - "柊千里_EN": 535, - "海芭夏_EN": 536, - "九条镰治_EN": 537, - "阿娜耶_EN": 538, - "笼钓瓶一心_EN": 539, - "回声海螺_EN": 540, - "劳维克_EN": 541, - "元太_EN": 542, - "阿扎尔_EN": 543, - "查尔斯_EN": 544, - "阿洛瓦_EN": 545, - "埃勒曼_EN": 546, - "纳比尔_EN": 547, - "莎拉_EN": 548, - "康纳_EN": 549, - "博来_EN": 550, - "玛塞勒_EN": 551, - "阿祇_EN": 552, - "博士_EN": 553, - "迪尔菲_EN": 554, - "宛烟_EN": 555, - "玛格丽特_EN": 556, - "羽生田千鹤_EN": 557, - "海妮耶_EN": 558, - "霍夫曼_EN": 559, - "旅行者_EN": 560, - "佐西摩斯_EN": 561, - "鹿野奈奈_EN": 562, - "舒伯特_EN": 563, - "天叔_EN": 564, - "艾莉丝_EN": 565, - "龙二_EN": 566, - "莺儿_EN": 567, - "嘉良_EN": 568, - "珊瑚_EN": 569, - "费迪南德_EN": 570, - "言笑_EN": 571, - "一心传名刀_EN": 572, - "久利须_EN": 573, - "嘉玛_EN": 574, - "艾文_EN": 575, - "克洛琳德_EN": 576, - "丹吉尔_EN": 577, - "女士_EN": 578, - "天目十五_EN": 579, - "老孟_EN": 580, - "白老先生_EN": 581, - "舍利夫_EN": 582, - "巴达维_EN": 583, - "拉齐_EN": 584, - "长生_EN": 585, - "吴船长_EN": 586, - "艾伯特_EN": 587, - "松浦_EN": 588, - "埃泽_EN": 589, - "阿圆_EN": 590, - "阿拉夫_EN": 591, - "莫塞伊思_EN": 592, - "石头_EN": 593, - "百闻_EN": 594, - "杜吉耶_EN": 595, - "波洛_EN": 596, - "斯坦利_EN": 597, - "掇星攫辰天君_EN": 598, - "迈蒙_EN": 599, - "博易_EN": 600, - "诗筠_EN": 601, - "毗伽尔_EN": 602, - "慧心_EN": 603, - "芙卡洛斯_EN": 604, - "恶龙_EN": 605, - "小仓澪_EN": 606, - "恕筠_EN": 607, - "知易_EN": 608, - "克列门特_EN": 609, - "大慈树王_EN": 610, - "维多利亚_EN": 611, - "黑田_EN": 612, - "马姆杜_EN": 613, - "科林斯_EN": 614, - "上杉_EN": 615, - "西拉杰_EN": 616, - "宁禄_EN": 617, - "纯水精灵_EN": 618, - "常九爷_EN": 619, - "阿尔卡米_EN": 620, - "沙扎曼_EN": 621, - "田铁嘴_EN": 622, - "加萨尼_EN": 623, - "克罗索_EN": 624, - "星稀_EN": 625, - "莱斯格_EN": 626, - "阿巴图伊_EN": 627, - "悦_EN": 628, - "德田_EN": 629, - "埃尔欣根_EN": 630, - "阿佩普_EN": 631, - "萨赫哈蒂_EN": 632, - "洛伦佐_EN": 633, - "塔杰·拉德卡尼_EN": 634, - "泽田_EN": 635, - "安西_EN": 636, - "理水叠山真君_EN": 637, + "浮游水蕈兽·元素生命_JP": 285, + "大肉丸_JP": 286, + "托克_JP": 287, + "蒂玛乌斯_JP": 288, + "昆钧_JP": 289, + "欧菲妮_JP": 290, + "塞琉斯_JP": 291, + "仆人_JP": 292, + "迈勒斯_JP": 293, + "希格雯_JP": 294, + "阿守_JP": 295, + "拉赫曼_JP": 296, + "杜拉夫_JP": 297, + "伊利亚斯_JP": 298, + "阿晃_JP": 299, + "旁白_JP": 300, + "爱德琳_JP": 301, + "埃洛伊_JP": 302, + "德沃沙克_JP": 303, + "玛乔丽_JP": 304, + "塞塔蕾_JP": 305, + "柊千里_JP": 306, + "海芭夏_JP": 307, + "九条镰治_JP": 308, + "阿娜耶_JP": 309, + "笼钓瓶一心_JP": 310, + "回声海螺_JP": 311, + "劳维克_JP": 312, + "元太_JP": 313, + "阿扎尔_JP": 314, + "查尔斯_JP": 315, + "阿洛瓦_JP": 316, + "埃勒曼_JP": 317, + "纳比尔_JP": 318, + "莎拉_JP": 319, + "康纳_JP": 320, + "博来_JP": 321, + "玛塞勒_JP": 322, + "阿祇_JP": 323, + "博士_JP": 324, + "迪尔菲_JP": 325, + "玛格丽特_JP": 326, + "宛烟_JP": 327, + "羽生田千鹤_JP": 328, + "海妮耶_JP": 329, + "霍夫曼_JP": 330, + "旅行者_JP": 331, + "佐西摩斯_JP": 332, + "舒伯特_JP": 333, + "鹿野奈奈_JP": 334, + "天叔_JP": 335, + "龙二_JP": 336, + "艾莉丝_JP": 337, + "莺儿_JP": 338, + "嘉良_JP": 339, + "珊瑚_JP": 340, + "言笑_JP": 341, + "一心传名刀_JP": 342, + "费迪南德_JP": 343, + "久利须_JP": 344, + "嘉玛_JP": 345, + "艾文_JP": 346, + "克洛琳德_JP": 347, + "丹吉尔_JP": 348, + "天目十五_JP": 349, + "女士_JP": 350, + "老孟_JP": 351, + "白老先生_JP": 352, + "舍利夫_JP": 353, + "巴达维_JP": 354, + "拉齐_JP": 355, + "长生_JP": 356, + "吴船长_JP": 357, + "艾伯特_JP": 358, + "松浦_JP": 359, + "埃泽_JP": 360, + "阿圆_JP": 361, + "阿拉夫_JP": 362, + "莫塞伊思_JP": 363, + "石头_JP": 364, + "百闻_JP": 365, + "杜吉耶_JP": 366, + "波洛_JP": 367, + "掇星攫辰天君_JP": 368, + "迈蒙_JP": 369, + "博易_JP": 370, + "诗筠_JP": 371, + "斯坦利_JP": 372, + "毗伽尔_JP": 373, + "芙卡洛斯_JP": 374, + "恶龙_JP": 375, + "小仓澪_JP": 376, + "恕筠_JP": 377, + "知易_JP": 378, + "克列门特_JP": 379, + "大慈树王_JP": 380, + "望雅_JP": 381, + "黑田_JP": 382, + "卡莉娜_JP": 383, + "马姆杜_JP": 384, + "科林斯_JP": 385, + "上杉_JP": 386, + "西拉杰_JP": 387, + "菲尔戈黛特_JP": 388, + "一平_JP": 389, + "纯水精灵_JP": 390, + "阿尔卡米_JP": 391, + "老戴_JP": 392, + "谢赫祖拜尔_JP": 393, + "沙扎曼_JP": 394, + "田铁嘴_JP": 395, + "小野寺_JP": 396, + "百识_JP": 397, + "克罗索_JP": 398, + "莱斯格_JP": 399, + "芷巧_JP": 400, + "加藤洋平_JP": 401, + "阿巴图伊_JP": 402, + "埃尔欣根_JP": 403, + "斯嘉莉_JP": 404, + "阿佩普_JP": 405, + "巫女_JP": 406, + "卡布斯_JP": 407, + "洛伦佐_JP": 408, + "萨赫哈蒂_JP": 409, + "娜德瓦_JP": 410, + "塞德娜_JP": 411, + "塔杰·拉德卡尼_JP": 412, + "绘星_JP": 413, + "泽田_JP": 414, + "安西_JP": 415, + "拉伊德_JP": 416, + "亚卡巴_JP": 417, + "有乐斋_JP": 418, + "莱昂_JP": 419, + "尤苏波夫_JP": 420, + "夏妮_JP": 421, + "埃舍尔_JP": 422, + "萨齐因_JP": 423, + "古山_JP": 424, + "自称渊上之物_JP": 425, + "丹羽_JP": 426, + "塞萨尔的日记_JP": 427, + "派蒙_EN": 428, + "纳西妲_EN": 429, + "凯亚_EN": 430, + "阿贝多_EN": 431, + "温迪_EN": 432, + "枫原万叶_EN": 433, + "钟离_EN": 434, + "荒泷一斗_EN": 435, + "八重神子_EN": 436, + "艾尔海森_EN": 437, + "提纳里_EN": 438, + "迪希雅_EN": 439, + "卡维_EN": 440, + "宵宫_EN": 441, + "莱依拉_EN": 442, + "那维莱特_EN": 443, + "赛诺_EN": 444, + "莫娜_EN": 445, + "诺艾尔_EN": 446, + "托马_EN": 447, + "凝光_EN": 448, + "林尼_EN": 449, + "北斗_EN": 450, + "柯莱_EN": 451, + "神里绫华_EN": 452, + "可莉_EN": 453, + "芭芭拉_EN": 454, + "雷电将军_EN": 455, + "娜维娅_EN": 456, + "芙宁娜_EN": 457, + "珊瑚宫心海_EN": 458, + "鹿野院平藏_EN": 459, + "迪奥娜_EN": 460, + "五郎_EN": 461, + "琴_EN": 462, + "班尼特_EN": 463, + "达达利亚_EN": 464, + "安柏_EN": 465, + "莱欧斯利_EN": 466, + "夜兰_EN": 467, + "妮露_EN": 468, + "辛焱_EN": 469, + "珐露珊_EN": 470, + "丽莎_EN": 471, + "魈_EN": 472, + "香菱_EN": 473, + "迪卢克_EN": 474, + "砂糖_EN": 475, + "烟绯_EN": 476, + "早柚_EN": 477, + "云堇_EN": 478, + "刻晴_EN": 479, + "重云_EN": 480, + "优菈_EN": 481, + "胡桃_EN": 482, + "流浪者_EN": 483, + "久岐忍_EN": 484, + "神里绫人_EN": 485, + "甘雨_EN": 486, + "戴因斯雷布_EN": 487, + "菲谢尔_EN": 488, + "白术_EN": 489, + "行秋_EN": 490, + "九条裟罗_EN": 491, + "夏洛蒂_EN": 492, + "雷泽_EN": 493, + "申鹤_EN": 494, + "荧_EN": 495, + "空_EN": 496, + "迪娜泽黛_EN": 497, + "凯瑟琳_EN": 498, + "多莉_EN": 499, + "坎蒂丝_EN": 500, + "琳妮特_EN": 501, + "萍姥姥_EN": 502, + "罗莎莉亚_EN": 503, + "埃德_EN": 504, + "爱贝尔_EN": 505, + "伊迪娅_EN": 506, + "留云借风真君_EN": 507, + "绮良良_EN": 508, + "陌生人_EN": 509, + "七七_EN": 510, + "式大将_EN": 511, + "瑶瑶_EN": 512, + "奥兹_EN": 513, + "菲米尼_EN": 514, + "米卡_EN": 515, + "哲平_EN": 516, + "浮游水蕈兽·元素生命_EN": 517, + "大肉丸_EN": 518, + "托克_EN": 519, + "蒂玛乌斯_EN": 520, + "昆钧_EN": 521, + "欧菲妮_EN": 522, + "塞琉斯_EN": 523, + "仆人_EN": 524, + "迈勒斯_EN": 525, + "希格雯_EN": 526, + "阿守_EN": 527, + "拉赫曼_EN": 528, + "杜拉夫_EN": 529, + "伊利亚斯_EN": 530, + "阿晃_EN": 531, + "旁白_EN": 532, + "爱德琳_EN": 533, + "埃洛伊_EN": 534, + "德沃沙克_EN": 535, + "玛乔丽_EN": 536, + "塞塔蕾_EN": 537, + "柊千里_EN": 538, + "海芭夏_EN": 539, + "九条镰治_EN": 540, + "阿娜耶_EN": 541, + "笼钓瓶一心_EN": 542, + "回声海螺_EN": 543, + "劳维克_EN": 544, + "元太_EN": 545, + "阿扎尔_EN": 546, + "查尔斯_EN": 547, + "阿洛瓦_EN": 548, + "埃勒曼_EN": 549, + "纳比尔_EN": 550, + "莎拉_EN": 551, + "康纳_EN": 552, + "博来_EN": 553, + "玛塞勒_EN": 554, + "阿祇_EN": 555, + "博士_EN": 556, + "迪尔菲_EN": 557, + "宛烟_EN": 558, + "玛格丽特_EN": 559, + "羽生田千鹤_EN": 560, + "海妮耶_EN": 561, + "霍夫曼_EN": 562, + "旅行者_EN": 563, + "佐西摩斯_EN": 564, + "鹿野奈奈_EN": 565, + "舒伯特_EN": 566, + "天叔_EN": 567, + "艾莉丝_EN": 568, + "龙二_EN": 569, + "莺儿_EN": 570, + "嘉良_EN": 571, + "珊瑚_EN": 572, + "费迪南德_EN": 573, + "言笑_EN": 574, + "一心传名刀_EN": 575, + "久利须_EN": 576, + "嘉玛_EN": 577, + "艾文_EN": 578, + "克洛琳德_EN": 579, + "丹吉尔_EN": 580, + "女士_EN": 581, + "天目十五_EN": 582, + "老孟_EN": 583, + "白老先生_EN": 584, + "舍利夫_EN": 585, + "巴达维_EN": 586, + "拉齐_EN": 587, + "长生_EN": 588, + "吴船长_EN": 589, + "艾伯特_EN": 590, + "松浦_EN": 591, + "埃泽_EN": 592, + "阿圆_EN": 593, + "阿拉夫_EN": 594, + "莫塞伊思_EN": 595, + "石头_EN": 596, + "百闻_EN": 597, + "杜吉耶_EN": 598, + "波洛_EN": 599, + "斯坦利_EN": 600, + "掇星攫辰天君_EN": 601, + "迈蒙_EN": 602, + "博易_EN": 603, + "诗筠_EN": 604, + "毗伽尔_EN": 605, + "慧心_EN": 606, + "芙卡洛斯_EN": 607, + "恶龙_EN": 608, + "小仓澪_EN": 609, + "恕筠_EN": 610, + "知易_EN": 611, + "克列门特_EN": 612, + "大慈树王_EN": 613, + "维多利亚_EN": 614, + "黑田_EN": 615, + "马姆杜_EN": 616, + "科林斯_EN": 617, + "上杉_EN": 618, + "西拉杰_EN": 619, + "宁禄_EN": 620, + "纯水精灵_EN": 621, + "常九爷_EN": 622, + "阿尔卡米_EN": 623, + "沙扎曼_EN": 624, + "田铁嘴_EN": 625, + "加萨尼_EN": 626, + "克罗索_EN": 627, + "星稀_EN": 628, + "莱斯格_EN": 629, + "阿巴图伊_EN": 630, + "埃尔欣根_EN": 631, + "阿佩普_EN": 632, + "萨赫哈蒂_EN": 633, + "洛伦佐_EN": 634, + "塔杰·拉德卡尼_EN": 635, + "泽田_EN": 636, + "安西_EN": 637, "埃舍尔_EN": 638, - "萨齐因_EN": 639, - "古田_EN": 640, - "三月七_ZH": 641, - "丹恒_ZH": 642, - "希儿_ZH": 643, - "娜塔莎_ZH": 644, - "希露瓦_ZH": 645, - "瓦尔特_ZH": 646, - "佩拉_ZH": 647, - "布洛妮娅_ZH": 648, - "虎克_ZH": 649, - "素裳_ZH": 650, - "克拉拉_ZH": 651, - "符玄_ZH": 652, - "白露_ZH": 653, - "杰帕德_ZH": 654, - "景元_ZH": 655, - "藿藿_ZH": 656, - "姬子_ZH": 657, - "穹_ZH": 658, - "星_ZH": 659, - "卡芙卡_ZH": 660, - "桂乃芬_ZH": 661, - "艾丝妲_ZH": 662, - "玲可_ZH": 663, - "彦卿_ZH": 664, - "托帕_ZH": 665, - "驭空_ZH": 666, - "浮烟_ZH": 667, - "停云_ZH": 668, - "镜流_ZH": 669, - "罗刹_ZH": 670, - "卢卡_ZH": 671, - "史瓦罗_ZH": 672, - "黑塔_ZH": 673, - "桑博_ZH": 674, - "伦纳德_ZH": 675, - "明曦_ZH": 676, - "银狼_ZH": 677, - "帕姆_ZH": 678, - "青雀_ZH": 679, - "乔瓦尼_ZH": 680, - "公输师傅_ZH": 681, - "晴霓_ZH": 682, - "螺丝咕姆_ZH": 683, - "阿兰_ZH": 684, - "奥列格_ZH": 685, - "丹枢_ZH": 686, - "尾巴_ZH": 687, - "寒鸦_ZH": 688, - "雪衣_ZH": 689, - "可可利亚_ZH": 690, - "青镞_ZH": 691, - "半夏_ZH": 692, - "银枝_ZH": 693, - "大毫_ZH": 694, - "霄翰_ZH": 695, - "信使_ZH": 696, - "费斯曼_ZH": 697, - "绿芙蓉_ZH": 698, - "dev_成男_ZH": 699, - "金人会长_ZH": 700, - "维利特_ZH": 701, - "维尔德_ZH": 702, - "斯科特_ZH": 703, - "卡波特_ZH": 704, - "刃_ZH": 705, - "岩明_ZH": 706, - "浣溪_ZH": 707, - "三月七_JP": 708, - "丹恒_JP": 709, - "希儿_JP": 710, - "娜塔莎_JP": 711, - "希露瓦_JP": 712, - "瓦尔特_JP": 713, - "佩拉_JP": 714, - "布洛妮娅_JP": 715, - "虎克_JP": 716, - "素裳_JP": 717, - "克拉拉_JP": 718, - "符玄_JP": 719, - "白露_JP": 720, - "杰帕德_JP": 721, - "景元_JP": 722, - "藿藿_JP": 723, - "姬子_JP": 724, - "卡芙卡_JP": 725, - "穹_JP": 726, - "星_JP": 727, - "桂乃芬_JP": 728, - "艾丝妲_JP": 729, - "彦卿_JP": 730, - "玲可_JP": 731, - "托帕_JP": 732, - "驭空_JP": 733, - "浮烟_JP": 734, - "停云_JP": 735, - "镜流_JP": 736, - "罗刹_JP": 737, - "卢卡_JP": 738, - "史瓦罗_JP": 739, - "黑塔_JP": 740, - "桑博_JP": 741, - "伦纳德_JP": 742, - "明曦_JP": 743, - "银狼_JP": 744, - "帕姆_JP": 745, - "青雀_JP": 746, - "乔瓦尼_JP": 747, - "公输师傅_JP": 748, - "晴霓_JP": 749, - "螺丝咕姆_JP": 750, - "阿兰_JP": 751, - "奥列格_JP": 752, - "丹枢_JP": 753, - "尾巴_JP": 754, - "寒鸦_JP": 755, - "雪衣_JP": 756, - "可可利亚_JP": 757, - "青镞_JP": 758, - "半夏_JP": 759, - "银枝_JP": 760, - "大毫_JP": 761, - "霄翰_JP": 762, - "信使_JP": 763, - "费斯曼_JP": 764, - "绿芙蓉_JP": 765, - "dev_成男_JP": 766, - "金人会长_JP": 767, - "维利特_JP": 768, - "维尔德_JP": 769, - "斯科特_JP": 770, - "刃_JP": 771, - "卡波特_JP": 772, - "岩明_JP": 773, - "浣溪_JP": 774, - "净砚_JP": 775, - "紫月季_JP": 776, - "歌蒂_JP": 777, - "奇怪的云骑_JP": 778, - "幻胧_JP": 779, - "斯薇塔_JP": 780, - "隐书_JP": 781, - "三月七_EN": 782, - "丹恒_EN": 783, - "希儿_EN": 784, - "娜塔莎_EN": 785, - "希露瓦_EN": 786, - "瓦尔特_EN": 787, - "佩拉_EN": 788, - "布洛妮娅_EN": 789, - "虎克_EN": 790, - "素裳_EN": 791, - "克拉拉_EN": 792, - "符玄_EN": 793, - "白露_EN": 794, - "杰帕德_EN": 795, - "景元_EN": 796, - "藿藿_EN": 797, - "姬子_EN": 798, - "卡芙卡_EN": 799, - "穹_EN": 800, - "星_EN": 801, - "桂乃芬_EN": 802, - "艾丝妲_EN": 803, - "彦卿_EN": 804, - "玲可_EN": 805, - "托帕_EN": 806, - "驭空_EN": 807, - "浮烟_EN": 808, - "停云_EN": 809, - "镜流_EN": 810, - "罗刹_EN": 811, - "卢卡_EN": 812, - "史瓦罗_EN": 813, - "黑塔_EN": 814, - "桑博_EN": 815, - "伦纳德_EN": 816, - "明曦_EN": 817, - "银狼_EN": 818, - "帕姆_EN": 819, - "青雀_EN": 820, - "乔瓦尼_EN": 821, - "公输师傅_EN": 822, - "晴霓_EN": 823, - "螺丝咕姆_EN": 824, - "阿兰_EN": 825, - "奥列格_EN": 826, - "丹枢_EN": 827, - "尾巴_EN": 828, - "寒鸦_EN": 829, - "雪衣_EN": 830, - "可可利亚_EN": 831, - "青镞_EN": 832, - "半夏_EN": 833, - "银枝_EN": 834, - "大毫_EN": 835, - "霄翰_EN": 836, - "信使_EN": 837, - "费斯曼_EN": 838, - "绿芙蓉_EN": 839, - "dev_成男_EN": 840, - "金人会长_EN": 841, - "维利特_EN": 842, - "维尔德_EN": 843, - "刃_EN": 844, - "卡波特_EN": 845, - "岩明_EN": 846, - "浣溪_EN": 847, - "紫月季_EN": 848, - "幻胧_EN": 849, - "女声_EN": 850, - "陆景和": 851, - "莫弈": 852, - "左然": 853, - "夏彦": 854 + "三月七_ZH": 639, + "丹恒_ZH": 640, + "希儿_ZH": 641, + "娜塔莎_ZH": 642, + "希露瓦_ZH": 643, + "瓦尔特_ZH": 644, + "佩拉_ZH": 645, + "布洛妮娅_ZH": 646, + "虎克_ZH": 647, + "素裳_ZH": 648, + "克拉拉_ZH": 649, + "符玄_ZH": 650, + "白露_ZH": 651, + "杰帕德_ZH": 652, + "景元_ZH": 653, + "藿藿_ZH": 654, + "姬子_ZH": 655, + "穹_ZH": 656, + "星_ZH": 657, + "卡芙卡_ZH": 658, + "桂乃芬_ZH": 659, + "艾丝妲_ZH": 660, + "玲可_ZH": 661, + "彦卿_ZH": 662, + "托帕_ZH": 663, + "驭空_ZH": 664, + "浮烟_ZH": 665, + "停云_ZH": 666, + "镜流_ZH": 667, + "罗刹_ZH": 668, + "卢卡_ZH": 669, + "史瓦罗_ZH": 670, + "黑塔_ZH": 671, + "桑博_ZH": 672, + "伦纳德_ZH": 673, + "明曦_ZH": 674, + "银狼_ZH": 675, + "帕姆_ZH": 676, + "青雀_ZH": 677, + "乔瓦尼_ZH": 678, + "公输师傅_ZH": 679, + "晴霓_ZH": 680, + "螺丝咕姆_ZH": 681, + "阿兰_ZH": 682, + "奥列格_ZH": 683, + "丹枢_ZH": 684, + "尾巴_ZH": 685, + "寒鸦_ZH": 686, + "雪衣_ZH": 687, + "可可利亚_ZH": 688, + "青镞_ZH": 689, + "半夏_ZH": 690, + "银枝_ZH": 691, + "大毫_ZH": 692, + "霄翰_ZH": 693, + "信使_ZH": 694, + "费斯曼_ZH": 695, + "绿芙蓉_ZH": 696, + "金人会长_ZH": 697, + "维利特_ZH": 698, + "维尔德_ZH": 699, + "斯科特_ZH": 700, + "卡波特_ZH": 701, + "刃_ZH": 702, + "岩明_ZH": 703, + "浣溪_ZH": 704, + "三月七_JP": 705, + "丹恒_JP": 706, + "希儿_JP": 707, + "娜塔莎_JP": 708, + "希露瓦_JP": 709, + "瓦尔特_JP": 710, + "佩拉_JP": 711, + "布洛妮娅_JP": 712, + "虎克_JP": 713, + "素裳_JP": 714, + "克拉拉_JP": 715, + "符玄_JP": 716, + "白露_JP": 717, + "杰帕德_JP": 718, + "景元_JP": 719, + "藿藿_JP": 720, + "姬子_JP": 721, + "卡芙卡_JP": 722, + "穹_JP": 723, + "星_JP": 724, + "桂乃芬_JP": 725, + "艾丝妲_JP": 726, + "彦卿_JP": 727, + "玲可_JP": 728, + "托帕_JP": 729, + "驭空_JP": 730, + "浮烟_JP": 731, + "停云_JP": 732, + "镜流_JP": 733, + "罗刹_JP": 734, + "卢卡_JP": 735, + "史瓦罗_JP": 736, + "黑塔_JP": 737, + "桑博_JP": 738, + "伦纳德_JP": 739, + "明曦_JP": 740, + "银狼_JP": 741, + "帕姆_JP": 742, + "青雀_JP": 743, + "乔瓦尼_JP": 744, + "公输师傅_JP": 745, + "晴霓_JP": 746, + "螺丝咕姆_JP": 747, + "阿兰_JP": 748, + "奥列格_JP": 749, + "丹枢_JP": 750, + "尾巴_JP": 751, + "寒鸦_JP": 752, + "雪衣_JP": 753, + "可可利亚_JP": 754, + "青镞_JP": 755, + "半夏_JP": 756, + "银枝_JP": 757, + "大毫_JP": 758, + "霄翰_JP": 759, + "信使_JP": 760, + "费斯曼_JP": 761, + "绿芙蓉_JP": 762, + "金人会长_JP": 763, + "维利特_JP": 764, + "维尔德_JP": 765, + "斯科特_JP": 766, + "刃_JP": 767, + "卡波特_JP": 768, + "岩明_JP": 769, + "浣溪_JP": 770, + "净砚_JP": 771, + "紫月季_JP": 772, + "歌蒂_JP": 773, + "奇怪的云骑_JP": 774, + "幻胧_JP": 775, + "斯薇塔_JP": 776, + "隐书_JP": 777, + "三月七_EN": 778, + "丹恒_EN": 779, + "希儿_EN": 780, + "娜塔莎_EN": 781, + "希露瓦_EN": 782, + "瓦尔特_EN": 783, + "佩拉_EN": 784, + "布洛妮娅_EN": 785, + "虎克_EN": 786, + "素裳_EN": 787, + "克拉拉_EN": 788, + "符玄_EN": 789, + "白露_EN": 790, + "杰帕德_EN": 791, + "景元_EN": 792, + "藿藿_EN": 793, + "姬子_EN": 794, + "卡芙卡_EN": 795, + "穹_EN": 796, + "星_EN": 797, + "桂乃芬_EN": 798, + "艾丝妲_EN": 799, + "彦卿_EN": 800, + "玲可_EN": 801, + "托帕_EN": 802, + "驭空_EN": 803, + "浮烟_EN": 804, + "停云_EN": 805, + "镜流_EN": 806, + "罗刹_EN": 807, + "卢卡_EN": 808, + "史瓦罗_EN": 809, + "黑塔_EN": 810, + "桑博_EN": 811, + "伦纳德_EN": 812, + "明曦_EN": 813, + "银狼_EN": 814, + "帕姆_EN": 815, + "青雀_EN": 816, + "乔瓦尼_EN": 817, + "公输师傅_EN": 818, + "晴霓_EN": 819, + "螺丝咕姆_EN": 820, + "阿兰_EN": 821, + "奥列格_EN": 822, + "丹枢_EN": 823, + "尾巴_EN": 824, + "寒鸦_EN": 825, + "雪衣_EN": 826, + "可可利亚_EN": 827, + "青镞_EN": 828, + "半夏_EN": 829, + "银枝_EN": 830, + "大毫_EN": 831, + "霄翰_EN": 832, + "信使_EN": 833, + "费斯曼_EN": 834, + "绿芙蓉_EN": 835, + "金人会长_EN": 836, + "维利特_EN": 837, + "维尔德_EN": 838, + "刃_EN": 839, + "卡波特_EN": 840, + "岩明_EN": 841, + "浣溪_EN": 842, + "紫月季_EN": 843, + "幻胧_EN": 844, + "女声_EN": 845, + "陆景和": 846, + "莫弈": 847, + "左然": 848, + "夏彦": 849 } }, "model": { @@ -947,7 +944,14 @@ ], "n_layers_q": 3, "use_spectral_norm": false, - "gin_channels": 256 + "gin_channels": 256, + "slm": { + "model": "./slm/wavlm-base-plus", + "sr": 16000, + "hidden": 768, + "nlayers": 13, + "initial_channel": 64 + } }, "version": "2.2" } diff --git a/data_utils.py b/data_utils.py index ef89656ff..ab9e6f25e 100644 --- a/data_utils.py +++ b/data_utils.py @@ -44,10 +44,6 @@ def __init__(self, audiopaths_sid_text, hparams): self.min_text_len = getattr(hparams, "min_text_len", 1) self.max_text_len = getattr(hparams, "max_text_len", 384) - self.empty_emo = torch.squeeze( - torch.load("empty_emo.npy", map_location="cpu"), dim=1 - ) - random.seed(1234) random.shuffle(self.audiopaths_sid_text) self._filter() @@ -98,14 +94,7 @@ def get_audio_text_speaker_pair(self, audiopath_sid_text): spec, wav = self.get_audio(audiopath) sid = torch.LongTensor([int(self.spk_map[sid])]) - if np.random.rand() > 0.1: - emo = torch.squeeze( - torch.load(audiopath.replace(".wav", ".emo.npy"), map_location="cpu"), - dim=1, - ) - else: - emo = self.empty_emo - return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert, emo) + return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert) def get_audio(self, filename): audio, sampling_rate = load_wav_to_torch(filename) @@ -168,15 +157,15 @@ def get_text(self, text, word2ph, phone, tone, language_str, wav_path): if language_str == "ZH": bert = bert_ori - ja_bert = torch.rand(1024, len(phone)) - en_bert = torch.rand(1024, len(phone)) + ja_bert = torch.randn(1024, len(phone)) + en_bert = torch.randn(1024, len(phone)) elif language_str == "JP": - bert = torch.rand(1024, len(phone)) + bert = torch.randn(1024, len(phone)) ja_bert = bert_ori - en_bert = torch.rand(1024, len(phone)) + en_bert = torch.randn(1024, len(phone)) elif language_str == "EN": - bert = torch.rand(1024, len(phone)) - ja_bert = torch.rand(1024, len(phone)) + bert = torch.randn(1024, len(phone)) + ja_bert = torch.randn(1024, len(phone)) en_bert = bert_ori phone = torch.LongTensor(phone) tone = torch.LongTensor(tone) @@ -226,7 +215,6 @@ def __call__(self, batch): bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len) ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len) en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len) - emo = torch.FloatTensor(len(batch), 512) spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) @@ -238,7 +226,6 @@ def __call__(self, batch): bert_padded.zero_() ja_bert_padded.zero_() en_bert_padded.zero_() - emo.zero_() for i in range(len(ids_sorted_decreasing)): row = batch[ids_sorted_decreasing[i]] @@ -272,8 +259,6 @@ def __call__(self, batch): en_bert = row[8] en_bert_padded[i, :, : en_bert.size(1)] = en_bert - emo[i, :] = row[9] - return ( text_padded, text_lengths, @@ -287,7 +272,6 @@ def __call__(self, batch): bert_padded, ja_bert_padded, en_bert_padded, - emo, ) diff --git a/empty_emo.npy b/empty_emo.npy deleted file mode 100644 index 6865293ad..000000000 Binary files a/empty_emo.npy and /dev/null differ diff --git a/for_deploy/infer.py b/for_deploy/infer.py new file mode 100644 index 000000000..516810969 --- /dev/null +++ b/for_deploy/infer.py @@ -0,0 +1,386 @@ +""" +版本管理、兼容推理及模型加载实现。 +版本说明: + 1. 版本号与github的release版本号对应,使用哪个release版本训练的模型即对应其版本号 + 2. 请在模型的config.json中显示声明版本号,添加一个字段"version" : "你的版本号" +特殊版本说明: + 1.1.1-fix: 1.1.1版本训练的模型,但是在推理时使用dev的日语修复 + 2.2:当前版本 +""" +import torch +import commons +from text import cleaned_text_to_sequence, get_bert +from clap_wrapper import get_clap_audio_feature, get_clap_text_feature +from text.cleaner import clean_text +import utils +import numpy as np + +from models import SynthesizerTrn +from text.symbols import symbols + +from oldVersion.V210.models import SynthesizerTrn as V210SynthesizerTrn +from oldVersion.V210.text import symbols as V210symbols +from oldVersion.V200.models import SynthesizerTrn as V200SynthesizerTrn +from oldVersion.V200.text import symbols as V200symbols +from oldVersion.V111.models import SynthesizerTrn as V111SynthesizerTrn +from oldVersion.V111.text import symbols as V111symbols +from oldVersion.V110.models import SynthesizerTrn as V110SynthesizerTrn +from oldVersion.V110.text import symbols as V110symbols +from oldVersion.V101.models import SynthesizerTrn as V101SynthesizerTrn +from oldVersion.V101.text import symbols as V101symbols + +from oldVersion import V111, V110, V101, V200, V210 + +# 当前版本信息 +latest_version = "2.2" + +# 版本兼容 +SynthesizerTrnMap = { + "2.1": V210SynthesizerTrn, + "2.0.2-fix": V200SynthesizerTrn, + "2.0.1": V200SynthesizerTrn, + "2.0": V200SynthesizerTrn, + "1.1.1-fix": V111SynthesizerTrn, + "1.1.1": V111SynthesizerTrn, + "1.1": V110SynthesizerTrn, + "1.1.0": V110SynthesizerTrn, + "1.0.1": V101SynthesizerTrn, + "1.0": V101SynthesizerTrn, + "1.0.0": V101SynthesizerTrn, +} + +symbolsMap = { + "2.1": V210symbols, + "2.0.2-fix": V200symbols, + "2.0.1": V200symbols, + "2.0": V200symbols, + "1.1.1-fix": V111symbols, + "1.1.1": V111symbols, + "1.1": V110symbols, + "1.1.0": V110symbols, + "1.0.1": V101symbols, + "1.0": V101symbols, + "1.0.0": V101symbols, +} + + +# def get_emo_(reference_audio, emotion, sid): +# emo = ( +# torch.from_numpy(get_emo(reference_audio)) +# if reference_audio and emotion == -1 +# else torch.FloatTensor( +# np.load(f"emo_clustering/{sid}/cluster_center_{emotion}.npy") +# ) +# ) +# return emo + + +def get_net_g(model_path: str, version: str, device: str, hps): + if version != latest_version: + net_g = SynthesizerTrnMap[version]( + len(symbolsMap[version]), + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).to(device) + else: + # 当前版本模型 net_g + net_g = SynthesizerTrn( + len(symbols), + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).to(device) + _ = net_g.eval() + _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True) + return net_g + + +def get_text(text, language_str, bert, hps, device): + # 在此处实现当前版本的get_text + norm_text, phone, tone, word2ph = clean_text(text, language_str) + phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) + + if hps.data.add_blank: + phone = commons.intersperse(phone, 0) + tone = commons.intersperse(tone, 0) + language = commons.intersperse(language, 0) + for i in range(len(word2ph)): + word2ph[i] = word2ph[i] * 2 + word2ph[0] += 1 + # bert_ori = get_bert(norm_text, word2ph, language_str, device) + bert_ori = bert[language_str].get_bert_feature(norm_text, word2ph, device) + del word2ph + assert bert_ori.shape[-1] == len(phone), phone + + if language_str == "ZH": + bert = bert_ori + ja_bert = torch.randn(1024, len(phone)) + en_bert = torch.randn(1024, len(phone)) + elif language_str == "JP": + bert = torch.randn(1024, len(phone)) + ja_bert = bert_ori + en_bert = torch.randn(1024, len(phone)) + elif language_str == "EN": + bert = torch.randn(1024, len(phone)) + ja_bert = torch.randn(1024, len(phone)) + en_bert = bert_ori + else: + raise ValueError("language_str should be ZH, JP or EN") + + assert bert.shape[-1] == len( + phone + ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" + + phone = torch.LongTensor(phone) + tone = torch.LongTensor(tone) + language = torch.LongTensor(language) + return bert, ja_bert, en_bert, phone, tone, language + + +def infer( + text, + emotion, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + sid, + language, + hps, + net_g, + device, + bert=None, + clap=None, + reference_audio=None, + skip_start=False, + skip_end=False, +): + # 2.2版本参数位置变了 + # 2.1 参数新增 emotion reference_audio skip_start skip_end + inferMap_V3 = { + "2.1": V210.infer, + } + # 支持中日英三语版本 + inferMap_V2 = { + "2.0.2-fix": V200.infer, + "2.0.1": V200.infer, + "2.0": V200.infer, + "1.1.1-fix": V111.infer_fix, + "1.1.1": V111.infer, + "1.1": V110.infer, + "1.1.0": V110.infer, + } + # 仅支持中文版本 + # 在测试中,并未发现两个版本的模型不能互相通用 + inferMap_V1 = { + "1.0.1": V101.infer, + "1.0": V101.infer, + "1.0.0": V101.infer, + } + version = hps.version if hasattr(hps, "version") else latest_version + # 非当前版本,根据版本号选择合适的infer + if version != latest_version: + if version in inferMap_V3.keys(): + return inferMap_V3[version]( + text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + sid, + language, + hps, + net_g, + device, + reference_audio, + emotion, + skip_start, + skip_end, + ) + if version in inferMap_V2.keys(): + return inferMap_V2[version]( + text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + sid, + language, + hps, + net_g, + device, + ) + if version in inferMap_V1.keys(): + return inferMap_V1[version]( + text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + sid, + hps, + net_g, + device, + ) + # 在此处实现当前版本的推理 + # emo = get_emo_(reference_audio, emotion, sid) + if isinstance(reference_audio, np.ndarray): + emo = clap.get_clap_audio_feature(reference_audio, device) + else: + emo = clap.get_clap_text_feature(emotion, device) + emo = torch.squeeze(emo, dim=1) + + bert, ja_bert, en_bert, phones, tones, lang_ids = get_text( + text, language, bert, hps, device + ) + if skip_start: + phones = phones[3:] + tones = tones[3:] + lang_ids = lang_ids[3:] + bert = bert[:, 3:] + ja_bert = ja_bert[:, 3:] + en_bert = en_bert[:, 3:] + if skip_end: + phones = phones[:-2] + tones = tones[:-2] + lang_ids = lang_ids[:-2] + bert = bert[:, :-2] + ja_bert = ja_bert[:, :-2] + en_bert = en_bert[:, :-2] + with torch.no_grad(): + x_tst = phones.to(device).unsqueeze(0) + tones = tones.to(device).unsqueeze(0) + lang_ids = lang_ids.to(device).unsqueeze(0) + bert = bert.to(device).unsqueeze(0) + ja_bert = ja_bert.to(device).unsqueeze(0) + en_bert = en_bert.to(device).unsqueeze(0) + x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) + emo = emo.to(device).unsqueeze(0) + del phones + speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) + audio = ( + net_g.infer( + x_tst, + x_tst_lengths, + speakers, + tones, + lang_ids, + bert, + ja_bert, + en_bert, + emo, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + )[0][0, 0] + .data.cpu() + .float() + .numpy() + ) + del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio + + +def infer_multilang( + text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + sid, + language, + hps, + net_g, + device, + bert=None, + clap=None, + reference_audio=None, + emotion=None, + skip_start=False, + skip_end=False, +): + bert, ja_bert, en_bert, phones, tones, lang_ids = [], [], [], [], [], [] + # emo = get_emo_(reference_audio, emotion, sid) + if isinstance(reference_audio, np.ndarray): + emo = clap.get_clap_audio_feature(reference_audio, device) + else: + emo = clap.get_clap_text_feature(emotion, device) + emo = torch.squeeze(emo, dim=1) + for idx, (txt, lang) in enumerate(zip(text, language)): + skip_start = (idx != 0) or (skip_start and idx == 0) + skip_end = (idx != len(text) - 1) or (skip_end and idx == len(text) - 1) + ( + temp_bert, + temp_ja_bert, + temp_en_bert, + temp_phones, + temp_tones, + temp_lang_ids, + ) = get_text(txt, lang, bert, hps, device) + if skip_start: + temp_bert = temp_bert[:, 3:] + temp_ja_bert = temp_ja_bert[:, 3:] + temp_en_bert = temp_en_bert[:, 3:] + temp_phones = temp_phones[3:] + temp_tones = temp_tones[3:] + temp_lang_ids = temp_lang_ids[3:] + if skip_end: + temp_bert = temp_bert[:, :-2] + temp_ja_bert = temp_ja_bert[:, :-2] + temp_en_bert = temp_en_bert[:, :-2] + temp_phones = temp_phones[:-2] + temp_tones = temp_tones[:-2] + temp_lang_ids = temp_lang_ids[:-2] + bert.append(temp_bert) + ja_bert.append(temp_ja_bert) + en_bert.append(temp_en_bert) + phones.append(temp_phones) + tones.append(temp_tones) + lang_ids.append(temp_lang_ids) + bert = torch.concatenate(bert, dim=1) + ja_bert = torch.concatenate(ja_bert, dim=1) + en_bert = torch.concatenate(en_bert, dim=1) + phones = torch.concatenate(phones, dim=0) + tones = torch.concatenate(tones, dim=0) + lang_ids = torch.concatenate(lang_ids, dim=0) + with torch.no_grad(): + x_tst = phones.to(device).unsqueeze(0) + tones = tones.to(device).unsqueeze(0) + lang_ids = lang_ids.to(device).unsqueeze(0) + bert = bert.to(device).unsqueeze(0) + ja_bert = ja_bert.to(device).unsqueeze(0) + en_bert = en_bert.to(device).unsqueeze(0) + emo = emo.to(device).unsqueeze(0) + x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) + del phones + speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) + audio = ( + net_g.infer( + x_tst, + x_tst_lengths, + speakers, + tones, + lang_ids, + bert, + ja_bert, + en_bert, + emo, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + )[0][0, 0] + .data.cpu() + .float() + .numpy() + ) + del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio diff --git a/for_deploy/infer_utils.py b/for_deploy/infer_utils.py new file mode 100644 index 000000000..dd00e4c9c --- /dev/null +++ b/for_deploy/infer_utils.py @@ -0,0 +1,111 @@ +import sys + +import torch +from transformers import ( + AutoModelForMaskedLM, + AutoTokenizer, + DebertaV2Model, + DebertaV2Tokenizer, + ClapModel, + ClapProcessor, +) + +from config import config +from text.japanese import text2sep_kata + + +class BertFeature: + def __init__(self, model_path, language="ZH"): + self.model_path = model_path + self.language = language + self.tokenizer = None + self.model = None + self.device = None + + self._prepare() + + def _get_device(self, device=config.bert_gen_config.device): + if ( + sys.platform == "darwin" + and torch.backends.mps.is_available() + and device == "cpu" + ): + device = "mps" + if not device: + device = "cuda" + return device + + def _prepare(self): + self.device = self._get_device() + + if self.language == "EN": + self.tokenizer = DebertaV2Tokenizer.from_pretrained(self.model_path) + self.model = DebertaV2Model.from_pretrained(self.model_path).to(self.device) + else: + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.model = AutoModelForMaskedLM.from_pretrained(self.model_path).to( + self.device + ) + self.model.eval() + + def get_bert_feature(self, text, word2ph): + if self.language == "JP": + text = "".join(text2sep_kata(text)[0]) + with torch.no_grad(): + inputs = self.tokenizer(text, return_tensors="pt") + for i in inputs: + inputs[i] = inputs[i].to(self.device) + res = self.model(**inputs, output_hidden_states=True) + res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + + word2phone = word2ph + phone_level_feature = [] + for i in range(len(word2phone)): + repeat_feature = res[i].repeat(word2phone[i], 1) + phone_level_feature.append(repeat_feature) + + phone_level_feature = torch.cat(phone_level_feature, dim=0) + + return phone_level_feature.T + + +class ClapFeature: + def __init__(self, model_path): + self.model_path = model_path + self.processor = None + self.model = None + self.device = None + + self._prepare() + + def _get_device(self, device=config.bert_gen_config.device): + if ( + sys.platform == "darwin" + and torch.backends.mps.is_available() + and device == "cpu" + ): + device = "mps" + if not device: + device = "cuda" + return device + + def _prepare(self): + self.device = self._get_device() + + self.processor = ClapProcessor.from_pretrained(self.model_path) + self.model = ClapModel.from_pretrained(self.model_path).to(self.device) + self.model.eval() + + def get_clap_audio_feature(self, audio_data): + with torch.no_grad(): + inputs = self.processor( + audios=audio_data, return_tensors="pt", sampling_rate=48000 + ).to(self.device) + emb = self.model.get_audio_features(**inputs) + return emb.T + + def get_clap_text_feature(self, text): + with torch.no_grad(): + inputs = self.processor(text=text, return_tensors="pt").to(self.device) + emb = self.model.get_text_features(**inputs) + return emb.T diff --git a/for_deploy/webui.py b/for_deploy/webui.py new file mode 100644 index 000000000..f813e513e --- /dev/null +++ b/for_deploy/webui.py @@ -0,0 +1,556 @@ +# flake8: noqa: E402 +import os +import logging +import re_matching +from tools.sentence import split_by_language + +logging.getLogger("numba").setLevel(logging.WARNING) +logging.getLogger("markdown_it").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) +logging.getLogger("matplotlib").setLevel(logging.WARNING) + +logging.basicConfig( + level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s" +) + +logger = logging.getLogger(__name__) + +import torch +import utils +from infer import infer, latest_version, get_net_g, infer_multilang +import gradio as gr +import webbrowser +import numpy as np +from config import config +from tools.translate import translate +import librosa +from infer_utils import BertFeature, ClapFeature + + +net_g = None + +device = config.webui_config.device +if device == "mps": + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +os.environ["OMP_NUM_THREADS"] = "1" +os.environ["MKL_NUM_THREADS"] = "1" + +bert_feature_map = { + "ZH": BertFeature( + "./bert/chinese-roberta-wwm-ext-large", + language="ZH", + ), + "JP": BertFeature( + "./bert/deberta-v2-large-japanese-char-wwm", + language="JP", + ), + "EN": BertFeature( + "./bert/deberta-v3-large", + language="EN", + ), +} + +clap_feature = ClapFeature("./emotional/clap-htsat-fused") + + +def generate_audio( + slices, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + speaker, + language, + reference_audio, + emotion, + skip_start=False, + skip_end=False, +): + audio_list = [] + # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) + with torch.no_grad(): + for idx, piece in enumerate(slices): + skip_start = (idx != 0) and skip_start + skip_end = (idx != len(slices) - 1) and skip_end + audio = infer( + piece, + reference_audio=reference_audio, + emotion=emotion, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + sid=speaker, + language=language, + hps=hps, + net_g=net_g, + device=device, + skip_start=skip_start, + skip_end=skip_end, + bert=bert_feature_map, + clap=clap_feature, + ) + audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) + audio_list.append(audio16bit) + # audio_list.append(silence) # 将静音添加到列表中 + return audio_list + + +def generate_audio_multilang( + slices, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + speaker, + language, + reference_audio, + emotion, + skip_start=False, + skip_end=False, +): + audio_list = [] + # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) + with torch.no_grad(): + for idx, piece in enumerate(slices): + skip_start = (idx != 0) and skip_start + skip_end = (idx != len(slices) - 1) and skip_end + audio = infer_multilang( + piece, + reference_audio=reference_audio, + emotion=emotion, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + sid=speaker, + language=language[idx], + hps=hps, + net_g=net_g, + device=device, + skip_start=skip_start, + skip_end=skip_end, + ) + audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) + audio_list.append(audio16bit) + # audio_list.append(silence) # 将静音添加到列表中 + return audio_list + + +def tts_split( + text: str, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + cut_by_sent, + interval_between_para, + interval_between_sent, + reference_audio, + emotion, +): + if language == "mix": + return ("invalid", None) + while text.find("\n\n") != -1: + text = text.replace("\n\n", "\n") + para_list = re_matching.cut_para(text) + audio_list = [] + if not cut_by_sent: + for idx, p in enumerate(para_list): + skip_start = idx != 0 + skip_end = idx != len(para_list) - 1 + audio = infer( + p, + reference_audio=reference_audio, + emotion=emotion, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + sid=speaker, + language=language, + hps=hps, + net_g=net_g, + device=device, + skip_start=skip_start, + skip_end=skip_end, + ) + audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) + audio_list.append(audio16bit) + silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16) + audio_list.append(silence) + else: + for idx, p in enumerate(para_list): + skip_start = idx != 0 + skip_end = idx != len(para_list) - 1 + audio_list_sent = [] + sent_list = re_matching.cut_sent(p) + for idx, s in enumerate(sent_list): + skip_start = (idx != 0) and skip_start + skip_end = (idx != len(sent_list) - 1) and skip_end + audio = infer( + s, + reference_audio=reference_audio, + emotion=emotion, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + sid=speaker, + language=language, + hps=hps, + net_g=net_g, + device=device, + skip_start=skip_start, + skip_end=skip_end, + ) + audio_list_sent.append(audio) + silence = np.zeros((int)(44100 * interval_between_sent)) + audio_list_sent.append(silence) + if (interval_between_para - interval_between_sent) > 0: + silence = np.zeros( + (int)(44100 * (interval_between_para - interval_between_sent)) + ) + audio_list_sent.append(silence) + audio16bit = gr.processing_utils.convert_to_16_bit_wav( + np.concatenate(audio_list_sent) + ) # 对完整句子做音量归一 + audio_list.append(audio16bit) + audio_concat = np.concatenate(audio_list) + return ("Success", (44100, audio_concat)) + + +def tts_fn( + text: str, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + reference_audio, + emotion, + prompt_mode, +): + if prompt_mode == "Audio prompt": + if reference_audio == None: + return ("Invalid audio prompt", None) + else: + reference_audio = load_audio(reference_audio)[1] + else: + reference_audio = None + audio_list = [] + if language == "mix": + bool_valid, str_valid = re_matching.validate_text(text) + if not bool_valid: + return str_valid, ( + hps.data.sampling_rate, + np.concatenate([np.zeros(hps.data.sampling_rate // 2)]), + ) + result = [] + for slice in re_matching.text_matching(text): + _speaker = slice.pop() + temp_contant = [] + temp_lang = [] + for lang, content in slice: + if "|" in content: + temp = [] + temp_ = [] + for i in content.split("|"): + if i != "": + temp.append([i]) + temp_.append([lang]) + else: + temp.append([]) + temp_.append([]) + temp_contant += temp + temp_lang += temp_ + else: + if len(temp_contant) == 0: + temp_contant.append([]) + temp_lang.append([]) + temp_contant[-1].append(content) + temp_lang[-1].append(lang) + for i, j in zip(temp_lang, temp_contant): + result.append([*zip(i, j), _speaker]) + for i, one in enumerate(result): + skip_start = i != 0 + skip_end = i != len(result) - 1 + _speaker = one.pop() + idx = 0 + while idx < len(one): + text_to_generate = [] + lang_to_generate = [] + while True: + lang, content = one[idx] + temp_text = [content] + if len(text_to_generate) > 0: + text_to_generate[-1] += [temp_text.pop(0)] + lang_to_generate[-1] += [lang] + if len(temp_text) > 0: + text_to_generate += [[i] for i in temp_text] + lang_to_generate += [[lang]] * len(temp_text) + if idx + 1 < len(one): + idx += 1 + else: + break + skip_start = (idx != 0) and skip_start + skip_end = (idx != len(one) - 1) and skip_end + print(text_to_generate, lang_to_generate) + audio_list.extend( + generate_audio_multilang( + text_to_generate, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + _speaker, + lang_to_generate, + reference_audio, + emotion, + skip_start, + skip_end, + ) + ) + idx += 1 + elif language.lower() == "auto": + for idx, slice in enumerate(text.split("|")): + if slice == "": + continue + skip_start = idx != 0 + skip_end = idx != len(text.split("|")) - 1 + sentences_list = split_by_language( + slice, target_languages=["zh", "ja", "en"] + ) + idx = 0 + while idx < len(sentences_list): + text_to_generate = [] + lang_to_generate = [] + while True: + content, lang = sentences_list[idx] + temp_text = [content] + lang = lang.upper() + if lang == "JA": + lang = "JP" + if len(text_to_generate) > 0: + text_to_generate[-1] += [temp_text.pop(0)] + lang_to_generate[-1] += [lang] + if len(temp_text) > 0: + text_to_generate += [[i] for i in temp_text] + lang_to_generate += [[lang]] * len(temp_text) + if idx + 1 < len(sentences_list): + idx += 1 + else: + break + skip_start = (idx != 0) and skip_start + skip_end = (idx != len(sentences_list) - 1) and skip_end + print(text_to_generate, lang_to_generate) + audio_list.extend( + generate_audio_multilang( + text_to_generate, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + speaker, + lang_to_generate, + reference_audio, + emotion, + skip_start, + skip_end, + ) + ) + idx += 1 + else: + audio_list.extend( + generate_audio( + text.split("|"), + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + speaker, + language, + reference_audio, + emotion, + ) + ) + + audio_concat = np.concatenate(audio_list) + return "Success", (hps.data.sampling_rate, audio_concat) + + +def load_audio(path): + audio, sr = librosa.load(path, 48000) + # audio = librosa.resample(audio, 44100, 48000) + return sr, audio + + +def gr_util(item): + if item == "Text prompt": + return {"visible": True, "__type__": "update"}, { + "visible": False, + "__type__": "update", + } + else: + return {"visible": False, "__type__": "update"}, { + "visible": True, + "__type__": "update", + } + + +if __name__ == "__main__": + if config.webui_config.debug: + logger.info("Enable DEBUG-LEVEL log") + logging.basicConfig(level=logging.DEBUG) + hps = utils.get_hparams_from_file(config.webui_config.config_path) + # 若config.json中未指定版本则默认为最新版本 + version = hps.version if hasattr(hps, "version") else latest_version + net_g = get_net_g( + model_path=config.webui_config.model, version=version, device=device, hps=hps + ) + speaker_ids = hps.data.spk2id + speakers = list(speaker_ids.keys()) + languages = ["ZH", "JP", "EN", "mix", "auto"] + with gr.Blocks() as app: + with gr.Row(): + with gr.Column(): + text = gr.TextArea( + label="输入文本内容", + placeholder=""" + 如果你选择语言为\'mix\',必须按照格式输入,否则报错: + 格式举例(zh是中文,jp是日语,不区分大小写;说话人举例:gongzi): + [说话人1]你好,こんにちは! こんにちは,世界。 + [说话人2]你好吗?元気ですか? + [说话人3]谢谢。どういたしまして。 + ... + 另外,所有的语言选项都可以用'|'分割长段实现分句生成。 + """, + ) + trans = gr.Button("中翻日", variant="primary") + slicer = gr.Button("快速切分", variant="primary") + speaker = gr.Dropdown( + choices=speakers, value=speakers[0], label="Speaker" + ) + _ = gr.Markdown( + value="提示模式(Prompt mode):可选文字提示或音频提示,用于生成文字或音频指定风格的声音。\n" + ) + prompt_mode = gr.Radio( + ["Text prompt", "Audio prompt"], + label="Prompt Mode", + value="Text prompt", + ) + text_prompt = gr.Textbox( + label="Text prompt", + placeholder="用文字描述生成风格。如:Happy", + value="Happy", + visible=True, + ) + audio_prompt = gr.Audio( + label="Audio prompt", type="filepath", visible=False + ) + sdp_ratio = gr.Slider( + minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio" + ) + noise_scale = gr.Slider( + minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise" + ) + noise_scale_w = gr.Slider( + minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W" + ) + length_scale = gr.Slider( + minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length" + ) + language = gr.Dropdown( + choices=languages, value=languages[0], label="Language" + ) + btn = gr.Button("生成音频!", variant="primary") + with gr.Column(): + with gr.Row(): + with gr.Column(): + interval_between_sent = gr.Slider( + minimum=0, + maximum=5, + value=0.2, + step=0.1, + label="句间停顿(秒),勾选按句切分才生效", + ) + interval_between_para = gr.Slider( + minimum=0, + maximum=10, + value=1, + step=0.1, + label="段间停顿(秒),需要大于句间停顿才有效", + ) + opt_cut_by_sent = gr.Checkbox( + label="按句切分 在按段落切分的基础上再按句子切分文本" + ) + slicer = gr.Button("切分生成", variant="primary") + text_output = gr.Textbox(label="状态信息") + audio_output = gr.Audio(label="输出音频") + # explain_image = gr.Image( + # label="参数解释信息", + # show_label=True, + # show_share_button=False, + # show_download_button=False, + # value=os.path.abspath("./img/参数说明.png"), + # ) + btn.click( + tts_fn, + inputs=[ + text, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + audio_prompt, + text_prompt, + prompt_mode, + ], + outputs=[text_output, audio_output], + ) + + trans.click( + translate, + inputs=[text], + outputs=[text], + ) + slicer.click( + tts_split, + inputs=[ + text, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + opt_cut_by_sent, + interval_between_para, + interval_between_sent, + audio_prompt, + text_prompt, + ], + outputs=[text_output, audio_output], + ) + + prompt_mode.change( + lambda x: gr_util(x), + inputs=[prompt_mode], + outputs=[text_prompt, audio_prompt], + ) + + audio_prompt.upload( + lambda x: load_audio(x), + inputs=[audio_prompt], + outputs=[audio_prompt], + ) + + print("推理页面已开启!") + webbrowser.open(f"http://127.0.0.1:{config.webui_config.port}") + app.launch(share=config.webui_config.share, server_port=config.webui_config.port) diff --git a/infer.py b/infer.py index 7262f1891..0d8fffd9e 100644 --- a/infer.py +++ b/infer.py @@ -10,7 +10,8 @@ import torch import commons from text import cleaned_text_to_sequence, get_bert -from clap_wrapper import get_clap_audio_feature, get_clap_text_feature + +# from clap_wrapper import get_clap_audio_feature, get_clap_text_feature from text.cleaner import clean_text import utils import numpy as np @@ -98,7 +99,8 @@ def get_net_g(model_path: str, version: str, device: str, hps): return net_g -def get_text(text, language_str, hps, device): +def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7): + style_text = None if style_text == "" else style_text # 在此处实现当前版本的get_text norm_text, phone, tone, word2ph = clean_text(text, language_str) phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) @@ -110,21 +112,23 @@ def get_text(text, language_str, hps, device): for i in range(len(word2ph)): word2ph[i] = word2ph[i] * 2 word2ph[0] += 1 - bert_ori = get_bert(norm_text, word2ph, language_str, device) + bert_ori = get_bert( + norm_text, word2ph, language_str, device, style_text, style_weight + ) del word2ph assert bert_ori.shape[-1] == len(phone), phone if language_str == "ZH": bert = bert_ori - ja_bert = torch.rand(1024, len(phone)) - en_bert = torch.rand(1024, len(phone)) + ja_bert = torch.randn(1024, len(phone)) + en_bert = torch.randn(1024, len(phone)) elif language_str == "JP": - bert = torch.rand(1024, len(phone)) + bert = torch.randn(1024, len(phone)) ja_bert = bert_ori - en_bert = torch.rand(1024, len(phone)) + en_bert = torch.randn(1024, len(phone)) elif language_str == "EN": - bert = torch.rand(1024, len(phone)) - ja_bert = torch.rand(1024, len(phone)) + bert = torch.randn(1024, len(phone)) + ja_bert = torch.randn(1024, len(phone)) en_bert = bert_ori else: raise ValueError("language_str should be ZH, JP or EN") @@ -154,6 +158,8 @@ def infer( reference_audio=None, skip_start=False, skip_end=False, + style_text=None, + style_weight=0.7, ): # 2.2版本参数位置变了 # 2.1 参数新增 emotion reference_audio skip_start skip_end @@ -181,6 +187,7 @@ def infer( # 非当前版本,根据版本号选择合适的infer if version != latest_version: if version in inferMap_V3.keys(): + emotion = 0 return inferMap_V3[version]( text, sdp_ratio, @@ -196,6 +203,8 @@ def infer( emotion, skip_start, skip_end, + style_text, + style_weight, ) if version in inferMap_V2.keys(): return inferMap_V2[version]( @@ -224,14 +233,19 @@ def infer( ) # 在此处实现当前版本的推理 # emo = get_emo_(reference_audio, emotion, sid) - if isinstance(reference_audio, np.ndarray): - emo = get_clap_audio_feature(reference_audio, device) - else: - emo = get_clap_text_feature(emotion, device) - emo = torch.squeeze(emo, dim=1) + # if isinstance(reference_audio, np.ndarray): + # emo = get_clap_audio_feature(reference_audio, device) + # else: + # emo = get_clap_text_feature(emotion, device) + # emo = torch.squeeze(emo, dim=1) bert, ja_bert, en_bert, phones, tones, lang_ids = get_text( - text, language, hps, device + text, + language, + hps, + device, + style_text=style_text, + style_weight=style_weight, ) if skip_start: phones = phones[3:] @@ -255,7 +269,7 @@ def infer( ja_bert = ja_bert.to(device).unsqueeze(0) en_bert = en_bert.to(device).unsqueeze(0) x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) - emo = emo.to(device).unsqueeze(0) + # emo = emo.to(device).unsqueeze(0) del phones speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) audio = ( @@ -268,7 +282,6 @@ def infer( bert, ja_bert, en_bert, - emo, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, @@ -278,7 +291,16 @@ def infer( .float() .numpy() ) - del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo + del ( + x_tst, + tones, + lang_ids, + bert, + x_tst_lengths, + speakers, + ja_bert, + en_bert, + ) # , emo if torch.cuda.is_available(): torch.cuda.empty_cache() return audio @@ -302,14 +324,14 @@ def infer_multilang( ): bert, ja_bert, en_bert, phones, tones, lang_ids = [], [], [], [], [], [] # emo = get_emo_(reference_audio, emotion, sid) - if isinstance(reference_audio, np.ndarray): - emo = get_clap_audio_feature(reference_audio, device) - else: - emo = get_clap_text_feature(emotion, device) - emo = torch.squeeze(emo, dim=1) + # if isinstance(reference_audio, np.ndarray): + # emo = get_clap_audio_feature(reference_audio, device) + # else: + # emo = get_clap_text_feature(emotion, device) + # emo = torch.squeeze(emo, dim=1) for idx, (txt, lang) in enumerate(zip(text, language)): - skip_start = (idx != 0) or (skip_start and idx == 0) - skip_end = (idx != len(text) - 1) or (skip_end and idx == len(text) - 1) + _skip_start = (idx != 0) or (skip_start and idx == 0) + _skip_end = (idx != len(language) - 1) or skip_end ( temp_bert, temp_ja_bert, @@ -318,14 +340,14 @@ def infer_multilang( temp_tones, temp_lang_ids, ) = get_text(txt, lang, hps, device) - if skip_start: + if _skip_start: temp_bert = temp_bert[:, 3:] temp_ja_bert = temp_ja_bert[:, 3:] temp_en_bert = temp_en_bert[:, 3:] temp_phones = temp_phones[3:] temp_tones = temp_tones[3:] temp_lang_ids = temp_lang_ids[3:] - if skip_end: + if _skip_end: temp_bert = temp_bert[:, :-2] temp_ja_bert = temp_ja_bert[:, :-2] temp_en_bert = temp_en_bert[:, :-2] @@ -351,7 +373,7 @@ def infer_multilang( bert = bert.to(device).unsqueeze(0) ja_bert = ja_bert.to(device).unsqueeze(0) en_bert = en_bert.to(device).unsqueeze(0) - emo = emo.to(device).unsqueeze(0) + # emo = emo.to(device).unsqueeze(0) x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) del phones speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) @@ -365,7 +387,6 @@ def infer_multilang( bert, ja_bert, en_bert, - emo, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, @@ -375,7 +396,16 @@ def infer_multilang( .float() .numpy() ) - del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo + del ( + x_tst, + tones, + lang_ids, + bert, + x_tst_lengths, + speakers, + ja_bert, + en_bert, + ) # , emo if torch.cuda.is_available(): torch.cuda.empty_cache() return audio diff --git a/losses.py b/losses.py index b1b263e4c..53357aa92 100644 --- a/losses.py +++ b/losses.py @@ -1,4 +1,6 @@ import torch +import torchaudio +from transformers import AutoModel def feature_loss(fmap_r, fmap_g): @@ -56,3 +58,93 @@ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): kl = torch.sum(kl * z_mask) l = kl / torch.sum(z_mask) return l + + +class WavLMLoss(torch.nn.Module): + def __init__(self, model, wd, model_sr, slm_sr=16000): + super(WavLMLoss, self).__init__() + self.wavlm = AutoModel.from_pretrained(model) + self.wd = wd + self.resample = torchaudio.transforms.Resample(model_sr, slm_sr) + + def forward(self, wav, y_rec): + with torch.no_grad(): + wav_16 = self.resample(wav) + wav_embeddings = self.wavlm( + input_values=wav_16, output_hidden_states=True + ).hidden_states + y_rec_16 = self.resample(y_rec) + y_rec_embeddings = self.wavlm( + input_values=y_rec_16.squeeze(), output_hidden_states=True + ).hidden_states + + floss = 0 + for er, eg in zip(wav_embeddings, y_rec_embeddings): + floss += torch.mean(torch.abs(er - eg)) + + return floss.mean() + + def generator(self, y_rec): + y_rec_16 = self.resample(y_rec) + y_rec_embeddings = self.wavlm( + input_values=y_rec_16, output_hidden_states=True + ).hidden_states + y_rec_embeddings = ( + torch.stack(y_rec_embeddings, dim=1) + .transpose(-1, -2) + .flatten(start_dim=1, end_dim=2) + ) + y_df_hat_g = self.wd(y_rec_embeddings) + loss_gen = torch.mean((1 - y_df_hat_g) ** 2) + + return loss_gen + + def discriminator(self, wav, y_rec): + with torch.no_grad(): + wav_16 = self.resample(wav) + wav_embeddings = self.wavlm( + input_values=wav_16, output_hidden_states=True + ).hidden_states + y_rec_16 = self.resample(y_rec) + y_rec_embeddings = self.wavlm( + input_values=y_rec_16, output_hidden_states=True + ).hidden_states + + y_embeddings = ( + torch.stack(wav_embeddings, dim=1) + .transpose(-1, -2) + .flatten(start_dim=1, end_dim=2) + ) + y_rec_embeddings = ( + torch.stack(y_rec_embeddings, dim=1) + .transpose(-1, -2) + .flatten(start_dim=1, end_dim=2) + ) + + y_d_rs = self.wd(y_embeddings) + y_d_gs = self.wd(y_rec_embeddings) + + y_df_hat_r, y_df_hat_g = y_d_rs, y_d_gs + + r_loss = torch.mean((1 - y_df_hat_r) ** 2) + g_loss = torch.mean((y_df_hat_g) ** 2) + + loss_disc_f = r_loss + g_loss + + return loss_disc_f.mean() + + def discriminator_forward(self, wav): + with torch.no_grad(): + wav_16 = self.resample(wav) + wav_embeddings = self.wavlm( + input_values=wav_16, output_hidden_states=True + ).hidden_states + y_embeddings = ( + torch.stack(wav_embeddings, dim=1) + .transpose(-1, -2) + .flatten(start_dim=1, end_dim=2) + ) + + y_d_rs = self.wd(y_embeddings) + + return y_d_rs diff --git a/models.py b/models.py index 6257d8e21..97dcdce32 100644 --- a/models.py +++ b/models.py @@ -40,33 +40,22 @@ def __init__( self.norm_2 = modules.LayerNorm(filter_channels) self.dur_proj = nn.Conv1d(1, filter_channels, 1) - self.pre_out_conv_1 = nn.Conv1d( - 2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + self.LSTM = nn.LSTM( + 2 * filter_channels, filter_channels, batch_first=True, bidirectional=True ) - self.pre_out_norm_1 = modules.LayerNorm(filter_channels) - self.pre_out_conv_2 = nn.Conv1d( - filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 - ) - self.pre_out_norm_2 = modules.LayerNorm(filter_channels) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, in_channels, 1) - self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid()) + self.output_layer = nn.Sequential( + nn.Linear(2 * filter_channels, 1), nn.Sigmoid() + ) - def forward_probability(self, x, x_mask, dur, g=None): + def forward_probability(self, x, dur): dur = self.dur_proj(dur) x = torch.cat([x, dur], dim=1) - x = self.pre_out_conv_1(x * x_mask) - x = torch.relu(x) - x = self.pre_out_norm_1(x) - x = self.drop(x) - x = self.pre_out_conv_2(x * x_mask) - x = torch.relu(x) - x = self.pre_out_norm_2(x) - x = self.drop(x) - x = x * x_mask x = x.transpose(1, 2) + x, _ = self.LSTM(x) output_prob = self.output_layer(x) return output_prob @@ -86,7 +75,7 @@ def forward(self, x, x_mask, dur_r, dur_hat, g=None): output_probs = [] for dur in [dur_r, dur_hat]: - output_prob = self.forward_probability(x, x_mask, dur, g) + output_prob = self.forward_probability(x, dur) output_probs.append(output_prob) return output_probs @@ -354,7 +343,6 @@ def __init__( n_layers, kernel_size, p_dropout, - n_speakers, gin_channels=0, ): super().__init__() @@ -376,31 +364,6 @@ def __init__( self.bert_proj = nn.Conv1d(1024, hidden_channels, 1) self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1) self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1) - # self.emo_proj = nn.Linear(512, hidden_channels) - self.in_feature_net = nn.Sequential( - # input is assumed to an already normalized embedding - nn.Linear(512, 1028, bias=False), - nn.GELU(), - nn.LayerNorm(1028), - *[Block(1028, 512) for _ in range(1)], - nn.Linear(1028, 512, bias=False), - # normalize before passing to VQ? - # nn.GELU(), - # nn.LayerNorm(512), - ) - self.emo_vq = VectorQuantize( - dim=512, - codebook_size=64, - codebook_dim=32, - commitment_weight=0.1, - decay=0.85, - heads=32, - kmeans_iters=20, - separate_codebook_per_head=True, - stochastic_sample_codes=True, - threshold_ema_dead_code=2, - ) - self.out_feature_net = nn.Linear(512, hidden_channels) self.encoder = attentions.Encoder( hidden_channels, @@ -413,18 +376,10 @@ def __init__( ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward( - self, x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=None - ): - sid = sid.cpu() + def forward(self, x, x_lengths, tone, language, bert, ja_bert, en_bert, g=None): bert_emb = self.bert_proj(bert).transpose(1, 2) ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2) en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2) - emo_emb = self.in_feature_net(emo) - emo_emb, _, loss_commit = self.emo_vq(emo_emb.unsqueeze(1)) - loss_commit = loss_commit.mean() - emo_emb = self.out_feature_net(emo_emb) - # emo_emb = self.emo_proj(emo.unsqueeze(1)) x = ( self.emb(x) + self.tone_emb(tone) @@ -432,7 +387,6 @@ def forward( + bert_emb + ja_bert_emb + en_bert_emb - + emo_emb ) * math.sqrt( self.hidden_channels ) # [b, t, h] @@ -445,7 +399,7 @@ def forward( stats = self.proj(x) * x_mask m, logs = torch.split(stats, self.out_channels, dim=1) - return x, m, logs, x_mask, loss_commit + return x, m, logs, x_mask class ResidualCouplingBlock(nn.Module): @@ -748,6 +702,55 @@ def forward(self, y, y_hat): return y_d_rs, y_d_gs, fmap_rs, fmap_gs +class WavLMDiscriminator(nn.Module): + """docstring for Discriminator.""" + + def __init__( + self, slm_hidden=768, slm_layers=13, initial_channel=64, use_spectral_norm=False + ): + super(WavLMDiscriminator, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.pre = norm_f( + Conv1d(slm_hidden * slm_layers, initial_channel, 1, 1, padding=0) + ) + + self.convs = nn.ModuleList( + [ + norm_f( + nn.Conv1d( + initial_channel, initial_channel * 2, kernel_size=5, padding=2 + ) + ), + norm_f( + nn.Conv1d( + initial_channel * 2, + initial_channel * 4, + kernel_size=5, + padding=2, + ) + ), + norm_f( + nn.Conv1d(initial_channel * 4, initial_channel * 4, 5, 1, padding=2) + ), + ] + ) + + self.conv_post = norm_f(Conv1d(initial_channel * 4, 1, 3, 1, padding=1)) + + def forward(self, x): + x = self.pre(x) + + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + x = torch.flatten(x, 1, -1) + + return x + + class ReferenceEncoder(nn.Module): """ inputs --- [N, Ty/r, n_mels*r] mels @@ -878,7 +881,6 @@ def __init__( n_layers, kernel_size, p_dropout, - self.n_speakers, gin_channels=self.enc_gin_channels, ) self.dec = Generator( @@ -946,14 +948,13 @@ def forward( bert, ja_bert, en_bert, - emo=None, ): if self.n_speakers > 0: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] else: g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1) - x, m_p, logs_p, x_mask, loss_commit = self.enc_p( - x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=g + x, m_p, logs_p, x_mask = self.enc_p( + x, x_lengths, tone, language, bert, ja_bert, en_bert, g=g ) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) z_p = self.flow(z, y_mask, g=g) @@ -996,9 +997,11 @@ def forward( logw_ = torch.log(w + 1e-6) * x_mask logw = self.dp(x, x_mask, g=g) + logw_sdp = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=1.0) l_length_dp = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum( x_mask ) # for averaging + l_length_sdp += torch.sum((logw_sdp - logw_) ** 2, [1, 2]) / torch.sum(x_mask) l_length = l_length_dp + l_length_sdp @@ -1018,9 +1021,8 @@ def forward( x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q), - (x, logw, logw_), + (x, logw, logw_, logw_sdp), g, - loss_commit, ) def infer( @@ -1033,7 +1035,6 @@ def infer( bert, ja_bert, en_bert, - emo=None, noise_scale=0.667, length_scale=1, noise_scale_w=0.8, @@ -1047,8 +1048,8 @@ def infer( g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] else: g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1) - x, m_p, logs_p, x_mask, _ = self.enc_p( - x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=g + x, m_p, logs_p, x_mask = self.enc_p( + x, x_lengths, tone, language, bert, ja_bert, en_bert, g=g ) logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * ( sdp_ratio diff --git a/oldVersion/V210/__init__.py b/oldVersion/V210/__init__.py index eb1aea283..89cfab3e2 100644 --- a/oldVersion/V210/__init__.py +++ b/oldVersion/V210/__init__.py @@ -7,7 +7,7 @@ from .text.cleaner import clean_text -def get_text(text, language_str, hps, device): +def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7): # 在此处实现当前版本的get_text norm_text, phone, tone, word2ph = clean_text(text, language_str) phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) @@ -19,7 +19,9 @@ def get_text(text, language_str, hps, device): for i in range(len(word2ph)): word2ph[i] = word2ph[i] * 2 word2ph[0] += 1 - bert_ori = get_bert(norm_text, word2ph, language_str, device) + bert_ori = get_bert( + norm_text, word2ph, language_str, device, style_text, style_weight + ) del word2ph assert bert_ori.shape[-1] == len(phone), phone @@ -74,9 +76,11 @@ def infer( emotion=None, skip_start=False, skip_end=False, + style_text=None, + style_weight=0.7, ): bert, ja_bert, en_bert, phones, tones, lang_ids = get_text( - text, language, hps, device + text, language, hps, device, style_text, style_weight ) emo = get_emo_(reference_audio, emotion) if skip_start: diff --git a/oldVersion/V210/text/__init__.py b/oldVersion/V210/text/__init__.py index e7a61585f..e29856f98 100644 --- a/oldVersion/V210/text/__init__.py +++ b/oldVersion/V210/text/__init__.py @@ -18,13 +18,15 @@ def cleaned_text_to_sequence(cleaned_text, tones, language): return phones, tones, lang_ids -def get_bert(norm_text, word2ph, language, device): +def get_bert(norm_text, word2ph, language, device, style_text, style_weight): from .chinese_bert import get_bert_feature as zh_bert from .english_bert_mock import get_bert_feature as en_bert from .japanese_bert import get_bert_feature as jp_bert lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} - bert = lang_bert_func_map[language](norm_text, word2ph, device) + bert = lang_bert_func_map[language]( + norm_text, word2ph, device, style_text, style_weight + ) return bert diff --git a/oldVersion/V210/text/chinese_bert.py b/oldVersion/V210/text/chinese_bert.py index 36f1e2a09..1b60bb4fc 100644 --- a/oldVersion/V210/text/chinese_bert.py +++ b/oldVersion/V210/text/chinese_bert.py @@ -12,7 +12,13 @@ models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -29,12 +35,25 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == len(text) + 2 word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/oldVersion/V210/text/english_bert_mock.py b/oldVersion/V210/text/english_bert_mock.py index 85b241c40..2f3c9af3d 100644 --- a/oldVersion/V210/text/english_bert_mock.py +++ b/oldVersion/V210/text/english_bert_mock.py @@ -13,7 +13,13 @@ models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -30,11 +36,24 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph)) word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/oldVersion/V210/text/japanese_bert.py b/oldVersion/V210/text/japanese_bert.py index 7dbe28423..ae4bfb8d2 100644 --- a/oldVersion/V210/text/japanese_bert.py +++ b/oldVersion/V210/text/japanese_bert.py @@ -13,8 +13,16 @@ models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): text = "".join(text2sep_kata(text)[0]) + if style_text: + style_text = "".join(text2sep_kata(style_text)[0]) if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -31,12 +39,25 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == len(text) + 2 word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/re_matching.py b/re_matching.py index 3c5340c99..dd464a5cc 100644 --- a/re_matching.py +++ b/re_matching.py @@ -44,7 +44,6 @@ def text_matching(text: str) -> list: result = [] for speaker, dialogue in matches: result.append(extract_language_and_text_updated(speaker, dialogue)) - print(result) return result diff --git a/resample_legacy.py b/resample_legacy.py new file mode 100644 index 000000000..a0d617558 --- /dev/null +++ b/resample_legacy.py @@ -0,0 +1,71 @@ +import os +import argparse +import librosa +from multiprocessing import Pool, cpu_count + +import soundfile +from tqdm import tqdm + +from config import config + + +def process(item): + wav_name, args = item + wav_path = os.path.join(args.in_dir, wav_name) + if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"): + wav, sr = librosa.load(wav_path, sr=args.sr) + soundfile.write(os.path.join(args.out_dir, wav_name), wav, sr) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--sr", + type=int, + default=config.resample_config.sampling_rate, + help="sampling rate", + ) + parser.add_argument( + "--in_dir", + type=str, + default=config.resample_config.in_dir, + help="path to source dir", + ) + parser.add_argument( + "--out_dir", + type=str, + default=config.resample_config.out_dir, + help="path to target dir", + ) + parser.add_argument( + "--processes", + type=int, + default=0, + help="cpu_processes", + ) + args, _ = parser.parse_known_args() + # autodl 无卡模式会识别出46个cpu + if args.processes == 0: + processes = cpu_count() - 2 if cpu_count() > 4 else 1 + else: + processes = args.processes + pool = Pool(processes=processes) + + tasks = [] + + for dirpath, _, filenames in os.walk(args.in_dir): + if not os.path.isdir(args.out_dir): + os.makedirs(args.out_dir, exist_ok=True) + for filename in filenames: + if filename.lower().endswith(".wav"): + tasks.append((filename, args)) + + for _ in tqdm( + pool.imap_unordered(process, tasks), + ): + pass + + pool.close() + pool.join() + + print("音频重采样完毕!") diff --git a/server_fastapi.py b/server_fastapi.py index cbc1170bd..e0f0d8b61 100644 --- a/server_fastapi.py +++ b/server_fastapi.py @@ -204,6 +204,8 @@ async def _voice( auto_split: bool, emotion: Optional[Union[int, str]] = None, reference_audio=None, + style_text: Optional[str] = None, + style_weight: float = 0.7, ) -> Union[Response, Dict[str, any]]: """TTS实现函数""" # 检查模型是否存在 @@ -261,6 +263,8 @@ async def _voice( device=loaded_models.models[model_id].device, emotion=emotion, reference_audio=ref_audio, + style_text=style_text, + style_weight=style_weight, ) audio = gradio.processing_utils.convert_to_16_bit_wav(audio) else: @@ -282,6 +286,8 @@ async def _voice( device=loaded_models.models[model_id].device, emotion=emotion, reference_audio=ref_audio, + style_text=style_text, + style_weight=style_weight, ) ) audios.append(np.zeros(int(44100 * 0.2))) @@ -312,6 +318,8 @@ async def voice( auto_split: bool = Query(False, description="自动切分"), emotion: Optional[Union[int, str]] = Query(None, description="emo"), reference_audio: UploadFile = File(None), + style_text: Optional[str] = Form(None, description="风格文本"), + style_weight: float = Query(0.7, description="风格权重"), ): """语音接口,若需要上传参考音频请仅使用post请求""" logger.info( @@ -331,6 +339,8 @@ async def voice( auto_split=auto_split, emotion=emotion, reference_audio=reference_audio, + style_text=style_text, + style_weight=style_weight, ) @app.get("/voice") @@ -350,6 +360,8 @@ async def voice( auto_translate: bool = Query(False, description="自动翻译"), auto_split: bool = Query(False, description="自动切分"), emotion: Optional[Union[int, str]] = Query(None, description="emo"), + style_text: Optional[str] = Query(None, description="风格文本"), + style_weight: float = Query(0.7, description="风格权重"), ): """语音接口""" logger.info( @@ -368,6 +380,8 @@ async def voice( auto_translate=auto_translate, auto_split=auto_split, emotion=emotion, + style_text=style_text, + style_weight=style_weight, ) @app.get("/models/info") diff --git a/slm/wavlm-base-plus/.gitattributes b/slm/wavlm-base-plus/.gitattributes new file mode 100644 index 000000000..6d34772f5 --- /dev/null +++ b/slm/wavlm-base-plus/.gitattributes @@ -0,0 +1,27 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/slm/wavlm-base-plus/README.md b/slm/wavlm-base-plus/README.md new file mode 100644 index 000000000..119267160 --- /dev/null +++ b/slm/wavlm-base-plus/README.md @@ -0,0 +1,65 @@ +--- +language: +- en +datasets: +tags: +- speech +inference: false +--- + +# WavLM-Base-Plus + +[Microsoft's WavLM](https://github.com/microsoft/unilm/tree/master/wavlm) + +The base model pretrained on 16kHz sampled speech audio. When using the model, make sure that your speech input is also sampled at 16kHz. + +**Note**: This model does not have a tokenizer as it was pretrained on audio alone. In order to use this model **speech recognition**, a tokenizer should be created and the model should be fine-tuned on labeled text data. Check out [this blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) for more in-detail explanation of how to fine-tune the model. + +The model was pre-trained on: + +- 60,000 hours of [Libri-Light](https://arxiv.org/abs/1912.07875) +- 10,000 hours of [GigaSpeech](https://arxiv.org/abs/2106.06909) +- 24,000 hours of [VoxPopuli](https://arxiv.org/abs/2101.00390) + +[Paper: WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) + +Authors: Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei + +**Abstract** +*Self-supervised learning (SSL) achieves great success in speech recognition, while limited exploration has been attempted for other speech processing tasks. As speech signal contains multi-faceted information including speaker identity, paralinguistics, spoken content, etc., learning universal representations for all speech tasks is challenging. In this paper, we propose a new pre-trained model, WavLM, to solve full-stack downstream speech tasks. WavLM is built based on the HuBERT framework, with an emphasis on both spoken content modeling and speaker identity preservation. We first equip the Transformer structure with gated relative position bias to improve its capability on recognition tasks. For better speaker discrimination, we propose an utterance mixing training strategy, where additional overlapped utterances are created unsupervisely and incorporated during model training. Lastly, we scale up the training dataset from 60k hours to 94k hours. WavLM Large achieves state-of-the-art performance on the SUPERB benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks.* + +The original model can be found under https://github.com/microsoft/unilm/tree/master/wavlm. + +# Usage + +This is an English pre-trained speech model that has to be fine-tuned on a downstream task like speech recognition or audio classification before it can be +used in inference. The model was pre-trained in English and should therefore perform well only in English. The model has been shown to work well on the [SUPERB benchmark](https://superbbenchmark.org/). + +**Note**: The model was pre-trained on phonemes rather than characters. This means that one should make sure that the input text is converted to a sequence +of phonemes before fine-tuning. + +## Speech Recognition + +To fine-tune the model for speech recognition, see [the official speech recognition example](https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition). + +## Speech Classification + +To fine-tune the model for speech classification, see [the official audio classification example](https://github.com/huggingface/transformers/tree/master/examples/pytorch/audio-classification). + +## Speaker Verification + +TODO + +## Speaker Diarization + +TODO + +# Contribution + +The model was contributed by [cywang](https://huggingface.co/cywang) and [patrickvonplaten](https://huggingface.co/patrickvonplaten). + +# License + +The official license can be found [here](https://github.com/microsoft/UniSpeech/blob/main/LICENSE) + +![design](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/wavlm.png) \ No newline at end of file diff --git a/slm/wavlm-base-plus/config.json b/slm/wavlm-base-plus/config.json new file mode 100644 index 000000000..b7b4e5f6c --- /dev/null +++ b/slm/wavlm-base-plus/config.json @@ -0,0 +1,99 @@ +{ + "_name_or_path": "wavlm-base-plus", + "activation_dropout": 0.0, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "WavLMModel" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "layerdrop": 0.05, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wavlm", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_buckets": 320, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_ctc_classes": 80, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 0, + "proj_codevector_dim": 256, + "replace_prob": 0.5, + "torch_dtype": "float32", + "transformers_version": "4.13.0.dev0", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "tokenizer_class": "Wav2Vec2CTCTokenizer" +} diff --git a/slm/wavlm-base-plus/preprocessor_config.json b/slm/wavlm-base-plus/preprocessor_config.json new file mode 100644 index 000000000..10f6def8c --- /dev/null +++ b/slm/wavlm-base-plus/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": false, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/text/__init__.py b/text/__init__.py index 816ad01d9..98276d2a8 100644 --- a/text/__init__.py +++ b/text/__init__.py @@ -18,13 +18,15 @@ def cleaned_text_to_sequence(cleaned_text, tones, language): return phones, tones, lang_ids -def get_bert(norm_text, word2ph, language, device): +def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7): from .chinese_bert import get_bert_feature as zh_bert from .english_bert_mock import get_bert_feature as en_bert from .japanese_bert import get_bert_feature as jp_bert lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} - bert = lang_bert_func_map[language](norm_text, word2ph, device) + bert = lang_bert_func_map[language]( + norm_text, word2ph, device, style_text, style_weight + ) return bert diff --git a/text/chinese_bert.py b/text/chinese_bert.py index 36f1e2a09..cfa7f6032 100644 --- a/text/chinese_bert.py +++ b/text/chinese_bert.py @@ -12,7 +12,13 @@ models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -29,12 +35,24 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() - + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == len(text) + 2 word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/text/english.py b/text/english.py index 0443829d6..4a2af9523 100644 --- a/text/english.py +++ b/text/english.py @@ -5,6 +5,7 @@ from transformers import DebertaV2Tokenizer from text import symbols +from text.symbols import punctuation current_file_path = os.path.dirname(__file__) CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep") @@ -217,6 +218,8 @@ def refine_ph(phn): if re.search(r"\d$", phn): tone = int(phn[-1]) + 1 phn = phn[:-1] + else: + tone = 3 return phn.lower(), tone @@ -389,45 +392,84 @@ def sep_text(text): return words +def text_to_words(text): + tokens = tokenizer.tokenize(text) + words = [] + for idx, t in enumerate(tokens): + if t.startswith("▁"): + words.append([t[1:]]) + else: + if t in punctuation: + if idx == len(tokens) - 1: + words.append([f"{t}"]) + else: + if ( + not tokens[idx + 1].startswith("▁") + and tokens[idx + 1] not in punctuation + ): + if idx == 0: + words.append([]) + words[-1].append(f"{t}") + else: + words.append([f"{t}"]) + else: + if idx == 0: + words.append([]) + words[-1].append(f"{t}") + return words + + def g2p(text): phones = [] tones = [] - # word2ph = [] - words = sep_text(text) - tokens = [tokenizer.tokenize(i) for i in words] + phone_len = [] + # words = sep_text(text) + # tokens = [tokenizer.tokenize(i) for i in words] + words = text_to_words(text) + for word in words: - if word.upper() in eng_dict: - phns, tns = refine_syllables(eng_dict[word.upper()]) - phones.append([post_replace_ph(i) for i in phns]) - tones.append(tns) - # word2ph.append(len(phns)) - else: - phone_list = list(filter(lambda p: p != " ", _g2p(word))) - phns = [] - tns = [] - for ph in phone_list: - if ph in arpa: - ph, tn = refine_ph(ph) - phns.append(ph) - tns.append(tn) - else: - phns.append(ph) - tns.append(0) - phones.append([post_replace_ph(i) for i in phns]) - tones.append(tns) - # word2ph.append(len(phns)) - # phones = [post_replace_ph(i) for i in phones] + temp_phones, temp_tones = [], [] + if len(word) > 1: + if "'" in word: + word = ["".join(word)] + for w in word: + if w in punctuation: + temp_phones.append(w) + temp_tones.append(0) + continue + if w.upper() in eng_dict: + phns, tns = refine_syllables(eng_dict[w.upper()]) + temp_phones += [post_replace_ph(i) for i in phns] + temp_tones += tns + # w2ph.append(len(phns)) + else: + phone_list = list(filter(lambda p: p != " ", _g2p(w))) + phns = [] + tns = [] + for ph in phone_list: + if ph in arpa: + ph, tn = refine_ph(ph) + phns.append(ph) + tns.append(tn) + else: + phns.append(ph) + tns.append(0) + temp_phones += [post_replace_ph(i) for i in phns] + temp_tones += tns + phones += temp_phones + tones += temp_tones + phone_len.append(len(temp_phones)) + # phones = [post_replace_ph(i) for i in phones] word2ph = [] - for token, phoneme in zip(tokens, phones): - phone_len = len(phoneme) + for token, pl in zip(words, phone_len): word_len = len(token) - aaa = distribute_phone(phone_len, word_len) + aaa = distribute_phone(pl, word_len) word2ph += aaa - phones = ["_"] + [j for i in phones for j in i] + ["_"] - tones = [0] + [j for i in tones for j in i] + [0] + phones = ["_"] + phones + ["_"] + tones = [0] + tones + [0] word2ph = [1] + word2ph + [1] assert len(phones) == len(tones), text assert len(phones) == sum(word2ph), text diff --git a/text/english_bert_mock.py b/text/english_bert_mock.py index 85b241c40..2f3c9af3d 100644 --- a/text/english_bert_mock.py +++ b/text/english_bert_mock.py @@ -13,7 +13,13 @@ models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -30,11 +36,24 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph)) word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/text/japanese_bert.py b/text/japanese_bert.py index d47be80f0..c69f41923 100644 --- a/text/japanese_bert.py +++ b/text/japanese_bert.py @@ -13,8 +13,16 @@ models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): text = "".join(text2sep_kata(text)[0]) + if style_text: + style_text = "".join(text2sep_kata(style_text)[0]) if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -31,12 +39,25 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == len(text) + 2 word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/text/tone_sandhi.py b/text/tone_sandhi.py index 6a6e4c3e6..372308604 100644 --- a/text/tone_sandhi.py +++ b/text/tone_sandhi.py @@ -634,9 +634,11 @@ def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')] # output seg: [['听一听', 'v']] def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: - new_seg = [] + new_seg = [] * len(seg) # function 1 - for i, (word, pos) in enumerate(seg): + i = 0 + while i < len(seg): + word, pos = seg[i] if ( i - 1 >= 0 and word == "一" @@ -645,6 +647,7 @@ def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: and seg[i - 1][1] == "v" ): new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] + i += 2 else: if ( i - 2 >= 0 @@ -655,7 +658,8 @@ def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: continue else: new_seg.append([word, pos]) - seg = new_seg + i += 1 + seg = [i for i in new_seg if len(i) > 0] new_seg = [] # function 2 for i, (word, pos) in enumerate(seg): diff --git a/train_ms.py b/train_ms.py index d02b42dec..fe795c0a4 100644 --- a/train_ms.py +++ b/train_ms.py @@ -27,8 +27,15 @@ SynthesizerTrn, MultiPeriodDiscriminator, DurationDiscriminator, + WavLMDiscriminator, +) +from losses import ( + generator_loss, + discriminator_loss, + feature_loss, + kl_loss, + WavLMLoss, ) -from losses import generator_loss, discriminator_loss, feature_loss, kl_loss from mel_processing import mel_spectrogram_torch, spec_to_mel_torch from text.symbols import symbols @@ -42,7 +49,6 @@ torch.backends.cuda.enable_mem_efficient_sdp( True ) # Not available if torch version is lower than 2.0 -torch.backends.cuda.enable_math_sdp(True) global_step = 0 @@ -173,6 +179,8 @@ def run(): 0.1, gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0, ).cuda(local_rank) + else: + net_dur_disc = None if ( "use_spk_conditioned_encoder" in hps.model.keys() and hps.model.use_spk_conditioned_encoder is True @@ -210,6 +218,9 @@ def run(): param.requires_grad = False net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(local_rank) + net_wd = WavLMDiscriminator( + hps.model.slm.hidden, hps.model.slm.nlayers, hps.model.slm.initial_channel + ).cuda(local_rank) optim_g = torch.optim.AdamW( filter(lambda p: p.requires_grad, net_g.parameters()), hps.train.learning_rate, @@ -222,6 +233,12 @@ def run(): betas=hps.train.betas, eps=hps.train.eps, ) + optim_wd = torch.optim.AdamW( + net_wd.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) if net_dur_disc is not None: optim_dur_disc = torch.optim.AdamW( net_dur_disc.parameters(), @@ -233,12 +250,11 @@ def run(): optim_dur_disc = None net_g = DDP(net_g, device_ids=[local_rank], bucket_cap_mb=512) net_d = DDP(net_d, device_ids=[local_rank], bucket_cap_mb=512) - dur_resume_lr = None + net_wd = DDP(net_wd, device_ids=[local_rank], bucket_cap_mb=512) if net_dur_disc is not None: net_dur_disc = DDP( net_dur_disc, device_ids=[local_rank], - find_unused_parameters=True, bucket_cap_mb=512, ) @@ -250,9 +266,10 @@ def run(): token=config.openi_token, mirror=config.mirror, ) - - try: - if net_dur_disc is not None: + dur_resume_lr = hps.train.learning_rate + wd_resume_lr = hps.train.learning_rate + if net_dur_disc is not None: + try: _, _, dur_resume_lr, epoch_str = utils.load_checkpoint( utils.latest_checkpoint_path(hps.model_dir, "DUR_*.pth"), net_dur_disc, @@ -261,28 +278,32 @@ def run(): if "skip_optimizer" in hps.train else True, ) - _, optim_g, g_resume_lr, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), - net_g, - optim_g, - skip_optimizer=hps.train.skip_optimizer - if "skip_optimizer" in hps.train - else True, - ) - _, optim_d, d_resume_lr, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), - net_d, - optim_d, - skip_optimizer=hps.train.skip_optimizer - if "skip_optimizer" in hps.train - else True, - ) - if not optim_g.param_groups[0].get("initial_lr"): - optim_g.param_groups[0]["initial_lr"] = g_resume_lr - if not optim_d.param_groups[0].get("initial_lr"): - optim_d.param_groups[0]["initial_lr"] = d_resume_lr if not optim_dur_disc.param_groups[0].get("initial_lr"): optim_dur_disc.param_groups[0]["initial_lr"] = dur_resume_lr + except: + print("Initialize dur_disc") + + try: + _, optim_g, g_resume_lr, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), + net_g, + optim_g, + skip_optimizer=hps.train.skip_optimizer + if "skip_optimizer" in hps.train + else True, + ) + _, optim_d, d_resume_lr, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), + net_d, + optim_d, + skip_optimizer=hps.train.skip_optimizer + if "skip_optimizer" in hps.train + else True, + ) + if not optim_g.param_groups[0].get("initial_lr"): + optim_g.param_groups[0]["initial_lr"] = g_resume_lr + if not optim_d.param_groups[0].get("initial_lr"): + optim_d.param_groups[0]["initial_lr"] = d_resume_lr epoch_str = max(epoch_str, 1) # global_step = (epoch_str - 1) * len(train_loader) @@ -297,21 +318,36 @@ def run(): epoch_str = 1 global_step = 0 + try: + _, optim_wd, wd_resume_lr, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "WD_*.pth"), + net_wd, + optim_wd, + skip_optimizer=hps.train.skip_optimizer + if "skip_optimizer" in hps.train + else True, + ) + if not optim_wd.param_groups[0].get("initial_lr"): + optim_wd.param_groups[0]["initial_lr"] = wd_resume_lr + except Exception as e: + print(e) + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 ) scheduler_d = torch.optim.lr_scheduler.ExponentialLR( optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 ) + scheduler_wd = torch.optim.lr_scheduler.ExponentialLR( + optim_wd, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 + ) if net_dur_disc is not None: - if not optim_dur_disc.param_groups[0].get("initial_lr"): - optim_dur_disc.param_groups[0]["initial_lr"] = dur_resume_lr scheduler_dur_disc = torch.optim.lr_scheduler.ExponentialLR( optim_dur_disc, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 ) else: scheduler_dur_disc = None - scaler = GradScaler(enabled=hps.train.fp16_run) + scaler = GradScaler(enabled=hps.train.bf16_run) for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: @@ -320,9 +356,9 @@ def run(): local_rank, epoch, hps, - [net_g, net_d, net_dur_disc], - [optim_g, optim_d, optim_dur_disc], - [scheduler_g, scheduler_d, scheduler_dur_disc], + [net_g, net_d, net_dur_disc, net_wd], + [optim_g, optim_d, optim_dur_disc, optim_wd], + [scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd], scaler, [train_loader, eval_loader], logger, @@ -334,9 +370,9 @@ def run(): local_rank, epoch, hps, - [net_g, net_d, net_dur_disc], - [optim_g, optim_d, optim_dur_disc], - [scheduler_g, scheduler_d, scheduler_dur_disc], + [net_g, net_d, net_dur_disc, net_wd], + [optim_g, optim_d, optim_dur_disc, optim_wd], + [scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd], scaler, [train_loader, None], None, @@ -361,18 +397,25 @@ def train_and_evaluate( logger, writers, ): - net_g, net_d, net_dur_disc = nets - optim_g, optim_d, optim_dur_disc = optims - scheduler_g, scheduler_d, scheduler_dur_disc = schedulers + net_g, net_d, net_dur_disc, net_wd = nets + optim_g, optim_d, optim_dur_disc, optim_wd = optims + scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd = schedulers train_loader, eval_loader = loaders if writers is not None: writer, writer_eval = writers + wl = WavLMLoss( + hps.model.slm.model, + net_wd, + hps.data.sampling_rate, + hps.model.slm.sr, + ).to(local_rank) train_loader.batch_sampler.set_epoch(epoch) global global_step net_g.train() net_d.train() + net_wd.train() if net_dur_disc is not None: net_dur_disc.train() for batch_idx, ( @@ -388,7 +431,6 @@ def train_and_evaluate( bert, ja_bert, en_bert, - emo, ) in enumerate(tqdm(train_loader)): if net_g.module.use_noise_scaled_mas: current_mas_noise_scale = ( @@ -411,9 +453,8 @@ def train_and_evaluate( bert = bert.cuda(local_rank, non_blocking=True) ja_bert = ja_bert.cuda(local_rank, non_blocking=True) en_bert = en_bert.cuda(local_rank, non_blocking=True) - emo = emo.cuda(local_rank, non_blocking=True) - with autocast(enabled=hps.train.fp16_run): + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): ( y_hat, l_length, @@ -422,9 +463,8 @@ def train_and_evaluate( x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), - (hidden_x, logw, logw_), + (hidden_x, logw, logw_, logw_sdp), g, - loss_commit, ) = net_g( x, x_lengths, @@ -436,7 +476,6 @@ def train_and_evaluate( bert, ja_bert, en_bert, - emo, ) mel = spec_to_mel_torch( spec, @@ -450,7 +489,7 @@ def train_and_evaluate( mel, ids_slice, hps.train.segment_size // hps.data.hop_length ) y_hat_mel = mel_spectrogram_torch( - y_hat.squeeze(1), + y_hat.squeeze(1).float(), hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, @@ -466,7 +505,7 @@ def train_and_evaluate( # Discriminator y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) - with autocast(enabled=False): + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( y_d_hat_r, y_d_hat_g ) @@ -475,11 +514,20 @@ def train_and_evaluate( y_dur_hat_r, y_dur_hat_g = net_dur_disc( hidden_x.detach(), x_mask.detach(), + logw_.detach(), logw.detach(), + g.detach(), + ) + y_dur_hat_r_sdp, y_dur_hat_g_sdp = net_dur_disc( + hidden_x.detach(), + x_mask.detach(), logw_.detach(), + logw_sdp.detach(), g.detach(), ) - with autocast(enabled=False): + y_dur_hat_r = y_dur_hat_r + y_dur_hat_r_sdp + y_dur_hat_g = y_dur_hat_g + y_dur_hat_g_sdp + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): # TODO: I think need to mean using the mask, but for now, just mean all ( loss_dur_disc, @@ -490,31 +538,60 @@ def train_and_evaluate( optim_dur_disc.zero_grad() scaler.scale(loss_dur_disc_all).backward() scaler.unscale_(optim_dur_disc) - commons.clip_grad_value_(net_dur_disc.parameters(), None) + # torch.nn.utils.clip_grad_norm_( + # parameters=net_dur_disc.parameters(), max_norm=100 + # ) + grad_norm_dur = commons.clip_grad_value_( + net_dur_disc.parameters(), None + ) scaler.step(optim_dur_disc) optim_d.zero_grad() scaler.scale(loss_disc_all).backward() scaler.unscale_(optim_d) + if getattr(hps.train, "bf16_run", False): + torch.nn.utils.clip_grad_norm_(parameters=net_d.parameters(), max_norm=200) grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) scaler.step(optim_d) - with autocast(enabled=hps.train.fp16_run): + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): + loss_slm = wl.discriminator( + y.detach().squeeze(), y_hat.detach().squeeze() + ).mean() + + optim_wd.zero_grad() + scaler.scale(loss_slm).backward() + scaler.unscale_(optim_wd) + # torch.nn.utils.clip_grad_norm_(parameters=net_wd.parameters(), max_norm=200) + grad_norm_wd = commons.clip_grad_value_(net_wd.parameters(), None) + scaler.step(optim_wd) + + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): # Generator y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) if net_dur_disc is not None: - y_dur_hat_r, y_dur_hat_g = net_dur_disc( - hidden_x, x_mask, logw, logw_, g - ) - with autocast(enabled=False): + _, y_dur_hat_g = net_dur_disc(hidden_x, x_mask, logw_, logw, g) + _, y_dur_hat_g_sdp = net_dur_disc(hidden_x, x_mask, logw_, logw_sdp, g) + y_dur_hat_g = y_dur_hat_g + y_dur_hat_g_sdp + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): loss_dur = torch.sum(l_length.float()) loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl loss_fm = feature_loss(fmap_r, fmap_g) loss_gen, losses_gen = generator_loss(y_d_hat_g) + + loss_lm = wl(y.detach().squeeze(), y_hat.squeeze()).mean() + loss_lm_gen = wl.generator(y_hat.squeeze()) + loss_gen_all = ( - loss_gen + loss_fm + loss_mel + loss_dur + loss_kl + loss_commit + loss_gen + + loss_fm + + loss_mel + + loss_dur + + loss_kl + + loss_lm + + loss_lm_gen ) if net_dur_disc is not None: loss_dur_gen, losses_dur_gen = generator_loss(y_dur_hat_g) @@ -522,6 +599,8 @@ def train_and_evaluate( optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) + if getattr(hps.train, "bf16_run", False): + torch.nn.utils.clip_grad_norm_(parameters=net_g.parameters(), max_norm=500) grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) scaler.step(optim_g) scaler.update() @@ -540,9 +619,12 @@ def train_and_evaluate( scalar_dict = { "loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, + "loss/wd/total": loss_slm, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g, + "grad_norm_dur": grad_norm_dur, + "grad_norm_wd": grad_norm_wd, } scalar_dict.update( { @@ -550,6 +632,8 @@ def train_and_evaluate( "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl, + "loss/g/lm": loss_lm, + "loss/g/lm_gen": loss_lm_gen, } ) scalar_dict.update( @@ -562,6 +646,30 @@ def train_and_evaluate( {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)} ) + if net_dur_disc is not None: + scalar_dict.update({"loss/dur_disc/total": loss_dur_disc_all}) + + scalar_dict.update( + { + "loss/dur_disc_g/{}".format(i): v + for i, v in enumerate(losses_dur_disc_g) + } + ) + scalar_dict.update( + { + "loss/dur_disc_r/{}".format(i): v + for i, v in enumerate(losses_dur_disc_r) + } + ) + + scalar_dict.update({"loss/g/dur_gen": loss_dur_gen}) + scalar_dict.update( + { + "loss/g/dur_gen_{}".format(i): v + for i, v in enumerate(losses_dur_gen) + } + ) + image_dict = { "slice/mel_org": utils.plot_spectrogram_to_numpy( y_mel[0].data.cpu().numpy() @@ -599,6 +707,13 @@ def train_and_evaluate( epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step)), ) + utils.save_checkpoint( + net_wd, + optim_wd, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "WD_{}.pth".format(global_step)), + ) if net_dur_disc is not None: utils.save_checkpoint( net_dur_disc, @@ -642,7 +757,6 @@ def evaluate(hps, generator, eval_loader, writer_eval): bert, ja_bert, en_bert, - emo, ) in enumerate(eval_loader): x, x_lengths = x.cuda(), x_lengths.cuda() spec, spec_lengths = spec.cuda(), spec_lengths.cuda() @@ -653,7 +767,6 @@ def evaluate(hps, generator, eval_loader, writer_eval): en_bert = en_bert.cuda() tone = tone.cuda() language = language.cuda() - emo = emo.cuda() for use_sdp in [True, False]: y_hat, attn, mask, *_ = generator.module.infer( x, @@ -664,7 +777,6 @@ def evaluate(hps, generator, eval_loader, writer_eval): bert, ja_bert, en_bert, - emo, y=spec, max_len=1000, sdp_ratio=0.0 if not use_sdp else 1.0, diff --git a/utils.py b/utils.py index 7c1440593..68fd148fe 100644 --- a/utils.py +++ b/utils.py @@ -301,7 +301,11 @@ def x_sorted(_x): to_del = [ os.path.join(path_to_models, fn) - for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep]) + for fn in ( + x_sorted("G")[:-n_ckpts_to_keep] + + x_sorted("D")[:-n_ckpts_to_keep] + + x_sorted("WD")[:-n_ckpts_to_keep] + ) ] def del_info(fn): diff --git a/webui.py b/webui.py index dfec239b2..a5465f3b2 100644 --- a/webui.py +++ b/webui.py @@ -42,6 +42,8 @@ def generate_audio( language, reference_audio, emotion, + style_text, + style_weight, skip_start=False, skip_end=False, ): @@ -49,8 +51,8 @@ def generate_audio( # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) with torch.no_grad(): for idx, piece in enumerate(slices): - skip_start = (idx != 0) and skip_start - skip_end = (idx != len(slices) - 1) and skip_end + skip_start = idx != 0 + skip_end = idx != len(slices) - 1 audio = infer( piece, reference_audio=reference_audio, @@ -66,10 +68,11 @@ def generate_audio( device=device, skip_start=skip_start, skip_end=skip_end, + style_text=style_text, + style_weight=style_weight, ) audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) audio_list.append(audio16bit) - # audio_list.append(silence) # 将静音添加到列表中 return audio_list @@ -90,8 +93,8 @@ def generate_audio_multilang( # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) with torch.no_grad(): for idx, piece in enumerate(slices): - skip_start = (idx != 0) and skip_start - skip_end = (idx != len(slices) - 1) and skip_end + skip_start = idx != 0 + skip_end = idx != len(slices) - 1 audio = infer_multilang( piece, reference_audio=reference_audio, @@ -110,7 +113,6 @@ def generate_audio_multilang( ) audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) audio_list.append(audio16bit) - # audio_list.append(silence) # 将静音添加到列表中 return audio_list @@ -127,63 +129,50 @@ def tts_split( interval_between_sent, reference_audio, emotion, + style_text, + style_weight, ): - if language == "mix": - return ("invalid", None) while text.find("\n\n") != -1: text = text.replace("\n\n", "\n") + text = text.replace("|", "") para_list = re_matching.cut_para(text) + para_list = [p for p in para_list if p != ""] audio_list = [] - if not cut_by_sent: - for idx, p in enumerate(para_list): - skip_start = idx != 0 - skip_end = idx != len(para_list) - 1 - audio = infer( + for p in para_list: + if not cut_by_sent: + audio_list += process_text( p, - reference_audio=reference_audio, - emotion=emotion, - sdp_ratio=sdp_ratio, - noise_scale=noise_scale, - noise_scale_w=noise_scale_w, - length_scale=length_scale, - sid=speaker, - language=language, - hps=hps, - net_g=net_g, - device=device, - skip_start=skip_start, - skip_end=skip_end, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + reference_audio, + emotion, + style_text, + style_weight, ) - audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) - audio_list.append(audio16bit) silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16) audio_list.append(silence) - else: - for idx, p in enumerate(para_list): - skip_start = idx != 0 - skip_end = idx != len(para_list) - 1 + else: audio_list_sent = [] sent_list = re_matching.cut_sent(p) - for idx, s in enumerate(sent_list): - skip_start = (idx != 0) and skip_start - skip_end = (idx != len(sent_list) - 1) and skip_end - audio = infer( + sent_list = [s for s in sent_list if s != ""] + for s in sent_list: + audio_list_sent += process_text( s, - reference_audio=reference_audio, - emotion=emotion, - sdp_ratio=sdp_ratio, - noise_scale=noise_scale, - noise_scale_w=noise_scale_w, - length_scale=length_scale, - sid=speaker, - language=language, - hps=hps, - net_g=net_g, - device=device, - skip_start=skip_start, - skip_end=skip_end, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + reference_audio, + emotion, + style_text, + style_weight, ) - audio_list_sent.append(audio) silence = np.zeros((int)(44100 * interval_between_sent)) audio_list_sent.append(silence) if (interval_between_para - interval_between_sent) > 0: @@ -196,10 +185,47 @@ def tts_split( ) # 对完整句子做音量归一 audio_list.append(audio16bit) audio_concat = np.concatenate(audio_list) - return ("Success", (44100, audio_concat)) + return ("Success", (hps.data.sampling_rate, audio_concat)) -def tts_fn( +def process_mix(slice): + _speaker = slice.pop() + _text, _lang = [], [] + for lang, content in slice: + content = content.split("|") + content = [part for part in content if part != ""] + if len(content) == 0: + continue + if len(_text) == 0: + _text = [[part] for part in content] + _lang = [[lang] for part in content] + else: + _text[-1].append(content[0]) + _lang[-1].append(lang) + if len(content) > 1: + _text += [[part] for part in content[1:]] + _lang += [[lang] for part in content[1:]] + return _text, _lang, _speaker + + +def process_auto(text): + _text, _lang = [], [] + for slice in text.split("|"): + if slice == "": + continue + temp_text, temp_lang = [], [] + sentences_list = split_by_language(slice, target_languages=["zh", "ja", "en"]) + for sentence, lang in sentences_list: + if sentence == "": + continue + temp_text.append(sentence) + temp_lang.append(lang.upper()) + _text.append(temp_text) + _lang.append(temp_lang) + return _text, _lang + + +def process_text( text: str, speaker, sdp_ratio, @@ -209,15 +235,9 @@ def tts_fn( language, reference_audio, emotion, - prompt_mode, + style_text=None, + style_weight=0, ): - if prompt_mode == "Audio prompt": - if reference_audio == None: - return ("Invalid audio prompt", None) - else: - reference_audio = load_audio(reference_audio)[1] - else: - reference_audio = None audio_list = [] if language == "mix": bool_valid, str_valid = re_matching.validate_text(text) @@ -226,120 +246,40 @@ def tts_fn( hps.data.sampling_rate, np.concatenate([np.zeros(hps.data.sampling_rate // 2)]), ) - result = [] for slice in re_matching.text_matching(text): - _speaker = slice.pop() - temp_contant = [] - temp_lang = [] - for lang, content in slice: - if "|" in content: - temp = [] - temp_ = [] - for i in content.split("|"): - if i != "": - temp.append([i]) - temp_.append([lang]) - else: - temp.append([]) - temp_.append([]) - temp_contant += temp - temp_lang += temp_ - else: - if len(temp_contant) == 0: - temp_contant.append([]) - temp_lang.append([]) - temp_contant[-1].append(content) - temp_lang[-1].append(lang) - for i, j in zip(temp_lang, temp_contant): - result.append([*zip(i, j), _speaker]) - for i, one in enumerate(result): - skip_start = i != 0 - skip_end = i != len(result) - 1 - _speaker = one.pop() - idx = 0 - while idx < len(one): - text_to_generate = [] - lang_to_generate = [] - while True: - lang, content = one[idx] - temp_text = [content] - if len(text_to_generate) > 0: - text_to_generate[-1] += [temp_text.pop(0)] - lang_to_generate[-1] += [lang] - if len(temp_text) > 0: - text_to_generate += [[i] for i in temp_text] - lang_to_generate += [[lang]] * len(temp_text) - if idx + 1 < len(one): - idx += 1 - else: - break - skip_start = (idx != 0) and skip_start - skip_end = (idx != len(one) - 1) and skip_end - print(text_to_generate, lang_to_generate) - audio_list.extend( - generate_audio_multilang( - text_to_generate, - sdp_ratio, - noise_scale, - noise_scale_w, - length_scale, - _speaker, - lang_to_generate, - reference_audio, - emotion, - skip_start, - skip_end, - ) + _text, _lang, _speaker = process_mix(slice) + if _speaker is None: + continue + print(f"Text: {_text}\nLang: {_lang}") + audio_list.extend( + generate_audio_multilang( + _text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + _speaker, + _lang, + reference_audio, + emotion, ) - idx += 1 + ) elif language.lower() == "auto": - for idx, slice in enumerate(text.split("|")): - if slice == "": - continue - skip_start = idx != 0 - skip_end = idx != len(text.split("|")) - 1 - sentences_list = split_by_language( - slice, target_languages=["zh", "ja", "en"] + _text, _lang = process_auto(text) + print(f"Text: {_text}\nLang: {_lang}") + audio_list.extend( + generate_audio_multilang( + _text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + speaker, + _lang, + reference_audio, + emotion, ) - idx = 0 - while idx < len(sentences_list): - text_to_generate = [] - lang_to_generate = [] - while True: - content, lang = sentences_list[idx] - temp_text = [content] - lang = lang.upper() - if lang == "JA": - lang = "JP" - if len(text_to_generate) > 0: - text_to_generate[-1] += [temp_text.pop(0)] - lang_to_generate[-1] += [lang] - if len(temp_text) > 0: - text_to_generate += [[i] for i in temp_text] - lang_to_generate += [[lang]] * len(temp_text) - if idx + 1 < len(sentences_list): - idx += 1 - else: - break - skip_start = (idx != 0) and skip_start - skip_end = (idx != len(sentences_list) - 1) and skip_end - print(text_to_generate, lang_to_generate) - audio_list.extend( - generate_audio_multilang( - text_to_generate, - sdp_ratio, - noise_scale, - noise_scale_w, - length_scale, - speaker, - lang_to_generate, - reference_audio, - emotion, - skip_start, - skip_end, - ) - ) - idx += 1 + ) else: audio_list.extend( generate_audio( @@ -352,13 +292,65 @@ def tts_fn( language, reference_audio, emotion, + style_text, + style_weight, ) ) + return audio_list + + +def tts_fn( + text: str, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + reference_audio, + emotion, + prompt_mode, + style_text=None, + style_weight=0, +): + if style_text == "": + style_text = None + if prompt_mode == "Audio prompt": + if reference_audio == None: + return ("Invalid audio prompt", None) + else: + reference_audio = load_audio(reference_audio)[1] + else: + reference_audio = None + + audio_list = process_text( + text, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + reference_audio, + emotion, + style_text, + style_weight, + ) audio_concat = np.concatenate(audio_list) return "Success", (hps.data.sampling_rate, audio_concat) +def format_utils(text, speaker): + _text, _lang = process_auto(text) + res = f"[{speaker}]" + for lang_s, content_s in zip(_lang, _text): + for lang, content in zip(lang_s, content_s): + res += f"<{lang.lower()}>{content}" + res += "|" + return "mix", res[:-1] + + def load_audio(path): audio, sr = librosa.load(path, 48000) # audio = librosa.resample(audio, 44100, 48000) @@ -408,34 +400,37 @@ def gr_util(item): ) trans = gr.Button("中翻日", variant="primary") slicer = gr.Button("快速切分", variant="primary") + formatter = gr.Button("检测语言,并整理为 MIX 格式", variant="primary") speaker = gr.Dropdown( choices=speakers, value=speakers[0], label="Speaker" ) _ = gr.Markdown( - value="提示模式(Prompt mode):可选文字提示或音频提示,用于生成文字或音频指定风格的声音。\n" + value="提示模式(Prompt mode):可选文字提示或音频提示,用于生成文字或音频指定风格的声音。\n", + visible=False, ) prompt_mode = gr.Radio( ["Text prompt", "Audio prompt"], label="Prompt Mode", value="Text prompt", + visible=False, ) text_prompt = gr.Textbox( label="Text prompt", placeholder="用文字描述生成风格。如:Happy", value="Happy", - visible=True, + visible=False, ) audio_prompt = gr.Audio( label="Audio prompt", type="filepath", visible=False ) sdp_ratio = gr.Slider( - minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio" + minimum=0, maximum=1, value=0.5, step=0.1, label="SDP Ratio" ) noise_scale = gr.Slider( minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise" ) noise_scale_w = gr.Slider( - minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W" + minimum=0.1, maximum=2, value=0.9, step=0.1, label="Noise_W" ) length_scale = gr.Slider( minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length" @@ -445,6 +440,21 @@ def gr_util(item): ) btn = gr.Button("生成音频!", variant="primary") with gr.Column(): + with gr.Accordion("融合文本语义", open=False): + gr.Markdown( + value="使用辅助文本的语意来辅助生成对话(语言保持与主文本相同)\n\n" + "**注意**:不要使用**指令式文本**(如:开心),要使用**带有强烈情感的文本**(如:我好快乐!!!)\n\n" + "效果较不明确,留空即为不使用该功能" + ) + style_text = gr.Textbox(label="辅助文本") + style_weight = gr.Slider( + minimum=0, + maximum=1, + value=0.7, + step=0.1, + label="Weight", + info="主文本和辅助文本的bert混合比率,0表示仅主文本,1表示仅辅助文本", + ) with gr.Row(): with gr.Column(): interval_between_sent = gr.Slider( @@ -487,6 +497,8 @@ def gr_util(item): audio_prompt, text_prompt, prompt_mode, + style_text, + style_weight, ], outputs=[text_output, audio_output], ) @@ -511,6 +523,8 @@ def gr_util(item): interval_between_sent, audio_prompt, text_prompt, + style_text, + style_weight, ], outputs=[text_output, audio_output], ) @@ -527,6 +541,12 @@ def gr_util(item): outputs=[audio_prompt], ) + formatter.click( + format_utils, + inputs=[text, speaker], + outputs=[language, text], + ) + print("推理页面已开启!") webbrowser.open(f"http://127.0.0.1:{config.webui_config.port}") app.launch(share=config.webui_config.share, server_port=config.webui_config.port) diff --git a/webui_preprocess.py b/webui_preprocess.py index af2ddeeb8..b5d6294c2 100644 --- a/webui_preprocess.py +++ b/webui_preprocess.py @@ -19,9 +19,9 @@ def generate_config(data_dir, batch_size): assert data_dir != "", "数据集名称不能为空" start_path, _, train_path, val_path, config_path = get_path(data_dir) if os.path.isfile(config_path): - config = json.load(open(config_path)) + config = json.load(open(config_path, "r", encoding="utf-8")) else: - config = json.load(open("configs/config.json")) + config = json.load(open("configs/config.json", "r", encoding="utf-8")) config["data"]["training_files"] = train_path config["data"]["validation_files"] = val_path config["train"]["batch_size"] = batch_size @@ -44,7 +44,7 @@ def resample(data_dir): in_dir = os.path.join(start_path, "raw") out_dir = os.path.join(start_path, "wavs") subprocess.run( - f"python resample.py " + f"python resample_legacy.py " f"--sr 44100 " f"--in_dir {in_dir} " f"--out_dir {out_dir} ", @@ -60,7 +60,9 @@ def preprocess_text(data_dir): with open(lbl_path, "w", encoding="utf-8") as f: for line in lines: path, spk, language, text = line.strip().split("|") - path = os.path.join(start_path, "wavs", os.path.basename(path)) + path = os.path.join(start_path, "wavs", os.path.basename(path)).replace( + "\\", "/" + ) f.writelines(f"{path}|{spk}|{language}|{text}\n") subprocess.run( f"python preprocess_text.py " @@ -83,16 +85,6 @@ def bert_gen(data_dir): return "BERT 特征文件生成完成" -def clap_gen(data_dir): - assert data_dir != "", "数据集名称不能为空" - _, _, _, _, config_path = get_path(data_dir) - subprocess.run( - f"python clap_gen.py " f"--config {config_path}", - shell=True, - ) - return "CLAP 特征文件生成完成" - - if __name__ == "__main__": with gr.Blocks() as app: with gr.Row(): @@ -104,9 +96,9 @@ def clap_gen(data_dir): "- [中文 RoBERTa](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)\n" "- [日文 DeBERTa](https://huggingface.co/ku-nlp/deberta-v2-large-japanese-char-wwm)\n" "- [英文 DeBERTa](https://huggingface.co/microsoft/deberta-v3-large)\n" - "- [CLAP](https://huggingface.co/laion/clap-htsat-fused)\n" + "- [WavLM](https://huggingface.co/microsoft/wavlm-base-plus)\n" "\n" - "将 BERT 模型放置到 `bert` 文件夹下,CLAP 模型放置到 `emotional` 文件夹下,覆盖同名文件夹。\n" + "将 BERT 模型放置到 `bert` 文件夹下,WavLM 模型放置到 `slm` 文件夹下,覆盖同名文件夹。\n" "\n" "数据准备:\n" "将数据放置在 data 文件夹下,按照如下结构组织:\n" @@ -156,12 +148,10 @@ def clap_gen(data_dir): preprocess_text_btn = gr.Button(value="执行", variant="primary") _ = gr.Markdown(value="## 第四步:生成 BERT 特征文件") bert_gen_btn = gr.Button(value="执行", variant="primary") - _ = gr.Markdown(value="## 第五步:生成 CLAP 特征文件") - clap_gen_btn = gr.Button(value="执行", variant="primary") _ = gr.Markdown( value="## 训练模型及部署:\n" "修改根目录下的 `config.yml` 中 `dataset_path` 一项为 `data/{你的数据集名称}`\n" - "- 训练:将[预训练模型文件](https://openi.pcl.ac.cn/Stardust_minus/Bert-VITS2/modelmanage/show_model)(`D_0.pth`、`DUR_0.pth` 和 `G_0.pth`)放到 `data/{你的数据集名称}/models` 文件夹下,执行 `torchrun --nproc_per_node=1 train_ms.py` 命令(多卡运行可参考 `run_MnodesAndMgpus.sh` 中的命令。\n" + "- 训练:将[预训练模型文件](https://openi.pcl.ac.cn/Stardust_minus/Bert-VITS2/modelmanage/show_model)(`D_0.pth`、`DUR_0.pth`、`WD_0.pth` 和 `G_0.pth`)放到 `data/{你的数据集名称}/models` 文件夹下,执行 `torchrun --nproc_per_node=1 train_ms.py` 命令(多卡运行可参考 `run_MnodesAndMgpus.sh` 中的命令。\n" "- 部署:修改根目录下的 `config.yml` 中 `webui` 下 `model` 一项为 `models/{权重文件名}.pth` (如 G_10000.pth),然后执行 `python webui.py`" ) @@ -171,7 +161,6 @@ def clap_gen(data_dir): resample_btn.click(resample, inputs=[data_dir], outputs=[info]) preprocess_text_btn.click(preprocess_text, inputs=[data_dir], outputs=[info]) bert_gen_btn.click(bert_gen, inputs=[data_dir], outputs=[info]) - clap_gen_btn.click(clap_gen, inputs=[data_dir], outputs=[info]) webbrowser.open("http://127.0.0.1:7860") app.launch(share=False, server_port=7860)