mirror of
https://github.com/babysor/MockingBird.git
synced 2025-02-23 20:39:12 +08:00
19 KiB
Vendored
19 KiB
Vendored
In [ ]:
from utils.hparams import load_hparams_json from utils.util import intersperse import json from models.synthesizer.models.vits import Vits import torch import numpy as np import IPython.display as ipd from models.synthesizer.utils.symbols import symbols from models.synthesizer.utils.text import text_to_sequence hps = load_hparams_json("data/ckpt/synthesizer/vits5/config.json") print(hps.train) model = Vits( len(symbols), hps["data"]["filter_length"] // 2 + 1, hps["train"]["segment_size"] // hps["data"]["hop_length"], n_speakers=hps["data"]["n_speakers"], **hps["model"]) _ = model.eval() device = torch.device("cpu") checkpoint = torch.load(str("data/ckpt/synthesizer/vits5/G_56000.pth"), map_location=device) if "model_state" in checkpoint: state = checkpoint["model_state"] else: state = checkpoint["model"] model.load_state_dict(state, strict=False) # 随机抽取情感参考音频的根目录 random_emotion_root = "D:\\audiodata\\SV2TTS\\synthesizer\\emo\\" import random, re from pypinyin import lazy_pinyin, Style import os def tts(txt, emotion, sid=0): txt = " ".join(lazy_pinyin(txt, style=Style.TONE3, neutral_tone_with_five=False)) text_norm = text_to_sequence(txt, hps["data"]["text_cleaners"]) # if hps["data"]["add_blank"]: # text_norm = intersperse(text_norm, 0) stn_tst = torch.LongTensor(text_norm) with torch.no_grad(): #inference mode x_tst = stn_tst.unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) sid = torch.LongTensor([sid]) if emotion.endswith("wav"): from models.synthesizer.preprocess_audio import extract_emo import librosa wav, sr = librosa.load(emotion, 16000) emo = torch.FloatTensor(extract_emo(np.expand_dims(wav, 0), sr, embeddings=True)) elif emotion == "random_sample": rand_emo = random.sample(os.listdir(random_emotion_root), 1)[0] print(rand_emo) emo = torch.FloatTensor(np.load(f"{random_emotion_root}\\{rand_emo}")).unsqueeze(0) elif emotion.endswith("npy"): print(emotion) emo = torch.FloatTensor(np.load(f"{random_emotion_root}\\{emotion}")).unsqueeze(0) else: print("emotion参数不正确") audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1, emo=emo)[0][0,0].data.float().numpy() ipd.display(ipd.Audio(audio, rate=hps["data"]["sampling_rate"], normalize=False))
推理:
In [ ]:
txt = "我们将其拓展到文本驱动数字人形象领域" #正常: tts(txt, emotion='emo-T0055G4906S0052.wav_00.npy', sid=100) #快速:emo-T0055G2323S0179.wav_00.npy #难过: tts(txt, emotion='emo-15_4581_20170825202626.wav_00.npy', sid=100) #开心:T0055G2412S0498.wav tts(txt, emotion='emo-T0055G2412S0498.wav_00.npy', sid=100) #愤怒 T0055G1371S0363.wav T0055G1344S0160.wav tts(txt, emotion='emo-T0055G1344S0160.wav_00.npy', sid=100) #疲惫 tts(txt, emotion='emo-T0055G2294S0476.wav_00.npy', sid=100) #着急 tts(txt, emotion='emo-T0055G1671S0170.wav_00.npy', sid=100)
In [ ]:
txt = "我们将其拓展到文本驱动数字人形象领域" tts(txt, emotion='random_sample', sid=100) tts(txt, emotion='random_sample', sid=100) tts(txt, emotion='random_sample', sid=100) tts(txt, emotion='random_sample', sid=100) tts(txt, emotion='random_sample', sid=100) tts(txt, emotion='random_sample', sid=100)
In [ ]:
txt = "我们将其拓展到文本驱动数字人形象领域" types = ["平淡", "激动", "疲惫", "兴奋", "沮丧", "开心"] for t in types: print(t) tts(txt, emotion=f'C:\\Users\\babys\\Music\\{t}.wav', sid=100) # tts(txt, emotion='D:\\audiodata\\aidatatang_200zh\\corpus\\train\\G1858\\T0055G1858S0342.wav', sid=5)
预处理:
In [ ]:
from models.synthesizer.preprocess import preprocess_dataset from pathlib import Path from utils.hparams import HParams datasets_root = Path("../audiodata/") hparams = HParams( n_fft = 1024, # filter_length num_mels = 80, hop_size = 256, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125) win_size = 1024, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050) fmin = 55, min_level_db = -100, ref_level_db = 20, max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small. sample_rate = 16000, rescale = True, max_mel_frames = 900, rescaling_max = 0.9, preemphasis = 0.97, # Filter coefficient to use if preemphasize is True preemphasize = True, ### Mel Visualization and Griffin-Lim signal_normalization = True, utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded ### Audio processing options fmax = 7600, # Should not exceed (sample_rate // 2) allow_clipping_in_normalization = True, # Used when signal_normalization = True clip_mels_length = True, # If true, discards samples exceeding max_mel_frames use_lws = False, # "Fast spectrogram phase recovery using local weighted sums" symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True, # and [0, max_abs_value] if False trim_silence = False, # Use with sample_rate of 16000 for best results ) preprocess_dataset(datasets_root=datasets_root, out_dir=datasets_root.joinpath("SV2TTS", "synthesizer"), n_processes=8, skip_existing=True, hparams=hparams, no_alignments=False, dataset="aidatatang_200zh", emotion_extract=True)
训练:
In [ ]:
from models.synthesizer.train_vits import run from pathlib import Path from utils.hparams import HParams import torch, os import torch.multiprocessing as mp datasets_root = Path("../audiodata/SV2TTS/synthesizer") hparams= HParams( model_dir = "data/ckpt/synthesizer/vits", ) hparams.loadJson(Path(hparams.model_dir).joinpath("config.json")) hparams.data["training_files"] = str(datasets_root.joinpath("train.txt")) hparams.data["validation_files"] = str(datasets_root.joinpath("train.txt")) hparams.data["datasets_root"] = str(datasets_root) n_gpus = torch.cuda.device_count() # for spawn os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '8899' mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hparams))
挑选只有对应emo文件的meta数据
In [ ]:
from pathlib import Path import os root = Path('../audiodata/SV2TTS/synthesizer') dict_info = [] with open(root.joinpath("train.txt"), "r", encoding="utf-8") as dict_meta: for raw in dict_meta: if not raw: continue v = raw.split("|")[0].replace("audio","emo") emo_fpath = root.joinpath("emo").joinpath(v) if emo_fpath.exists(): dict_info.append(raw) # else: # print(emo_fpath) # Iterate over each wav meta2 = Path('../audiodata/SV2TTS/synthesizer/train2.txt') metadata_file = meta2.open("w", encoding="utf-8") for new_info in dict_info: metadata_file.write(new_info) metadata_file.close()
从训练集中抽取10%作为测试集
In [ ]:
from pathlib import Path root = Path('../audiodata/SV2TTS/synthesizer') dict_info1 = [] dict_info2 = [] count = 1 with open(root.joinpath("train.txt"), "r", encoding="utf-8") as dict_meta: for raw in dict_meta: if not raw: continue if count % 10 == 0: dict_info2.append(raw) else: dict_info1.append(raw) count += 1 # Iterate over each wav meta1 = Path('../audiodata/SV2TTS/synthesizer/train1.txt') metadata_file = meta1.open("w", encoding="utf-8") for new_info in dict_info1: metadata_file.write(new_info) metadata_file.close() meta2 = Path('../audiodata/SV2TTS/synthesizer/eval.txt') metadata_file = meta2.open("w", encoding="utf-8") for new_info in dict_info2: metadata_file.write(new_info) metadata_file.close()
evaluation
In [ ]:
from pathlib import Path root = Path('../audiodata/SV2TTS/synthesizer') spks = [] spk_id = {} rows = [] with open(root.joinpath("eval.txt"), "r", encoding="utf-8") as dict_meta: for raw in dict_meta: speaker_name = raw.split("-")[1][6:10] if speaker_name not in spk_id: spks.append(speaker_name) spk_id[speaker_name] = 1 rows.append(raw) i = 0 spks.sort() for sp in spks: spk_id[sp] = str(i) i = i + 1 print(len(spks)) meta2 = Path('../audiodata/SV2TTS/synthesizer/eval2.txt') metadata_file = meta2.open("w", encoding="utf-8") for row in rows: speaker_n = row.split("-")[1][6:10] metadata_file.write(row.strip()+"|"+spk_id[speaker_n]+"\n") metadata_file.close()
[Not Recommended] Try to transcript map to detailed format: ni3 hao3 -> n i3 h ao3
After couple of tests, I think this method will not improve the quality of result and may cause the crash of monotonic alignment.
In [ ]:
from pathlib import Path datasets_root = Path("../audiodata/SV2TTS/synthesizer/") dictionary_fp = Path("../audiodata/ProDiff/processed/mandarin_pinyin.dict") dict_map = {} for l in open(dictionary_fp, encoding='utf-8').readlines(): item = l.split("\t") dict_map[item[0]] = item[1].replace("\n","") with datasets_root.joinpath('train2.txt').open("w+", encoding='utf-8') as f: for l in open(datasets_root.joinpath('train.txt'), encoding='utf-8').readlines(): items = l.strip().replace("\n","").replace("\t"," ").split("|") phs_str = "" for word in items[5].split(" "): if word in dict_map: phs_str += dict_map[word] else: phs_str += word phs_str += " _ " items[5] = phs_str # if not os.path.exists(mfa_input_root.joinpath('train.txt')): # with open(mfa_input_root.joinpath(fileName + 'lab'), 'w+', encoding="utf-8") as f: f.write("|".join(items) + "\n")
预处理后的数据可视化
In [ ]:
import matplotlib.pyplot as plt import librosa.display import librosa, torch import numpy as np from utils.audio_utils import spectrogram, mel_spectrogram, load_wav_to_torch, spec_to_mel # x, sr = librosa.load("D:\audiodata\SV2TTS\synthesizer\audio\audio-T0055G2333S0196.wav_00.npy") x = np.load("D:\\audiodata\\SV2TTS\\synthesizer\\audio\\audio-T0055G1858S0342.wav_00.npy") plt.figure(figsize=(14, 5)) librosa.display.waveplot(x) X = librosa.stft(x) Xdb = librosa.amplitude_to_db(abs(X)) plt.figure(figsize=(14, 5)) librosa.display.specshow(Xdb, x_axis='time', y_axis='hz') # spectrogram = np.load("D:\\audiodata\\SV2TTS\\synthesizer\\mels\\mel-T0055G1858S0342.wav_00.npy") audio = torch.from_numpy(x.astype(np.float32)) # audio, sampling_rate = load_wav_to_torch("D:\\audiodata\\aidatatang_200zh\\corpus\\train\\G1858\\T0055G1858S0342.wav") # audio_norm = audio / 32768.0 audio_norm = audio.unsqueeze(0) spec = spectrogram(audio_norm, 1024, 256, 1024, center=False) # spec = spec_to_mel() spec = torch.squeeze(spec, 0) mel = spec_to_mel(spec, 1024, 80, 16000, 0, None) fig = plt.figure(figsize=(10, 8)) ax2 = fig.add_subplot(211) im = ax2.imshow(mel, interpolation="none")
情感聚类
In [ ]:
# from sklearn import metrics # from sklearn.mixture import GaussianMixture # 高斯混合模型 import os import numpy as np import librosa import IPython.display as ipd from random import sample embs = [] wavnames = [] emo_root_path = "D:\\audiodata\\SV2TTS\\synthesizer\\emo\\" wav_root_path = "D:\\audiodata\\aidatatang_200zh\\corpus\\train\\" for idx, emo_fpath in enumerate(sample(os.listdir(emo_root_path), 10000)): if emo_fpath.endswith(".npy") and emo_fpath.startswith("emo-T"): embs.append(np.expand_dims(np.load(emo_root_path + emo_fpath), axis=0)) wav_fpath = wav_root_path + emo_fpath[9:14] + "\\" + emo_fpath.split("_00")[0][4:] wavnames.append(wav_fpath) print(len(embs)) x = np.concatenate(embs, axis=0)
In [ ]:
# 聚类算法类的数量 n_clusters = 20 from sklearn.cluster import * # model = KMeans(n_clusters=n_clusters, random_state=10) # model = DBSCAN(eps=0.002, min_samples=2) # 可以自行尝试各种不同的聚类算法 # model = Birch(n_clusters= n_clusters, threshold= 0.2) # model = SpectralClustering(n_clusters=n_clusters) model = AgglomerativeClustering(n_clusters= n_clusters) import random y_predict = model.fit_predict(x) def disp(wavname): wav, sr =librosa.load(wavname, 16000) display(ipd.Audio(wav, rate=sr)) classes=[[] for i in range(y_predict.max()+1)] for idx, wavname in enumerate(wavnames): classes[y_predict[idx]].append(wavname) for i in range(y_predict.max()+1): print("类别:", i, "本类中样本数量:", len(classes[i])) """每一个类只预览2条音频""" for j in range(2): idx = random.randint(0, len(classes[i]) - 1) print(classes[i][idx]) disp(classes[i][idx])