mirror of
https://github.com/babysor/MockingBird.git
synced 2024-11-27 04:49:52 +08:00
Some changes to make it easier to install the dependencies
This commit is contained in:
parent
b78d0d2a26
commit
9f1dbeeecc
10
README-CN.md
10
README-CN.md
@ -44,6 +44,16 @@
|
||||
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
|
||||
* 安装 webrtcvad `pip install webrtcvad-wheels`。
|
||||
|
||||
或者
|
||||
- 用`conda` 或者 `mamba` 安装依赖
|
||||
|
||||
```conda env create -n env_name -f env.yml```
|
||||
|
||||
```mamba env create -n env_name -f env.yml```
|
||||
|
||||
会创建新环境安装必须的依赖. 之后用 `conda activate env_name` 切换环境就完成了.
|
||||
> env.yml只包含了运行时必要的依赖,暂时不包括monotonic-align,如果想要装GPU版本的pytorch可以查看官网教程。
|
||||
|
||||
#### 1.2 M1芯片Mac环境配置(Inference Time)
|
||||
> 以下环境按x86-64搭建,使用原生的`demo_toolbox.py`,可作为在不改代码情况下快速使用的workaround。
|
||||
>
|
||||
|
10
README.md
10
README.md
@ -39,6 +39,16 @@
|
||||
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
|
||||
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
|
||||
|
||||
or
|
||||
- install dependencies with `conda` or `mamba`
|
||||
|
||||
```conda env create -n env_name -f env.yml```
|
||||
|
||||
```mamba env create -n env_name -f env.yml```
|
||||
|
||||
will create a virtual environment where necessary dependencies are installed. Switch to the new environment by `conda activate env_name` and enjoy it.
|
||||
> env.yml only includes the necessary dependencies to run the project,temporarily without monotonic-align. You can check the official website to install the GPU version of pytorch.
|
||||
|
||||
#### 1.2 Setup with a M1 Mac
|
||||
> The following steps are a workaround to directly use the original `demo_toolbox.py`without the changing of codes.
|
||||
>
|
||||
|
@ -78,7 +78,7 @@ if __name__ == "__main__":
|
||||
else:
|
||||
train_hifigan(0, args, h)
|
||||
elif args.vocoder_type == "fregan":
|
||||
with open('vocoder/fregan/config.json') as f:
|
||||
with Path('vocoder/fregan/config.json').open() as f:
|
||||
json_config = json.load(f)
|
||||
h = AttrDict(json_config)
|
||||
if h.num_gpus > 1:
|
||||
|
@ -33,7 +33,7 @@ colormap = np.array([
|
||||
[0, 0, 0],
|
||||
[183, 183, 183],
|
||||
[76, 255, 0],
|
||||
], dtype=np.float) / 255
|
||||
], dtype=float) / 255
|
||||
|
||||
default_text = \
|
||||
"欢迎使用工具箱, 现已支持中文输入!"
|
||||
@ -402,8 +402,8 @@ class UI(QDialog):
|
||||
self.app.processEvents()
|
||||
|
||||
def set_loading(self, value, maximum=1):
|
||||
self.loading_bar.setValue(value * 100)
|
||||
self.loading_bar.setMaximum(maximum * 100)
|
||||
self.loading_bar.setValue(int(value * 100))
|
||||
self.loading_bar.setMaximum(int(maximum * 100))
|
||||
self.loading_bar.setTextVisible(value != 0)
|
||||
self.app.processEvents()
|
||||
|
||||
|
@ -39,7 +39,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
|
||||
|
||||
# Resample the wav if needed
|
||||
if source_sr is not None and source_sr != sampling_rate:
|
||||
wav = librosa.resample(wav, source_sr, sampling_rate)
|
||||
wav = librosa.resample(wav, orig_sr = source_sr, target_sr = sampling_rate)
|
||||
|
||||
# Apply the preprocessing: normalize volume and shorten long silences
|
||||
if normalize:
|
||||
@ -99,7 +99,7 @@ def trim_long_silences(wav):
|
||||
return ret[width - 1:] / width
|
||||
|
||||
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
||||
audio_mask = np.round(audio_mask).astype(np.bool)
|
||||
audio_mask = np.round(audio_mask).astype(bool)
|
||||
|
||||
# Dilate the voiced regions
|
||||
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
||||
|
@ -21,7 +21,7 @@ colormap = np.array([
|
||||
[33, 0, 127],
|
||||
[0, 0, 0],
|
||||
[183, 183, 183],
|
||||
], dtype=np.float) / 255
|
||||
], dtype=float) / 255
|
||||
|
||||
|
||||
class Visualizations:
|
||||
|
@ -31,14 +31,13 @@ class LogMel(torch.nn.Module):
|
||||
fs: int = 16000,
|
||||
n_fft: int = 512,
|
||||
n_mels: int = 80,
|
||||
fmin: float = None,
|
||||
fmin: float = 0,
|
||||
fmax: float = None,
|
||||
htk: bool = False,
|
||||
norm=1,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
fmin = 0 if fmin is None else fmin
|
||||
fmax = fs / 2 if fmax is None else fmax
|
||||
_mel_options = dict(
|
||||
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
|
||||
|
@ -107,7 +107,7 @@ def _griffin_lim(S, hparams):
|
||||
Based on https://github.com/librosa/librosa/issues/434
|
||||
"""
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
S_complex = np.abs(S).astype(complex)
|
||||
y = _istft(S_complex * angles, hparams)
|
||||
for i in range(hparams.griffin_lim_iters):
|
||||
angles = np.exp(1j * np.angle(_stft(y, hparams)))
|
||||
|
@ -78,12 +78,12 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||
|
||||
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
|
||||
job = Pool(n_processes).imap(func, speaker_dirs)
|
||||
job = Pool(n_processes).imap_unordered(func, speaker_dirs)
|
||||
|
||||
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
||||
if speaker_metadata is not None:
|
||||
for metadatum in speaker_metadata:
|
||||
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
||||
metadata_file.write("|".join(map(str,metadatum)) + "\n")
|
||||
metadata_file.close()
|
||||
|
||||
# Verify the contents of the metadata file
|
||||
@ -134,7 +134,7 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
|
||||
# Embed the utterances in separate threads
|
||||
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
||||
job = Pool(n_processes).imap(func, fpaths)
|
||||
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
||||
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
||||
|
||||
def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
|
||||
wav_dir = synthesizer_root.joinpath("audio")
|
||||
@ -152,4 +152,4 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
|
||||
# Embed the utterances in separate threads
|
||||
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
|
||||
job = Pool(n_processes).imap(func, fpaths)
|
||||
list(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
||||
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
||||
|
@ -104,29 +104,58 @@ def _split_on_silences(wav_fpath, words, hparams):
|
||||
wav = logmmse.denoise(wav, profile, eta=0)
|
||||
|
||||
resp = pinyin(words, style=Style.TONE3)
|
||||
res = [v[0] for v in resp if v[0].strip()]
|
||||
res = filter(lambda v : not v.isspace(),map(lambda v: v[0],resp))
|
||||
res = " ".join(res)
|
||||
|
||||
return wav, res
|
||||
|
||||
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
|
||||
metadata = []
|
||||
extensions = ["*.wav", "*.flac", "*.mp3"]
|
||||
for extension in extensions:
|
||||
wav_fpath_list = speaker_dir.glob(extension)
|
||||
# Iterate over each wav
|
||||
for wav_fpath in wav_fpath_list:
|
||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||
words = dict_info.get(wav_fpath.name) if not words else words # try with extension
|
||||
if not words:
|
||||
print("no wordS")
|
||||
continue
|
||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||
result = _process_utterance(wav, text, out_dir, sub_basename,
|
||||
skip_existing, hparams, encoder_model_fpath)
|
||||
if result is None:
|
||||
continue
|
||||
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
||||
metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
|
||||
return [m for m in metadata if m is not None]
|
||||
extensions = ("*.wav", "*.flac", "*.mp3")
|
||||
if skip_existing:
|
||||
for extension in extensions:
|
||||
wav_fpath_list = speaker_dir.glob(extension)
|
||||
# Iterate over each wav
|
||||
for wav_fpath in wav_fpath_list:
|
||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||
if not words:
|
||||
words = dict_info.get(wav_fpath.name) # try with extension
|
||||
if not words:
|
||||
print("no wordS")
|
||||
continue
|
||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||
|
||||
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
|
||||
wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
|
||||
|
||||
if mel_fpath.exists() and wav_fpath_.exists():
|
||||
continue
|
||||
|
||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||
result = _process_utterance(wav, text, out_dir, sub_basename,
|
||||
False, hparams, encoder_model_fpath) # accelarate
|
||||
if result is None:
|
||||
continue
|
||||
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
||||
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
|
||||
else:
|
||||
for extension in extensions:
|
||||
wav_fpath_list = speaker_dir.glob(extension)
|
||||
# Iterate over each wav
|
||||
for wav_fpath in wav_fpath_list:
|
||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||
if not words:
|
||||
words = dict_info.get(wav_fpath.name) # try with extension
|
||||
if not words:
|
||||
print("no wordS")
|
||||
continue
|
||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||
|
||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||
result = _process_utterance(wav, text, out_dir, sub_basename,
|
||||
False, hparams, encoder_model_fpath)
|
||||
if result is None:
|
||||
continue
|
||||
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
||||
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
|
||||
return metadata
|
||||
|
@ -50,7 +50,7 @@ def linear_to_mel(spectrogram):
|
||||
|
||||
|
||||
def build_mel_basis():
|
||||
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
|
||||
return librosa.filters.mel(sr = hp.sample_rate, n_fft = hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
|
||||
|
||||
|
||||
def normalize(S):
|
||||
|
@ -1,9 +1,8 @@
|
||||
umap-learn
|
||||
visdom
|
||||
librosa==0.8.1
|
||||
librosa
|
||||
matplotlib>=3.3.0
|
||||
numpy==1.19.3; platform_system == "Windows"
|
||||
numpy==1.19.4; platform_system != "Windows"
|
||||
numpy
|
||||
scipy>=1.0.0
|
||||
tqdm
|
||||
sounddevice
|
||||
@ -13,22 +12,22 @@ inflect
|
||||
PyQt5
|
||||
multiprocess
|
||||
numba
|
||||
webrtcvad; platform_system != "Windows"
|
||||
webrtcvad
|
||||
pypinyin
|
||||
flask
|
||||
flask_wtf
|
||||
flask_cors==3.0.10
|
||||
gevent==21.8.0
|
||||
flask_cors
|
||||
gevent
|
||||
flask_restx
|
||||
tensorboard==1.15
|
||||
streamlit==1.8.0
|
||||
PyYAML==5.4.1
|
||||
tensorboard
|
||||
streamlit
|
||||
PyYAML
|
||||
torch_complex
|
||||
espnet
|
||||
PyWavelets
|
||||
monotonic-align==0.0.3
|
||||
transformers==4.26.0
|
||||
transformers
|
||||
fastapi
|
||||
loguru
|
||||
typer[all]
|
||||
click==8.0.4
|
||||
click
|
||||
|
Loading…
Reference in New Issue
Block a user