From 297d92bf5dd0d23000fdd6f9df339e25dd0925b6 Mon Sep 17 00:00:00 2001 From: autumnmotor <59357372+autumnmotor@users.noreply.github.com> Date: Sat, 22 Apr 2023 20:39:47 +0900 Subject: [PATCH] some change precision audio processing (#94) * some change precision audio processing * fix clipping problem in resample resample sometimes causes signal clipping, not just librosa.resample * fix error --- extract_f0_print.py | 4 +++- my_utils.py | 4 ++-- train/data_utils.py | 10 ++++++++-- trainset_preprocess_pipeline_print.py | 23 +++++++++++++++++++---- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/extract_f0_print.py b/extract_f0_print.py index d330c90..d2bc805 100644 --- a/extract_f0_print.py +++ b/extract_f0_print.py @@ -33,7 +33,9 @@ class FeatureInput(object): self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) def compute_f0(self, path, f0_method): - x, sr = librosa.load(path, self.fs) + # default resample type of librosa.resample is "soxr_hq". + # Quality: soxr_vhq > soxr_hq + x, sr = librosa.load(path, self.fs, res_type='soxr_vhq') p_len = x.shape[0] // self.hop assert sr == self.fs if f0_method == "pm": diff --git a/my_utils.py b/my_utils.py index 89a1527..8b7e427 100644 --- a/my_utils.py +++ b/my_utils.py @@ -12,10 +12,10 @@ def load_audio(file, sr): ) # 防止小白拷路径头尾带了空格和"和回车 out, _ = ( ffmpeg.input(file, threads=0) - .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) ) except Exception as e: raise RuntimeError(f"Failed to load audio: {e}") - return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 + return np.frombuffer(out, np.float32).flatten() \ No newline at end of file diff --git a/train/data_utils.py b/train/data_utils.py index ee7d4d1..87a435f 100644 --- a/train/data_utils.py +++ b/train/data_utils.py @@ -98,7 +98,10 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): sampling_rate, self.sampling_rate ) ) - audio_norm = audio / self.max_wav_value + audio_norm = audio +# audio_norm = audio / self.max_wav_value +# audio_norm = audio / np.abs(audio).max() + audio_norm = audio_norm.unsqueeze(0) spec_filename = filename.replace(".wav", ".spec.pt") if os.path.exists(spec_filename): @@ -287,7 +290,10 @@ class TextAudioLoader(torch.utils.data.Dataset): sampling_rate, self.sampling_rate ) ) - audio_norm = audio / self.max_wav_value + audio_norm = audio +# audio_norm = audio / self.max_wav_value +# audio_norm = audio / np.abs(audio).max() + audio_norm = audio_norm.unsqueeze(0) spec_filename = filename.replace(".wav", ".spec.pt") if os.path.exists(spec_filename): diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index caaf533..7b5833a 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -59,19 +59,34 @@ class PreProcess: wavfile.write( "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), self.sr, - (tmp_audio * 32768).astype(np.int16), + (tmp_audio * 1).astype(np.float32), ) - tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000) + + # default resample type of librosa.resample is "soxr_hq". + # Quality: soxr_vhq > soxr_hq + tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000, res_type="soxr_vhq") + tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + ( + 1 - self.alpha + ) * tmp_audio + wavfile.write( + "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), + self.sr, + (tmp_audio * 1).astype(np.float32), + ) + wavfile.write( "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 16000, - (tmp_audio * 32768).astype(np.int16), + (tmp_audio * 1).astype(np.float32), ) def pipeline(self, path, idx0): try: audio = load_audio(path, self.sr) - audio = signal.filtfilt(self.bh, self.ah, audio) + # zero phased digital filter cause pre-ringing noise... + # audio = signal.filtfilt(self.bh, self.ah, audio) + audio = signal.lfilter(self.bh, self.ah, audio) + idx1 = 0 for audio in self.slicer.slice(audio): i = 0