diff --git a/train/data_utils.py b/train/data_utils.py index ee7d4d1..759cfbb 100644 --- a/train/data_utils.py +++ b/train/data_utils.py @@ -98,7 +98,9 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): sampling_rate, self.sampling_rate ) ) - audio_norm = audio / self.max_wav_value +# audio_norm = audio / self.max_wav_value +# audio_norm = audio / np.abs(audio).max() + audio_norm = audio_norm.unsqueeze(0) spec_filename = filename.replace(".wav", ".spec.pt") if os.path.exists(spec_filename): @@ -287,7 +289,9 @@ class TextAudioLoader(torch.utils.data.Dataset): sampling_rate, self.sampling_rate ) ) - audio_norm = audio / self.max_wav_value +# audio_norm = audio / self.max_wav_value +# audio_norm = audio / np.abs(audio).max() + audio_norm = audio_norm.unsqueeze(0) spec_filename = filename.replace(".wav", ".spec.pt") if os.path.exists(spec_filename): diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index b698203..a771c0e 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -65,6 +65,15 @@ class PreProcess: # default resample type of librosa.resample is "soxr_hq". # Quality: soxr_vhq > soxr_hq tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000, res_type="soxr_vhq") + tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + ( + 1 - self.alpha + ) * tmp_audio + wavfile.write( + "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), + self.sr, + (tmp_audio * 1).astype(np.float32), + ) + wavfile.write( "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 16000,