some change precision audio processing

2025-05-06 20:01:37 +08:00 · 2023-04-18 18:17:29 +09:00 · 2023-04-18 18:17:29 +09:00 · 30d5f02a3d
commit 30d5f02a3d
parent a4c64b0253
3 changed files with 15 additions and 7 deletions
--- a/extract_f0_print.py
+++ b/extract_f0_print.py
@ -33,7 +33,9 @@ class FeatureInput(object):
        self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)

    def compute_f0(self, path, f0_method):
-        x, sr = librosa.load(path, self.fs)
+        # default resample type of librosa.resample is "soxr_hq".
+        # Quality: soxr_vhq > soxr_hq
+        x, sr = librosa.load(path, self.fs, res_type='soxr_vhq')
        p_len = x.shape[0] // self.hop
        assert sr == self.fs
        if f0_method == "pm":
--- a/my_utils.py
+++ b/my_utils.py
@ -12,10 +12,10 @@ def load_audio(file, sr):
        )  # 防止小白拷路径头尾带了空格和"和回车
        out, _ = (
            ffmpeg.input(file, threads=0)
-            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
        )
    except Exception as e:
        raise RuntimeError(f"Failed to load audio: {e}")

-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+    return np.frombuffer(out, np.float32).flatten()
--- a/trainset_preprocess_pipeline_print.py
+++ b/trainset_preprocess_pipeline_print.py
@ -59,19 +59,25 @@ class PreProcess:
        wavfile.write(
            "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
            self.sr,
-            (tmp_audio * 32768).astype(np.int16),
+            (tmp_audio * 1).astype(np.float32),
        )
-        tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)
+
+        # default resample type of librosa.resample is "soxr_hq".
+        # Quality: soxr_vhq > soxr_hq
+        tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000, res_type="soxr_vhq")
        wavfile.write(
            "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
            16000,
-            (tmp_audio * 32768).astype(np.int16),
+            (tmp_audio * 1).astype(np.float32),
        )

    def pipeline(self, path, idx0):
        try:
            audio = load_audio(path, self.sr)
-            audio = signal.filtfilt(self.bh, self.ah, audio)
+            # zero phased digital filter cause pre-ringing noise...
+            # audio = signal.filtfilt(self.bh, self.ah, audio) 
+            audio = signal.lfilter(self.bh, self.ah, audio)
+
            idx1 = 0
            for audio in self.slicer.slice(audio):
                i = 0