shithub: opus

Download patch

ref: 587c1020feea25920851e984f7e2aef784263a57
parent: 7487168d52edd79c5dba9c10007c1aa821893a76
author: Jan Buethe <jbuethe@amazon.de>
date: Sat Jul 22 11:16:23 EDT 2023

clean-up

--- a/dnn/torch/testsuite/run_test.py
+++ b/dnn/torch/testsuite/run_test.py
@@ -37,7 +37,6 @@
 import yaml
 
 from utils.files import get_wave_file_list
-from utils.warpq import compute_WAPRQ
 from utils.pesq import compute_PESQ
 from utils.pitch import compute_pitch_error
 
@@ -51,7 +50,7 @@
 parser.add_argument('--fs', type=int, help="sampling rate at which input is presented as wave file (defaults to 16000)", default=16000)
 parser.add_argument('--num-workers', type=int, help="number of subprocesses to be used (default=4)", default=4)
 parser.add_argument('--plc-suffix', type=str, default="_is_lost.txt", help="suffix of plc error pattern file: only relevant if command chain uses PLCFILE (default=_is_lost.txt)")
-parser.add_argument('--metrics', type=str, default='warpq', help='comma separated string of metrics, supported: {{"warpq", "pesq"}}, default="warpq"')
+parser.add_argument('--metrics', type=str, default='pesq', help='comma separated string of metrics, supported: {{"pesq", "pitch_error", "voicing_error"}}, default="pesq"')
 parser.add_argument('--verbose', action='store_true', help='enables printouts of all commands run in the pipeline')
 
 def check_for_sox_in_path():
@@ -69,7 +68,7 @@
         raise RuntimeError(f"command '{command}' failed with exit code {r.returncode}")
 
 
-def run_processing_chain(input_path, output_path, model_commands, fs, metrics={'warpq'}, plc_suffix="_is_lost.txt", verbose=False):
+def run_processing_chain(input_path, output_path, model_commands, fs, metrics={'pesq'}, plc_suffix="_is_lost.txt", verbose=False):
 
     # prepare model input
     model_input = output_path + ".resamp.wav"
@@ -86,10 +85,7 @@
     scores = dict()
     cache = dict()
     for metric in metrics:
-        if metric == 'warpq':
-            # run warpq
-            score = compute_WAPRQ(input_path, output_path, sr=fs)
-        elif metric == 'pesq':
+        if metric == 'pesq':
             # run pesq
             score = compute_PESQ(input_path, output_path, fs=fs)
         elif metric == 'pitch_error':
@@ -241,7 +237,6 @@
                 """)
 
 metric_sorting_signs = {
-    'warpq'         : -1,
     'pesq'          : 1,
     'pitch_error'   : -1,
     'voicing_error' : -1
--- a/dnn/torch/testsuite/utils/warpq.py
+++ /dev/null
@@ -1,177 +1,0 @@
-
-"""
-WARP-Q: Quality Prediction For Generative Neural Speech Codecs
-
-This is the WARP-Q version used in the ICASSP 2021 Paper:
-
-W. A. Jassim, J. Skoglund, M. Chinen, and A. Hines, “WARP-Q: Quality prediction
-for generative neural speech codecs,” paper accepted for presentation at the 2021 IEEE
-International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021).
-Date of acceptance: 30 Jan 2021. Preprint: https://arxiv.org/pdf/2102.10449
-
-Run using python 3.x and include these package dependencies in your virtual environment:
-    - pandas
-    - librosa
-    - numpy
-    - pyvad
-    - skimage
-    - speechpy
-    - soundfile
-    - scipy (optional)
-    - seaborn (optional, for plotting only)
-    - multiprocessing (optional, for parallel computing mode only)
-    - joblib (optional, for parallel computing mode only)
-
-Input:
-    - The main_test function calls a csv file that contains paths of audio files.
-    - The csv file cosists of four columns:
-        - Ref_Wave: reference speech
-        - Test_Wave: test speech
-        - MOS: subjective score (optinal, for plotting only)
-        - Codec: type of speech codec for the test speech (optinal, for plotting only)
-
-Output:
-    - Code will compute the WARP-Q quality scores between Ref_Wave and Test_Wave,
-    and will store the obrained results in a new column in the same csv file.
-
-
-Releases:
-
-Warning: While this code has been tested and commented giving invalid input
-files may cause unexpected results and will not be caught by robust exception
-handling or validation checking. It will just fail or give you the wrong answer.
-
-In this simple and basic demo, we compute WARP-Q scores for 8 speech samples only.
-More data should should be provided to have better score distributions.
-
-
-(c) Dr Wissam Jassim
-    University College Dublin
-    wissam.a.jassim@gmail.com
-    wissam.jassim@ucd.ie
-    November 28, 2020
-
-"""
-
-# Load libraries
-import librosa, librosa.core, librosa.display
-import numpy as np
-from pyvad import vad
-from skimage.util.shape import view_as_windows
-import speechpy
-import soundfile as sf
-
-################################ WARP-Q #######################################
-def compute_WAPRQ(ref_path,test_path,sr=16000,n_mfcc=12,fmax=5000,patch_size=0.4,
-                  sigma=np.array([[1,1],[3,2],[1,3]])):
-
-    # Inputs:
-    # refPath: path of reference speech
-    # disPath: path pf degraded speech
-    # sr: sampling frequency, Hz
-    # n_mfcc: number of MFCCs
-    # fmax: cutoff frequency
-    # patch_size: size of each patch in s
-    # sigma: step size conditon for DTW
-
-    # Output:
-    # WARP-Q quality score between refPath and disPath
-
-
-    ####################### Load speech files #################################
-    # Load Ref Speech
-    if ref_path[-4:] == '.wav':
-        speech_Ref, sr_Ref = librosa.load(ref_path,sr=sr)
-    else:
-        if ref_path[-4:] == '.SRC': #For ITUT database if applicable
-            speech_Ref, sr_Ref  = sf.read(ref_path, format='RAW', channels=1, samplerate=16000,
-                           subtype='PCM_16', endian='LITTLE')
-            if sr_Ref != sr:
-                speech_Ref = librosa.resample(speech_Ref, sr_Ref, sr)
-                sr_Ref = sr
-
-    # Load Coded Speech
-    if test_path[-4:] == '.wav':
-        speech_Coded, sr_Coded = librosa.load(test_path,sr=sr)
-    else:
-        if test_path[-4:] == '.OUT': #For ITUT database if applicable
-            speech_Coded, sr_Coded  = sf.read(test_path, format='RAW', channels=1, samplerate=16000,
-                           subtype='PCM_16', endian='LITTLE')
-            if sr_Coded != sr:
-                speech_Coded = librosa.resample(speech_Coded, sr_Coded, sr)
-                sr_Coded = sr
-
-    if sr_Ref != sr_Coded:
-        raise ValueError("Reference and degraded signals should have same sampling rate!")
-
-    # Make sure amplitudes are in the range of [-1, 1] otherwise clipping to -1 to 1
-    # after resampling (if applicable). We experienced this issue for TCD-VOIP database only
-    speech_Ref[speech_Ref>1]=1.0
-    speech_Ref[speech_Ref<-1]=-1.0
-
-    speech_Coded[speech_Coded>1]=1.0
-    speech_Coded[speech_Coded<-1]=-1.0
-
-    ###########################################################################
-
-    win_length = int(0.032*sr) #32 ms frame
-    hop_length = int(0.004*sr) #4 ms overlap
-    #hop_length = int(0.016*sr)
-
-    n_fft = 2*win_length
-    lifter = 3
-
-    # DTW Parameters
-    Metric = 'euclidean'
-
-    # VAD Parameters
-    hop_size_vad = 30
-    sr_vad = sr
-    aggresive = 0
-
-    # VAD for Ref speech
-    vact1 = vad(speech_Ref, sr, fs_vad = sr_vad, hop_length = hop_size_vad, vad_mode=aggresive)
-    speech_Ref_vad = speech_Ref[vact1==1]
-
-    # VAD for Coded speech
-    vact2 = vad(speech_Coded, sr, fs_vad = sr_vad, hop_length = hop_size_vad, vad_mode=aggresive)
-    speech_Coded_vad = speech_Coded[vact2==1]
-
-    # Compute MFCC features for the two signals
-
-    mfcc_Ref = librosa.feature.mfcc(y=speech_Ref_vad,sr=sr,n_mfcc=n_mfcc,fmax=fmax,
-                                    n_fft=n_fft,win_length=win_length,hop_length=hop_length,lifter=lifter)
-    mfcc_Coded = librosa.feature.mfcc(y=speech_Coded_vad,sr=sr,n_mfcc=n_mfcc,fmax=fmax,
-                                    n_fft=n_fft,win_length=win_length,hop_length=hop_length,lifter=lifter)
-
-    # Feature Normalisation using CMVNW method
-    mfcc_Ref = speechpy.processing.cmvnw(mfcc_Ref.T,win_size=201,variance_normalization=True).T
-    mfcc_Coded = speechpy.processing.cmvnw(mfcc_Coded.T,win_size=201,variance_normalization=True).T
-
-    # Divid MFCC features of Coded speech into patches
-    cols = int(patch_size/(hop_length/sr))
-    window_shape = (np.size(mfcc_Ref,0), cols)
-    step  = int(cols/2)
-
-    mfcc_Coded_patch = view_as_windows(mfcc_Coded, window_shape, step)
-
-    Acc =[]
-    band_rad = 0.25
-    weights_mul=np.array([1, 1, 1])
-
-    # Compute alignment cose between each patch and Ref MFCC
-    for i in range(mfcc_Coded_patch.shape[1]):
-
-        patch = mfcc_Coded_patch[0][i]
-
-        D, P = librosa.sequence.dtw(X=patch, Y=mfcc_Ref, metric=Metric,
-                                    step_sizes_sigma=sigma, weights_mul=weights_mul,
-                                    band_rad=band_rad, subseq=True, backtrack=True)
-
-        P_librosa = P[::-1, :]
-        b_ast = P_librosa[-1, 1]
-
-        Acc.append(D[-1, b_ast] / D.shape[0])
-
-    # Final score
-    return np.median(Acc).item()
--