Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 1 | # Copyright © 2020 Arm Ltd and Contributors. All rights reserved. |
| 2 | # SPDX-License-Identifier: MIT |
| 3 | |
| 4 | import numpy as np |
| 5 | |
| 6 | from context import preprocess |
| 7 | |
| 8 | test_wav = [ |
| 9 | -3,0,1,-1,2,3,-2,2, |
| 10 | 1,-2,0,3,-1,8,3,2, |
| 11 | -1,-1,2,7,3,5,6,6, |
| 12 | 6,12,5,6,3,3,5,4, |
| 13 | 4,6,7,7,7,3,7,2, |
| 14 | 8,4,4,2,-4,-1,-1,-4, |
| 15 | 2,1,-1,-4,0,-7,-6,-2, |
| 16 | -5,1,-5,-1,-7,-3,-3,-7, |
| 17 | 0,-3,3,-5,0,1,-2,-2, |
| 18 | -3,-3,-7,-3,-2,-6,-5,-8, |
| 19 | -2,-8,4,-9,-4,-9,-5,-5, |
| 20 | -3,-9,-3,-9,-1,-7,-4,1, |
| 21 | -3,2,-8,-4,-4,-5,1,-3, |
| 22 | -1,0,-1,-2,-3,-2,-4,-1, |
| 23 | 1,-1,3,0,3,2,0,0, |
| 24 | 0,-3,1,1,0,8,3,4, |
| 25 | 1,5,6,4,7,3,3,0, |
| 26 | 3,6,7,6,4,5,9,9, |
| 27 | 5,5,8,1,6,9,6,6, |
| 28 | 7,1,8,1,5,0,5,5, |
| 29 | 0,3,2,7,2,-3,3,0, |
| 30 | 3,0,0,0,2,0,-1,-1, |
| 31 | -2,-3,-8,0,1,0,-3,-3, |
| 32 | -3,-2,-3,-3,-4,-6,-2,-8, |
| 33 | -9,-4,-1,-5,-3,-3,-4,-3, |
| 34 | -6,3,0,-1,-2,-9,-4,-2, |
| 35 | 2,-1,3,-5,-5,-2,0,-2, |
| 36 | 0,-1,-3,1,-2,9,4,5, |
| 37 | 2,2,1,0,-6,-2,0,0, |
| 38 | 0,-1,4,-4,3,-7,-1,5, |
| 39 | -6,-1,-5,4,3,9,-2,1, |
| 40 | 3,0,0,-2,1,2,1,1, |
| 41 | 0,3,2,-1,3,-3,7,0, |
| 42 | 0,3,2,2,-2,3,-2,2, |
| 43 | -3,4,-1,-1,-5,-1,-3,-2, |
| 44 | 1,-1,3,2,4,1,2,-2, |
| 45 | 0,2,7,0,8,-3,6,-3, |
| 46 | 6,1,2,-3,-1,-1,-1,1, |
| 47 | -2,2,1,2,0,-2,3,-2, |
| 48 | 3,-2,1,0,-3,-1,-2,-4, |
| 49 | -6,-5,-8,-1,-4,0,-3,-1, |
| 50 | -1,-1,0,-2,-3,-7,-1,0, |
| 51 | 1,5,0,5,1,1,-3,0, |
| 52 | -6,3,-8,4,-8,6,-6,1, |
| 53 | -6,-2,-5,-6,0,-5,4,-1, |
| 54 | 4,-2,1,2,1,0,-2,0, |
| 55 | 0,2,-2,2,-5,2,0,-2, |
| 56 | 1,-2,0,5,1,0,1,5, |
| 57 | 0,8,3,2,2,0,5,-2, |
| 58 | 3,1,0,1,0,-2,-1,-3, |
| 59 | 1,-1,3,0,3,0,-2,-1, |
| 60 | -4,-4,-4,-1,-4,-4,-3,-6, |
| 61 | -3,-7,-3,-1,-2,0,-5,-4, |
| 62 | -7,-3,-2,-2,1,2,2,8, |
| 63 | 5,4,2,4,3,5,0,3, |
| 64 | 3,6,4,2,2,-2,4,-2, |
| 65 | 3,3,2,1,1,4,-5,2, |
| 66 | -3,0,-1,1,-2,2,5,1, |
| 67 | 4,2,3,1,-1,1,0,6, |
| 68 | 0,-2,-1,1,-1,2,-5,-1, |
| 69 | -5,-1,-6,-3,-3,2,4,0, |
| 70 | -1,-5,3,-4,-1,-3,-4,1, |
| 71 | -4,1,-1,-1,0,-5,-4,-2, |
| 72 | -1,-1,-3,-7,-3,-3,4,4, |
| 73 | ] |
| 74 | |
| 75 | def test_mel_scale_function_with_htk_true(): |
| 76 | samp_freq = 16000 |
| 77 | frame_len_ms = 32 |
| 78 | frame_len_samples = samp_freq * frame_len_ms * 0.001 |
| 79 | num_mfcc_feats = 13 |
| 80 | num_fbank_bins = 128 |
| 81 | mel_lo_freq = 0 |
| 82 | mil_hi_freq = 8000 |
| 83 | use_htk = False |
| 84 | n_FFT = 512 |
| 85 | |
| 86 | mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats, |
| 87 | frame_len_samples, use_htk, n_FFT) |
| 88 | |
| 89 | mfcc_inst = preprocess.MFCC(mfcc_params) |
| 90 | |
| 91 | mel = mfcc_inst.mel_scale(16, True) |
| 92 | |
| 93 | assert np.isclose(mel, 25.470010570730597) |
| 94 | |
| 95 | |
| 96 | def test_mel_scale_function_with_htk_false(): |
| 97 | samp_freq = 16000 |
| 98 | frame_len_ms = 32 |
| 99 | frame_len_samples = samp_freq * frame_len_ms * 0.001 |
| 100 | num_mfcc_feats = 13 |
| 101 | num_fbank_bins = 128 |
| 102 | mel_lo_freq = 0 |
| 103 | mil_hi_freq = 8000 |
| 104 | use_htk = False |
| 105 | n_FFT = 512 |
| 106 | |
| 107 | mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats, |
| 108 | frame_len_samples, use_htk, n_FFT) |
| 109 | |
| 110 | mfcc_inst = preprocess.MFCC(mfcc_params) |
| 111 | |
| 112 | mel = mfcc_inst.mel_scale(16, False) |
| 113 | |
| 114 | assert np.isclose(mel, 0.24) |
| 115 | |
| 116 | |
| 117 | def test_inverse_mel_scale_function_with_htk_true(): |
| 118 | samp_freq = 16000 |
| 119 | frame_len_ms = 32 |
| 120 | frame_len_samples = samp_freq * frame_len_ms * 0.001 |
| 121 | num_mfcc_feats = 13 |
| 122 | num_fbank_bins = 128 |
| 123 | mel_lo_freq = 0 |
| 124 | mil_hi_freq = 8000 |
| 125 | use_htk = False |
| 126 | n_FFT = 512 |
| 127 | |
| 128 | mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats, |
| 129 | frame_len_samples, use_htk, n_FFT) |
| 130 | |
| 131 | mfcc_inst = preprocess.MFCC(mfcc_params) |
| 132 | |
| 133 | mel = mfcc_inst.inv_mel_scale(16, True) |
| 134 | |
| 135 | assert np.isclose(mel, 10.008767240008943) |
| 136 | |
| 137 | |
| 138 | def test_inverse_mel_scale_function_with_htk_false(): |
| 139 | samp_freq = 16000 |
| 140 | frame_len_ms = 32 |
| 141 | frame_len_samples = samp_freq * frame_len_ms * 0.001 |
| 142 | num_mfcc_feats = 13 |
| 143 | num_fbank_bins = 128 |
| 144 | mel_lo_freq = 0 |
| 145 | mil_hi_freq = 8000 |
| 146 | use_htk = False |
| 147 | n_FFT = 512 |
| 148 | |
| 149 | mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats, |
| 150 | frame_len_samples, use_htk, n_FFT) |
| 151 | |
| 152 | mfcc_inst = preprocess.MFCC(mfcc_params) |
| 153 | |
| 154 | mel = mfcc_inst.inv_mel_scale(16, False) |
| 155 | |
| 156 | assert np.isclose(mel, 1071.170287494467) |
| 157 | |
| 158 | |
| 159 | def test_create_mel_filter_bank(): |
| 160 | samp_freq = 16000 |
| 161 | frame_len_ms = 32 |
| 162 | frame_len_samples = samp_freq * frame_len_ms * 0.001 |
| 163 | num_mfcc_feats = 13 |
| 164 | num_fbank_bins = 128 |
| 165 | mel_lo_freq = 0 |
| 166 | mil_hi_freq = 8000 |
| 167 | use_htk = False |
| 168 | n_FFT = 512 |
| 169 | |
| 170 | mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats, |
| 171 | frame_len_samples, use_htk, n_FFT) |
| 172 | |
| 173 | mfcc_inst = preprocess.MFCC(mfcc_params) |
| 174 | |
| 175 | mel_filter_bank = mfcc_inst.create_mel_filter_bank() |
| 176 | |
| 177 | assert len(mel_filter_bank) == 128 |
| 178 | |
| 179 | assert str(mel_filter_bank[0]) == "[0.02837754]" |
| 180 | assert str(mel_filter_bank[1]) == "[0.01438901 0.01398853]" |
| 181 | assert str(mel_filter_bank[2]) == "[0.02877802]" |
| 182 | assert str(mel_filter_bank[3]) == "[0.04236608]" |
| 183 | assert str(mel_filter_bank[4]) == "[0.00040047 0.02797707]" |
| 184 | assert str(mel_filter_bank[5]) == "[0.01478948 0.01358806]" |
| 185 | assert str(mel_filter_bank[50]) == "[0.03298853]" |
| 186 | assert str(mel_filter_bank[100]) == "[0.00260166 0.00588759 0.00914814 0.00798015 0.00476919 0.00158245]" |
| 187 | |
| 188 | |
| 189 | def test_mfcc_compute(): |
| 190 | samp_freq = 16000 |
| 191 | frame_len_ms = 32 |
| 192 | frame_len_samples = samp_freq * frame_len_ms * 0.001 |
| 193 | num_mfcc_feats = 13 |
| 194 | num_fbank_bins = 128 |
| 195 | mel_lo_freq = 0 |
| 196 | mil_hi_freq = 8000 |
| 197 | use_htk = False |
| 198 | n_FFT = 512 |
| 199 | |
| 200 | audio_data = np.array(test_wav) / (2 ** 15) |
| 201 | |
| 202 | mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats, |
| 203 | frame_len_samples, use_htk, n_FFT) |
| 204 | mfcc_inst = preprocess.MFCC(mfcc_params) |
| 205 | mfcc_feats = mfcc_inst.mfcc_compute(audio_data) |
| 206 | |
| 207 | assert np.isclose((mfcc_feats[0]), -834.9656973095651) |
| 208 | assert np.isclose((mfcc_feats[1]), 21.026915475076322) |
| 209 | assert np.isclose((mfcc_feats[2]), 18.628541708201688) |
| 210 | assert np.isclose((mfcc_feats[3]), 7.341153529494758) |
| 211 | assert np.isclose((mfcc_feats[4]), 18.907974386153214) |
| 212 | assert np.isclose((mfcc_feats[5]), -5.360387487466194) |
| 213 | assert np.isclose((mfcc_feats[6]), 6.523572638527085) |
| 214 | assert np.isclose((mfcc_feats[7]), -11.270643644983316) |
| 215 | assert np.isclose((mfcc_feats[8]), 8.375177203773777) |
| 216 | assert np.isclose((mfcc_feats[9]), 12.06721844362991) |
| 217 | assert np.isclose((mfcc_feats[10]), 8.30815892468875) |
| 218 | assert np.isclose((mfcc_feats[11]), -13.499911910889917) |
| 219 | assert np.isclose((mfcc_feats[12]), -18.176121251436165) |
| 220 | |
| 221 | |
| 222 | def test_sliding_window_for_small_num_samples(): |
| 223 | samp_freq = 16000 |
| 224 | frame_len_ms = 32 |
| 225 | frame_len_samples = samp_freq * frame_len_ms * 0.001 |
| 226 | num_mfcc_feats = 13 |
| 227 | mode_input_size = 9 |
| 228 | stride = 160 |
| 229 | num_fbank_bins = 128 |
| 230 | mel_lo_freq = 0 |
| 231 | mil_hi_freq = 8000 |
| 232 | use_htk = False |
| 233 | n_FFT = 512 |
| 234 | |
| 235 | audio_data = np.array(test_wav) / (2 ** 15) |
| 236 | |
| 237 | full_audio_data = np.tile(audio_data, 9) |
| 238 | |
| 239 | mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats, |
| 240 | frame_len_samples, use_htk, n_FFT) |
| 241 | mfcc_inst = preprocess.MFCC(mfcc_params) |
| 242 | preprocessor = preprocess.Preprocessor(mfcc_inst, mode_input_size, stride) |
| 243 | |
| 244 | input_tensor = preprocessor.extract_features(full_audio_data) |
| 245 | |
| 246 | assert np.isclose(input_tensor[0][0], -3.4660944830426454) |
| 247 | assert np.isclose(input_tensor[0][1], 0.3587718932127629) |
| 248 | assert np.isclose(input_tensor[0][2], 0.3480551325669172) |
| 249 | assert np.isclose(input_tensor[0][3], 0.2976191917228921) |
| 250 | assert np.isclose(input_tensor[0][4], 0.3493037340849936) |
| 251 | assert np.isclose(input_tensor[0][5], 0.2408643285767937) |
| 252 | assert np.isclose(input_tensor[0][6], 0.2939659585037282) |
| 253 | assert np.isclose(input_tensor[0][7], 0.2144552669573928) |
| 254 | assert np.isclose(input_tensor[0][8], 0.302239565899944) |
| 255 | assert np.isclose(input_tensor[0][9], 0.3187368787077345) |
| 256 | assert np.isclose(input_tensor[0][10], 0.3019401051295793) |
| 257 | assert np.isclose(input_tensor[0][11], 0.20449412797602678) |
| 258 | |
| 259 | assert np.isclose(input_tensor[0][38], -0.18751440767749533) |
| 260 | |
| 261 | |
| 262 | def test_sliding_window_for_wav_2_letter_sized_input(): |
| 263 | samp_freq = 16000 |
| 264 | frame_len_ms = 32 |
| 265 | frame_len_samples = samp_freq * frame_len_ms * 0.001 |
| 266 | num_mfcc_feats = 13 |
| 267 | mode_input_size = 296 |
| 268 | stride = 160 |
| 269 | num_fbank_bins = 128 |
| 270 | mel_lo_freq = 0 |
| 271 | mil_hi_freq = 8000 |
| 272 | use_htk = False |
| 273 | n_FFT = 512 |
| 274 | |
| 275 | audio_data = np.zeros(47712, dtype=int) |
| 276 | |
| 277 | mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats, |
| 278 | frame_len_samples, use_htk, n_FFT) |
| 279 | |
| 280 | mfcc_inst = preprocess.MFCC(mfcc_params) |
| 281 | preprocessor = preprocess.Preprocessor(mfcc_inst, mode_input_size, stride) |
| 282 | |
| 283 | input_tensor = preprocessor.extract_features(audio_data) |
| 284 | |
| 285 | assert len(input_tensor[0]) == 39 |
| 286 | assert len(input_tensor) == 296 |