blob: d692ab51c8771f4f292e13236087cfbfcc19ffa2 [file] [log] [blame]
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +00001# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
2# SPDX-License-Identifier: MIT
3
4import numpy as np
5
6from context import preprocess
7
8test_wav = [
9 -3,0,1,-1,2,3,-2,2,
10 1,-2,0,3,-1,8,3,2,
11 -1,-1,2,7,3,5,6,6,
12 6,12,5,6,3,3,5,4,
13 4,6,7,7,7,3,7,2,
14 8,4,4,2,-4,-1,-1,-4,
15 2,1,-1,-4,0,-7,-6,-2,
16 -5,1,-5,-1,-7,-3,-3,-7,
17 0,-3,3,-5,0,1,-2,-2,
18 -3,-3,-7,-3,-2,-6,-5,-8,
19 -2,-8,4,-9,-4,-9,-5,-5,
20 -3,-9,-3,-9,-1,-7,-4,1,
21 -3,2,-8,-4,-4,-5,1,-3,
22 -1,0,-1,-2,-3,-2,-4,-1,
23 1,-1,3,0,3,2,0,0,
24 0,-3,1,1,0,8,3,4,
25 1,5,6,4,7,3,3,0,
26 3,6,7,6,4,5,9,9,
27 5,5,8,1,6,9,6,6,
28 7,1,8,1,5,0,5,5,
29 0,3,2,7,2,-3,3,0,
30 3,0,0,0,2,0,-1,-1,
31 -2,-3,-8,0,1,0,-3,-3,
32 -3,-2,-3,-3,-4,-6,-2,-8,
33 -9,-4,-1,-5,-3,-3,-4,-3,
34 -6,3,0,-1,-2,-9,-4,-2,
35 2,-1,3,-5,-5,-2,0,-2,
36 0,-1,-3,1,-2,9,4,5,
37 2,2,1,0,-6,-2,0,0,
38 0,-1,4,-4,3,-7,-1,5,
39 -6,-1,-5,4,3,9,-2,1,
40 3,0,0,-2,1,2,1,1,
41 0,3,2,-1,3,-3,7,0,
42 0,3,2,2,-2,3,-2,2,
43 -3,4,-1,-1,-5,-1,-3,-2,
44 1,-1,3,2,4,1,2,-2,
45 0,2,7,0,8,-3,6,-3,
46 6,1,2,-3,-1,-1,-1,1,
47 -2,2,1,2,0,-2,3,-2,
48 3,-2,1,0,-3,-1,-2,-4,
49 -6,-5,-8,-1,-4,0,-3,-1,
50 -1,-1,0,-2,-3,-7,-1,0,
51 1,5,0,5,1,1,-3,0,
52 -6,3,-8,4,-8,6,-6,1,
53 -6,-2,-5,-6,0,-5,4,-1,
54 4,-2,1,2,1,0,-2,0,
55 0,2,-2,2,-5,2,0,-2,
56 1,-2,0,5,1,0,1,5,
57 0,8,3,2,2,0,5,-2,
58 3,1,0,1,0,-2,-1,-3,
59 1,-1,3,0,3,0,-2,-1,
60 -4,-4,-4,-1,-4,-4,-3,-6,
61 -3,-7,-3,-1,-2,0,-5,-4,
62 -7,-3,-2,-2,1,2,2,8,
63 5,4,2,4,3,5,0,3,
64 3,6,4,2,2,-2,4,-2,
65 3,3,2,1,1,4,-5,2,
66 -3,0,-1,1,-2,2,5,1,
67 4,2,3,1,-1,1,0,6,
68 0,-2,-1,1,-1,2,-5,-1,
69 -5,-1,-6,-3,-3,2,4,0,
70 -1,-5,3,-4,-1,-3,-4,1,
71 -4,1,-1,-1,0,-5,-4,-2,
72 -1,-1,-3,-7,-3,-3,4,4,
73]
74
75def test_mel_scale_function_with_htk_true():
76 samp_freq = 16000
77 frame_len_ms = 32
78 frame_len_samples = samp_freq * frame_len_ms * 0.001
79 num_mfcc_feats = 13
80 num_fbank_bins = 128
81 mel_lo_freq = 0
82 mil_hi_freq = 8000
83 use_htk = False
84 n_FFT = 512
85
86 mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
87 frame_len_samples, use_htk, n_FFT)
88
89 mfcc_inst = preprocess.MFCC(mfcc_params)
90
91 mel = mfcc_inst.mel_scale(16, True)
92
93 assert np.isclose(mel, 25.470010570730597)
94
95
96def test_mel_scale_function_with_htk_false():
97 samp_freq = 16000
98 frame_len_ms = 32
99 frame_len_samples = samp_freq * frame_len_ms * 0.001
100 num_mfcc_feats = 13
101 num_fbank_bins = 128
102 mel_lo_freq = 0
103 mil_hi_freq = 8000
104 use_htk = False
105 n_FFT = 512
106
107 mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
108 frame_len_samples, use_htk, n_FFT)
109
110 mfcc_inst = preprocess.MFCC(mfcc_params)
111
112 mel = mfcc_inst.mel_scale(16, False)
113
114 assert np.isclose(mel, 0.24)
115
116
117def test_inverse_mel_scale_function_with_htk_true():
118 samp_freq = 16000
119 frame_len_ms = 32
120 frame_len_samples = samp_freq * frame_len_ms * 0.001
121 num_mfcc_feats = 13
122 num_fbank_bins = 128
123 mel_lo_freq = 0
124 mil_hi_freq = 8000
125 use_htk = False
126 n_FFT = 512
127
128 mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
129 frame_len_samples, use_htk, n_FFT)
130
131 mfcc_inst = preprocess.MFCC(mfcc_params)
132
133 mel = mfcc_inst.inv_mel_scale(16, True)
134
135 assert np.isclose(mel, 10.008767240008943)
136
137
138def test_inverse_mel_scale_function_with_htk_false():
139 samp_freq = 16000
140 frame_len_ms = 32
141 frame_len_samples = samp_freq * frame_len_ms * 0.001
142 num_mfcc_feats = 13
143 num_fbank_bins = 128
144 mel_lo_freq = 0
145 mil_hi_freq = 8000
146 use_htk = False
147 n_FFT = 512
148
149 mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
150 frame_len_samples, use_htk, n_FFT)
151
152 mfcc_inst = preprocess.MFCC(mfcc_params)
153
154 mel = mfcc_inst.inv_mel_scale(16, False)
155
156 assert np.isclose(mel, 1071.170287494467)
157
158
159def test_create_mel_filter_bank():
160 samp_freq = 16000
161 frame_len_ms = 32
162 frame_len_samples = samp_freq * frame_len_ms * 0.001
163 num_mfcc_feats = 13
164 num_fbank_bins = 128
165 mel_lo_freq = 0
166 mil_hi_freq = 8000
167 use_htk = False
168 n_FFT = 512
169
170 mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
171 frame_len_samples, use_htk, n_FFT)
172
173 mfcc_inst = preprocess.MFCC(mfcc_params)
174
175 mel_filter_bank = mfcc_inst.create_mel_filter_bank()
176
177 assert len(mel_filter_bank) == 128
178
179 assert str(mel_filter_bank[0]) == "[0.02837754]"
180 assert str(mel_filter_bank[1]) == "[0.01438901 0.01398853]"
181 assert str(mel_filter_bank[2]) == "[0.02877802]"
182 assert str(mel_filter_bank[3]) == "[0.04236608]"
183 assert str(mel_filter_bank[4]) == "[0.00040047 0.02797707]"
184 assert str(mel_filter_bank[5]) == "[0.01478948 0.01358806]"
185 assert str(mel_filter_bank[50]) == "[0.03298853]"
186 assert str(mel_filter_bank[100]) == "[0.00260166 0.00588759 0.00914814 0.00798015 0.00476919 0.00158245]"
187
188
189def test_mfcc_compute():
190 samp_freq = 16000
191 frame_len_ms = 32
192 frame_len_samples = samp_freq * frame_len_ms * 0.001
193 num_mfcc_feats = 13
194 num_fbank_bins = 128
195 mel_lo_freq = 0
196 mil_hi_freq = 8000
197 use_htk = False
198 n_FFT = 512
199
200 audio_data = np.array(test_wav) / (2 ** 15)
201
202 mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
203 frame_len_samples, use_htk, n_FFT)
204 mfcc_inst = preprocess.MFCC(mfcc_params)
205 mfcc_feats = mfcc_inst.mfcc_compute(audio_data)
206
207 assert np.isclose((mfcc_feats[0]), -834.9656973095651)
208 assert np.isclose((mfcc_feats[1]), 21.026915475076322)
209 assert np.isclose((mfcc_feats[2]), 18.628541708201688)
210 assert np.isclose((mfcc_feats[3]), 7.341153529494758)
211 assert np.isclose((mfcc_feats[4]), 18.907974386153214)
212 assert np.isclose((mfcc_feats[5]), -5.360387487466194)
213 assert np.isclose((mfcc_feats[6]), 6.523572638527085)
214 assert np.isclose((mfcc_feats[7]), -11.270643644983316)
215 assert np.isclose((mfcc_feats[8]), 8.375177203773777)
216 assert np.isclose((mfcc_feats[9]), 12.06721844362991)
217 assert np.isclose((mfcc_feats[10]), 8.30815892468875)
218 assert np.isclose((mfcc_feats[11]), -13.499911910889917)
219 assert np.isclose((mfcc_feats[12]), -18.176121251436165)
220
221
222def test_sliding_window_for_small_num_samples():
223 samp_freq = 16000
224 frame_len_ms = 32
225 frame_len_samples = samp_freq * frame_len_ms * 0.001
226 num_mfcc_feats = 13
227 mode_input_size = 9
228 stride = 160
229 num_fbank_bins = 128
230 mel_lo_freq = 0
231 mil_hi_freq = 8000
232 use_htk = False
233 n_FFT = 512
234
235 audio_data = np.array(test_wav) / (2 ** 15)
236
237 full_audio_data = np.tile(audio_data, 9)
238
239 mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
240 frame_len_samples, use_htk, n_FFT)
241 mfcc_inst = preprocess.MFCC(mfcc_params)
242 preprocessor = preprocess.Preprocessor(mfcc_inst, mode_input_size, stride)
243
244 input_tensor = preprocessor.extract_features(full_audio_data)
245
246 assert np.isclose(input_tensor[0][0], -3.4660944830426454)
247 assert np.isclose(input_tensor[0][1], 0.3587718932127629)
248 assert np.isclose(input_tensor[0][2], 0.3480551325669172)
249 assert np.isclose(input_tensor[0][3], 0.2976191917228921)
250 assert np.isclose(input_tensor[0][4], 0.3493037340849936)
251 assert np.isclose(input_tensor[0][5], 0.2408643285767937)
252 assert np.isclose(input_tensor[0][6], 0.2939659585037282)
253 assert np.isclose(input_tensor[0][7], 0.2144552669573928)
254 assert np.isclose(input_tensor[0][8], 0.302239565899944)
255 assert np.isclose(input_tensor[0][9], 0.3187368787077345)
256 assert np.isclose(input_tensor[0][10], 0.3019401051295793)
257 assert np.isclose(input_tensor[0][11], 0.20449412797602678)
258
259 assert np.isclose(input_tensor[0][38], -0.18751440767749533)
260
261
262def test_sliding_window_for_wav_2_letter_sized_input():
263 samp_freq = 16000
264 frame_len_ms = 32
265 frame_len_samples = samp_freq * frame_len_ms * 0.001
266 num_mfcc_feats = 13
267 mode_input_size = 296
268 stride = 160
269 num_fbank_bins = 128
270 mel_lo_freq = 0
271 mil_hi_freq = 8000
272 use_htk = False
273 n_FFT = 512
274
275 audio_data = np.zeros(47712, dtype=int)
276
277 mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
278 frame_len_samples, use_htk, n_FFT)
279
280 mfcc_inst = preprocess.MFCC(mfcc_params)
281 preprocessor = preprocess.Preprocessor(mfcc_inst, mode_input_size, stride)
282
283 input_tensor = preprocessor.extract_features(audio_data)
284
285 assert len(input_tensor[0]) == 39
286 assert len(input_tensor) == 296