Blame - python/pyarmnn/examples/speech_recognition/tests/test_mfcc.py - ml/armnn

blob: d692ab51c8771f4f292e13236087cfbfcc19ffa2 [file] [log] [blame]

Éanna Ó Catháin	145c88f	2020-11-16 14:12:11 +0000	[diff] [blame]	1	# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
				2	# SPDX-License-Identifier: MIT
				3
				4	import numpy as np
				5
				6	from context import preprocess
				7
				8	test_wav = [
				9	-3,0,1,-1,2,3,-2,2,
				10	1,-2,0,3,-1,8,3,2,
				11	-1,-1,2,7,3,5,6,6,
				12	6,12,5,6,3,3,5,4,
				13	4,6,7,7,7,3,7,2,
				14	8,4,4,2,-4,-1,-1,-4,
				15	2,1,-1,-4,0,-7,-6,-2,
				16	-5,1,-5,-1,-7,-3,-3,-7,
				17	0,-3,3,-5,0,1,-2,-2,
				18	-3,-3,-7,-3,-2,-6,-5,-8,
				19	-2,-8,4,-9,-4,-9,-5,-5,
				20	-3,-9,-3,-9,-1,-7,-4,1,
				21	-3,2,-8,-4,-4,-5,1,-3,
				22	-1,0,-1,-2,-3,-2,-4,-1,
				23	1,-1,3,0,3,2,0,0,
				24	0,-3,1,1,0,8,3,4,
				25	1,5,6,4,7,3,3,0,
				26	3,6,7,6,4,5,9,9,
				27	5,5,8,1,6,9,6,6,
				28	7,1,8,1,5,0,5,5,
				29	0,3,2,7,2,-3,3,0,
				30	3,0,0,0,2,0,-1,-1,
				31	-2,-3,-8,0,1,0,-3,-3,
				32	-3,-2,-3,-3,-4,-6,-2,-8,
				33	-9,-4,-1,-5,-3,-3,-4,-3,
				34	-6,3,0,-1,-2,-9,-4,-2,
				35	2,-1,3,-5,-5,-2,0,-2,
				36	0,-1,-3,1,-2,9,4,5,
				37	2,2,1,0,-6,-2,0,0,
				38	0,-1,4,-4,3,-7,-1,5,
				39	-6,-1,-5,4,3,9,-2,1,
				40	3,0,0,-2,1,2,1,1,
				41	0,3,2,-1,3,-3,7,0,
				42	0,3,2,2,-2,3,-2,2,
				43	-3,4,-1,-1,-5,-1,-3,-2,
				44	1,-1,3,2,4,1,2,-2,
				45	0,2,7,0,8,-3,6,-3,
				46	6,1,2,-3,-1,-1,-1,1,
				47	-2,2,1,2,0,-2,3,-2,
				48	3,-2,1,0,-3,-1,-2,-4,
				49	-6,-5,-8,-1,-4,0,-3,-1,
				50	-1,-1,0,-2,-3,-7,-1,0,
				51	1,5,0,5,1,1,-3,0,
				52	-6,3,-8,4,-8,6,-6,1,
				53	-6,-2,-5,-6,0,-5,4,-1,
				54	4,-2,1,2,1,0,-2,0,
				55	0,2,-2,2,-5,2,0,-2,
				56	1,-2,0,5,1,0,1,5,
				57	0,8,3,2,2,0,5,-2,
				58	3,1,0,1,0,-2,-1,-3,
				59	1,-1,3,0,3,0,-2,-1,
				60	-4,-4,-4,-1,-4,-4,-3,-6,
				61	-3,-7,-3,-1,-2,0,-5,-4,
				62	-7,-3,-2,-2,1,2,2,8,
				63	5,4,2,4,3,5,0,3,
				64	3,6,4,2,2,-2,4,-2,
				65	3,3,2,1,1,4,-5,2,
				66	-3,0,-1,1,-2,2,5,1,
				67	4,2,3,1,-1,1,0,6,
				68	0,-2,-1,1,-1,2,-5,-1,
				69	-5,-1,-6,-3,-3,2,4,0,
				70	-1,-5,3,-4,-1,-3,-4,1,
				71	-4,1,-1,-1,0,-5,-4,-2,
				72	-1,-1,-3,-7,-3,-3,4,4,
				73	]
				74
				75	def test_mel_scale_function_with_htk_true():
				76	samp_freq = 16000
				77	frame_len_ms = 32
				78	frame_len_samples = samp_freq * frame_len_ms * 0.001
				79	num_mfcc_feats = 13
				80	num_fbank_bins = 128
				81	mel_lo_freq = 0
				82	mil_hi_freq = 8000
				83	use_htk = False
				84	n_FFT = 512
				85
				86	mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
				87	frame_len_samples, use_htk, n_FFT)
				88
				89	mfcc_inst = preprocess.MFCC(mfcc_params)
				90
				91	mel = mfcc_inst.mel_scale(16, True)
				92
				93	assert np.isclose(mel, 25.470010570730597)
				94
				95
				96	def test_mel_scale_function_with_htk_false():
				97	samp_freq = 16000
				98	frame_len_ms = 32
				99	frame_len_samples = samp_freq * frame_len_ms * 0.001
				100	num_mfcc_feats = 13
				101	num_fbank_bins = 128
				102	mel_lo_freq = 0
				103	mil_hi_freq = 8000
				104	use_htk = False
				105	n_FFT = 512
				106
				107	mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
				108	frame_len_samples, use_htk, n_FFT)
				109
				110	mfcc_inst = preprocess.MFCC(mfcc_params)
				111
				112	mel = mfcc_inst.mel_scale(16, False)
				113
				114	assert np.isclose(mel, 0.24)
				115
				116
				117	def test_inverse_mel_scale_function_with_htk_true():
				118	samp_freq = 16000
				119	frame_len_ms = 32
				120	frame_len_samples = samp_freq * frame_len_ms * 0.001
				121	num_mfcc_feats = 13
				122	num_fbank_bins = 128
				123	mel_lo_freq = 0
				124	mil_hi_freq = 8000
				125	use_htk = False
				126	n_FFT = 512
				127
				128	mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
				129	frame_len_samples, use_htk, n_FFT)
				130
				131	mfcc_inst = preprocess.MFCC(mfcc_params)
				132
				133	mel = mfcc_inst.inv_mel_scale(16, True)
				134
				135	assert np.isclose(mel, 10.008767240008943)
				136
				137
				138	def test_inverse_mel_scale_function_with_htk_false():
				139	samp_freq = 16000
				140	frame_len_ms = 32
				141	frame_len_samples = samp_freq * frame_len_ms * 0.001
				142	num_mfcc_feats = 13
				143	num_fbank_bins = 128
				144	mel_lo_freq = 0
				145	mil_hi_freq = 8000
				146	use_htk = False
				147	n_FFT = 512
				148
				149	mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
				150	frame_len_samples, use_htk, n_FFT)
				151
				152	mfcc_inst = preprocess.MFCC(mfcc_params)
				153
				154	mel = mfcc_inst.inv_mel_scale(16, False)
				155
				156	assert np.isclose(mel, 1071.170287494467)
				157
				158
				159	def test_create_mel_filter_bank():
				160	samp_freq = 16000
				161	frame_len_ms = 32
				162	frame_len_samples = samp_freq * frame_len_ms * 0.001
				163	num_mfcc_feats = 13
				164	num_fbank_bins = 128
				165	mel_lo_freq = 0
				166	mil_hi_freq = 8000
				167	use_htk = False
				168	n_FFT = 512
				169
				170	mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
				171	frame_len_samples, use_htk, n_FFT)
				172
				173	mfcc_inst = preprocess.MFCC(mfcc_params)
				174
				175	mel_filter_bank = mfcc_inst.create_mel_filter_bank()
				176
				177	assert len(mel_filter_bank) == 128
				178
				179	assert str(mel_filter_bank[0]) == "[0.02837754]"
				180	assert str(mel_filter_bank[1]) == "[0.01438901 0.01398853]"
				181	assert str(mel_filter_bank[2]) == "[0.02877802]"
				182	assert str(mel_filter_bank[3]) == "[0.04236608]"
				183	assert str(mel_filter_bank[4]) == "[0.00040047 0.02797707]"
				184	assert str(mel_filter_bank[5]) == "[0.01478948 0.01358806]"
				185	assert str(mel_filter_bank[50]) == "[0.03298853]"
				186	assert str(mel_filter_bank[100]) == "[0.00260166 0.00588759 0.00914814 0.00798015 0.00476919 0.00158245]"
				187
				188
				189	def test_mfcc_compute():
				190	samp_freq = 16000
				191	frame_len_ms = 32
				192	frame_len_samples = samp_freq * frame_len_ms * 0.001
				193	num_mfcc_feats = 13
				194	num_fbank_bins = 128
				195	mel_lo_freq = 0
				196	mil_hi_freq = 8000
				197	use_htk = False
				198	n_FFT = 512
				199
				200	audio_data = np.array(test_wav) / (2 ** 15)
				201
				202	mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
				203	frame_len_samples, use_htk, n_FFT)
				204	mfcc_inst = preprocess.MFCC(mfcc_params)
				205	mfcc_feats = mfcc_inst.mfcc_compute(audio_data)
				206
				207	assert np.isclose((mfcc_feats[0]), -834.9656973095651)
				208	assert np.isclose((mfcc_feats[1]), 21.026915475076322)
				209	assert np.isclose((mfcc_feats[2]), 18.628541708201688)
				210	assert np.isclose((mfcc_feats[3]), 7.341153529494758)
				211	assert np.isclose((mfcc_feats[4]), 18.907974386153214)
				212	assert np.isclose((mfcc_feats[5]), -5.360387487466194)
				213	assert np.isclose((mfcc_feats[6]), 6.523572638527085)
				214	assert np.isclose((mfcc_feats[7]), -11.270643644983316)
				215	assert np.isclose((mfcc_feats[8]), 8.375177203773777)
				216	assert np.isclose((mfcc_feats[9]), 12.06721844362991)
				217	assert np.isclose((mfcc_feats[10]), 8.30815892468875)
				218	assert np.isclose((mfcc_feats[11]), -13.499911910889917)
				219	assert np.isclose((mfcc_feats[12]), -18.176121251436165)
				220
				221
				222	def test_sliding_window_for_small_num_samples():
				223	samp_freq = 16000
				224	frame_len_ms = 32
				225	frame_len_samples = samp_freq * frame_len_ms * 0.001
				226	num_mfcc_feats = 13
				227	mode_input_size = 9
				228	stride = 160
				229	num_fbank_bins = 128
				230	mel_lo_freq = 0
				231	mil_hi_freq = 8000
				232	use_htk = False
				233	n_FFT = 512
				234
				235	audio_data = np.array(test_wav) / (2 ** 15)
				236
				237	full_audio_data = np.tile(audio_data, 9)
				238
				239	mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
				240	frame_len_samples, use_htk, n_FFT)
				241	mfcc_inst = preprocess.MFCC(mfcc_params)
				242	preprocessor = preprocess.Preprocessor(mfcc_inst, mode_input_size, stride)
				243
				244	input_tensor = preprocessor.extract_features(full_audio_data)
				245
				246	assert np.isclose(input_tensor[0][0], -3.4660944830426454)
				247	assert np.isclose(input_tensor[0][1], 0.3587718932127629)
				248	assert np.isclose(input_tensor[0][2], 0.3480551325669172)
				249	assert np.isclose(input_tensor[0][3], 0.2976191917228921)
				250	assert np.isclose(input_tensor[0][4], 0.3493037340849936)
				251	assert np.isclose(input_tensor[0][5], 0.2408643285767937)
				252	assert np.isclose(input_tensor[0][6], 0.2939659585037282)
				253	assert np.isclose(input_tensor[0][7], 0.2144552669573928)
				254	assert np.isclose(input_tensor[0][8], 0.302239565899944)
				255	assert np.isclose(input_tensor[0][9], 0.3187368787077345)
				256	assert np.isclose(input_tensor[0][10], 0.3019401051295793)
				257	assert np.isclose(input_tensor[0][11], 0.20449412797602678)
				258
				259	assert np.isclose(input_tensor[0][38], -0.18751440767749533)
				260
				261
				262	def test_sliding_window_for_wav_2_letter_sized_input():
				263	samp_freq = 16000
				264	frame_len_ms = 32
				265	frame_len_samples = samp_freq * frame_len_ms * 0.001
				266	num_mfcc_feats = 13
				267	mode_input_size = 296
				268	stride = 160
				269	num_fbank_bins = 128
				270	mel_lo_freq = 0
				271	mil_hi_freq = 8000
				272	use_htk = False
				273	n_FFT = 512
				274
				275	audio_data = np.zeros(47712, dtype=int)
				276
				277	mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
				278	frame_len_samples, use_htk, n_FFT)
				279
				280	mfcc_inst = preprocess.MFCC(mfcc_params)
				281	preprocessor = preprocess.Preprocessor(mfcc_inst, mode_input_size, stride)
				282
				283	input_tensor = preprocessor.extract_features(audio_data)
				284
				285	assert len(input_tensor[0]) == 39
				286	assert len(input_tensor) == 296