Blame - python/pyarmnn/examples/speech_recognition/wav2letter_mfcc.py - ml/armnn

blob: 1cac24d588853b9f0163c8f9e3d8a7808e630263 [file] [log] [blame]

alexander	f42f568	2021-07-16 11:30:56 +0100	[diff] [blame]	1	# Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
				2	# SPDX-License-Identifier: MIT
				3
				4	import numpy as np
				5	import os
				6	import sys
				7
				8	script_dir = os.path.dirname(__file__)
				9	sys.path.insert(1, os.path.join(script_dir, '..', 'common'))
				10
				11	from mfcc import MFCC, AudioPreprocessor
				12
				13
				14	class Wav2LetterMFCC(MFCC):
				15	"""Extends base MFCC class to provide Wav2Letter-specific MFCC requirements."""
				16
				17	def __init__(self, mfcc_params):
				18	super().__init__(mfcc_params)
				19
				20	def spectrum_calc(self, audio_data):
				21	return np.abs(np.fft.rfft(np.hanning(self.mfcc_params.frame_len + 1)[0:self.mfcc_params.frame_len] * audio_data,
				22	self.mfcc_params.n_fft)) ** 2
				23
				24	def log_mel(self, mel_energy):
				25	mel_energy += 1e-10
				26	log_mel_energy = 10.0 * np.log10(mel_energy)
				27	top_db = 80.0
				28	return np.maximum(log_mel_energy, log_mel_energy.max() - top_db)
				29
				30	def create_dct_matrix(self, num_fbank_bins, num_mfcc_feats):
				31	"""
				32	Creates the Discrete Cosine Transform matrix to be used in the compute function.
				33
				34	Args:
				35	num_fbank_bins: The number of filter bank bins
				36	num_mfcc_feats: the number of MFCC features
				37
				38	Returns:
				39	the DCT matrix
				40	"""
				41	dct_m = np.zeros(num_fbank_bins * num_mfcc_feats)
				42	for k in range(num_mfcc_feats):
				43	for n in range(num_fbank_bins):
				44	if k == 0:
				45	dct_m[(k * num_fbank_bins) + n] = 2 * np.sqrt(1 / (4 * num_fbank_bins)) * np.cos(
				46	(np.pi / num_fbank_bins) * (n + 0.5) * k)
				47	else:
				48	dct_m[(k * num_fbank_bins) + n] = 2 * np.sqrt(1 / (2 * num_fbank_bins)) * np.cos(
				49	(np.pi / num_fbank_bins) * (n + 0.5) * k)
				50
				51	dct_m = np.reshape(dct_m, [self.mfcc_params.num_mfcc_feats, self.mfcc_params.num_fbank_bins])
				52	return dct_m
				53
				54	def mel_norm(self, weight, right_mel, left_mel):
				55	"""Over-riding parent class with ASR specific weight normalisation."""
				56	enorm = 2.0 / (self.inv_mel_scale(right_mel, False) - self.inv_mel_scale(left_mel, False))
				57	return weight * enorm
				58
				59
				60	class W2LAudioPreprocessor(AudioPreprocessor):
				61
				62	def __init__(self, mfcc, model_input_size, stride):
				63	self.model_input_size = model_input_size
				64	self.stride = stride
				65
				66	super().__init__(self, model_input_size, stride)
				67	# Savitzky - Golay differential filters
				68	self.savgol_order1_coeffs = np.array([6.66666667e-02, 5.00000000e-02, 3.33333333e-02,
				69	1.66666667e-02, -3.46944695e-18, -1.66666667e-02,
				70	-3.33333333e-02, -5.00000000e-02, -6.66666667e-02])
				71
				72	self.savgol_order2_coeffs = np.array([0.06060606, 0.01515152, -0.01731602,
				73	-0.03679654, -0.04329004, -0.03679654,
				74	-0.01731602, 0.01515152, 0.06060606])
				75	self._mfcc_calc = mfcc
				76
				77	def mfcc_delta_calc(self, features):
				78	"""Over-riding parent class with ASR specific MFCC derivative features"""
				79	mfcc_delta_np = np.zeros_like(features)
				80	mfcc_delta2_np = np.zeros_like(features)
				81
				82	for i in range(features.shape[1]):
				83	idelta = np.convolve(features[:, i], self.savgol_order1_coeffs, 'same')
				84	mfcc_delta_np[:, i] = idelta
				85	ideltadelta = np.convolve(features[:, i], self.savgol_order2_coeffs, 'same')
				86	mfcc_delta2_np[:, i] = ideltadelta
				87
				88	features = np.concatenate((self._normalize(features), self._normalize(mfcc_delta_np),
				89	self._normalize(mfcc_delta2_np)), axis=1)
				90
				91	return features