alexander | f42f568 | 2021-07-16 11:30:56 +0100 | [diff] [blame] | 1 | # Copyright © 2021 Arm Ltd and Contributors. All rights reserved. |
| 2 | # SPDX-License-Identifier: MIT |
| 3 | |
| 4 | import numpy as np |
| 5 | import os |
| 6 | import sys |
| 7 | |
| 8 | script_dir = os.path.dirname(__file__) |
| 9 | sys.path.insert(1, os.path.join(script_dir, '..', 'common')) |
| 10 | |
| 11 | from mfcc import MFCC, AudioPreprocessor |
| 12 | |
| 13 | |
| 14 | class Wav2LetterMFCC(MFCC): |
| 15 | """Extends base MFCC class to provide Wav2Letter-specific MFCC requirements.""" |
| 16 | |
| 17 | def __init__(self, mfcc_params): |
| 18 | super().__init__(mfcc_params) |
| 19 | |
| 20 | def spectrum_calc(self, audio_data): |
| 21 | return np.abs(np.fft.rfft(np.hanning(self.mfcc_params.frame_len + 1)[0:self.mfcc_params.frame_len] * audio_data, |
| 22 | self.mfcc_params.n_fft)) ** 2 |
| 23 | |
| 24 | def log_mel(self, mel_energy): |
| 25 | mel_energy += 1e-10 |
| 26 | log_mel_energy = 10.0 * np.log10(mel_energy) |
| 27 | top_db = 80.0 |
| 28 | return np.maximum(log_mel_energy, log_mel_energy.max() - top_db) |
| 29 | |
| 30 | def create_dct_matrix(self, num_fbank_bins, num_mfcc_feats): |
| 31 | """ |
| 32 | Creates the Discrete Cosine Transform matrix to be used in the compute function. |
| 33 | |
| 34 | Args: |
| 35 | num_fbank_bins: The number of filter bank bins |
| 36 | num_mfcc_feats: the number of MFCC features |
| 37 | |
| 38 | Returns: |
| 39 | the DCT matrix |
| 40 | """ |
| 41 | dct_m = np.zeros(num_fbank_bins * num_mfcc_feats) |
| 42 | for k in range(num_mfcc_feats): |
| 43 | for n in range(num_fbank_bins): |
| 44 | if k == 0: |
| 45 | dct_m[(k * num_fbank_bins) + n] = 2 * np.sqrt(1 / (4 * num_fbank_bins)) * np.cos( |
| 46 | (np.pi / num_fbank_bins) * (n + 0.5) * k) |
| 47 | else: |
| 48 | dct_m[(k * num_fbank_bins) + n] = 2 * np.sqrt(1 / (2 * num_fbank_bins)) * np.cos( |
| 49 | (np.pi / num_fbank_bins) * (n + 0.5) * k) |
| 50 | |
| 51 | dct_m = np.reshape(dct_m, [self.mfcc_params.num_mfcc_feats, self.mfcc_params.num_fbank_bins]) |
| 52 | return dct_m |
| 53 | |
| 54 | def mel_norm(self, weight, right_mel, left_mel): |
| 55 | """Over-riding parent class with ASR specific weight normalisation.""" |
| 56 | enorm = 2.0 / (self.inv_mel_scale(right_mel, False) - self.inv_mel_scale(left_mel, False)) |
| 57 | return weight * enorm |
| 58 | |
| 59 | |
| 60 | class W2LAudioPreprocessor(AudioPreprocessor): |
| 61 | |
| 62 | def __init__(self, mfcc, model_input_size, stride): |
| 63 | self.model_input_size = model_input_size |
| 64 | self.stride = stride |
| 65 | |
| 66 | super().__init__(self, model_input_size, stride) |
| 67 | # Savitzky - Golay differential filters |
| 68 | self.savgol_order1_coeffs = np.array([6.66666667e-02, 5.00000000e-02, 3.33333333e-02, |
| 69 | 1.66666667e-02, -3.46944695e-18, -1.66666667e-02, |
| 70 | -3.33333333e-02, -5.00000000e-02, -6.66666667e-02]) |
| 71 | |
| 72 | self.savgol_order2_coeffs = np.array([0.06060606, 0.01515152, -0.01731602, |
| 73 | -0.03679654, -0.04329004, -0.03679654, |
| 74 | -0.01731602, 0.01515152, 0.06060606]) |
| 75 | self._mfcc_calc = mfcc |
| 76 | |
| 77 | def mfcc_delta_calc(self, features): |
| 78 | """Over-riding parent class with ASR specific MFCC derivative features""" |
| 79 | mfcc_delta_np = np.zeros_like(features) |
| 80 | mfcc_delta2_np = np.zeros_like(features) |
| 81 | |
| 82 | for i in range(features.shape[1]): |
| 83 | idelta = np.convolve(features[:, i], self.savgol_order1_coeffs, 'same') |
| 84 | mfcc_delta_np[:, i] = idelta |
| 85 | ideltadelta = np.convolve(features[:, i], self.savgol_order2_coeffs, 'same') |
| 86 | mfcc_delta2_np[:, i] = ideltadelta |
| 87 | |
| 88 | features = np.concatenate((self._normalize(features), self._normalize(mfcc_delta_np), |
| 89 | self._normalize(mfcc_delta2_np)), axis=1) |
| 90 | |
| 91 | return features |