blob: 234b14d3befe96c4eba27650e0c2208f47cb853f [file] [log] [blame]
Éanna Ó Catháinc6ab02a2021-04-07 14:35:25 +01001//
2// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
3// SPDX-License-Identifier: MIT
4//
5
6#include <cstdio>
7#include <float.h>
8
9#include "MFCC.hpp"
10#include "MathUtils.hpp"
11
12
13MfccParams::MfccParams(
14 const float samplingFreq,
15 const int numFbankBins,
16 const float melLoFreq,
17 const float melHiFreq,
18 const int numMfccFeats,
19 const int frameLen,
20 const bool useHtkMethod,
21 const int numMfccVectors):
22 m_samplingFreq(samplingFreq),
23 m_numFbankBins(numFbankBins),
24 m_melLoFreq(melLoFreq),
25 m_melHiFreq(melHiFreq),
26 m_numMfccFeatures(numMfccFeats),
27 m_frameLen(frameLen),
28 m_numMfccVectors(numMfccVectors),
29
30 /* Smallest power of 2 >= frame length. */
31 m_frameLenPadded(pow(2, ceil((log(frameLen)/log(2))))),
32 m_useHtkMethod(useHtkMethod)
33{}
34
35std::string MfccParams::Str()
36{
37 char strC[1024];
38 snprintf(strC, sizeof(strC) - 1, "\n \
39 \n\t Sampling frequency: %f\
40 \n\t Number of filter banks: %u\
41 \n\t Mel frequency limit (low): %f\
42 \n\t Mel frequency limit (high): %f\
43 \n\t Number of MFCC features: %u\
44 \n\t Frame length: %u\
45 \n\t Padded frame length: %u\
46 \n\t Using HTK for Mel scale: %s\n",
47 this->m_samplingFreq, this->m_numFbankBins, this->m_melLoFreq,
48 this->m_melHiFreq, this->m_numMfccFeatures, this->m_frameLen,
49 this->m_frameLenPadded, this->m_useHtkMethod ? "yes" : "no");
50 return std::string{strC};
51}
52
53MFCC::MFCC(const MfccParams& params):
54 _m_params(params),
55 _m_filterBankInitialised(false)
56{
57 this->_m_buffer = std::vector<float>(
58 this->_m_params.m_frameLenPadded, 0.0);
59 this->_m_frame = std::vector<float>(
60 this->_m_params.m_frameLenPadded, 0.0);
61 this->_m_melEnergies = std::vector<float>(
62 this->_m_params.m_numFbankBins, 0.0);
63
64 this->_m_windowFunc = std::vector<float>(this->_m_params.m_frameLen);
65 const float multiplier = 2 * M_PI / this->_m_params.m_frameLen;
66
67 /* Create window function. */
68 for (size_t i = 0; i < this->_m_params.m_frameLen; i++)
69 {
70 this->_m_windowFunc[i] = (0.5 - (0.5 * cos(static_cast<float>(i) * multiplier)));
71 }
72}
73
74void MFCC::Init()
75{
76 this->_InitMelFilterBank();
77}
78
79float MFCC::MelScale(const float freq, const bool useHTKMethod)
80{
81 if (useHTKMethod)
82 {
83 return 1127.0f * logf (1.0f + freq / 700.0f);
84 }
85 else
86 {
87 /* Slaney formula for mel scale. */
88 float mel = freq / freqStep;
89
90 if (freq >= minLogHz)
91 {
92 mel = minLogMel + logf(freq / minLogHz) / logStep;
93 }
94 return mel;
95 }
96}
97
98float MFCC::InverseMelScale(const float melFreq, const bool useHTKMethod)
99{
100 if (useHTKMethod)
101 {
102 return 700.0f * (expf (melFreq / 1127.0f) - 1.0f);
103 }
104 else
105 {
106 /* Slaney formula for mel scale. */
107 float freq = freqStep * melFreq;
108
109 if (melFreq >= minLogMel)
110 {
111 freq = minLogHz * expf(logStep * (melFreq - minLogMel));
112 }
113 return freq;
114 }
115}
116
117
118bool MFCC::ApplyMelFilterBank(
119 std::vector<float>& fftVec,
120 std::vector<std::vector<float>>& melFilterBank,
121 std::vector<int32_t>& filterBankFilterFirst,
122 std::vector<int32_t>& filterBankFilterLast,
123 std::vector<float>& melEnergies)
124{
125 const size_t numBanks = melEnergies.size();
126
127 if (numBanks != filterBankFilterFirst.size() ||
128 numBanks != filterBankFilterLast.size())
129 {
130 printf("unexpected filter bank lengths\n");
131 return false;
132 }
133
134 for (size_t bin = 0; bin < numBanks; ++bin)
135 {
136 auto filterBankIter = melFilterBank[bin].begin();
137 float melEnergy = 1e-10; /* Avoid log of zero at later stages */
138 const int32_t firstIndex = filterBankFilterFirst[bin];
139 const int32_t lastIndex = filterBankFilterLast[bin];
140
141 for (int32_t i = firstIndex; i <= lastIndex; ++i)
142 {
143 melEnergy += (*filterBankIter++ * fftVec[i]);
144 }
145
146 melEnergies[bin] = melEnergy;
147 }
148
149 return true;
150}
151
152void MFCC::ConvertToLogarithmicScale(std::vector<float>& melEnergies)
153{
154 float maxMelEnergy = -FLT_MAX;
155
156 /* Container for natural logarithms of mel energies */
157 std::vector <float> vecLogEnergies(melEnergies.size(), 0.f);
158
159 /* Because we are taking natural logs, we need to multiply by log10(e).
160 * Also, for wav2letter model, we scale our log10 values by 10 */
161 constexpr float multiplier = 10.0 * /* default scalar */
162 0.4342944819032518; /* log10f(std::exp(1.0))*/
163
164 /* Take log of the whole vector */
165 MathUtils::VecLogarithmF32(melEnergies, vecLogEnergies);
166
167 /* Scale the log values and get the max */
168 for (auto iterM = melEnergies.begin(), iterL = vecLogEnergies.begin();
169 iterM != melEnergies.end(); ++iterM, ++iterL)
170 {
171 *iterM = *iterL * multiplier;
172
173 /* Save the max mel energy. */
174 if (*iterM > maxMelEnergy)
175 {
176 maxMelEnergy = *iterM;
177 }
178 }
179
180 /* Clamp the mel energies */
181 constexpr float maxDb = 80.0;
182 const float clampLevelLowdB = maxMelEnergy - maxDb;
183 for (auto iter = melEnergies.begin(); iter != melEnergies.end(); ++iter)
184 {
185 *iter = std::max(*iter, clampLevelLowdB);
186 }
187}
188
189void MFCC::_ConvertToPowerSpectrum()
190{
191 const uint32_t halfDim = this->_m_params.m_frameLenPadded / 2;
192
193 /* Handle this special case. */
194 float firstEnergy = this->_m_buffer[0] * this->_m_buffer[0];
195 float lastEnergy = this->_m_buffer[1] * this->_m_buffer[1];
196
197 MathUtils::ComplexMagnitudeSquaredF32(
198 this->_m_buffer.data(),
199 this->_m_buffer.size(),
200 this->_m_buffer.data(),
201 this->_m_buffer.size()/2);
202
203 this->_m_buffer[0] = firstEnergy;
204 this->_m_buffer[halfDim] = lastEnergy;
205}
206
207std::vector<float> MFCC::CreateDCTMatrix(
208 const int32_t inputLength,
209 const int32_t coefficientCount)
210{
211 std::vector<float> dctMatix(inputLength * coefficientCount);
212
213 /* Orthonormal normalization. */
214 const float normalizerK0 = 2 * sqrt(1.0 / static_cast<float>(4*inputLength));
215 const float normalizer = 2 * sqrt(1.0 / static_cast<float>(2*inputLength));
216
217 const float angleIncr = M_PI/inputLength;
218 float angle = angleIncr; /* we start using it at k = 1 loop */
219
220 /* First row of DCT will use normalizer K0 */
221 for (int32_t n = 0; n < inputLength; ++n)
222 {
223 dctMatix[n] = normalizerK0;
224 }
225
226 /* Second row (index = 1) onwards, we use standard normalizer */
227 for (int32_t k = 1, m = inputLength; k < coefficientCount; ++k, m += inputLength)
228 {
229 for (int32_t n = 0; n < inputLength; ++n)
230 {
231 dctMatix[m+n] = normalizer *
232 cos((n + 0.5) * angle);
233 }
234 angle += angleIncr;
235 }
236 return dctMatix;
237}
238
239float MFCC::GetMelFilterBankNormaliser(
240 const float& leftMel,
241 const float& rightMel,
242 const bool useHTKMethod)
243{
244/* Slaney normalization for mel weights. */
245 return (2.0f / (MFCC::InverseMelScale(rightMel, useHTKMethod) -
246 MFCC::InverseMelScale(leftMel, useHTKMethod)));
247}
248
249void MFCC::_InitMelFilterBank()
250{
251 if (!this->_IsMelFilterBankInited())
252 {
253 this->_m_melFilterBank = this->_CreateMelFilterBank();
254 this->_m_dctMatrix = this->CreateDCTMatrix(
255 this->_m_params.m_numFbankBins,
256 this->_m_params.m_numMfccFeatures);
257 this->_m_filterBankInitialised = true;
258 }
259}
260
261bool MFCC::_IsMelFilterBankInited()
262{
263 return this->_m_filterBankInitialised;
264}
265
266void MFCC::_MfccComputePreFeature(const std::vector<float>& audioData)
267{
268 this->_InitMelFilterBank();
269
270 /* TensorFlow way of normalizing .wav data to (-1, 1). */
271 constexpr float normaliser = 1.0;
272 for (size_t i = 0; i < this->_m_params.m_frameLen; i++)
273 {
274 this->_m_frame[i] = static_cast<float>(audioData[i]) * normaliser;
275 }
276
277 /* Apply window function to input frame. */
278 for(size_t i = 0; i < this->_m_params.m_frameLen; i++)
279 {
280 this->_m_frame[i] *= this->_m_windowFunc[i];
281 }
282
283 /* Set remaining frame values to 0. */
284 std::fill(this->_m_frame.begin() + this->_m_params.m_frameLen,this->_m_frame.end(), 0);
285
286 /* Compute FFT. */
287 MathUtils::FftF32(this->_m_frame, this->_m_buffer);
288
289 /* Convert to power spectrum. */
290 this->_ConvertToPowerSpectrum();
291
292 /* Apply mel filterbanks. */
293 if (!this->ApplyMelFilterBank(this->_m_buffer,
294 this->_m_melFilterBank,
295 this->_m_filterBankFilterFirst,
296 this->_m_filterBankFilterLast,
297 this->_m_melEnergies))
298 {
299 printf("Failed to apply MEL filter banks\n");
300 }
301
302 /* Convert to logarithmic scale */
303 this->ConvertToLogarithmicScale(this->_m_melEnergies);
304}
305
306std::vector<float> MFCC::MfccCompute(const std::vector<float>& audioData)
307{
308 this->_MfccComputePreFeature(audioData);
309
310 std::vector<float> mfccOut(this->_m_params.m_numMfccFeatures);
311
312 float * ptrMel = this->_m_melEnergies.data();
313 float * ptrDct = this->_m_dctMatrix.data();
314 float * ptrMfcc = mfccOut.data();
315
316 /* Take DCT. Uses matrix mul. */
317 for (size_t i = 0, j = 0; i < mfccOut.size();
318 ++i, j += this->_m_params.m_numFbankBins)
319 {
320 *ptrMfcc++ = MathUtils::DotProductF32(
321 ptrDct + j,
322 ptrMel,
323 this->_m_params.m_numFbankBins);
324 }
325
326 return mfccOut;
327}
328
329std::vector<std::vector<float>> MFCC::_CreateMelFilterBank()
330{
331 size_t numFftBins = this->_m_params.m_frameLenPadded / 2;
332 float fftBinWidth = static_cast<float>(this->_m_params.m_samplingFreq) / this->_m_params.m_frameLenPadded;
333
334 float melLowFreq = MFCC::MelScale(this->_m_params.m_melLoFreq,
335 this->_m_params.m_useHtkMethod);
336 float melHighFreq = MFCC::MelScale(this->_m_params.m_melHiFreq,
337 this->_m_params.m_useHtkMethod);
338 float melFreqDelta = (melHighFreq - melLowFreq) / (this->_m_params.m_numFbankBins + 1);
339
340 std::vector<float> thisBin = std::vector<float>(numFftBins);
341 std::vector<std::vector<float>> melFilterBank(
342 this->_m_params.m_numFbankBins);
343 this->_m_filterBankFilterFirst =
344 std::vector<int32_t>(this->_m_params.m_numFbankBins);
345 this->_m_filterBankFilterLast =
346 std::vector<int32_t>(this->_m_params.m_numFbankBins);
347
348 for (size_t bin = 0; bin < this->_m_params.m_numFbankBins; bin++)
349 {
350 float leftMel = melLowFreq + bin * melFreqDelta;
351 float centerMel = melLowFreq + (bin + 1) * melFreqDelta;
352 float rightMel = melLowFreq + (bin + 2) * melFreqDelta;
353
354 int32_t firstIndex = -1;
355 int32_t lastIndex = -1;
356 const float normaliser = this->GetMelFilterBankNormaliser(leftMel, rightMel, this->_m_params.m_useHtkMethod);
357
358 for (size_t i = 0; i < numFftBins; i++)
359 {
360 float freq = (fftBinWidth * i); /* Center freq of this fft bin. */
361 float mel = MFCC::MelScale(freq, this->_m_params.m_useHtkMethod);
362 thisBin[i] = 0.0;
363
364 if (mel > leftMel && mel < rightMel)
365 {
366 float weight;
367 if (mel <= centerMel)
368 {
369 weight = (mel - leftMel) / (centerMel - leftMel);
370 }
371 else
372 {
373 weight = (rightMel - mel) / (rightMel - centerMel);
374 }
375
376 thisBin[i] = weight * normaliser;
377 if (firstIndex == -1)
378 {
379 firstIndex = i;
380 }
381 lastIndex = i;
382 }
383 }
384
385 this->_m_filterBankFilterFirst[bin] = firstIndex;
386 this->_m_filterBankFilterLast[bin] = lastIndex;
387
388 /* Copy the part we care about. */
389 for (int32_t i = firstIndex; i <= lastIndex; i++)
390 {
391 melFilterBank[bin].push_back(thisBin[i]);
392 }
393 }
394
395 return melFilterBank;
396}
397