Blame - samples/SpeechRecognition/src/MFCC.cpp - ml/armnn

blob: 234b14d3befe96c4eba27650e0c2208f47cb853f [file] [log] [blame]

Éanna Ó Catháin	c6ab02a	2021-04-07 14:35:25 +0100	[diff] [blame^]	1	//
				2	// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
				3	// SPDX-License-Identifier: MIT
				4	//
				5
				6	#include <cstdio>
				7	#include <float.h>
				8
				9	#include "MFCC.hpp"
				10	#include "MathUtils.hpp"
				11
				12
				13	MfccParams::MfccParams(
				14	const float samplingFreq,
				15	const int numFbankBins,
				16	const float melLoFreq,
				17	const float melHiFreq,
				18	const int numMfccFeats,
				19	const int frameLen,
				20	const bool useHtkMethod,
				21	const int numMfccVectors):
				22	m_samplingFreq(samplingFreq),
				23	m_numFbankBins(numFbankBins),
				24	m_melLoFreq(melLoFreq),
				25	m_melHiFreq(melHiFreq),
				26	m_numMfccFeatures(numMfccFeats),
				27	m_frameLen(frameLen),
				28	m_numMfccVectors(numMfccVectors),
				29
				30	/* Smallest power of 2 >= frame length. */
				31	m_frameLenPadded(pow(2, ceil((log(frameLen)/log(2))))),
				32	m_useHtkMethod(useHtkMethod)
				33	{}
				34
				35	std::string MfccParams::Str()
				36	{
				37	char strC[1024];
				38	snprintf(strC, sizeof(strC) - 1, "\n \
				39	\n\t Sampling frequency: %f\
				40	\n\t Number of filter banks: %u\
				41	\n\t Mel frequency limit (low): %f\
				42	\n\t Mel frequency limit (high): %f\
				43	\n\t Number of MFCC features: %u\
				44	\n\t Frame length: %u\
				45	\n\t Padded frame length: %u\
				46	\n\t Using HTK for Mel scale: %s\n",
				47	this->m_samplingFreq, this->m_numFbankBins, this->m_melLoFreq,
				48	this->m_melHiFreq, this->m_numMfccFeatures, this->m_frameLen,
				49	this->m_frameLenPadded, this->m_useHtkMethod ? "yes" : "no");
				50	return std::string{strC};
				51	}
				52
				53	MFCC::MFCC(const MfccParams& params):
				54	_m_params(params),
				55	_m_filterBankInitialised(false)
				56	{
				57	this->_m_buffer = std::vector<float>(
				58	this->_m_params.m_frameLenPadded, 0.0);
				59	this->_m_frame = std::vector<float>(
				60	this->_m_params.m_frameLenPadded, 0.0);
				61	this->_m_melEnergies = std::vector<float>(
				62	this->_m_params.m_numFbankBins, 0.0);
				63
				64	this->_m_windowFunc = std::vector<float>(this->_m_params.m_frameLen);
				65	const float multiplier = 2 * M_PI / this->_m_params.m_frameLen;
				66
				67	/* Create window function. */
				68	for (size_t i = 0; i < this->_m_params.m_frameLen; i++)
				69	{
				70	this->_m_windowFunc[i] = (0.5 - (0.5 * cos(static_cast<float>(i) * multiplier)));
				71	}
				72	}
				73
				74	void MFCC::Init()
				75	{
				76	this->_InitMelFilterBank();
				77	}
				78
				79	float MFCC::MelScale(const float freq, const bool useHTKMethod)
				80	{
				81	if (useHTKMethod)
				82	{
				83	return 1127.0f * logf (1.0f + freq / 700.0f);
				84	}
				85	else
				86	{
				87	/* Slaney formula for mel scale. */
				88	float mel = freq / freqStep;
				89
				90	if (freq >= minLogHz)
				91	{
				92	mel = minLogMel + logf(freq / minLogHz) / logStep;
				93	}
				94	return mel;
				95	}
				96	}
				97
				98	float MFCC::InverseMelScale(const float melFreq, const bool useHTKMethod)
				99	{
				100	if (useHTKMethod)
				101	{
				102	return 700.0f * (expf (melFreq / 1127.0f) - 1.0f);
				103	}
				104	else
				105	{
				106	/* Slaney formula for mel scale. */
				107	float freq = freqStep * melFreq;
				108
				109	if (melFreq >= minLogMel)
				110	{
				111	freq = minLogHz * expf(logStep * (melFreq - minLogMel));
				112	}
				113	return freq;
				114	}
				115	}
				116
				117
				118	bool MFCC::ApplyMelFilterBank(
				119	std::vector<float>& fftVec,
				120	std::vector<std::vector<float>>& melFilterBank,
				121	std::vector<int32_t>& filterBankFilterFirst,
				122	std::vector<int32_t>& filterBankFilterLast,
				123	std::vector<float>& melEnergies)
				124	{
				125	const size_t numBanks = melEnergies.size();
				126
				127	if (numBanks != filterBankFilterFirst.size() \|\|
				128	numBanks != filterBankFilterLast.size())
				129	{
				130	printf("unexpected filter bank lengths\n");
				131	return false;
				132	}
				133
				134	for (size_t bin = 0; bin < numBanks; ++bin)
				135	{
				136	auto filterBankIter = melFilterBank[bin].begin();
				137	float melEnergy = 1e-10; /* Avoid log of zero at later stages */
				138	const int32_t firstIndex = filterBankFilterFirst[bin];
				139	const int32_t lastIndex = filterBankFilterLast[bin];
				140
				141	for (int32_t i = firstIndex; i <= lastIndex; ++i)
				142	{
				143	melEnergy += (filterBankIter++ fftVec[i]);
				144	}
				145
				146	melEnergies[bin] = melEnergy;
				147	}
				148
				149	return true;
				150	}
				151
				152	void MFCC::ConvertToLogarithmicScale(std::vector<float>& melEnergies)
				153	{
				154	float maxMelEnergy = -FLT_MAX;
				155
				156	/* Container for natural logarithms of mel energies */
				157	std::vector <float> vecLogEnergies(melEnergies.size(), 0.f);
				158
				159	/* Because we are taking natural logs, we need to multiply by log10(e).
				160	* Also, for wav2letter model, we scale our log10 values by 10 */
				161	constexpr float multiplier = 10.0 * /* default scalar */
				162	0.4342944819032518; /* log10f(std::exp(1.0))*/
				163
				164	/* Take log of the whole vector */
				165	MathUtils::VecLogarithmF32(melEnergies, vecLogEnergies);
				166
				167	/* Scale the log values and get the max */
				168	for (auto iterM = melEnergies.begin(), iterL = vecLogEnergies.begin();
				169	iterM != melEnergies.end(); ++iterM, ++iterL)
				170	{
				171	iterM = iterL * multiplier;
				172
				173	/* Save the max mel energy. */
				174	if (*iterM > maxMelEnergy)
				175	{
				176	maxMelEnergy = *iterM;
				177	}
				178	}
				179
				180	/* Clamp the mel energies */
				181	constexpr float maxDb = 80.0;
				182	const float clampLevelLowdB = maxMelEnergy - maxDb;
				183	for (auto iter = melEnergies.begin(); iter != melEnergies.end(); ++iter)
				184	{
				185	iter = std::max(iter, clampLevelLowdB);
				186	}
				187	}
				188
				189	void MFCC::_ConvertToPowerSpectrum()
				190	{
				191	const uint32_t halfDim = this->_m_params.m_frameLenPadded / 2;
				192
				193	/* Handle this special case. */
				194	float firstEnergy = this->_m_buffer[0] * this->_m_buffer[0];
				195	float lastEnergy = this->_m_buffer[1] * this->_m_buffer[1];
				196
				197	MathUtils::ComplexMagnitudeSquaredF32(
				198	this->_m_buffer.data(),
				199	this->_m_buffer.size(),
				200	this->_m_buffer.data(),
				201	this->_m_buffer.size()/2);
				202
				203	this->_m_buffer[0] = firstEnergy;
				204	this->_m_buffer[halfDim] = lastEnergy;
				205	}
				206
				207	std::vector<float> MFCC::CreateDCTMatrix(
				208	const int32_t inputLength,
				209	const int32_t coefficientCount)
				210	{
				211	std::vector<float> dctMatix(inputLength * coefficientCount);
				212
				213	/* Orthonormal normalization. */
				214	const float normalizerK0 = 2 * sqrt(1.0 / static_cast<float>(4*inputLength));
				215	const float normalizer = 2 * sqrt(1.0 / static_cast<float>(2*inputLength));
				216
				217	const float angleIncr = M_PI/inputLength;
				218	float angle = angleIncr; /* we start using it at k = 1 loop */
				219
				220	/* First row of DCT will use normalizer K0 */
				221	for (int32_t n = 0; n < inputLength; ++n)
				222	{
				223	dctMatix[n] = normalizerK0;
				224	}
				225
				226	/* Second row (index = 1) onwards, we use standard normalizer */
				227	for (int32_t k = 1, m = inputLength; k < coefficientCount; ++k, m += inputLength)
				228	{
				229	for (int32_t n = 0; n < inputLength; ++n)
				230	{
				231	dctMatix[m+n] = normalizer *
				232	cos((n + 0.5) * angle);
				233	}
				234	angle += angleIncr;
				235	}
				236	return dctMatix;
				237	}
				238
				239	float MFCC::GetMelFilterBankNormaliser(
				240	const float& leftMel,
				241	const float& rightMel,
				242	const bool useHTKMethod)
				243	{
				244	/* Slaney normalization for mel weights. */
				245	return (2.0f / (MFCC::InverseMelScale(rightMel, useHTKMethod) -
				246	MFCC::InverseMelScale(leftMel, useHTKMethod)));
				247	}
				248
				249	void MFCC::_InitMelFilterBank()
				250	{
				251	if (!this->_IsMelFilterBankInited())
				252	{
				253	this->_m_melFilterBank = this->_CreateMelFilterBank();
				254	this->_m_dctMatrix = this->CreateDCTMatrix(
				255	this->_m_params.m_numFbankBins,
				256	this->_m_params.m_numMfccFeatures);
				257	this->_m_filterBankInitialised = true;
				258	}
				259	}
				260
				261	bool MFCC::_IsMelFilterBankInited()
				262	{
				263	return this->_m_filterBankInitialised;
				264	}
				265
				266	void MFCC::_MfccComputePreFeature(const std::vector<float>& audioData)
				267	{
				268	this->_InitMelFilterBank();
				269
				270	/* TensorFlow way of normalizing .wav data to (-1, 1). */
				271	constexpr float normaliser = 1.0;
				272	for (size_t i = 0; i < this->_m_params.m_frameLen; i++)
				273	{
				274	this->_m_frame[i] = static_cast<float>(audioData[i]) * normaliser;
				275	}
				276
				277	/* Apply window function to input frame. */
				278	for(size_t i = 0; i < this->_m_params.m_frameLen; i++)
				279	{
				280	this->_m_frame[i] *= this->_m_windowFunc[i];
				281	}
				282
				283	/* Set remaining frame values to 0. */
				284	std::fill(this->_m_frame.begin() + this->_m_params.m_frameLen,this->_m_frame.end(), 0);
				285
				286	/* Compute FFT. */
				287	MathUtils::FftF32(this->_m_frame, this->_m_buffer);
				288
				289	/* Convert to power spectrum. */
				290	this->_ConvertToPowerSpectrum();
				291
				292	/* Apply mel filterbanks. */
				293	if (!this->ApplyMelFilterBank(this->_m_buffer,
				294	this->_m_melFilterBank,
				295	this->_m_filterBankFilterFirst,
				296	this->_m_filterBankFilterLast,
				297	this->_m_melEnergies))
				298	{
				299	printf("Failed to apply MEL filter banks\n");
				300	}
				301
				302	/* Convert to logarithmic scale */
				303	this->ConvertToLogarithmicScale(this->_m_melEnergies);
				304	}
				305
				306	std::vector<float> MFCC::MfccCompute(const std::vector<float>& audioData)
				307	{
				308	this->_MfccComputePreFeature(audioData);
				309
				310	std::vector<float> mfccOut(this->_m_params.m_numMfccFeatures);
				311
				312	float * ptrMel = this->_m_melEnergies.data();
				313	float * ptrDct = this->_m_dctMatrix.data();
				314	float * ptrMfcc = mfccOut.data();
				315
				316	/* Take DCT. Uses matrix mul. */
				317	for (size_t i = 0, j = 0; i < mfccOut.size();
				318	++i, j += this->_m_params.m_numFbankBins)
				319	{
				320	*ptrMfcc++ = MathUtils::DotProductF32(
				321	ptrDct + j,
				322	ptrMel,
				323	this->_m_params.m_numFbankBins);
				324	}
				325
				326	return mfccOut;
				327	}
				328
				329	std::vector<std::vector<float>> MFCC::_CreateMelFilterBank()
				330	{
				331	size_t numFftBins = this->_m_params.m_frameLenPadded / 2;
				332	float fftBinWidth = static_cast<float>(this->_m_params.m_samplingFreq) / this->_m_params.m_frameLenPadded;
				333
				334	float melLowFreq = MFCC::MelScale(this->_m_params.m_melLoFreq,
				335	this->_m_params.m_useHtkMethod);
				336	float melHighFreq = MFCC::MelScale(this->_m_params.m_melHiFreq,
				337	this->_m_params.m_useHtkMethod);
				338	float melFreqDelta = (melHighFreq - melLowFreq) / (this->_m_params.m_numFbankBins + 1);
				339
				340	std::vector<float> thisBin = std::vector<float>(numFftBins);
				341	std::vector<std::vector<float>> melFilterBank(
				342	this->_m_params.m_numFbankBins);
				343	this->_m_filterBankFilterFirst =
				344	std::vector<int32_t>(this->_m_params.m_numFbankBins);
				345	this->_m_filterBankFilterLast =
				346	std::vector<int32_t>(this->_m_params.m_numFbankBins);
				347
				348	for (size_t bin = 0; bin < this->_m_params.m_numFbankBins; bin++)
				349	{
				350	float leftMel = melLowFreq + bin * melFreqDelta;
				351	float centerMel = melLowFreq + (bin + 1) * melFreqDelta;
				352	float rightMel = melLowFreq + (bin + 2) * melFreqDelta;
				353
				354	int32_t firstIndex = -1;
				355	int32_t lastIndex = -1;
				356	const float normaliser = this->GetMelFilterBankNormaliser(leftMel, rightMel, this->_m_params.m_useHtkMethod);
				357
				358	for (size_t i = 0; i < numFftBins; i++)
				359	{
				360	float freq = (fftBinWidth * i); /* Center freq of this fft bin. */
				361	float mel = MFCC::MelScale(freq, this->_m_params.m_useHtkMethod);
				362	thisBin[i] = 0.0;
				363
				364	if (mel > leftMel && mel < rightMel)
				365	{
				366	float weight;
				367	if (mel <= centerMel)
				368	{
				369	weight = (mel - leftMel) / (centerMel - leftMel);
				370	}
				371	else
				372	{
				373	weight = (rightMel - mel) / (rightMel - centerMel);
				374	}
				375
				376	thisBin[i] = weight * normaliser;
				377	if (firstIndex == -1)
				378	{
				379	firstIndex = i;
				380	}
				381	lastIndex = i;
				382	}
				383	}
				384
				385	this->_m_filterBankFilterFirst[bin] = firstIndex;
				386	this->_m_filterBankFilterLast[bin] = lastIndex;
				387
				388	/* Copy the part we care about. */
				389	for (int32_t i = firstIndex; i <= lastIndex; i++)
				390	{
				391	melFilterBank[bin].push_back(thisBin[i]);
				392	}
				393	}
				394
				395	return melFilterBank;
				396	}
				397