Blame - samples/SpeechRecognition/include/Wav2LetterPreprocessor.hpp - ml/armnn

blob: ebc9e864e31ada157dafb9d7a54e6cf57781ecc1 [file] [log] [blame]

George Gekov	23c2627	2021-08-16 11:32:10 +0100	[diff] [blame^]	1	//
				2	// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
				3	// SPDX-License-Identifier: MIT
				4	//
				5	#ifndef SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
				6	#define SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
				7
				8	#include <numeric>
				9	#include "DataStructures.hpp"
				10	#include "SlidingWindow.hpp"
				11	#include "MFCC.hpp"
				12	#include "Wav2LetterMFCC.hpp"
				13	// Class to facilitate pre-processing calculation for Wav2Letter model for ASR
				14	using AudioWindow = SlidingWindow<const float>;
				15
				16	class Wav2LetterPreprocessor
				17	{
				18	public:
				19	Wav2LetterPreprocessor(uint32_t windowLen, uint32_t windowStride,
				20	std::unique_ptr<Wav2LetterMFCC> mfccInst);
				21
				22	/**
				23	* @brief Calculates the features required from audio data. This
				24	* includes MFCC, first and second order deltas,
				25	* normalisation and finally, quantisation. The tensor is
				26	* populated with feature from a given window placed along
				27	* in a single row.
				28	* @param[in] audioData pointer to the first element of audio data
				29	* @param[in] audioDataLen number of elements in the audio data
				30	* @param[in] tensor tensor to be populated
				31	* @return true if successful, false in case of error.
				32	*/
				33	bool Invoke(const float* audioData, uint32_t audioDataLen, std::vector<int8_t>& output, int quantOffset,
				34	float quantScale);
				35
				36	std::unique_ptr<MFCC> m_mfcc;
				37
				38	// Actual buffers to be populated
				39	Array2d<float> m_mfccBuf; // Contiguous buffer 1D: MFCC
				40	Array2d<float> m_delta1Buf; // Contiguous buffer 1D: Delta 1
				41	Array2d<float> m_delta2Buf; // Contiguous buffer 1D: Delta 2
				42
				43	uint32_t m_windowLen; // Window length for MFCC
				44	uint32_t m_windowStride; // Window stride len for MFCC
				45	AudioWindow m_window; // Sliding window
				46
				47	protected:
				48	/**
				49	* @brief Computes the first and second order deltas for the
				50	* MFCC buffers - they are assumed to be populated.
				51	*
				52	* @param[in] mfcc MFCC buffers
				53	* @param[out] delta1 result of the first diff computation
				54	* @param[out] delta2 result of the second diff computation
				55	*
				56	* @return true if successful, false otherwise
				57	*/
				58	static bool ComputeDeltas(Array2d<float>& mfcc,
				59	Array2d<float>& delta1,
				60	Array2d<float>& delta2);
				61
				62	protected:
				63
				64	/**
				65	* @brief Given a 2D vector of floats, computes the mean
				66	* @param[in] vec vector of vector of floats
				67	* @return mean value
				68	*/
				69	static float GetMean(Array2d<float>& vec);
				70
				71	/**
				72	* @brief Given a 2D vector of floats, computes the stddev
				73	* @param[in] vec vector of vector of floats
				74	* @param[in] mean mean value of the vector passed in
				75	* @return stddev value
				76	*/
				77	static float GetStdDev(Array2d<float>& vec, float mean);
				78
				79	/**
				80	* @brief Given a 2D vector of floats, normalises it using
				81	* the mean and the stddev
				82	* @param[in/out] vec vector of vector of floats
				83	* @return
				84	*/
				85	static void NormaliseVec(Array2d<float>& vec);
				86
				87	/**
				88	* @brief Normalises the MFCC and delta buffers
				89	* @return
				90	*/
				91	void Normalise();
				92
				93	/**
				94	* @brief Given the quantisation and data type limits, computes
				95	* the quantised values of a floating point input data.
				96	* @param[in] elem Element to be quantised
				97	* @param[in] quantScale Scale
				98	* @param[in] quantOffset Offset
				99	* @param[in] minVal Numerical limit - minimum
				100	* @param[in] maxVal Numerical limit - maximum
				101	* @return floating point quantised value
				102	*/
				103	static float GetQuantElem(
				104	float elem,
				105	float quantScale,
				106	int quantOffset,
				107	float minVal,
				108	float maxVal);
				109
				110	/**
				111	* @brief Quantises the MFCC and delta buffers, and places them
				112	* in the output buffer. While doing so, it transposes
				113	* the data. Reason: Buffers in this class are arranged
				114	* for "time" axis to be row major. Primary reason for
				115	* this being the convolution speed up (as we can use
				116	* contiguous memory). The output, however, requires the
				117	* time axis to be in column major arrangement.
				118	* @param[in] outputBuf pointer to the output buffer
				119	* @param[in] outputBufSz output buffer's size
				120	* @param[in] quantScale quantisation scale
				121	* @param[in] quantOffset quantisation offset
				122	*/
				123	template<typename T>
				124	bool Quantise(T*outputBuf, int quantOffset, float quantScale)
				125	{
				126	// Populate
				127	T* outputBufMfcc = outputBuf;
				128	T* outputBufD1 = outputBuf + this->m_mfcc->m_params.m_numMfccFeatures;
				129	T* outputBufD2 = outputBufD1 + this->m_mfcc->m_params.m_numMfccFeatures;
				130	const uint32_t ptrIncr = this->m_mfcc->m_params.m_numMfccFeatures * 2; // (3 vectors - 1 vector)
				131
				132	const float minVal = std::numeric_limits<T>::min();
				133	const float maxVal = std::numeric_limits<T>::max();
				134
				135	// We need to do a transpose while copying and concatenating the tensor
				136	for (uint32_t j = 0; j < this->m_mfcc->m_params.m_numMfccVectors; ++j)
				137	{
				138	for (uint32_t i = 0; i < this->m_mfcc->m_params.m_numMfccFeatures; ++i)
				139	{
				140	*outputBufMfcc++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
				141	this->m_mfccBuf(i, j), quantScale,
				142	quantOffset, minVal, maxVal));
				143	*outputBufD1++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
				144	this->m_delta1Buf(i, j), quantScale,
				145	quantOffset, minVal, maxVal));
				146	*outputBufD2++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
				147	this->m_delta2Buf(i, j), quantScale,
				148	quantOffset, minVal, maxVal));
				149	}
				150	outputBufMfcc += ptrIncr;
				151	outputBufD1 += ptrIncr;
				152	outputBufD2 += ptrIncr;
				153	}
				154	return true;
				155	}
				156	};
				157
				158	#endif //SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP