samples/SpeechRecognition/include/Wav2LetterPreprocessor.hpp - ml/armnn - Gitiles

 //
 // Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #ifndef SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
 #define SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP

 #include <numeric>
 #include "DataStructures.hpp"
 #include "SlidingWindow.hpp"
 #include "MFCC.hpp"
 #include "Wav2LetterMFCC.hpp"
 // Class to facilitate pre-processing calculation for Wav2Letter model for ASR
 using AudioWindow = SlidingWindow<const float>;

 class Wav2LetterPreprocessor
 {
 public:
     Wav2LetterPreprocessor(uint32_t windowLen, uint32_t windowStride,
                            std::unique_ptr<Wav2LetterMFCC> mfccInst);

     /**
      * @brief       Calculates the features required from audio data. This
      *              includes MFCC, first and second order deltas,
      *              normalisation and finally, quantisation. The tensor is
      *              populated with feature from a given window placed along
      *              in a single row.
      * @param[in]   audioData     pointer to the first element of audio data
      * @param[in]   audioDataLen  number of elements in the audio data
      * @param[in]   tensor        tensor to be populated
      * @return      true if successful, false in case of error.
      */
     bool Invoke(const float* audioData, uint32_t audioDataLen, std::vector<int8_t>& output, int quantOffset,
                 float quantScale);

     std::unique_ptr<MFCC> m_mfcc;

     // Actual buffers to be populated
     Array2d<float> m_mfccBuf;         // Contiguous buffer 1D: MFCC
     Array2d<float> m_delta1Buf;       // Contiguous buffer 1D: Delta 1
     Array2d<float> m_delta2Buf;       // Contiguous buffer 1D: Delta 2

     uint32_t m_windowLen;       // Window length for MFCC
     uint32_t m_windowStride;    // Window stride len for MFCC
     AudioWindow m_window;       // Sliding window

 protected:
     /**
      * @brief Computes the first and second order deltas for the
      *        MFCC buffers - they are assumed to be populated.
      *
      * @param[in]  mfcc   MFCC buffers
      * @param[out] delta1 result of the first diff computation
      * @param[out] delta2 result of the second diff computation
      *
      * @return true if successful, false otherwise
      */
     static bool ComputeDeltas(Array2d<float>& mfcc,
                               Array2d<float>& delta1,
                               Array2d<float>& delta2);

 protected:

     /**
      * @brief      Given a 2D vector of floats, computes the mean
      * @param[in]   vec      vector of vector of floats
      * @return      mean value
      */
     static float GetMean(Array2d<float>& vec);

     /**
      * @brief       Given a 2D vector of floats, computes the stddev
      * @param[in]   vec   vector of vector of floats
      * @param[in]   mean     mean value of the vector passed in
      * @return      stddev value
      */
     static float GetStdDev(Array2d<float>& vec, float mean);

     /**
      * @brief           Given a 2D vector of floats, normalises it using
      *                  the mean and the stddev
      * @param[in/out]   vec      vector of vector of floats
      * @return
      */
     static void NormaliseVec(Array2d<float>& vec);

     /**
      * @brief       Normalises the MFCC and delta buffers
      * @return
      */
     void Normalise();

     /**
      * @brief       Given the quantisation and data type limits, computes
      *              the quantised values of a floating point input data.
      * @param[in]   elem            Element to be quantised
      * @param[in]   quantScale      Scale
      * @param[in]   quantOffset     Offset
      * @param[in]   minVal          Numerical limit - minimum
      * @param[in]   maxVal          Numerical limit - maximum
      * @return      floating point quantised value
      */
     static float GetQuantElem(
             float elem,
             float quantScale,
             int quantOffset,
             float minVal,
             float maxVal);

     /**
      * @brief       Quantises the MFCC and delta buffers, and places them
      *              in the output buffer. While doing so, it transposes
      *              the data. Reason: Buffers in this class are arranged
      *              for "time" axis to be row major. Primary reason for
      *              this being the convolution speed up (as we can use
      *              contiguous memory). The output, however, requires the
      *              time axis to be in column major arrangement.
      * @param[in]   outputBuf       pointer to the output buffer
      * @param[in]   outputBufSz     output buffer's size
      * @param[in]   quantScale      quantisation scale
      * @param[in]   quantOffset     quantisation offset
      */
     template<typename T>
     bool Quantise(T*outputBuf, int quantOffset, float quantScale)
     {
         // Populate
         T* outputBufMfcc = outputBuf;
         T* outputBufD1 = outputBuf + this->m_mfcc->m_params.m_numMfccFeatures;
         T* outputBufD2 = outputBufD1 + this->m_mfcc->m_params.m_numMfccFeatures;
         const uint32_t ptrIncr = this->m_mfcc->m_params.m_numMfccFeatures * 2; // (3 vectors - 1 vector)

         const float minVal = std::numeric_limits<T>::min();
         const float maxVal = std::numeric_limits<T>::max();

         // We need to do a transpose while copying and concatenating the tensor
         for (uint32_t j = 0; j < this->m_mfcc->m_params.m_numMfccVectors; ++j)
         {
             for (uint32_t i = 0; i < this->m_mfcc->m_params.m_numMfccFeatures; ++i)
             {
                 *outputBufMfcc++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
                         this->m_mfccBuf(i, j), quantScale,
                         quantOffset, minVal, maxVal));
                 *outputBufD1++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
                         this->m_delta1Buf(i, j), quantScale,
                         quantOffset, minVal, maxVal));
                 *outputBufD2++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
                         this->m_delta2Buf(i, j), quantScale,
                         quantOffset, minVal, maxVal));
             }
             outputBufMfcc += ptrIncr;
             outputBufD1 += ptrIncr;
             outputBufD2 += ptrIncr;
         }
         return true;
     }
 };

 #endif //SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
	//
	// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
	// SPDX-License-Identifier: MIT
	//
	#ifndef SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
	#define SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP

	#include <numeric>
	#include "DataStructures.hpp"
	#include "SlidingWindow.hpp"
	#include "MFCC.hpp"
	#include "Wav2LetterMFCC.hpp"
	// Class to facilitate pre-processing calculation for Wav2Letter model for ASR
	using AudioWindow = SlidingWindow<const float>;

	class Wav2LetterPreprocessor
	{
	public:
	Wav2LetterPreprocessor(uint32_t windowLen, uint32_t windowStride,
	std::unique_ptr<Wav2LetterMFCC> mfccInst);

	/**
	* @brief Calculates the features required from audio data. This
	* includes MFCC, first and second order deltas,
	* normalisation and finally, quantisation. The tensor is
	* populated with feature from a given window placed along
	* in a single row.
	* @param[in] audioData pointer to the first element of audio data
	* @param[in] audioDataLen number of elements in the audio data
	* @param[in] tensor tensor to be populated
	* @return true if successful, false in case of error.
	*/
	bool Invoke(const float* audioData, uint32_t audioDataLen, std::vector<int8_t>& output, int quantOffset,
	float quantScale);

	std::unique_ptr<MFCC> m_mfcc;

	// Actual buffers to be populated
	Array2d<float> m_mfccBuf; // Contiguous buffer 1D: MFCC
	Array2d<float> m_delta1Buf; // Contiguous buffer 1D: Delta 1
	Array2d<float> m_delta2Buf; // Contiguous buffer 1D: Delta 2

	uint32_t m_windowLen; // Window length for MFCC
	uint32_t m_windowStride; // Window stride len for MFCC
	AudioWindow m_window; // Sliding window

	protected:
	/**
	* @brief Computes the first and second order deltas for the
	* MFCC buffers - they are assumed to be populated.
	*
	* @param[in] mfcc MFCC buffers
	* @param[out] delta1 result of the first diff computation
	* @param[out] delta2 result of the second diff computation
	*
	* @return true if successful, false otherwise
	*/
	static bool ComputeDeltas(Array2d<float>& mfcc,
	Array2d<float>& delta1,
	Array2d<float>& delta2);

	protected:

	/**
	* @brief Given a 2D vector of floats, computes the mean
	* @param[in] vec vector of vector of floats
	* @return mean value
	*/
	static float GetMean(Array2d<float>& vec);

	/**
	* @brief Given a 2D vector of floats, computes the stddev
	* @param[in] vec vector of vector of floats
	* @param[in] mean mean value of the vector passed in
	* @return stddev value
	*/
	static float GetStdDev(Array2d<float>& vec, float mean);

	/**
	* @brief Given a 2D vector of floats, normalises it using
	* the mean and the stddev
	* @param[in/out] vec vector of vector of floats
	* @return
	*/
	static void NormaliseVec(Array2d<float>& vec);

	/**
	* @brief Normalises the MFCC and delta buffers
	* @return
	*/
	void Normalise();

	/**
	* @brief Given the quantisation and data type limits, computes
	* the quantised values of a floating point input data.
	* @param[in] elem Element to be quantised
	* @param[in] quantScale Scale
	* @param[in] quantOffset Offset
	* @param[in] minVal Numerical limit - minimum
	* @param[in] maxVal Numerical limit - maximum
	* @return floating point quantised value
	*/
	static float GetQuantElem(
	float elem,
	float quantScale,
	int quantOffset,
	float minVal,
	float maxVal);

	/**
	* @brief Quantises the MFCC and delta buffers, and places them
	* in the output buffer. While doing so, it transposes
	* the data. Reason: Buffers in this class are arranged
	* for "time" axis to be row major. Primary reason for
	* this being the convolution speed up (as we can use
	* contiguous memory). The output, however, requires the
	* time axis to be in column major arrangement.
	* @param[in] outputBuf pointer to the output buffer
	* @param[in] outputBufSz output buffer's size
	* @param[in] quantScale quantisation scale
	* @param[in] quantOffset quantisation offset
	*/
	template<typename T>
	bool Quantise(T*outputBuf, int quantOffset, float quantScale)
	{
	// Populate
	T* outputBufMfcc = outputBuf;
	T* outputBufD1 = outputBuf + this->m_mfcc->m_params.m_numMfccFeatures;
	T* outputBufD2 = outputBufD1 + this->m_mfcc->m_params.m_numMfccFeatures;
	const uint32_t ptrIncr = this->m_mfcc->m_params.m_numMfccFeatures * 2; // (3 vectors - 1 vector)

	const float minVal = std::numeric_limits<T>::min();
	const float maxVal = std::numeric_limits<T>::max();

	// We need to do a transpose while copying and concatenating the tensor
	for (uint32_t j = 0; j < this->m_mfcc->m_params.m_numMfccVectors; ++j)
	{
	for (uint32_t i = 0; i < this->m_mfcc->m_params.m_numMfccFeatures; ++i)
	{
	*outputBufMfcc++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
	this->m_mfccBuf(i, j), quantScale,
	quantOffset, minVal, maxVal));
	*outputBufD1++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
	this->m_delta1Buf(i, j), quantScale,
	quantOffset, minVal, maxVal));
	*outputBufD2++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
	this->m_delta2Buf(i, j), quantScale,
	quantOffset, minVal, maxVal));
	}
	outputBufMfcc += ptrIncr;
	outputBufD1 += ptrIncr;
	outputBufD2 += ptrIncr;
	}
	return true;
	}
	};

	#endif //SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP