samples/SpeechRecognition/include/Preprocess.hpp - ml/armnn - Gitiles

 //
 // Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //

 #pragma once

 #include "DataStructures.hpp"
 #include "SlidingWindow.hpp"
 #include <numeric>
 #include "MFCC.hpp"

 /* Class to facilitate pre-processing calculation for Wav2Letter model
      * for ASR */
 using AudioWindow = SlidingWindow <const float>;

 class Preprocess
 {
 public:

     MFCC                _m_mfcc;            /* MFCC instance */

     /* Actual buffers to be populated */
     Array2d<float>      _m_mfccBuf;         /* Contiguous buffer 1D: MFCC */
     Array2d<float>      _m_delta1Buf;       /* Contiguous buffer 1D: Delta 1 */
     Array2d<float>      _m_delta2Buf;       /* Contiguous buffer 1D: Delta 2 */

     uint32_t            _m_windowLen;       /* Window length for MFCC */
     uint32_t            _m_windowStride;    /* Window stride len for MFCC */
     AudioWindow         _m_window;          /* Sliding window */

     /**
      * @brief       Constructor
      * @param[in]   numMfccFeatures     number of MFCC features per window
      * @param[in]   windowLen           number of elements in a window
      * @param[in]   windowStride        stride (in number of elements) for
      *                                  moving the window
      * @param[in]   numMfccVectors      number of MFCC vectors per window
     */
     Preprocess(
             const uint32_t  windowLen,
             const uint32_t  windowStride,
             const MFCC mfccInst);
     Preprocess() = delete;
     ~Preprocess();

     /**
      * @brief       Calculates the features required from audio data. This
      *              includes MFCC, first and second order deltas,
      *              normalisation and finally, quantisation. The tensor is
      *              populated with feature from a given window placed along
      *              in a single row.
      * @param[in]   audioData     pointer to the first element of audio data
      * @param[in]   audioDataLen  number of elements in the audio data
      * @param[in]   tensor        tensor to be populated
      * @return      true if successful, false in case of error.
      */
     bool Invoke(const float* audioData,
                 const uint32_t  audioDataLen,
                 std::vector<int8_t>& output,
                 int quantOffset,
                 float quantScale);


 protected:
     /**
      * @brief Computes the first and second order deltas for the
      *        MFCC buffers - they are assumed to be populated.
      *
      * @param[in]  mfcc   MFCC buffers
      * @param[out] delta1 result of the first diff computation
      * @param[out] delta2 result of the second diff computation
      *
      * @return true if successful, false otherwise
      */
     static bool _ComputeDeltas(Array2d<float>& mfcc,
                                Array2d<float>& delta1,
                                Array2d<float>& delta2);

     /**
      * @brief      Given a 2D vector of floats, computes the mean
      * @param[in]   vec      vector of vector of floats
      * @return      mean value
      */
     static float _GetMean(Array2d<float>& vec);

     /**
      * @brief       Given a 2D vector of floats, computes the stddev
      * @param[in]   vec   vector of vector of floats
      * @param[in]   mean     mean value of the vector passed in
      * @return      stddev value
      */
     static float _GetStdDev(Array2d<float>& vec,
                             const float mean);

     /**
      * @brief           Given a 2D vector of floats, normalises it using
      *                  the mean and the stddev
      * @param[in/out]   vec      vector of vector of floats
      * @return
      */
     static void _NormaliseVec(Array2d<float>& vec);

     /**
      * @brief       Normalises the MFCC and delta buffers
      * @return
      */
     void _Normalise();

     /**
      * @brief       Given the quantisation and data type limits, computes
      *              the quantised values of a floating point input data.
      * @param[in]   elem            Element to be quantised
      * @param[in]   quantScale      Scale
      * @param[in]   quantOffset     Offset
      * @param[in]   minVal          Numerical limit - minimum
      * @param[in]   maxVal          Numerical limit - maximum
      * @return      floating point quantised value
      */
     static float _GetQuantElem(
             const float     elem,
             const float     quantScale,
             const int       quantOffset,
             const float     minVal,
             const float     maxVal);

     /**
      * @brief       Quantises the MFCC and delta buffers, and places them
      *              in the output buffer. While doing so, it transposes
      *              the data. Reason: Buffers in this class are arranged
      *              for "time" axis to be row major. Primary reason for
      *              this being the convolution speed up (as we can use
      *              contiguous memory). The output, however, requires the
      *              time axis to be in column major arrangement.
      * @param[in]   outputBuf       pointer to the output buffer
      * @param[in]   outputBufSz     output buffer's size
      * @param[in]   quantScale      quantisation scale
      * @param[in]   quantOffset     quantisation offset
      */
     template <typename T>
     bool _Quantise(T* outputBuf, int quantOffset, float quantScale)
     {
         /* Populate */
         T* outputBufMfcc = outputBuf;
         T* outputBufD1 = outputBuf + this->_m_mfcc._m_params.m_numMfccFeatures;
         T* outputBufD2 = outputBufD1 + this->_m_mfcc._m_params.m_numMfccFeatures;
         const uint32_t ptrIncr = this->_m_mfcc._m_params.m_numMfccFeatures * 2; /* (3 vectors - 1 vector) */

         const float minVal = std::numeric_limits<T>::min();
         const float maxVal = std::numeric_limits<T>::max();

         /* We need to do a transpose while copying and concatenating
          * the tensor*/
         for (uint32_t j = 0; j < this->_m_mfcc._m_params.m_numMfccVectors; ++j) {
             for (uint32_t i = 0; i < this->_m_mfcc._m_params.m_numMfccFeatures; ++i)
             {
                 *outputBufMfcc++ = static_cast<T>(this->_GetQuantElem(
                         this->_m_mfccBuf(i, j), quantScale,
                         quantOffset, minVal, maxVal));
                 *outputBufD1++ = static_cast<T>(this->_GetQuantElem(
                         this->_m_delta1Buf(i, j), quantScale,
                         quantOffset, minVal, maxVal));
                 *outputBufD2++ = static_cast<T>(this->_GetQuantElem(
                         this->_m_delta2Buf(i, j), quantScale,
                         quantOffset, minVal, maxVal));
             }
             outputBufMfcc += ptrIncr;
             outputBufD1 += ptrIncr;
             outputBufD2 += ptrIncr;
         }

         return true;
     }
 };
	//
	// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
	// SPDX-License-Identifier: MIT
	//

	#pragma once

	#include "DataStructures.hpp"
	#include "SlidingWindow.hpp"
	#include <numeric>
	#include "MFCC.hpp"

	/* Class to facilitate pre-processing calculation for Wav2Letter model
	* for ASR */
	using AudioWindow = SlidingWindow <const float>;

	class Preprocess
	{
	public:

	MFCC _m_mfcc; /* MFCC instance */

	/* Actual buffers to be populated */
	Array2d<float> _m_mfccBuf; /* Contiguous buffer 1D: MFCC */
	Array2d<float> _m_delta1Buf; /* Contiguous buffer 1D: Delta 1 */
	Array2d<float> _m_delta2Buf; /* Contiguous buffer 1D: Delta 2 */

	uint32_t _m_windowLen; /* Window length for MFCC */
	uint32_t _m_windowStride; /* Window stride len for MFCC */
	AudioWindow _m_window; /* Sliding window */

	/**
	* @brief Constructor
	* @param[in] numMfccFeatures number of MFCC features per window
	* @param[in] windowLen number of elements in a window
	* @param[in] windowStride stride (in number of elements) for
	* moving the window
	* @param[in] numMfccVectors number of MFCC vectors per window
	*/
	Preprocess(
	const uint32_t windowLen,
	const uint32_t windowStride,
	const MFCC mfccInst);
	Preprocess() = delete;
	~Preprocess();

	/**
	* @brief Calculates the features required from audio data. This
	* includes MFCC, first and second order deltas,
	* normalisation and finally, quantisation. The tensor is
	* populated with feature from a given window placed along
	* in a single row.
	* @param[in] audioData pointer to the first element of audio data
	* @param[in] audioDataLen number of elements in the audio data
	* @param[in] tensor tensor to be populated
	* @return true if successful, false in case of error.
	*/
	bool Invoke(const float* audioData,
	const uint32_t audioDataLen,
	std::vector<int8_t>& output,
	int quantOffset,
	float quantScale);


	protected:
	/**
	* @brief Computes the first and second order deltas for the
	* MFCC buffers - they are assumed to be populated.
	*
	* @param[in] mfcc MFCC buffers
	* @param[out] delta1 result of the first diff computation
	* @param[out] delta2 result of the second diff computation
	*
	* @return true if successful, false otherwise
	*/
	static bool _ComputeDeltas(Array2d<float>& mfcc,
	Array2d<float>& delta1,
	Array2d<float>& delta2);

	/**
	* @brief Given a 2D vector of floats, computes the mean
	* @param[in] vec vector of vector of floats
	* @return mean value
	*/
	static float _GetMean(Array2d<float>& vec);

	/**
	* @brief Given a 2D vector of floats, computes the stddev
	* @param[in] vec vector of vector of floats
	* @param[in] mean mean value of the vector passed in
	* @return stddev value
	*/
	static float _GetStdDev(Array2d<float>& vec,
	const float mean);

	/**
	* @brief Given a 2D vector of floats, normalises it using
	* the mean and the stddev
	* @param[in/out] vec vector of vector of floats
	* @return
	*/
	static void _NormaliseVec(Array2d<float>& vec);

	/**
	* @brief Normalises the MFCC and delta buffers
	* @return
	*/
	void _Normalise();

	/**
	* @brief Given the quantisation and data type limits, computes
	* the quantised values of a floating point input data.
	* @param[in] elem Element to be quantised
	* @param[in] quantScale Scale
	* @param[in] quantOffset Offset
	* @param[in] minVal Numerical limit - minimum
	* @param[in] maxVal Numerical limit - maximum
	* @return floating point quantised value
	*/
	static float _GetQuantElem(
	const float elem,
	const float quantScale,
	const int quantOffset,
	const float minVal,
	const float maxVal);

	/**
	* @brief Quantises the MFCC and delta buffers, and places them
	* in the output buffer. While doing so, it transposes
	* the data. Reason: Buffers in this class are arranged
	* for "time" axis to be row major. Primary reason for
	* this being the convolution speed up (as we can use
	* contiguous memory). The output, however, requires the
	* time axis to be in column major arrangement.
	* @param[in] outputBuf pointer to the output buffer
	* @param[in] outputBufSz output buffer's size
	* @param[in] quantScale quantisation scale
	* @param[in] quantOffset quantisation offset
	*/
	template <typename T>
	bool _Quantise(T* outputBuf, int quantOffset, float quantScale)
	{
	/* Populate */
	T* outputBufMfcc = outputBuf;
	T* outputBufD1 = outputBuf + this->_m_mfcc._m_params.m_numMfccFeatures;
	T* outputBufD2 = outputBufD1 + this->_m_mfcc._m_params.m_numMfccFeatures;
	const uint32_t ptrIncr = this->_m_mfcc._m_params.m_numMfccFeatures * 2; /* (3 vectors - 1 vector) */

	const float minVal = std::numeric_limits<T>::min();
	const float maxVal = std::numeric_limits<T>::max();

	/* We need to do a transpose while copying and concatenating
	* the tensor*/
	for (uint32_t j = 0; j < this->_m_mfcc._m_params.m_numMfccVectors; ++j) {
	for (uint32_t i = 0; i < this->_m_mfcc._m_params.m_numMfccFeatures; ++i)
	{
	*outputBufMfcc++ = static_cast<T>(this->_GetQuantElem(
	this->_m_mfccBuf(i, j), quantScale,
	quantOffset, minVal, maxVal));
	*outputBufD1++ = static_cast<T>(this->_GetQuantElem(
	this->_m_delta1Buf(i, j), quantScale,
	quantOffset, minVal, maxVal));
	*outputBufD2++ = static_cast<T>(this->_GetQuantElem(
	this->_m_delta2Buf(i, j), quantScale,
	quantOffset, minVal, maxVal));
	}
	outputBufMfcc += ptrIncr;
	outputBufD1 += ptrIncr;
	outputBufD2 += ptrIncr;
	}

	return true;
	}
	};