blob: 80c568439bf5a598bd171ceb987a5c08a790ce6a [file] [log] [blame]
//
// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
#include "DataStructures.hpp"
#include "SlidingWindow.hpp"
#include <numeric>
#include "MFCC.hpp"
/* Class to facilitate pre-processing calculation for Wav2Letter model
* for ASR */
using AudioWindow = SlidingWindow <const float>;
class Preprocess
{
public:
MFCC _m_mfcc; /* MFCC instance */
/* Actual buffers to be populated */
Array2d<float> _m_mfccBuf; /* Contiguous buffer 1D: MFCC */
Array2d<float> _m_delta1Buf; /* Contiguous buffer 1D: Delta 1 */
Array2d<float> _m_delta2Buf; /* Contiguous buffer 1D: Delta 2 */
uint32_t _m_windowLen; /* Window length for MFCC */
uint32_t _m_windowStride; /* Window stride len for MFCC */
AudioWindow _m_window; /* Sliding window */
/**
* @brief Constructor
* @param[in] numMfccFeatures number of MFCC features per window
* @param[in] windowLen number of elements in a window
* @param[in] windowStride stride (in number of elements) for
* moving the window
* @param[in] numMfccVectors number of MFCC vectors per window
*/
Preprocess(
const uint32_t windowLen,
const uint32_t windowStride,
const MFCC mfccInst);
Preprocess() = delete;
~Preprocess();
/**
* @brief Calculates the features required from audio data. This
* includes MFCC, first and second order deltas,
* normalisation and finally, quantisation. The tensor is
* populated with feature from a given window placed along
* in a single row.
* @param[in] audioData pointer to the first element of audio data
* @param[in] audioDataLen number of elements in the audio data
* @param[in] tensor tensor to be populated
* @return true if successful, false in case of error.
*/
bool Invoke(const float* audioData,
const uint32_t audioDataLen,
std::vector<int8_t>& output,
int quantOffset,
float quantScale);
protected:
/**
* @brief Computes the first and second order deltas for the
* MFCC buffers - they are assumed to be populated.
*
* @param[in] mfcc MFCC buffers
* @param[out] delta1 result of the first diff computation
* @param[out] delta2 result of the second diff computation
*
* @return true if successful, false otherwise
*/
static bool _ComputeDeltas(Array2d<float>& mfcc,
Array2d<float>& delta1,
Array2d<float>& delta2);
/**
* @brief Given a 2D vector of floats, computes the mean
* @param[in] vec vector of vector of floats
* @return mean value
*/
static float _GetMean(Array2d<float>& vec);
/**
* @brief Given a 2D vector of floats, computes the stddev
* @param[in] vec vector of vector of floats
* @param[in] mean mean value of the vector passed in
* @return stddev value
*/
static float _GetStdDev(Array2d<float>& vec,
const float mean);
/**
* @brief Given a 2D vector of floats, normalises it using
* the mean and the stddev
* @param[in/out] vec vector of vector of floats
* @return
*/
static void _NormaliseVec(Array2d<float>& vec);
/**
* @brief Normalises the MFCC and delta buffers
* @return
*/
void _Normalise();
/**
* @brief Given the quantisation and data type limits, computes
* the quantised values of a floating point input data.
* @param[in] elem Element to be quantised
* @param[in] quantScale Scale
* @param[in] quantOffset Offset
* @param[in] minVal Numerical limit - minimum
* @param[in] maxVal Numerical limit - maximum
* @return floating point quantised value
*/
static float _GetQuantElem(
const float elem,
const float quantScale,
const int quantOffset,
const float minVal,
const float maxVal);
/**
* @brief Quantises the MFCC and delta buffers, and places them
* in the output buffer. While doing so, it transposes
* the data. Reason: Buffers in this class are arranged
* for "time" axis to be row major. Primary reason for
* this being the convolution speed up (as we can use
* contiguous memory). The output, however, requires the
* time axis to be in column major arrangement.
* @param[in] outputBuf pointer to the output buffer
* @param[in] outputBufSz output buffer's size
* @param[in] quantScale quantisation scale
* @param[in] quantOffset quantisation offset
*/
template <typename T>
bool _Quantise(T* outputBuf, int quantOffset, float quantScale)
{
/* Populate */
T* outputBufMfcc = outputBuf;
T* outputBufD1 = outputBuf + this->_m_mfcc._m_params.m_numMfccFeatures;
T* outputBufD2 = outputBufD1 + this->_m_mfcc._m_params.m_numMfccFeatures;
const uint32_t ptrIncr = this->_m_mfcc._m_params.m_numMfccFeatures * 2; /* (3 vectors - 1 vector) */
const float minVal = std::numeric_limits<T>::min();
const float maxVal = std::numeric_limits<T>::max();
/* We need to do a transpose while copying and concatenating
* the tensor*/
for (uint32_t j = 0; j < this->_m_mfcc._m_params.m_numMfccVectors; ++j) {
for (uint32_t i = 0; i < this->_m_mfcc._m_params.m_numMfccFeatures; ++i)
{
*outputBufMfcc++ = static_cast<T>(this->_GetQuantElem(
this->_m_mfccBuf(i, j), quantScale,
quantOffset, minVal, maxVal));
*outputBufD1++ = static_cast<T>(this->_GetQuantElem(
this->_m_delta1Buf(i, j), quantScale,
quantOffset, minVal, maxVal));
*outputBufD2++ = static_cast<T>(this->_GetQuantElem(
this->_m_delta2Buf(i, j), quantScale,
quantOffset, minVal, maxVal));
}
outputBufMfcc += ptrIncr;
outputBufD1 += ptrIncr;
outputBufD2 += ptrIncr;
}
return true;
}
};