blob: ebc9e864e31ada157dafb9d7a54e6cf57781ecc1 [file] [log] [blame]
George Gekov23c26272021-08-16 11:32:10 +01001//
2// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
3// SPDX-License-Identifier: MIT
4//
5#ifndef SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
6#define SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
7
8#include <numeric>
9#include "DataStructures.hpp"
10#include "SlidingWindow.hpp"
11#include "MFCC.hpp"
12#include "Wav2LetterMFCC.hpp"
13// Class to facilitate pre-processing calculation for Wav2Letter model for ASR
14using AudioWindow = SlidingWindow<const float>;
15
16class Wav2LetterPreprocessor
17{
18public:
19 Wav2LetterPreprocessor(uint32_t windowLen, uint32_t windowStride,
20 std::unique_ptr<Wav2LetterMFCC> mfccInst);
21
22 /**
23 * @brief Calculates the features required from audio data. This
24 * includes MFCC, first and second order deltas,
25 * normalisation and finally, quantisation. The tensor is
26 * populated with feature from a given window placed along
27 * in a single row.
28 * @param[in] audioData pointer to the first element of audio data
29 * @param[in] audioDataLen number of elements in the audio data
30 * @param[in] tensor tensor to be populated
31 * @return true if successful, false in case of error.
32 */
33 bool Invoke(const float* audioData, uint32_t audioDataLen, std::vector<int8_t>& output, int quantOffset,
34 float quantScale);
35
36 std::unique_ptr<MFCC> m_mfcc;
37
38 // Actual buffers to be populated
39 Array2d<float> m_mfccBuf; // Contiguous buffer 1D: MFCC
40 Array2d<float> m_delta1Buf; // Contiguous buffer 1D: Delta 1
41 Array2d<float> m_delta2Buf; // Contiguous buffer 1D: Delta 2
42
43 uint32_t m_windowLen; // Window length for MFCC
44 uint32_t m_windowStride; // Window stride len for MFCC
45 AudioWindow m_window; // Sliding window
46
47protected:
48 /**
49 * @brief Computes the first and second order deltas for the
50 * MFCC buffers - they are assumed to be populated.
51 *
52 * @param[in] mfcc MFCC buffers
53 * @param[out] delta1 result of the first diff computation
54 * @param[out] delta2 result of the second diff computation
55 *
56 * @return true if successful, false otherwise
57 */
58 static bool ComputeDeltas(Array2d<float>& mfcc,
59 Array2d<float>& delta1,
60 Array2d<float>& delta2);
61
62protected:
63
64 /**
65 * @brief Given a 2D vector of floats, computes the mean
66 * @param[in] vec vector of vector of floats
67 * @return mean value
68 */
69 static float GetMean(Array2d<float>& vec);
70
71 /**
72 * @brief Given a 2D vector of floats, computes the stddev
73 * @param[in] vec vector of vector of floats
74 * @param[in] mean mean value of the vector passed in
75 * @return stddev value
76 */
77 static float GetStdDev(Array2d<float>& vec, float mean);
78
79 /**
80 * @brief Given a 2D vector of floats, normalises it using
81 * the mean and the stddev
82 * @param[in/out] vec vector of vector of floats
83 * @return
84 */
85 static void NormaliseVec(Array2d<float>& vec);
86
87 /**
88 * @brief Normalises the MFCC and delta buffers
89 * @return
90 */
91 void Normalise();
92
93 /**
94 * @brief Given the quantisation and data type limits, computes
95 * the quantised values of a floating point input data.
96 * @param[in] elem Element to be quantised
97 * @param[in] quantScale Scale
98 * @param[in] quantOffset Offset
99 * @param[in] minVal Numerical limit - minimum
100 * @param[in] maxVal Numerical limit - maximum
101 * @return floating point quantised value
102 */
103 static float GetQuantElem(
104 float elem,
105 float quantScale,
106 int quantOffset,
107 float minVal,
108 float maxVal);
109
110 /**
111 * @brief Quantises the MFCC and delta buffers, and places them
112 * in the output buffer. While doing so, it transposes
113 * the data. Reason: Buffers in this class are arranged
114 * for "time" axis to be row major. Primary reason for
115 * this being the convolution speed up (as we can use
116 * contiguous memory). The output, however, requires the
117 * time axis to be in column major arrangement.
118 * @param[in] outputBuf pointer to the output buffer
119 * @param[in] outputBufSz output buffer's size
120 * @param[in] quantScale quantisation scale
121 * @param[in] quantOffset quantisation offset
122 */
123 template<typename T>
124 bool Quantise(T*outputBuf, int quantOffset, float quantScale)
125 {
126 // Populate
127 T* outputBufMfcc = outputBuf;
128 T* outputBufD1 = outputBuf + this->m_mfcc->m_params.m_numMfccFeatures;
129 T* outputBufD2 = outputBufD1 + this->m_mfcc->m_params.m_numMfccFeatures;
130 const uint32_t ptrIncr = this->m_mfcc->m_params.m_numMfccFeatures * 2; // (3 vectors - 1 vector)
131
132 const float minVal = std::numeric_limits<T>::min();
133 const float maxVal = std::numeric_limits<T>::max();
134
135 // We need to do a transpose while copying and concatenating the tensor
136 for (uint32_t j = 0; j < this->m_mfcc->m_params.m_numMfccVectors; ++j)
137 {
138 for (uint32_t i = 0; i < this->m_mfcc->m_params.m_numMfccFeatures; ++i)
139 {
140 *outputBufMfcc++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
141 this->m_mfccBuf(i, j), quantScale,
142 quantOffset, minVal, maxVal));
143 *outputBufD1++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
144 this->m_delta1Buf(i, j), quantScale,
145 quantOffset, minVal, maxVal));
146 *outputBufD2++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
147 this->m_delta2Buf(i, j), quantScale,
148 quantOffset, minVal, maxVal));
149 }
150 outputBufMfcc += ptrIncr;
151 outputBufD1 += ptrIncr;
152 outputBufD2 += ptrIncr;
153 }
154 return true;
155 }
156};
157
158#endif //SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP