MLECO-1252 ASR sample application using the public ArmNN C++ API. Change-Id: I98cd505b8772a8c8fa88308121bc94135bb45068 Signed-off-by: Éanna Ó Catháin <eanna.ocathain@arm.com>

commit: c6ab02a626e15b4a12fc09ecd844eb8b95380c3c [log] [tgz]
author: Éanna Ó Catháin <eanna.ocathain@arm.com> Wed Apr 07 14:35:25 2021 +0100
committer: Jim Flynn <jim.flynn@arm.com> Fri May 07 09:11:52 2021 +0000
tree: 9912ed9cdb89cdb24483b22d6621ae30049ae321
parent: e813d67f86df41a238ff79b5c554ef5027f56576 [diff]
diff --git a/samples/SpeechRecognition/include/AudioCapture.hpp b/samples/SpeechRecognition/include/AudioCapture.hpp
new file mode 100644
index 0000000..90c2ecc
--- /dev/null
+++ b/samples/SpeechRecognition/include/AudioCapture.hpp

@@ -0,0 +1,62 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <string>
+#include <iostream>
+
+#include <math.h>
+
+#include <vector>
+
+#include <exception>
+
+#include "SlidingWindow.hpp"
+
+namespace asr
+{
+
+/**
+* @brief Class used to capture the audio data loaded from file, and to provide a method of
+ * extracting correctly positioned and appropriately sized audio windows
+*
+*/
+    class AudioCapture
+    {
+    public:
+
+        SlidingWindow<const float> m_window;
+        int lastReadIdx= 0;
+
+        /**
+        * @brief Default constructor
+        */
+        AudioCapture()
+        {};
+
+        /**
+        * @brief Function to load the audio data captured from the
+         * input file to memory.
+        */
+        std::vector<float> LoadAudioFile(std::string filePath);
+
+        /**
+        * @brief Function to initialize the sliding window. This will set its position in memory, its
+         * window size and its stride.
+        */
+        void InitSlidingWindow(float* data, size_t dataSize, int minSamples, size_t stride);
+
+        /**
+        * Checks whether there is another block of audio in memory to read
+        */
+        bool HasNext();
+
+        /**
+        * Retrieves the next block of audio if its available
+        */
+        std::vector<float> Next();
+    };
+} // namespace asr
\ No newline at end of file

diff --git a/samples/SpeechRecognition/include/DataStructures.hpp b/samples/SpeechRecognition/include/DataStructures.hpp
new file mode 100644
index 0000000..9922265
--- /dev/null
+++ b/samples/SpeechRecognition/include/DataStructures.hpp

@@ -0,0 +1,102 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+/**
+ * Class Array2d is a data structure that represents a two dimensional array.
+ * The data is allocated in contiguous memory, arranged row-wise
+ * and individual elements can be accessed with the () operator.
+ * For example a two dimensional array D of size (M, N) can be accessed:
+ *
+ *               _|<------------- col size = N  -------->|
+ *               |  D(r=0, c=0) D(r=0, c=1)... D(r=0, c=N)
+ *               |  D(r=1, c=0) D(r=1, c=1)... D(r=1, c=N)
+ *               |  ...
+ *    row size = M  ...
+ *               |  ...
+ *               _  D(r=M, c=0) D(r=M, c=1)... D(r=M, c=N)
+ *
+ */
+template<typename T>
+class Array2d
+{
+private:
+    size_t m_rows;
+    size_t m_cols;
+    T* m_data;
+
+public:
+    /**
+     * Creates the array2d with the given sizes.
+     *
+     * @param rows  number of rows.
+     * @param cols  number of columns.
+     */
+    Array2d(unsigned rows, unsigned cols)
+    {
+        if (rows == 0 || cols == 0) {
+            printf("Array2d constructor has 0 size.\n");
+            m_data = nullptr;
+            return;
+        }
+        m_rows = rows;
+        m_cols = cols;
+        m_data = new T[rows * cols];
+    }
+
+    ~Array2d()
+    {
+        delete[] m_data;
+    }
+
+    T& operator() (unsigned int row, unsigned int col)
+    {
+        return m_data[m_cols * row + col];
+    }
+
+    T operator() (unsigned int row, unsigned int col) const
+    {
+        return m_data[m_cols * row + col];
+    }
+
+    /**
+     * Gets rows number of the current array2d.
+     * @return number of rows.
+     */
+    size_t size(size_t dim)
+    {
+        switch (dim)
+        {
+            case 0:
+                return m_rows;
+            case 1:
+                return m_cols;
+            default:
+                return 0;
+        }
+    }
+
+    /**
+     * Gets the array2d total size.
+     */
+    size_t totalSize()
+    {
+        return m_rows * m_cols;
+    }
+
+    /**
+     * array2d iterator.
+     */
+    using iterator=T*;
+    using const_iterator=T const*;
+
+    iterator begin() { return m_data; }
+    iterator end() { return m_data + totalSize(); }
+    const_iterator begin() const { return m_data; }
+    const_iterator end() const { return m_data + totalSize(); };
+};

diff --git a/samples/SpeechRecognition/include/Decoder.hpp b/samples/SpeechRecognition/include/Decoder.hpp
new file mode 100644
index 0000000..69d97cc
--- /dev/null
+++ b/samples/SpeechRecognition/include/Decoder.hpp

@@ -0,0 +1,63 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <string>
+#include <map>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+
+# pragma once
+
+namespace asr
+{
+/**
+* @brief Class used to Decode the output of the ASR inference
+*
+*/
+    class Decoder
+    {
+    public:
+        std::map<int, std::string> m_labels;
+        /**
+        * @brief Default constructor
+        * @param[in] labels - map of labels to be used for decoding to text.
+        */
+        Decoder(std::map<int, std::string>& labels);
+
+        /**
+        * @brief Function to decode the output into a text string
+        * @param[in] output - the output vector to decode.
+        */
+        template<typename T>
+        std::string DecodeOutput(std::vector<T>& contextToProcess)
+        {
+            int rowLength = 29;
+
+            std::vector<char> unfilteredText;
+
+            for(int row = 0; row < contextToProcess.size()/rowLength; ++row)
+            {
+                std::vector<int16_t> rowVector;
+                for(int j = 0; j < rowLength; ++j)
+                {
+                    rowVector.emplace_back(static_cast<int16_t>(contextToProcess[row * rowLength + j]));
+                }
+
+                int max_index = std::distance(rowVector.begin(),std::max_element(rowVector.begin(), rowVector.end()));
+                unfilteredText.emplace_back(this->m_labels.at(max_index)[0]);
+            }
+
+            std::string filteredText = FilterCharacters(unfilteredText);
+            return filteredText;
+        }
+
+        /**
+        * @brief Function to filter out unwanted characters
+        * @param[in] unfiltered - the unfiltered output to be processed.
+        */
+        std::string FilterCharacters(std::vector<char>& unfiltered);
+    };
+} // namespace asr

diff --git a/samples/SpeechRecognition/include/MFCC.hpp b/samples/SpeechRecognition/include/MFCC.hpp
new file mode 100644
index 0000000..14b6d9f
--- /dev/null
+++ b/samples/SpeechRecognition/include/MFCC.hpp

@@ -0,0 +1,244 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <vector>
+#include <cstdint>
+#include <cmath>
+#include <limits>
+#include <string>
+
+/* MFCC's consolidated parameters */
+class MfccParams
+{
+public:
+    float       m_samplingFreq;
+    int         m_numFbankBins;
+    float       m_melLoFreq;
+    float       m_melHiFreq;
+    int         m_numMfccFeatures;
+    int         m_frameLen;
+    int         m_frameLenPadded;
+    bool        m_useHtkMethod;
+    int         m_numMfccVectors;
+
+    /** @brief  Constructor */
+    MfccParams(const float samplingFreq, const int numFbankBins,
+               const float melLoFreq, const float melHiFreq,
+               const int numMfccFeats, const int frameLen,
+               const bool useHtkMethod, const int numMfccVectors);
+
+    /* Delete the default constructor */
+    MfccParams()  = delete;
+
+    /* Default destructor */
+    ~MfccParams() = default;
+
+    /** @brief  String representation of parameters */
+    std::string Str();
+};
+
+/**
+ * @brief   Class for MFCC feature extraction.
+ *          Based on https://github.com/ARM-software/ML-KWS-for-MCU/blob/master/Deployment/Source/MFCC/mfcc.cpp
+ *          This class is designed to be generic and self-sufficient but
+ *          certain calculation routines can be overridden to accommodate
+ *          use-case specific requirements.
+ */
+class MFCC
+{
+
+public:
+
+    /**
+    * @brief        Extract MFCC  features for one single small frame of
+    *               audio data e.g. 640 samples.
+    * @param[in]    audioData - Vector of audio samples to calculate
+    *               features for.
+    * @return       Vector of extracted MFCC features.
+    **/
+    std::vector<float> MfccCompute(const std::vector<float>& audioData);
+
+    MfccParams _m_params;
+
+    /**
+     * @brief       Constructor
+     * @param[in]   params - MFCC parameters
+    */
+    MFCC(const MfccParams& params);
+
+    /* Delete the default constructor */
+    MFCC() = delete;
+
+    /** @brief  Default destructor */
+    ~MFCC() = default;
+
+    /** @brief  Initialise */
+    void Init();
+
+    /**
+     * @brief        Extract MFCC features and quantise for one single small
+     *               frame of audio data e.g. 640 samples.
+     * @param[in]    audioData - Vector of audio samples to calculate
+     *               features for.
+     * @param[in]    quantScale - quantisation scale.
+     * @param[in]    quantOffset - quantisation offset
+     * @return      Vector of extracted quantised MFCC features.
+     **/
+    template<typename T>
+    std::vector<T> MfccComputeQuant(const std::vector<float>& audioData,
+                                    const float quantScale,
+                                    const int quantOffset)
+    {
+        this->_MfccComputePreFeature(audioData);
+        float minVal = std::numeric_limits<T>::min();
+        float maxVal = std::numeric_limits<T>::max();
+
+        std::vector<T> mfccOut(this->_m_params.m_numMfccFeatures);
+        const size_t numFbankBins = this->_m_params.m_numFbankBins;
+
+        /* Take DCT. Uses matrix mul. */
+        for (size_t i = 0, j = 0; i < mfccOut.size(); ++i, j += numFbankBins)
+        {
+            float sum = 0;
+            for (size_t k = 0; k < numFbankBins; ++k)
+            {
+                sum += this->_m_dctMatrix[j + k] * this->_m_melEnergies[k];
+            }
+            /* Quantize to T. */
+            sum = std::round((sum / quantScale) + quantOffset);
+            mfccOut[i] = static_cast<T>(std::min<float>(std::max<float>(sum, minVal), maxVal));
+        }
+
+        return mfccOut;
+    }
+
+    /* Constants */
+    static constexpr float logStep = 1.8562979903656 / 27.0;
+    static constexpr float freqStep = 200.0 / 3;
+    static constexpr float minLogHz = 1000.0;
+    static constexpr float minLogMel = minLogHz / freqStep;
+
+protected:
+    /**
+     * @brief       Project input frequency to Mel Scale.
+     * @param[in]   freq - input frequency in floating point
+     * @param[in]   useHTKmethod - bool to signal if HTK method is to be
+     *              used for calculation
+     * @return      Mel transformed frequency in floating point
+     **/
+    static float MelScale(const float    freq,
+                          const bool     useHTKMethod = true);
+
+    /**
+     * @brief       Inverse Mel transform - convert MEL warped frequency
+     *              back to normal frequency
+     * @param[in]   freq - Mel frequency in floating point
+     * @param[in]   useHTKmethod - bool to signal if HTK method is to be
+     *              used for calculation
+     * @return      Real world frequency in floating point
+     **/
+    static float InverseMelScale(const float melFreq,
+                                 const bool  useHTKMethod = true);
+
+    /**
+     * @brief       Populates MEL energies after applying the MEL filter
+     *              bank weights and adding them up to be placed into
+     *              bins, according to the filter bank's first and last
+     *              indices (pre-computed for each filter bank element
+     *              by _CreateMelFilterBank function).
+     * @param[in]   fftVec                  Vector populated with FFT magnitudes
+     * @param[in]   melFilterBank           2D Vector with filter bank weights
+     * @param[in]   filterBankFilterFirst   Vector containing the first indices of filter bank
+     *                                      to be used for each bin.
+     * @param[in]   filterBankFilterLast    Vector containing the last indices of filter bank
+     *                                      to be used for each bin.
+     * @param[out]  melEnergies             Pre-allocated vector of MEL energies to be
+     *                                      populated.
+     * @return      true if successful, false otherwise
+     */
+    virtual bool ApplyMelFilterBank(
+            std::vector<float>&                 fftVec,
+            std::vector<std::vector<float>>&    melFilterBank,
+            std::vector<int32_t>&               filterBankFilterFirst,
+            std::vector<int32_t>&               filterBankFilterLast,
+            std::vector<float>&                 melEnergies);
+
+    /**
+     * @brief           Converts the Mel energies for logarithmic scale
+     * @param[in/out]   melEnergies - 1D vector of Mel energies
+     **/
+    virtual void ConvertToLogarithmicScale(std::vector<float>& melEnergies);
+
+    /**
+     * @brief       Create a matrix used to calculate Discrete Cosine
+     *              Transform.
+     * @param[in]   inputLength - input length of the buffer on which
+     *              DCT will be performed
+     * @param[in]   coefficientCount - Total coefficients per input
+     *              length
+     * @return      1D vector with inputLength x coefficientCount elements
+     *              populated with DCT coefficients.
+     */
+    virtual std::vector<float> CreateDCTMatrix(
+            const int32_t inputLength,
+            const int32_t coefficientCount);
+
+    /**
+     * @brief       Given the low and high Mel values, get the normaliser
+     *              for weights to be applied when populating the filter
+     *              bank.
+     * @param[in]   leftMel - low Mel frequency value
+     * @param[in]   rightMel - high Mel frequency value
+     * @param[in]   useHTKMethod - bool to signal if HTK method is to be
+     *              used for calculation
+     */
+    virtual float GetMelFilterBankNormaliser(
+            const float&   leftMel,
+            const float&   rightMel,
+            const bool     useHTKMethod);
+
+private:
+
+    std::vector<float>              _m_frame;
+    std::vector<float>              _m_buffer;
+    std::vector<float>              _m_melEnergies;
+    std::vector<float>              _m_windowFunc;
+    std::vector<std::vector<float>> _m_melFilterBank;
+    std::vector<float>              _m_dctMatrix;
+    std::vector<int32_t>            _m_filterBankFilterFirst;
+    std::vector<int32_t>            _m_filterBankFilterLast;
+    bool                            _m_filterBankInitialised;
+
+    /**
+     * @brief       Initialises the filter banks and the DCT matrix **/
+    void _InitMelFilterBank();
+
+    /**
+     * @brief       Signals whether the instance of MFCC has had its
+     *              required buffers initialised
+     * @return      True if initialised, false otherwise
+     **/
+    bool _IsMelFilterBankInited();
+
+    /**
+     * @brief       Create mel filter banks for MFCC calculation.
+     * @return      2D vector of floats
+     **/
+    std::vector<std::vector<float>> _CreateMelFilterBank();
+
+    /**
+     * @brief       Computes and populates internal memeber buffers used
+     *              in MFCC feature calculation
+     * @param[in]   audioData - 1D vector of 16-bit audio data
+     */
+    void _MfccComputePreFeature(const std::vector<float>& audioData);
+
+    /** @brief       Computes the magnitude from an interleaved complex array */
+    void _ConvertToPowerSpectrum();
+
+};
+

diff --git a/samples/SpeechRecognition/include/MathUtils.hpp b/samples/SpeechRecognition/include/MathUtils.hpp
new file mode 100644
index 0000000..5f81fb6
--- /dev/null
+++ b/samples/SpeechRecognition/include/MathUtils.hpp

@@ -0,0 +1,85 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <vector>
+#include <cmath>
+#include <cstdint>
+#include <numeric>
+
+class MathUtils
+{
+
+public:
+
+    /**
+     * @brief       Computes the FFT for the input vector
+     * @param[in]   input       Floating point vector of input elements
+     * @param[out]  fftOutput   Output buffer to be populated by computed
+     *                          FFTs
+     * @return      none
+     */
+    static void FftF32(std::vector<float>& input,
+                       std::vector<float>& fftOutput);
+
+
+    /**
+     * @brief       Computes the dot product of two 1D floating point
+     *              vectors.
+     *              result = sum(srcA[0]*srcB[0] + srcA[1]*srcB[1] + ..)
+     * @param[in]   srcPtrA     pointer to the first element of first
+     *                          array
+     * @param[in]   srcPtrB     pointer to the first element of second
+     *                          array
+     * @param[in]   srcLen      Number of elements in the array/vector
+     * @return      dot product
+     */
+    static float DotProductF32(float* srcPtrA, float* srcPtrB,
+                               const int srcLen);
+
+    /**
+     * @brief       Computes the squared magnitude of floating point
+     *              complex number array.
+     * @param[in]   ptrSrc      pointer to the first element of input
+     *                          array
+     * @param[in]   srcLen      Number of elements in the array/vector
+     * @param[out]  ptrDst      Output buffer to be populated
+     * @param[in]   dstLen      output buffer len (for sanity check only)
+     * @return      true if successful, false otherwise
+     */
+    static bool ComplexMagnitudeSquaredF32(float* ptrSrc,
+                                           const int srcLen,
+                                           float* ptrDst,
+                                           const int dstLen);
+
+    /**
+         * @brief       Computes the natural logarithms of input floating point
+         *              vector
+         * @param[in]   input   Floating point input vector
+         * @param[out]  output  Pre-allocated buffer to be populated with
+         *                      natural log values of each input element
+         * @return      none
+         */
+    static void VecLogarithmF32(std::vector <float>& input,
+                                std::vector <float>& output);
+
+    /**
+         * @brief       Gets the mean of a floating point array of elements
+         * @param[in]   ptrSrc  pointer to the first element
+         * @param[in]   srcLen  Number of elements in the array/vector
+         * @return      average value
+         */
+    static float MeanF32(float* ptrSrc, const uint32_t srcLen);
+
+    /**
+     * @brief       Gets the standard deviation of a floating point array
+     *              of elements
+     * @param[in]   ptrSrc  pointer to the first element
+     * @param[in]   srcLen  Number of elements in the array/vector
+     * @param[in]   mean    pre-computed mean value
+     * @return      standard deviation value
+     */
+    static float StdDevF32(float* ptrSrc, const uint32_t srcLen,
+                           const float mean);
+};

diff --git a/samples/SpeechRecognition/include/Preprocess.hpp b/samples/SpeechRecognition/include/Preprocess.hpp
new file mode 100644
index 0000000..80c5684
--- /dev/null
+++ b/samples/SpeechRecognition/include/Preprocess.hpp

@@ -0,0 +1,175 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "DataStructures.hpp"
+#include "SlidingWindow.hpp"
+#include <numeric>
+#include "MFCC.hpp"
+
+/* Class to facilitate pre-processing calculation for Wav2Letter model
+     * for ASR */
+using AudioWindow = SlidingWindow <const float>;
+
+class Preprocess
+{
+public:
+
+    MFCC                _m_mfcc;            /* MFCC instance */
+
+    /* Actual buffers to be populated */
+    Array2d<float>      _m_mfccBuf;         /* Contiguous buffer 1D: MFCC */
+    Array2d<float>      _m_delta1Buf;       /* Contiguous buffer 1D: Delta 1 */
+    Array2d<float>      _m_delta2Buf;       /* Contiguous buffer 1D: Delta 2 */
+
+    uint32_t            _m_windowLen;       /* Window length for MFCC */
+    uint32_t            _m_windowStride;    /* Window stride len for MFCC */
+    AudioWindow         _m_window;          /* Sliding window */
+
+    /**
+     * @brief       Constructor
+     * @param[in]   numMfccFeatures     number of MFCC features per window
+     * @param[in]   windowLen           number of elements in a window
+     * @param[in]   windowStride        stride (in number of elements) for
+     *                                  moving the window
+     * @param[in]   numMfccVectors      number of MFCC vectors per window
+    */
+    Preprocess(
+            const uint32_t  windowLen,
+            const uint32_t  windowStride,
+            const MFCC mfccInst);
+    Preprocess() = delete;
+    ~Preprocess();
+
+    /**
+     * @brief       Calculates the features required from audio data. This
+     *              includes MFCC, first and second order deltas,
+     *              normalisation and finally, quantisation. The tensor is
+     *              populated with feature from a given window placed along
+     *              in a single row.
+     * @param[in]   audioData     pointer to the first element of audio data
+     * @param[in]   audioDataLen  number of elements in the audio data
+     * @param[in]   tensor        tensor to be populated
+     * @return      true if successful, false in case of error.
+     */
+    bool Invoke(const float* audioData,
+                const uint32_t  audioDataLen,
+                std::vector<int8_t>& output,
+                int quantOffset,
+                float quantScale);
+
+
+protected:
+    /**
+     * @brief Computes the first and second order deltas for the
+     *        MFCC buffers - they are assumed to be populated.
+     *
+     * @param[in]  mfcc   MFCC buffers
+     * @param[out] delta1 result of the first diff computation
+     * @param[out] delta2 result of the second diff computation
+     *
+     * @return true if successful, false otherwise
+     */
+    static bool _ComputeDeltas(Array2d<float>& mfcc,
+                               Array2d<float>& delta1,
+                               Array2d<float>& delta2);
+
+    /**
+     * @brief      Given a 2D vector of floats, computes the mean
+     * @param[in]   vec      vector of vector of floats
+     * @return      mean value
+     */
+    static float _GetMean(Array2d<float>& vec);
+
+    /**
+     * @brief       Given a 2D vector of floats, computes the stddev
+     * @param[in]   vec   vector of vector of floats
+     * @param[in]   mean     mean value of the vector passed in
+     * @return      stddev value
+     */
+    static float _GetStdDev(Array2d<float>& vec,
+                            const float mean);
+
+    /**
+     * @brief           Given a 2D vector of floats, normalises it using
+     *                  the mean and the stddev
+     * @param[in/out]   vec      vector of vector of floats
+     * @return
+     */
+    static void _NormaliseVec(Array2d<float>& vec);
+
+    /**
+     * @brief       Normalises the MFCC and delta buffers
+     * @return
+     */
+    void _Normalise();
+
+    /**
+     * @brief       Given the quantisation and data type limits, computes
+     *              the quantised values of a floating point input data.
+     * @param[in]   elem            Element to be quantised
+     * @param[in]   quantScale      Scale
+     * @param[in]   quantOffset     Offset
+     * @param[in]   minVal          Numerical limit - minimum
+     * @param[in]   maxVal          Numerical limit - maximum
+     * @return      floating point quantised value
+     */
+    static float _GetQuantElem(
+            const float     elem,
+            const float     quantScale,
+            const int       quantOffset,
+            const float     minVal,
+            const float     maxVal);
+
+    /**
+     * @brief       Quantises the MFCC and delta buffers, and places them
+     *              in the output buffer. While doing so, it transposes
+     *              the data. Reason: Buffers in this class are arranged
+     *              for "time" axis to be row major. Primary reason for
+     *              this being the convolution speed up (as we can use
+     *              contiguous memory). The output, however, requires the
+     *              time axis to be in column major arrangement.
+     * @param[in]   outputBuf       pointer to the output buffer
+     * @param[in]   outputBufSz     output buffer's size
+     * @param[in]   quantScale      quantisation scale
+     * @param[in]   quantOffset     quantisation offset
+     */
+    template <typename T>
+    bool _Quantise(T* outputBuf, int quantOffset, float quantScale)
+    {
+        /* Populate */
+        T* outputBufMfcc = outputBuf;
+        T* outputBufD1 = outputBuf + this->_m_mfcc._m_params.m_numMfccFeatures;
+        T* outputBufD2 = outputBufD1 + this->_m_mfcc._m_params.m_numMfccFeatures;
+        const uint32_t ptrIncr = this->_m_mfcc._m_params.m_numMfccFeatures * 2; /* (3 vectors - 1 vector) */
+
+        const float minVal = std::numeric_limits<T>::min();
+        const float maxVal = std::numeric_limits<T>::max();
+
+        /* We need to do a transpose while copying and concatenating
+         * the tensor*/
+        for (uint32_t j = 0; j < this->_m_mfcc._m_params.m_numMfccVectors; ++j) {
+            for (uint32_t i = 0; i < this->_m_mfcc._m_params.m_numMfccFeatures; ++i)
+            {
+                *outputBufMfcc++ = static_cast<T>(this->_GetQuantElem(
+                        this->_m_mfccBuf(i, j), quantScale,
+                        quantOffset, minVal, maxVal));
+                *outputBufD1++ = static_cast<T>(this->_GetQuantElem(
+                        this->_m_delta1Buf(i, j), quantScale,
+                        quantOffset, minVal, maxVal));
+                *outputBufD2++ = static_cast<T>(this->_GetQuantElem(
+                        this->_m_delta2Buf(i, j), quantScale,
+                        quantOffset, minVal, maxVal));
+            }
+            outputBufMfcc += ptrIncr;
+            outputBufD1 += ptrIncr;
+            outputBufD2 += ptrIncr;
+        }
+
+        return true;
+    }
+};
+

diff --git a/samples/SpeechRecognition/include/SlidingWindow.hpp b/samples/SpeechRecognition/include/SlidingWindow.hpp
new file mode 100644
index 0000000..791a0b7
--- /dev/null
+++ b/samples/SpeechRecognition/include/SlidingWindow.hpp

@@ -0,0 +1,161 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+template<class T>
+class SlidingWindow
+{
+protected:
+    T* m_start = nullptr;
+    size_t m_dataSize = 0;
+    size_t m_size = 0;
+    size_t m_stride = 0;
+    size_t m_count = 0;
+public:
+
+    /**
+     * Creates the window slider through the given data.
+     *
+     * @param data          pointer to the data to slide through.
+     * @param dataSize      size in T type elements wise.
+     * @param windowSize    sliding window size in T type wise elements.
+     * @param stride        stride size in T type wise elements.
+     */
+    SlidingWindow(T* data, size_t dataSize,
+                  size_t windowSize, size_t stride)
+    {
+        m_start = data;
+        m_dataSize = dataSize;
+        m_size = windowSize;
+        m_stride = stride;
+    }
+
+    SlidingWindow() = default;
+
+    ~SlidingWindow() = default;
+
+    /**
+     * Get the next data window.
+     * @return pointer to the next window, if next window is not available nullptr is returned.
+     */
+    virtual T* Next()
+    {
+        if (HasNext())
+        {
+            m_count++;
+            return m_start + Index() * m_stride;
+        }
+        else
+        {
+            return nullptr;
+        }
+    }
+
+    /**
+     * Checks if the next data portion is available.
+     * @return true if next data portion is available
+     */
+    bool HasNext()
+    {
+        return this->m_count < 1 + this->FractionalTotalStrides() && (this->NextWindowStartIndex() < this->m_dataSize);
+    }
+
+    /**
+     * Resest the slider to the initial position.
+     */
+    virtual void Reset()
+    {
+        m_count = 0;
+    }
+
+    /**
+     * Resest the slider to the initial position.
+     */
+    virtual size_t GetWindowSize()
+    {
+        return m_size;
+    }
+
+    /**
+     * Resets the slider to the start of the new data.
+     * New data size MUST be the same as the old one.
+     * @param newStart pointer to the new data to slide through.
+     */
+    virtual void Reset(T* newStart)
+    {
+        m_start = newStart;
+        Reset();
+    }
+
+    /**
+     * Gets current index of the sliding window.
+     * @return current position of the sliding window in number of strides
+     */
+    size_t Index()
+    {
+        return m_count == 0? 0: m_count - 1;
+    }
+
+    /**
+     * Gets the index from the start of the data where the next window will begin.
+     * While Index() returns the index of sliding window itself this function returns the index of the data
+     * element itself.
+     * @return Index from the start of the data where the next sliding window will begin.
+     */
+    virtual size_t NextWindowStartIndex()
+    {
+        return m_count == 0? 0: ((m_count) * m_stride);
+    }
+
+    /**
+     * Go to given sliding window index.
+     * @param index new position of the sliding window. if index is invalid (greater than possible range of strides)
+     *              then next call to Next() will return nullptr.
+     */
+    void FastForward(size_t index)
+    {
+        m_count = index;
+    }
+
+    /**
+     * Calculates whole number of times the window can stride through the given data.
+     * @return maximum number of strides.
+     */
+    size_t TotalStrides()
+    {
+        if (m_size > m_dataSize)
+        {
+            return 0;
+        }
+        return ((m_dataSize - m_size)/m_stride);
+    }
+
+    /**
+     * Calculates number of times the window can stride through the given data. May not be a whole number.
+     * @return Number of strides to cover all data.
+     */
+    float FractionalTotalStrides()
+    {
+        if(this->m_size > this->m_dataSize)
+        {
+            return this->m_dataSize / this->m_size;
+        }
+        else
+        {
+            return ((this->m_dataSize - this->m_size)/ static_cast<float>(this->m_stride));
+        }
+
+    }
+
+    /**
+     * Calculates the remaining data left to be processed
+     * @return The remaining unprocessed data
+     */
+    int RemainingData()
+    {
+        return this->m_dataSize - this->NextWindowStartIndex();
+    }
+};
\ No newline at end of file

diff --git a/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp b/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp
new file mode 100644
index 0000000..47ce304
--- /dev/null
+++ b/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp

@@ -0,0 +1,139 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ArmnnNetworkExecutor.hpp"
+#include "Decoder.hpp"
+#include "MFCC.hpp"
+#include "Preprocess.hpp"
+
+namespace asr
+{
+/**
+ * Generic Speech Recognition pipeline with 3 steps: data pre-processing, inference execution and inference
+ * result post-processing.
+ *
+ */
+class ASRPipeline
+{
+public:
+
+    /**
+     * Creates speech recognition pipeline with given network executor and decoder.
+     * @param executor - unique pointer to inference runner
+     * @param decoder - unique pointer to inference results decoder
+     */
+    ASRPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor,
+                std::unique_ptr<Decoder> decoder);
+
+    /**
+     * @brief Standard audio pre-processing implementation.
+     *
+     * Preprocesses and prepares the data for inference by
+     * extracting the MFCC features.
+
+     * @param[in] audio - the raw audio data
+     * @param[out] preprocessor - the preprocessor object, which handles the data prepreration
+     */
+    template<typename Tin,typename Tout>
+    std::vector<Tout> PreProcessing(std::vector<Tin>& audio, Preprocess& preprocessor)
+    {
+        int audioDataToPreProcess = preprocessor._m_windowLen +
+                ((preprocessor._m_mfcc._m_params.m_numMfccVectors -1) *preprocessor._m_windowStride);
+        int outputBufferSize = preprocessor._m_mfcc._m_params.m_numMfccVectors
+                * preprocessor._m_mfcc._m_params.m_numMfccFeatures * 3;
+        std::vector<Tout> outputBuffer(outputBufferSize);
+        preprocessor.Invoke(audio.data(), audioDataToPreProcess, outputBuffer, m_executor->GetQuantizationOffset(),
+                            m_executor->GetQuantizationScale());
+        return outputBuffer;
+    }
+
+    /**
+     * @brief Executes inference
+     *
+     * Calls inference runner provided during instance construction.
+     *
+     * @param[in] preprocessedData - input inference data. Data type should be aligned with input tensor.
+     * @param[out] result - raw inference results.
+     */
+    template<typename T>
+    void Inference(const std::vector<T>& preprocessedData, common::InferenceResults<int8_t>& result)
+    {
+        size_t data_bytes = sizeof(std::vector<T>) + (sizeof(T) * preprocessedData.size());
+        m_executor->Run(preprocessedData.data(), data_bytes, result);
+    }
+
+    /**
+     * @brief Standard inference results post-processing implementation.
+     *
+     * Decodes inference results using decoder provided during construction.
+     *
+     * @param[in] inferenceResult - inference results to be decoded.
+     * @param[in] isFirstWindow - for checking if this is the first window of the sliding window.
+     * @param[in] isLastWindow - for checking if this is the last window of the sliding window.
+     * @param[in] currentRContext - the right context of the output text. To be output if it is the last window.
+     */
+    template<typename T>
+    void PostProcessing(common::InferenceResults<int8_t>& inferenceResult,
+                                     bool& isFirstWindow,
+                                     bool isLastWindow,
+                                     std::string currentRContext)
+    {
+        int rowLength = 29;
+        int middleContextStart = 49;
+        int middleContextEnd = 99;
+        int leftContextStart = 0;
+        int rightContextStart = 100;
+        int rightContextEnd = 148;
+
+        std::vector<T> contextToProcess;
+
+        // If isFirstWindow we keep the left context of the output
+        if(isFirstWindow)
+        {
+            std::vector<T> chunk(&inferenceResult[0][leftContextStart],
+                    &inferenceResult[0][middleContextEnd * rowLength]);
+            contextToProcess = chunk;
+        }
+        // Else we only keep the middle context of the output
+        else
+        {
+            std::vector<T> chunk(&inferenceResult[0][middleContextStart * rowLength],
+                    &inferenceResult[0][middleContextEnd * rowLength]);
+            contextToProcess = chunk;
+        }
+        std::string output = this->m_decoder->DecodeOutput<T>(contextToProcess);
+        isFirstWindow = false;
+        std::cout << output << std::flush;
+
+        // If this is the last window, we print the right context of the output
+        if(isLastWindow)
+        {
+            std::vector<T> rContext(&inferenceResult[0][rightContextStart*rowLength],
+                    &inferenceResult[0][rightContextEnd * rowLength]);
+            currentRContext = this->m_decoder->DecodeOutput(rContext);
+            std::cout << currentRContext << std::endl;
+        }
+    }
+
+protected:
+    std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> m_executor;
+    std::unique_ptr<Decoder> m_decoder;
+};
+
+using IPipelinePtr = std::unique_ptr<asr::ASRPipeline>;
+
+/**
+ * Constructs speech recognition pipeline based on configuration provided.
+ *
+ * @param[in] config - speech recognition pipeline configuration.
+ * @param[in] labels - asr labels
+ *
+ * @return unique pointer to asr pipeline.
+ */
+IPipelinePtr CreatePipeline(common::PipelineOptions& config, std::map<int, std::string>& labels);
+
+}// namespace asr
\ No newline at end of file
commit	c6ab02a626e15b4a12fc09ecd844eb8b95380c3c	[log] [tgz]
author	Éanna Ó Catháin <eanna.ocathain@arm.com>	Wed Apr 07 14:35:25 2021 +0100
committer	Jim Flynn <jim.flynn@arm.com>	Fri May 07 09:11:52 2021 +0000
tree	9912ed9cdb89cdb24483b22d6621ae30049ae321
parent	e813d67f86df41a238ff79b5c554ef5027f56576 [diff]