MLECO-3077: Add ASR use case API * Minor adjustments to doc strings in KWS * Remove unused score threshold in KWS Signed-off-by: Richard Burton <richard.burton@arm.com> Change-Id: Ie1c5bf6f7bdbebb853b6a10cb7ba1c4a1d9a76c9

commit: c291144b7f08c21d08cdaf79cc64dc420ca70070 [log] [tgz]
author: Richard Burton <richard.burton@arm.com> Fri Apr 22 09:08:21 2022 +0100
committer: Richard Burton <richard.burton@arm.com> Fri Apr 22 09:08:21 2022 +0100
tree: 1b91c38f7dd479a0c13772a1e1da52079d06237c
parent: b1904b11d15da48c7ead4e6bb85c3e671956ab03 [diff]
diff --git a/source/use_case/asr/include/AsrResult.hpp b/source/use_case/asr/include/AsrResult.hpp
index b12ed7d..ed826d0 100644
--- a/source/use_case/asr/include/AsrResult.hpp
+++ b/source/use_case/asr/include/AsrResult.hpp

@@ -25,7 +25,7 @@
 namespace app {
 namespace asr {
 
-    using ResultVec = std::vector < arm::app::ClassificationResult >;
+    using ResultVec = std::vector<arm::app::ClassificationResult>;
 
     /* Structure for holding ASR result. */
     class AsrResult {

diff --git a/source/use_case/asr/include/Wav2LetterModel.hpp b/source/use_case/asr/include/Wav2LetterModel.hpp
index 55395b9..895df2b 100644
--- a/source/use_case/asr/include/Wav2LetterModel.hpp
+++ b/source/use_case/asr/include/Wav2LetterModel.hpp

@@ -36,6 +36,9 @@
         static constexpr uint32_t ms_outputRowsIdx = 2;
         static constexpr uint32_t ms_outputColsIdx = 3;
 
+        static constexpr uint32_t ms_blankTokenIdx   = 28;
+        static constexpr uint32_t ms_numMfccFeatures = 13;
+
     protected:
         /** @brief   Gets the reference to op resolver interface class. */
         const tflite::MicroOpResolver& GetOpResolver() override;

diff --git a/source/use_case/asr/include/Wav2LetterPostprocess.hpp b/source/use_case/asr/include/Wav2LetterPostprocess.hpp
index 29eb548..45defa5 100644
--- a/source/use_case/asr/include/Wav2LetterPostprocess.hpp
+++ b/source/use_case/asr/include/Wav2LetterPostprocess.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * Copyright (c) 2021-2022 Arm Limited. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,93 +17,90 @@
 #ifndef ASR_WAV2LETTER_POSTPROCESS_HPP
 #define ASR_WAV2LETTER_POSTPROCESS_HPP
 
-#include "TensorFlowLiteMicro.hpp" /* TensorFlow headers. */
+#include "TensorFlowLiteMicro.hpp"   /* TensorFlow headers. */
+#include "BaseProcessing.hpp"
+#include "AsrClassifier.hpp"
+#include "AsrResult.hpp"
 #include "log_macros.h"
 
 namespace arm {
 namespace app {
-namespace audio {
-namespace asr {
 
     /**
      * @brief   Helper class to manage tensor post-processing for "wav2letter"
      *          output.
      */
-    class Postprocess {
+    class ASRPostProcess : public BasePostProcess {
     public:
+        bool m_lastIteration = false;   /* Flag to set if processing the last set of data for a clip. */
+
         /**
-         * @brief       Constructor
-         * @param[in]   contextLen      Left and right context length for
-         *                              output tensor.
-         * @param[in]   innerLen        This is the length of the section
-         *                              between left and right context.
-         * @param[in]   blankTokenIdx  Blank token index.
+         * @brief           Constructor
+         * @param[in]       outputTensor       Pointer to the output Tensor.
+         * @param[in]       labels             Vector of string labels to identify each output of the model.
+         * @param[in/out]   result            Vector of classification results to store decoded outputs.
+         * @param[in]       outputContextLen   Left/right context length for output tensor.
+         * @param[in]       blankTokenIdx      Index in the labels that the "Blank token" takes.
+         * @param[in]       reductionAxis      The axis that the logits of each time step is on.
          **/
-        Postprocess(uint32_t contextLen,
-                    uint32_t innerLen,
-                    uint32_t blankTokenIdx);
-
-        Postprocess() = delete;
-        ~Postprocess() = default;
+        ASRPostProcess(AsrClassifier& classifier, TfLiteTensor* outputTensor,
+                const std::vector<std::string>& labels, asr::ResultVec& result,
+                uint32_t outputContextLen,
+                uint32_t blankTokenIdx, uint32_t reductionAxis);
 
         /**
-         * @brief       Erases the required part of the tensor based
-         *              on context lengths set up during initialisation.
-         * @param[in]   tensor          Pointer to the tensor.
-         * @param[in]   axisIdx         Index of the axis on which erase is
-         *                              performed.
-         * @param[in]   lastIteration   Flag to signal this is the
-         *                              last iteration in which case
-         *                              the right context is preserved.
-         * @return      true if successful, false otherwise.
-         */
-        bool Invoke(TfLiteTensor*  tensor,
-                    uint32_t axisIdx,
-                    bool lastIteration = false);
+         * @brief    Should perform post-processing of the result of inference then
+         *           populate ASR result data for any later use.
+         * @return   true if successful, false otherwise.
+         **/
+        bool DoPostProcess() override;
+
+        /** @brief   Gets the output inner length for post-processing. */
+        static uint32_t GetOutputInnerLen(const TfLiteTensor*, uint32_t outputCtxLen);
+
+        /** @brief   Gets the output context length (left/right) for post-processing. */
+        static uint32_t GetOutputContextLen(const Model& model, uint32_t inputCtxLen);
+
+        /** @brief   Gets the number of feature vectors to be computed. */
+        static uint32_t GetNumFeatureVectors(const Model& model);
 
     private:
-        uint32_t    m_contextLen;      /* lengths of left and right contexts. */
-        uint32_t    m_innerLen;        /* Length of inner context. */
-        uint32_t    m_totalLen;        /* Total length of the required axis. */
-        uint32_t    m_countIterations; /* Current number of iterations. */
-        uint32_t    m_blankTokenIdx;   /* Index of the labels blank token. */
-        /**
-         * @brief       Checks if the tensor and axis index are valid
-         *              inputs to the object - based on how it has been
-         *              initialised.
-         * @return      true if valid, false otherwise.
-         */
-        bool IsInputValid(TfLiteTensor*  tensor,
-                          const uint32_t axisIdx) const;
+        AsrClassifier& m_classifier;                /* ASR Classifier object. */
+        TfLiteTensor* m_outputTensor;               /* Model output tensor. */
+        const std::vector<std::string>& m_labels;   /* ASR Labels. */
+        asr::ResultVec & m_results;                 /* Results vector for a single inference. */
+        uint32_t m_outputContextLen;                /* lengths of left/right contexts for output. */
+        uint32_t m_outputInnerLen;                  /* Length of output inner context. */
+        uint32_t m_totalLen;                        /* Total length of the required axis. */
+        uint32_t m_countIterations;                 /* Current number of iterations. */
+        uint32_t m_blankTokenIdx;                   /* Index of the labels blank token. */
+        uint32_t m_reductionAxisIdx;                /* Axis containing output logits for a single step. */
 
         /**
-         * @brief       Gets the tensor data element size in bytes based
-         *              on the tensor type.
-         * @return      Size in bytes, 0 if not supported.
+         * @brief    Checks if the tensor and axis index are valid
+         *           inputs to the object - based on how it has been initialised.
+         * @return   true if valid, false otherwise.
+         */
+        bool IsInputValid(TfLiteTensor*  tensor,
+                          uint32_t axisIdx) const;
+
+        /**
+         * @brief    Gets the tensor data element size in bytes based
+         *           on the tensor type.
+         * @return   Size in bytes, 0 if not supported.
          */
         static uint32_t GetTensorElementSize(TfLiteTensor* tensor);
 
         /**
-         * @brief       Erases sections from the data assuming row-wise
-         *              arrangement along the context axis.
-         * @return      true if successful, false otherwise.
+         * @brief    Erases sections from the data assuming row-wise
+         *           arrangement along the context axis.
+         * @return   true if successful, false otherwise.
          */
         bool EraseSectionsRowWise(uint8_t* ptrData,
-                                  const uint32_t strideSzBytes,
-                                  const bool lastIteration);
-
-        /**
-         * @brief       Erases sections from the data assuming col-wise
-         *              arrangement along the context axis.
-         * @return      true if successful, false otherwise.
-         */
-        static bool EraseSectionsColWise(const uint8_t* ptrData,
-                                  const uint32_t strideSzBytes,
-                                  const bool lastIteration);
+                                  uint32_t strideSzBytes,
+                                  bool lastIteration);
     };
 
-} /* namespace asr */
-} /* namespace audio */
 } /* namespace app */
 } /* namespace arm */
 

diff --git a/source/use_case/asr/include/Wav2LetterPreprocess.hpp b/source/use_case/asr/include/Wav2LetterPreprocess.hpp
index 13d1589..8c12b3d 100644
--- a/source/use_case/asr/include/Wav2LetterPreprocess.hpp
+++ b/source/use_case/asr/include/Wav2LetterPreprocess.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * Copyright (c) 2021-2022 Arm Limited. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,49 +21,44 @@
 #include "Wav2LetterMfcc.hpp"
 #include "AudioUtils.hpp"
 #include "DataStructures.hpp"
+#include "BaseProcessing.hpp"
 #include "log_macros.h"
 
 namespace arm {
 namespace app {
-namespace audio {
-namespace asr {
 
     /* Class to facilitate pre-processing calculation for Wav2Letter model
      * for ASR. */
-    using AudioWindow = SlidingWindow <const int16_t>;
+    using AudioWindow = audio::SlidingWindow<const int16_t>;
 
-    class Preprocess {
+    class ASRPreProcess : public BasePreProcess {
     public:
         /**
          * @brief       Constructor.
-         * @param[in]   numMfccFeatures   Number of MFCC features per window.
-         * @param[in]   windowLen         Number of elements in a window.
-         * @param[in]   windowStride      Stride (in number of elements) for
-         *                                moving the window.
-         * @param[in]   numMfccVectors    Number of MFCC vectors per window.
+         * @param[in]   inputTensor        Pointer to the TFLite Micro input Tensor.
+         * @param[in]   numMfccFeatures    Number of MFCC features per window.
+         * @param[in]   mfccWindowLen      Number of audio elements to calculate MFCC features per window.
+         * @param[in]   mfccWindowStride   Stride (in number of elements) for moving the MFCC window.
+         * @param[in]   mfccWindowStride   Number of MFCC vectors that need to be calculated
+         *                                 for an inference.
          */
-        Preprocess(
-            uint32_t  numMfccFeatures,
-            uint32_t  windowLen,
-            uint32_t  windowStride,
-            uint32_t  numMfccVectors);
-        Preprocess() = delete;
-        ~Preprocess() = default;
+        ASRPreProcess(TfLiteTensor* inputTensor,
+                uint32_t  numMfccFeatures,
+                uint32_t  audioWindowLen,
+                uint32_t  mfccWindowLen,
+                uint32_t  mfccWindowStride);
 
         /**
          * @brief       Calculates the features required from audio data. This
          *              includes MFCC, first and second order deltas,
          *              normalisation and finally, quantisation. The tensor is
-         *              populated with feature from a given window placed along
+         *              populated with features from a given window placed along
          *              in a single row.
          * @param[in]   audioData      Pointer to the first element of audio data.
          * @param[in]   audioDataLen   Number of elements in the audio data.
-         * @param[in]   tensor         Tensor to be populated.
          * @return      true if successful, false in case of error.
          */
-        bool Invoke(const int16_t * audioData,
-                    uint32_t  audioDataLen,
-                    TfLiteTensor *  tensor);
+        bool DoPreProcess(const void* audioData, size_t audioDataLen) override;
 
     protected:
          /**
@@ -80,32 +75,16 @@
                                    Array2d<float>& delta2);
 
         /**
-         * @brief       Given a 2D vector of floats, computes the mean.
-         * @param[in]   vec   Vctor of vector of floats.
-         * @return      Mean value.
-         */
-        static float GetMean(Array2d<float>& vec);
-
-        /**
-         * @brief       Given a 2D vector of floats, computes the stddev.
-         * @param[in]   vec    Vector of vector of floats.
-         * @param[in]   mean   Mean value of the vector passed in.
-         * @return      stddev value.
-         */
-        static float GetStdDev(Array2d<float>& vec,
-                               const float mean);
-
-        /**
-         * @brief           Given a 2D vector of floats, normalises it using
-         *                  the mean and the stddev.
+         * @brief           Given a 2D vector of floats, rescale it to have mean of 0 and
+        *                   standard deviation of 1.
          * @param[in,out]   vec   Vector of vector of floats.
          */
-        static void NormaliseVec(Array2d<float>& vec);
+        static void StandardizeVecF32(Array2d<float>& vec);
 
         /**
-         * @brief   Normalises the MFCC and delta buffers.
+         * @brief   Standardizes all the MFCC and delta buffers to have mean 0 and std. dev 1.
          */
-        void Normalise();
+        void Standarize();
 
         /**
          * @brief       Given the quantisation and data type limits, computes
@@ -139,7 +118,7 @@
          */
         template <typename T>
         bool Quantise(
-                T *             outputBuf,
+                T*              outputBuf,
                 const uint32_t  outputBufSz,
                 const float     quantScale,
                 const int       quantOffset)
@@ -160,15 +139,15 @@
             const float maxVal = std::numeric_limits<T>::max();
 
             /* Need to transpose while copying and concatenating the tensor. */
-            for (uint32_t j = 0; j < this->m_numFeatVectors; ++j) {
+            for (uint32_t j = 0; j < this->m_numFeatureFrames; ++j) {
                 for (uint32_t i = 0; i < this->m_numMfccFeats; ++i) {
-                    *outputBufMfcc++ = static_cast<T>(Preprocess::GetQuantElem(
+                    *outputBufMfcc++ = static_cast<T>(ASRPreProcess::GetQuantElem(
                             this->m_mfccBuf(i, j), quantScale,
                             quantOffset, minVal, maxVal));
-                    *outputBufD1++ = static_cast<T>(Preprocess::GetQuantElem(
+                    *outputBufD1++ = static_cast<T>(ASRPreProcess::GetQuantElem(
                             this->m_delta1Buf(i, j), quantScale,
                             quantOffset, minVal, maxVal));
-                    *outputBufD2++ = static_cast<T>(Preprocess::GetQuantElem(
+                    *outputBufD2++ = static_cast<T>(ASRPreProcess::GetQuantElem(
                             this->m_delta2Buf(i, j), quantScale,
                             quantOffset, minVal, maxVal));
                 }
@@ -181,23 +160,22 @@
         }
 
     private:
-        Wav2LetterMFCC      m_mfcc;            /* MFCC instance. */
+        audio::Wav2LetterMFCC   m_mfcc;          /* MFCC instance. */
+        TfLiteTensor*           m_inputTensor;   /* Model input tensor. */
 
         /* Actual buffers to be populated. */
-        Array2d<float>      m_mfccBuf;         /* Contiguous buffer 1D: MFCC */
-        Array2d<float>      m_delta1Buf;       /* Contiguous buffer 1D: Delta 1 */
-        Array2d<float>      m_delta2Buf;       /* Contiguous buffer 1D: Delta 2 */
+        Array2d<float>   m_mfccBuf;              /* Contiguous buffer 1D: MFCC */
+        Array2d<float>   m_delta1Buf;            /* Contiguous buffer 1D: Delta 1 */
+        Array2d<float>   m_delta2Buf;            /* Contiguous buffer 1D: Delta 2 */
 
-        uint32_t            m_windowLen;       /* Window length for MFCC. */
-        uint32_t            m_windowStride;    /* Window stride len for MFCC. */
-        uint32_t            m_numMfccFeats;    /* Number of MFCC features per window. */
-        uint32_t            m_numFeatVectors;  /* Number of m_numMfccFeats. */
-        AudioWindow         m_window;          /* Sliding window. */
+        uint32_t         m_mfccWindowLen;        /* Window length for MFCC. */
+        uint32_t         m_mfccWindowStride;     /* Window stride len for MFCC. */
+        uint32_t         m_numMfccFeats;         /* Number of MFCC features per window. */
+        uint32_t         m_numFeatureFrames;     /* How many sets of m_numMfccFeats. */
+        AudioWindow      m_mfccSlidingWindow;    /* Sliding window to calculate MFCCs. */
 
     };
 
-} /* namespace asr */
-} /* namespace audio */
 } /* namespace app */
 } /* namespace arm */
commit	c291144b7f08c21d08cdaf79cc64dc420ca70070	[log] [tgz]
author	Richard Burton <richard.burton@arm.com>	Fri Apr 22 09:08:21 2022 +0100
committer	Richard Burton <richard.burton@arm.com>	Fri Apr 22 09:08:21 2022 +0100
tree	1b91c38f7dd479a0c13772a1e1da52079d06237c
parent	b1904b11d15da48c7ead4e6bb85c3e671956ab03 [diff]