MLECO-2079 Adding the C++ KWS example Signed-off-by: Eanna O Cathain <eanna.ocathain@arm.com> Change-Id: I81899bbfaada32f478c2e2fc6441eabb94d8d0fc

commit: 23c26277086c78704a17f0dae86da947816320c0 [log] [tgz]
author: George Gekov <george.gekov@arm.com> Mon Aug 16 11:32:10 2021 +0100
committer: Jim Flynn <jim.flynn@arm.com> Sat Feb 05 19:49:06 2022 +0000
tree: 88b02fd1fae3130256d059251788a7ef68d2831f
parent: 922b912fd2d462bac0809bac5669310ad1506310 [diff]
diff --git a/samples/KeywordSpotting/CMakeLists.txt b/samples/KeywordSpotting/CMakeLists.txt
new file mode 100644
index 0000000..e8f2631
--- /dev/null
+++ b/samples/KeywordSpotting/CMakeLists.txt

@@ -0,0 +1,64 @@
+# Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+cmake_minimum_required(VERSION 3.0.2)
+
+set(CMAKE_C_STANDARD                99)
+set(CMAKE_CXX_STANDARD              14)
+
+# Make the standard a requirement => prevent fallback to previous
+# supported standard
+set(CMAKE_C_STANDARD_REQUIRED       ON)
+set(CMAKE_CXX_STANDARD_REQUIRED     ON)
+
+# We want to pass standard C/C++ flags, without gnu extensions
+set(CMAKE_C_EXTENSIONS              OFF)
+set(CMAKE_CXX_EXTENSIONS            OFF)
+
+project (keyword-spotting-example)
+
+set(CMAKE_C_FLAGS_DEBUG         "-DDEBUG -O0 -g -fPIC -pthread")
+set(CMAKE_C_FLAGS_RELEASE       "-DNDEBUG -O3 -fPIC -pthread")
+
+set(CMAKE_CXX_FLAGS_DEBUG       "-DDEBUG -O0 -g -fPIC -pthread")
+set(CMAKE_CXX_FLAGS_RELEASE     "-DNDEBUG -O3 -fPIC -pthread")
+
+include(ExternalProject)
+
+# Build in release mode by default
+if (NOT CMAKE_BUILD_TYPE STREQUAL Debug)
+    set(CMAKE_BUILD_TYPE Release CACHE INTERNAL "")
+endif()
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+if (NOT DEFINED DEPENDENCIES_DIR)
+    set(DEPENDENCIES_DIR ${CMAKE_BINARY_DIR}/dependencies)
+endif()
+
+include(../common/cmake/find_armnn.cmake)
+
+include_directories(include)
+include_directories(../common/include/ArmnnUtils)
+include_directories(../common/include/Utils)
+include_directories(../common/include/Audio)
+
+file(GLOB SOURCES "src/*.cpp")
+file(GLOB COMMON_UTILS_SOURCES "../common/src/Utils/*.cpp")
+file(GLOB COMMON_AUDIO_SOURCES "../common/src/Audio/*.cpp")
+list(REMOVE_ITEM SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/Main.cpp)
+file(GLOB TEST_SOURCES "test/*.cpp")
+file(GLOB APP_MAIN "src/Main.cpp")
+
+if(BUILD_UNIT_TESTS)
+    include(cmake/unit_tests.cmake)
+endif()
+
+set(APP_TARGET_NAME "${CMAKE_PROJECT_NAME}")
+
+add_executable("${APP_TARGET_NAME}"  ${COMMON_UTILS_SOURCES} ${COMMON_AUDIO_SOURCES} ${SOURCES} ${APP_MAIN})
+
+target_link_libraries("${APP_TARGET_NAME}" PUBLIC ${ARMNN_LIBS} -lsndfile -lsamplerate)
+target_include_directories("${APP_TARGET_NAME}" PUBLIC ${ARMNN_INCLUDE_DIR} )

diff --git a/samples/KeywordSpotting/Readme.md b/samples/KeywordSpotting/Readme.md
new file mode 100644
index 0000000..914d984
--- /dev/null
+++ b/samples/KeywordSpotting/Readme.md

@@ -0,0 +1,283 @@
+# Keyword Spotting Example
+
+## Introduction
+
+This is a sample code showing keyword spotting using Arm NN public C++ API. The compiled application can take
+
+* an audio file
+
+as input and produce
+
+* recognised keyword in the audio file
+
+as output. The application works with the [fully quantised DS CNN Large model](https://github.com/ARM-software/ML-zoo/raw/68b5fbc77ed28e67b2efc915997ea4477c1d9d5b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/) which is trained to recongize 12 keywords, including an unknown word.
+
+## Dependencies
+
+This example utilises `libsndfile`, `libasound` and `libsamplerate` libraries to capture the raw audio data from file, and to re-sample to the expected sample rate. Top level inference API is provided by Arm NN library.
+
+### Arm NN
+
+Keyword spotting example build system does not trigger Arm NN compilation. Thus, before building the application,
+please ensure that Arm NN libraries and header files are available on your build platform.
+The application executable binary dynamically links with the following Arm NN libraries:
+
+* libarmnn.so
+* libarmnnTfLiteParser.so
+
+The build script searches for available Arm NN libraries in the following order:
+
+1. Inside custom user directory specified by ARMNN_LIB_DIR cmake option.
+2. Inside the current Arm NN repository, assuming that Arm NN was built following [these instructions](../../BuildGuideCrossCompilation.md).
+3. Inside default locations for system libraries, assuming Arm NN was installed from deb packages.
+
+Arm NN header files will be searched in parent directory of found libraries files under `include` directory, i.e.
+libraries found in `/usr/lib` or `/usr/lib64` and header files in `/usr/include` (or `${ARMNN_LIB_DIR}/include`).
+
+Please see [find_armnn.cmake](./cmake/find_armnn.cmake) for implementation details.
+
+## Building
+
+There is one flow for building this application:
+
+* native build on a host platform
+
+### Build Options
+
+* ARMNN_LIB_DIR - point to the custom location of the Arm NN libs and headers.
+* BUILD_UNIT_TESTS -  set to `1` to build tests. Additionally to the main application, `keyword-spotting-example-tests`
+unit tests executable will be created.
+
+### Native Build
+
+To build this application on a host platform, firstly ensure that required dependencies are installed:
+For example, for raspberry PI:
+
+```commandline
+sudo apt-get update
+sudo apt-get -yq install libsndfile1-dev
+sudo apt-get -yq install libasound2-dev
+sudo apt-get -yq install libsamplerate-dev
+```
+
+To build demo application, create a build directory:
+
+```commandline
+mkdir build
+cd build
+```
+
+If you have already installed Arm NN and and the required libraries:
+
+Inside build directory, run cmake and make commands:
+
+```commandline
+cmake  ..
+make
+```
+
+This will build the following in bin directory:
+
+* `keyword-spotting-example` - application executable
+
+If you have custom Arm NN location, use `ARMNN_LIB_DIR` options:
+
+```commandline
+cmake  -DARMNN_LIB_DIR=/path/to/armnn ..
+make
+```
+
+## Executing
+
+Once the application executable is built, it can be executed with the following options:
+
+* --audio-file-path: Path to the audio file to run keyword spotting on **[REQUIRED]**
+* --model-file-path: Path to the Keyword Spotting model to use **[REQUIRED]**
+
+* --preferred-backends: Takes the preferred backends in preference order, separated by comma.
+                        For example: `CpuAcc,GpuAcc,CpuRef`. Accepted options: [`CpuAcc`, `CpuRef`, `GpuAcc`].
+                        Defaults to `CpuRef` **[OPTIONAL]**
+
+### Keyword Spotting on a supplied audio file
+
+A small selection of suitable wav files containing keywords can be found [here](https://git.mlplatform.org/ml/ethos-u/ml-embedded-evaluation-kit.git/plain/resources/kws/samples/).
+To run keyword spotting on a supplied audio file and output the result to console:
+
+```commandline
+./keyword-spotting-example --audio-file-path /path/to/audio/file --model-file-path /path/to/model/file
+```
+
+# Application Overview
+
+This section provides a walkthrough of the application, explaining in detail the steps:
+
+1. Initialisation
+    1. Reading from Audio Source
+2. Creating a Network
+    1. Creating Parser and Importing Graph
+    2. Optimizing Graph for Compute Device
+    3. Creating Input and Output Binding Information
+3. Keyword spotting pipeline
+    1. Pre-processing the Captured Audio
+    2. Making Input and Output Tensors
+    3. Executing Inference
+    4. Postprocessing
+    5. Decoding and Processing Inference Output
+
+### Initialisation
+
+##### Reading from Audio Source
+
+After parsing user arguments, the chosen audio file is loaded into an AudioCapture object.
+We use [`AudioCapture`](./include/AudioCapture.hpp) in our main function to capture appropriately sized audio blocks from the source using the
+`Next()` function.
+
+The `AudioCapture` object also re-samples the audio input to a desired sample rate, and sets the number of channels used to one channel (i.e `mono`)
+
+### Creating a Network
+
+All operations with Arm NN and networks are encapsulated in [`ArmnnNetworkExecutor`](./include/ArmnnNetworkExecutor.hpp)
+class.
+
+##### Creating Parser and Importing Graph
+
+The first step with Arm NN SDK is to import a graph from file by using the appropriate parser.
+
+The Arm NN SDK provides parsers for reading graphs from a variety of model formats. In our application we specifically
+focus on `.tflite, .pb, .onnx` models.
+
+Based on the extension of the provided model file, the corresponding parser is created and the network file loaded with
+`CreateNetworkFromBinaryFile()` method. The parser will handle the creation of the underlying Arm NN graph.
+
+Currently this example only supports tflite format model files and uses `ITfLiteParser`:
+
+```c++
+#include "armnnTfLiteParser/ITfLiteParser.hpp"
+
+armnnTfLiteParser::ITfLiteParserPtr parser = armnnTfLiteParser::ITfLiteParser::Create();
+armnn::INetworkPtr network = parser->CreateNetworkFromBinaryFile(modelPath.c_str());
+```
+
+##### Optimizing Graph for Compute Device
+
+Arm NN supports optimized execution on multiple CPU and GPU devices. Prior to executing a graph, we must select the
+appropriate device context. We do this by creating a runtime context with default options with `IRuntime()`.
+
+For example:
+
+```c++
+#include "armnn/ArmNN.hpp"
+
+auto runtime = armnn::IRuntime::Create(armnn::IRuntime::CreationOptions());
+```
+
+We can optimize the imported graph by specifying a list of backends in order of preference and implement
+backend-specific optimizations. The backends are identified by a string unique to the backend,
+for example `CpuAcc, GpuAcc, CpuRef`.
+
+For example:
+
+```c++
+std::vector<armnn::BackendId> backends{"CpuAcc", "GpuAcc", "CpuRef"};
+```
+
+Internally and transparently, Arm NN splits the graph into subgraph based on backends, it calls a optimize subgraphs
+function on each of them and, if possible, substitutes the corresponding subgraph in the original graph with
+its optimized version.
+
+Using the `Optimize()` function we optimize the graph for inference and load the optimized network onto the compute
+device with `LoadNetwork()`. This function creates the backend-specific workloads
+for the layers and a backend specific workload factory which is called to create the workloads.
+
+For example:
+
+```c++
+armnn::IOptimizedNetworkPtr optNet = Optimize(*network,
+                                              backends,
+                                              m_Runtime->GetDeviceSpec(),
+                                              armnn::OptimizerOptions());
+std::string errorMessage;
+runtime->LoadNetwork(0, std::move(optNet), errorMessage));
+std::cerr << errorMessage << std::endl;
+```
+
+##### Creating Input and Output Binding Information
+
+Parsers can also be used to extract the input information for the network. By calling `GetSubgraphInputTensorNames`
+we extract all the input names and, with `GetNetworkInputBindingInfo`, bind the input points of the graph.
+For example:
+
+```c++
+std::vector<std::string> inputNames = parser->GetSubgraphInputTensorNames(0);
+auto inputBindingInfo = parser->GetNetworkInputBindingInfo(0, inputNames[0]);
+```
+
+The input binding information contains all the essential information about the input. It is a tuple consisting of
+integer identifiers for bindable layers (inputs, outputs) and the tensor info (data type, quantization information,
+number of dimensions, total number of elements).
+
+Similarly, we can get the output binding information for an output layer by using the parser to retrieve output
+tensor names and calling `GetNetworkOutputBindingInfo()`.
+
+### Keyword Spotting pipeline
+
+The keyword spotting pipeline has 3 steps to perform: data pre-processing, run inference and decode inference results.
+
+See [`KeywordSpottingPipeline`](include/KeywordSpottingPipeline.hpp) for more details.
+
+#### Pre-processing the Audio Input
+
+Each frame captured from source is read and stored by the AudioCapture object.
+It's `Next()` function provides us with the correctly positioned window of data, sized appropriately for the given model, to pre-process before inference.
+
+```c++
+std::vector<float> audioBlock = capture.Next();
+...
+std::vector<int8_t> preprocessedData = kwsPipeline->PreProcessing(audioBlock);
+```
+
+The `MFCC` class is then used to extract the Mel-frequency Cepstral Coefficients (MFCCs, [see Wikipedia](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum)) from each stored audio frame in the provided window of audio, to be used as features for the network. MFCCs are the result of computing the dot product of the Discrete Cosine Transform (DCT) Matrix and the log of the Mel energy.
+
+After all the MFCCs needed for an inference have been extracted from the audio data they are concatenated to make the input tensor for the model.
+
+#### Executing Inference
+
+```c++
+common::InferenceResults results;
+...
+kwsPipeline->Inference(preprocessedData, results);
+```
+
+Inference step will call `ArmnnNetworkExecutor::Run` method that will prepare input tensors and execute inference.
+A compute device performs inference for the loaded network using the `EnqueueWorkload()` function of the runtime context.
+For example:
+
+```c++
+//const void* inputData = ...;
+//outputTensors were pre-allocated before
+
+armnn::InputTensors inputTensors = {{ inputBindingInfo.first,armnn::ConstTensor(inputBindingInfo.second, inputData)}};
+runtime->EnqueueWorkload(0, inputTensors, outputTensors);
+```
+
+We allocate memory for output data once and map it to output tensor objects. After successful inference, we read data
+from the pre-allocated output data buffer. See [`ArmnnNetworkExecutor::ArmnnNetworkExecutor`](./src/ArmnnNetworkExecutor.cpp)
+and [`ArmnnNetworkExecutor::Run`](./src/ArmnnNetworkExecutor.cpp) for more details.
+
+#### Postprocessing
+
+##### Decoding
+
+The output from the inference is decoded to obtain the spotted keyword- the word with highest probability is outputted to the console.
+
+```c++
+kwsPipeline->PostProcessing(results, labels,
+                            [](int index, std::string& label, float prob) -> void {
+                                printf("Keyword \"%s\", index %d:, probability %f\n",
+                                        label.c_str(),
+                                        index,
+                                        prob);
+                            });
+```
+
+The produced string is displayed on the console.

diff --git a/samples/KeywordSpotting/cmake/unit_tests.cmake b/samples/KeywordSpotting/cmake/unit_tests.cmake
new file mode 100644
index 0000000..97ba4d4
--- /dev/null
+++ b/samples/KeywordSpotting/cmake/unit_tests.cmake

@@ -0,0 +1,65 @@
+# Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+# Function to download a file from the Arm Model Zoo
+function(download_file_from_modelzoo model_zoo_version file_sub_path download_path)
+    set(MODEL_ZOO_REPO "https://github.com/ARM-software/ML-zoo/raw")
+    string(JOIN "/" FILE_URL
+        ${MODEL_ZOO_REPO} ${model_zoo_version} ${file_sub_path})
+    message(STATUS "Downloading ${FILE_URL} to ${download_path}...")
+    file(DOWNLOAD ${FILE_URL} ${download_path}
+        STATUS DOWNLOAD_STATE)
+    list(GET DOWNLOAD_STATE 0 RET_VAL)
+    if(${RET_VAL})
+        list(GET DOWNLOAD_STATE 1 RET_MSG)
+        message(FATAL_ERROR "Download failed with error code: ${RET_VAL}; "
+                            "Error message: ${RET_MSG}")
+    endif()
+endfunction()
+
+set(TEST_RESOURCES_DIR ${CMAKE_SOURCE_DIR}/test/resources)
+file(MAKE_DIRECTORY ${TEST_RESOURCES_DIR})
+add_definitions (-DTEST_RESOURCE_DIR="${TEST_RESOURCES_DIR}")
+set(TEST_TARGET_NAME "${CMAKE_PROJECT_NAME}-tests")
+
+file(GLOB TEST_SOURCES "test/*")
+file(GLOB TESTS_AUDIO_COMMON "../common/test/Audio/*")
+
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/test/resources)
+include(../common/cmake/find_catch.cmake)
+
+add_executable("${TEST_TARGET_NAME}" ${COMMON_UTILS_SOURCES} ${COMMON_AUDIO_SOURCES} ${SOURCES} ${TEST_SOURCES} ${TESTS_AUDIO_COMMON})
+
+ExternalProject_Add(passport
+        URL https://raw.githubusercontent.com/Azure-Samples/cognitive-services-speech-sdk/master/sampledata/audiofiles/myVoiceIsMyPassportVerifyMe04.wav
+        DOWNLOAD_NO_EXTRACT 1
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ${CMAKE_COMMAND} -E copy <DOWNLOAD_DIR>/myVoiceIsMyPassportVerifyMe04.wav ${CMAKE_CURRENT_SOURCE_DIR}/test/resources
+        INSTALL_COMMAND ""
+        )
+
+add_dependencies(
+        "${TEST_TARGET_NAME}"
+        "passport"
+        "catch2-headers"
+)
+
+
+set(MODEL_FILENAME          ds_cnn_clustered_int8.tflite)
+set(MODEL_RESOURCES_DIR     ${CMAKE_CURRENT_SOURCE_DIR}/test/resources)
+file(MAKE_DIRECTORY         ${MODEL_RESOURCES_DIR})
+set(DEFAULT_MODEL_PATH      ${CMAKE_CURRENT_SOURCE_DIR}/test/resources/${MODEL_FILENAME})
+
+# Download the default model
+set(ZOO_COMMON_SUBPATH      "models/keyword_spotting/ds_cnn_large/tflite_clustered_int8")
+set(ZOO_MODEL_SUBPATH       "${ZOO_COMMON_SUBPATH}/${MODEL_FILENAME}")
+set(ZOO_MODEL_VERSION       "68b5fbc77ed28e67b2efc915997ea4477c1d9d5b")
+
+download_file_from_modelzoo(${ZOO_MODEL_VERSION} ${ZOO_MODEL_SUBPATH} ${DEFAULT_MODEL_PATH})
+
+
+target_include_directories("${TEST_TARGET_NAME}" PUBLIC ${TEST_TPIP_INCLUDE}
+     ${ARMNN_INCLUDE_DIR}
+      ${DEPENDENCIES_DIR} ${TEST_RESOURCES_DIR} ${COMMON_INCLUDE_DIR})
+
+target_link_libraries("${TEST_TARGET_NAME}" PUBLIC ${ARMNN_LIBS} -lsndfile -lsamplerate)
\ No newline at end of file

diff --git a/samples/KeywordSpotting/include/Decoder.hpp b/samples/KeywordSpotting/include/Decoder.hpp
new file mode 100644
index 0000000..aca6831
--- /dev/null
+++ b/samples/KeywordSpotting/include/Decoder.hpp

@@ -0,0 +1,32 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+# pragma once
+
+#include <string>
+#include <map>
+#include "ArmnnNetworkExecutor.hpp"
+
+namespace kws 
+{
+
+/**
+* @brief Decodes quantised last layer of model output
+*
+*/
+class Decoder 
+{
+private:
+    int quantisationOffset;
+    float quantisationScale;
+
+public:
+
+    Decoder(int quantisationOffset, float quantisationScale) : quantisationOffset(quantisationOffset),
+                                                               quantisationScale(quantisationScale) {}
+
+    std::pair<int, float> decodeOutput(std::vector<int8_t>& modelOutput);
+
+};
+} // namespace kws
\ No newline at end of file

diff --git a/samples/KeywordSpotting/include/DsCNNPreprocessor.hpp b/samples/KeywordSpotting/include/DsCNNPreprocessor.hpp
new file mode 100644
index 0000000..b635d1a
--- /dev/null
+++ b/samples/KeywordSpotting/include/DsCNNPreprocessor.hpp

@@ -0,0 +1,39 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#ifndef KEYWORD_SPOTTING_EXAMPLE_DSCNNPREPROCESSOR_HPP
+#define KEYWORD_SPOTTING_EXAMPLE_DSCNNPREPROCESSOR_HPP
+
+#include <numeric>
+#include "DsCnnMfcc.hpp"
+
+namespace kws 
+{
+class DsCNNPreprocessor
+{
+public:
+    DsCNNPreprocessor(uint32_t windowLen, uint32_t windowStride,
+                      std::unique_ptr<DsCnnMFCC> mfccInst);
+
+    /**
+    * @brief       Calculates the features required from audio data. This
+    *              includes MFCC, first and second order deltas,
+    *              normalisation and finally, quantisation. The tensor is
+    *              populated with feature from a given window placed along
+    *              in a single row.
+    * @param[in]   audioData     pointer to the first element of audio data
+    * @param[in]   output        output to be populated
+    * @return      true if successful, false in case of error.
+    */
+    std::vector<int8_t> Invoke(const float* audioData, 
+                               size_t dataSize,
+                               int quantOffset,
+                               float quantScale) ;
+
+    uint32_t m_windowLen;       // Window length for MFCC
+    uint32_t m_windowStride;    // Window stride len for MFCC
+    std::unique_ptr<MFCC> m_mfcc;
+};
+} // namespace kws
+#endif //KEYWORD_SPOTTING_EXAMPLE_DSCNNPREPROCESSOR_HPP

diff --git a/samples/KeywordSpotting/include/DsCnnMfcc.hpp b/samples/KeywordSpotting/include/DsCnnMfcc.hpp
new file mode 100644
index 0000000..851e010
--- /dev/null
+++ b/samples/KeywordSpotting/include/DsCnnMfcc.hpp

@@ -0,0 +1,20 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "MFCC.hpp"
+
+/* Class to provide DS-CNN specific MFCC calculation requirements. */
+class DsCnnMFCC : public MFCC 
+{
+
+public:
+
+    explicit DsCnnMFCC(MfccParams& params)
+        :  MFCC(params)
+    {}
+    DsCnnMFCC()  = delete;
+    ~DsCnnMFCC() = default;
+};

diff --git a/samples/KeywordSpotting/include/KeywordSpottingPipeline.hpp b/samples/KeywordSpotting/include/KeywordSpottingPipeline.hpp
new file mode 100644
index 0000000..bd47987
--- /dev/null
+++ b/samples/KeywordSpotting/include/KeywordSpottingPipeline.hpp

@@ -0,0 +1,91 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ArmnnNetworkExecutor.hpp"
+#include "Decoder.hpp"
+#include "MFCC.hpp"
+#include "DsCNNPreprocessor.hpp"
+
+namespace kws
+{
+/**
+ * Generic Keyword Spotting pipeline with 3 steps: data pre-processing, inference execution and inference
+ * result post-processing.
+ *
+ */
+class KWSPipeline
+{
+public:
+
+    /**
+     * Creates speech recognition pipeline with given network executor and decoder.
+     * @param executor - unique pointer to inference runner
+     * @param decoder - unique pointer to inference results decoder
+     */
+    KWSPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor,
+                std::unique_ptr<Decoder> decoder,
+                std::unique_ptr<DsCNNPreprocessor> preProcessor);
+
+    /**
+     * @brief Standard audio pre-processing implementation.
+     *
+     * Preprocesses and prepares the data for inference by
+     * extracting the MFCC features.
+
+     * @param[in] audio - the raw audio data
+     */
+
+    std::vector<int8_t> PreProcessing(std::vector<float>& audio);
+
+    /**
+     * @brief Executes inference
+     *
+     * Calls inference runner provided during instance construction.
+     *
+     * @param[in] preprocessedData - input inference data. Data type should be aligned with input tensor.
+     * @param[out] result - raw inference results.
+     */
+    void Inference(const std::vector<int8_t>& preprocessedData, common::InferenceResults<int8_t>& result);
+
+    /**
+     * @brief Standard inference results post-processing implementation.
+     *
+     * Decodes inference results using decoder provided during construction.
+     *
+     * @param[in] inferenceResult - inference results to be decoded.
+     * @param[in] labels - the words we use for the model
+     */
+    void PostProcessing(common::InferenceResults<int8_t>& inferenceResults,
+                        std::map<int, std::string>& labels,
+                        const std::function<void (int, std::string&, float)>& callback);
+
+    /**
+     * @brief Get the number of samples for the pipeline input
+
+     * @return - number of samples for the pipeline
+     */
+    int getInputSamplesSize();
+
+protected:
+    std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> m_executor;
+    std::unique_ptr<Decoder> m_decoder;
+    std::unique_ptr<DsCNNPreprocessor> m_preProcessor;
+};
+
+using IPipelinePtr = std::unique_ptr<kws::KWSPipeline>;
+
+/**
+ * Constructs speech recognition pipeline based on configuration provided.
+ *
+ * @param[in] config - speech recognition pipeline configuration.
+ * @param[in] labels - asr labels
+ *
+ * @return unique pointer to asr pipeline.
+ */
+IPipelinePtr CreatePipeline(common::PipelineOptions& config);
+
+};// namespace kws
\ No newline at end of file

diff --git a/samples/KeywordSpotting/src/Decoder.cpp b/samples/KeywordSpotting/src/Decoder.cpp
new file mode 100644
index 0000000..107e25c
--- /dev/null
+++ b/samples/KeywordSpotting/src/Decoder.cpp

@@ -0,0 +1,35 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "Decoder.hpp"
+
+std::pair<int, float> kws::Decoder::decodeOutput(std::vector<int8_t>& modelOutput) 
+{
+
+    std::vector<float> dequantisedOutput;
+    //Normalise vector values into new vector
+    for (auto& value : modelOutput) 
+    {
+        float normalisedModelOutput = this->quantisationScale * (static_cast<float >(value) -
+                                                                 static_cast<float >(this->quantisationOffset));
+        dequantisedOutput.push_back(normalisedModelOutput);
+    }
+
+    //Get largest value in modelOutput
+    const std::vector<float>::iterator& maxElementIterator = std::max_element(dequantisedOutput.begin(),
+                                                                              dequantisedOutput.end());
+    //Find the labelMapIndex of the largest value which corresponds to a key in a label map
+    int labelMapIndex = static_cast<int>(std::distance(dequantisedOutput.begin(), maxElementIterator));
+
+    //Round to two DP
+    float maxModelOutputProbability = std::roundf((*maxElementIterator) * 100) / 100;
+
+    return std::make_pair(labelMapIndex, maxModelOutputProbability);
+
+}
+
+
+
+

diff --git a/samples/KeywordSpotting/src/DsCNNPreprocessor.cpp b/samples/KeywordSpotting/src/DsCNNPreprocessor.cpp
new file mode 100644
index 0000000..8215fee
--- /dev/null
+++ b/samples/KeywordSpotting/src/DsCNNPreprocessor.cpp

@@ -0,0 +1,40 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include <cmath>
+#include <numeric>
+#include <algorithm>
+#include <memory>
+#include "MathUtils.hpp"
+#include "SlidingWindow.hpp"
+#include "DsCNNPreprocessor.hpp"
+
+std::vector<int8_t> kws::DsCNNPreprocessor::Invoke(const float* audioData, size_t dataSize,
+                                                   int quantOffset, float quantScale) 
+{
+    auto window = SlidingWindow<const float>(
+            audioData, dataSize,
+            this->m_windowLen, this->m_windowStride);
+
+    uint32_t mfccBufIdx = 0;
+    std::vector<int8_t> outputBuffer;
+    // While we can slide over the window
+    while (window.HasNext()) 
+    {
+        const float* mfccWindow = window.Next();
+        auto mfccAudioData = std::vector<float>(mfccWindow, mfccWindow + this->m_windowLen);
+
+        auto mfcc = this->m_mfcc->MfccComputeQuant<int8_t>(mfccAudioData, quantScale, quantOffset);
+
+        std::copy(mfcc.begin(), mfcc.end(), std::back_inserter(outputBuffer));
+
+        ++mfccBufIdx;
+    }
+
+    return outputBuffer;
+}
+
+kws::DsCNNPreprocessor::DsCNNPreprocessor(const uint32_t windowLen, const uint32_t windowStride,
+                                          std::unique_ptr<DsCnnMFCC> mfccInst) :
+        m_windowLen{windowLen}, m_windowStride{windowStride}, m_mfcc{std::move(mfccInst)} {}

diff --git a/samples/KeywordSpotting/src/KeywordSpottingPipeline.cpp b/samples/KeywordSpotting/src/KeywordSpottingPipeline.cpp
new file mode 100644
index 0000000..e32d947
--- /dev/null
+++ b/samples/KeywordSpotting/src/KeywordSpottingPipeline.cpp

@@ -0,0 +1,94 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "KeywordSpottingPipeline.hpp"
+#include "ArmnnNetworkExecutor.hpp"
+#include "DsCNNPreprocessor.hpp"
+
+namespace kws
+{
+KWSPipeline::KWSPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor,
+                         std::unique_ptr<Decoder> decoder,
+                         std::unique_ptr<DsCNNPreprocessor> preProcessor
+                         ) :
+        m_executor(std::move(executor)),
+        m_decoder(std::move(decoder)),
+        m_preProcessor(std::move(preProcessor)) {}
+
+
+std::vector<int8_t> KWSPipeline::PreProcessing(std::vector<float>& audio)
+{
+    return m_preProcessor->Invoke(audio.data(), audio.size(), m_executor->GetQuantizationOffset(),
+                                  m_executor->GetQuantizationScale());
+}
+
+void KWSPipeline::Inference(const std::vector<int8_t>& preprocessedData, 
+                            common::InferenceResults<int8_t>& result)
+{
+    m_executor->Run(preprocessedData.data(), preprocessedData.size(), result);
+}
+
+void KWSPipeline::PostProcessing(common::InferenceResults<int8_t>& inferenceResults,
+                    std::map<int, std::string>& labels,
+                    const std::function<void (int, std::string&, float)>& callback)
+{
+    std::pair<int,float> outputDecoder = this->m_decoder->decodeOutput(inferenceResults[0]);
+    int keywordIndex = std::get<0>(outputDecoder);
+    std::string output = labels[keywordIndex];
+    callback(keywordIndex, output, std::get<1>(outputDecoder));
+}
+
+int KWSPipeline::getInputSamplesSize()
+{
+    return this->m_preProcessor->m_windowLen +
+            ((this->m_preProcessor->m_mfcc->m_params.m_numMfccVectors - 1) * 
+              this->m_preProcessor->m_windowStride);
+}
+
+IPipelinePtr CreatePipeline(common::PipelineOptions& config)
+{
+    if (config.m_ModelName == "DS_CNN_CLUSTERED_INT8") 
+    {
+        //DS-CNN model settings
+        float SAMP_FREQ = 16000;
+        int MFCC_WINDOW_LEN = 640;
+        int MFCC_WINDOW_STRIDE = 320;
+        int NUM_MFCC_FEATS = 10;
+        int NUM_MFCC_VECTORS = 49;
+        //todo: calc in pipeline and use in main
+        int SAMPLES_PER_INFERENCE = NUM_MFCC_VECTORS * MFCC_WINDOW_STRIDE + 
+                                    MFCC_WINDOW_LEN - MFCC_WINDOW_STRIDE; //16000
+        float MEL_LO_FREQ = 20;
+        float MEL_HI_FREQ = 4000;
+        int NUM_FBANK_BIN = 40;
+
+        MfccParams mfccParams(SAMP_FREQ,
+                              NUM_FBANK_BIN,
+                              MEL_LO_FREQ,
+                              MEL_HI_FREQ,
+                              NUM_MFCC_FEATS,
+                              MFCC_WINDOW_LEN, false,
+                              NUM_MFCC_VECTORS);
+
+        std::unique_ptr<DsCnnMFCC> mfccInst = std::make_unique<DsCnnMFCC>(mfccParams);
+        auto preprocessor = std::make_unique<kws::DsCNNPreprocessor>(
+            MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE, std::move(mfccInst));
+
+        auto executor = std::make_unique<common::ArmnnNetworkExecutor<int8_t>>(
+            config.m_ModelFilePath, config.m_backends);
+
+        auto decoder = std::make_unique<kws::Decoder>(executor->GetOutputQuantizationOffset(0),
+                                                      executor->GetOutputQuantizationScale(0));
+
+        return std::make_unique<kws::KWSPipeline>(std::move(executor), 
+                                                  std::move(decoder), std::move(preprocessor));
+    }  
+    else 
+    {
+        throw std::invalid_argument("Unknown Model name: " + config.m_ModelName + " .");
+    }
+}
+
+};// namespace kws
\ No newline at end of file

diff --git a/samples/KeywordSpotting/src/Main.cpp b/samples/KeywordSpotting/src/Main.cpp
new file mode 100644
index 0000000..10efcd8
--- /dev/null
+++ b/samples/KeywordSpotting/src/Main.cpp

@@ -0,0 +1,128 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include <iostream>
+#include <map>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include "KeywordSpottingPipeline.hpp"
+#include "CmdArgsParser.hpp"
+#include "ArmnnNetworkExecutor.hpp"
+#include "AudioCapture.hpp"
+
+const std::string AUDIO_FILE_PATH = "--audio-file-path";
+const std::string MODEL_FILE_PATH = "--model-file-path";
+const std::string LABEL_PATH = "--label-path";
+const std::string PREFERRED_BACKENDS = "--preferred-backends";
+const std::string HELP = "--help";
+
+/*
+ * The accepted options for this Speech Recognition executable
+ */
+static std::map<std::string, std::string> CMD_OPTIONS = 
+{
+        {AUDIO_FILE_PATH,    "[REQUIRED] Path to the Audio file to run speech recognition on"},
+        {MODEL_FILE_PATH,    "[REQUIRED] Path to the Speech Recognition model to use"},
+        {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma."
+                             " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]."
+                             " Defaults to CpuAcc,CpuRef"}
+};
+
+/*
+ * Reads the user supplied backend preference, splits it by comma, and returns an ordered vector
+ */
+std::vector<armnn::BackendId> GetPreferredBackendList(const std::string& preferredBackends) 
+{
+    std::vector<armnn::BackendId> backends;
+    std::stringstream ss(preferredBackends);
+
+    while (ss.good()) 
+    {
+        std::string backend;
+        std::getline(ss, backend, ',');
+        backends.emplace_back(backend);
+    }
+    return backends;
+}
+
+//Labels for this model
+std::map<int, std::string> labels = 
+{
+        {0,  "silence"},
+        {1,  "unknown"},
+        {2,  "yes"},
+        {3,  "no"},
+        {4,  "up"},
+        {5,  "down"},
+        {6,  "left"},
+        {7,  "right"},
+        {8,  "on"},
+        {9,  "off"},
+        {10, "stop"},
+        {11, "go"}
+};
+
+
+int main(int argc, char* argv[]) 
+{
+    printf("ArmNN major version: %d\n", ARMNN_MAJOR_VERSION);
+    std::map<std::string, std::string> options;
+
+    //Read command line args
+    int result = ParseOptions(options, CMD_OPTIONS, argv, argc);
+    if (result != 0) 
+    {
+        return result;
+    }
+
+    // Create the ArmNN inference runner
+    common::PipelineOptions pipelineOptions;
+    pipelineOptions.m_ModelName = "DS_CNN_CLUSTERED_INT8";
+    pipelineOptions.m_ModelFilePath = GetSpecifiedOption(options, MODEL_FILE_PATH);
+    if (CheckOptionSpecified(options, PREFERRED_BACKENDS)) 
+    {
+        pipelineOptions.m_backends = GetPreferredBackendList(
+            (GetSpecifiedOption(options, PREFERRED_BACKENDS)));
+    } 
+    else 
+    {
+        pipelineOptions.m_backends = {"CpuAcc", "CpuRef"};
+    }
+
+    kws::IPipelinePtr kwsPipeline = kws::CreatePipeline(pipelineOptions);
+
+    //Extract audio data from sound file
+    auto filePath = GetSpecifiedOption(options, AUDIO_FILE_PATH);
+    std::vector<float> audioData = audio::AudioCapture::LoadAudioFile(filePath);
+
+    audio::AudioCapture capture;
+    //todo: read samples and stride from pipeline
+    capture.InitSlidingWindow(audioData.data(), 
+                              audioData.size(), 
+                              kwsPipeline->getInputSamplesSize(), 
+                              kwsPipeline->getInputSamplesSize()/2);
+
+    //Loop through audio data buffer
+    while (capture.HasNext()) 
+    {
+        std::vector<float> audioBlock = capture.Next();
+        common::InferenceResults<int8_t> results;
+
+        //Prepare input tensors
+        std::vector<int8_t> preprocessedData = kwsPipeline->PreProcessing(audioBlock);
+        //Run inference
+        kwsPipeline->Inference(preprocessedData, results);
+        //Decode output
+        kwsPipeline->PostProcessing(results, labels,
+                                    [](int index, std::string& label, float prob) -> void {
+                                        printf("Keyword \"%s\", index %d:, probability %f\n",
+                                               label.c_str(),
+                                               index,
+                                               prob);
+                                    });
+    }
+
+    return 0;
+}
\ No newline at end of file

diff --git a/samples/KeywordSpotting/test/DecoderTest.cpp b/samples/KeywordSpotting/test/DecoderTest.cpp
new file mode 100644
index 0000000..e44eb29
--- /dev/null
+++ b/samples/KeywordSpotting/test/DecoderTest.cpp

@@ -0,0 +1,28 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <catch.hpp>
+#include <map>
+#include "Decoder.hpp"
+
+
+TEST_CASE("Test KWS decoder")
+{
+//    Actual output probability: [0.0, 0.06, 0.02, 0.03, 0.0, 0.0, 0.05, 0.0, 0.83, 0.0, 0.1, 0.0]
+//    int8 quantised Model output [1, 4, 2, 3, 1, 1, 3, 1, 43, 1, 6, 1]
+//    Reconstructed  dequantised probability [0.0, 0.06, 0.02, 0.04, 0.0, 0.0, 0.04, 0.0, 0.84, 0.0, 0.1, 0.0]
+
+    int quantisationOffset = 1;
+    float quantisationScale = 0.02;
+
+    std::vector<int8_t> modelOutput = {1, 4, 2, 3, 1, 1, 3, 1, 43, 1, 6, 1};
+
+    kws::Decoder decoder(quantisationOffset,quantisationScale);
+
+    std::pair<int,float> result =  decoder.decodeOutput(modelOutput);
+
+
+    CHECK(result == std::pair<int,float>(8,0.84));
+}

diff --git a/samples/KeywordSpotting/test/KeywordSpottingPipelineTest.cpp b/samples/KeywordSpotting/test/KeywordSpottingPipelineTest.cpp
new file mode 100644
index 0000000..9fb87fd
--- /dev/null
+++ b/samples/KeywordSpotting/test/KeywordSpottingPipelineTest.cpp

@@ -0,0 +1,230 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <catch.hpp>
+#include <map>
+#include <cinttypes>
+#include "KeywordSpottingPipeline.hpp"
+#include "DsCNNPreprocessor.hpp"
+
+static std::string GetResourceFilePath(const std::string& filename)
+{
+    std::string testResources = TEST_RESOURCE_DIR;
+    if (testResources.empty())
+    {
+        throw std::invalid_argument("Invalid test resources directory provided");
+    }
+    else
+    {
+        if(testResources.back() != '/')
+        {
+            return testResources + "/" + filename;
+        }
+        else
+        {
+            return testResources + filename;
+        }
+    }
+}
+
+TEST_CASE("Test Keyword spotting pipeline")
+{
+    const int8_t ifm0_kws [] = 
+    {
+    -0x1b, 0x4f, 0x7a, -0x55, 0x6, -0x11, 0x6e, -0x6, 0x67, -0x7e, -0xd, 0x6, 0x49, 0x79, -0x1e, 0xe, 
+     0x1d, 0x6e, 0x6f, 0x6f, -0x2e, -0x4b, 0x2, -0x3e, 0x40, -0x4b, -0x7, 0x31, -0x38, -0x64, -0x28, 
+     0xc, -0x1d, 0xf, 0x1c, 0x5a, -0x4b, 0x56, 0x7e, 0x9, -0x29, 0x13, -0x65, -0xa, 0x34, -0x59, 0x41, 
+    -0x6f, 0x75, 0x67, -0x5f, 0x17, 0x4a, -0x76, -0x7a, 0x49, -0x19, -0x41, 0x78, 0x40, 0x44, 0xe, 
+    -0x51, -0x5c, 0x3d, 0x24, 0x76, -0x66, -0x11, 0x5e, 0x7b, -0x4, 0x7a, 0x9, 0x13, 0x8, -0x21, -0x11, 
+     0x13, 0x7a, 0x25, 0x6, -0x68, 0x6a, -0x30, -0x16, -0x43, -0x27, 0x4c, 0x6b, -0x14, -0x12, -0x5f, 
+     0x49, -0x2a, 0x44, 0x57, -0x78, -0x72, 0x62, -0x8, -0x38, -0x73, -0x2, -0x80, 0x79, -0x3f, 0x57, 
+     0x9, -0x7e, -0x34, -0x59, 0x19, -0x66, 0x58, -0x3b, -0x69, -0x1a, 0x13, -0x2f, -0x2f, 0x13, 0x35, 
+    -0x30, 0x1e, 0x3b, -0x71, 0x67, 0x7d, -0x5d, 0x1a, 0x69, -0x53, -0x38, -0xf, 0x76, 0x2, 0x7e, 0x45, 
+    -0xa, 0x59, -0x6b, -0x28, -0x5d, -0x63, -0x7d, -0x3, 0x48, 0x74, -0x75, -0x7a, 0x1f, -0x53, 0x5b, 
+     0x4d, -0x18, -0x4a, 0x39, -0x52, 0x5a, -0x6b, -0x41, -0x3e, -0x61, -0x80, -0x52, 0x67, 0x71, -0x47, 
+     0x79, -0x41, 0x3a, -0x8, -0x1f, 0x4d, -0x7, 0x5b, 0x6b, -0x1b, -0x8, -0x20, -0x21, 0x7c, -0x74, 
+     0x25, -0x68, -0xe, -0x7e, -0x45, -0x28, 0x45, -0x1a, -0x39, 0x78, 0x11, 0x48, -0x6b, -0x7b, -0x43, 
+    -0x21, 0x38, 0x46, 0x7c, -0x5d, 0x59, 0x53, -0x3f, -0x15, 0x59, -0x17, 0x75, 0x2f, 0x7c, 0x68, 0x6a, 
+     0x0, -0x10, 0x5b, 0x61, 0x36, -0x41, 0x33, 0x23, -0x80, -0x1d, -0xb, -0x56, 0x2d, 0x68, -0x68, 
+     0x2f, 0x48, -0x5d, -0x44, 0x64, -0x27, 0x68, -0x13, 0x39, -0x3f, 0x18, 0x31, 0x15, -0x78, -0x2, 
+     0x72, 0x60, 0x59, -0x30, -0x22, 0x73, 0x61, 0x76, -0x4, -0x62, -0x64, -0x80, -0x32, -0x16, 0x51,
+    -0x2, -0x70, 0x71, 0x3f, -0x5f, -0x35, -0x3c, 0x79, 0x48, 0x61, 0x5b, -0x20, -0x1e, -0x68, -0x1c, 
+     0x6c, 0x3a, 0x28, -0x36, -0x3e, 0x5f, -0x75, -0x73, 0x1e, 0x75, -0x66, -0x22, 0x20, -0x64, 0x67, 
+     0x36, 0x14, 0x37, -0xa, -0xe, 0x8, -0x37, -0x43, 0x21, -0x8, 0x54, 0x1, 0x34, -0x2c, -0x73, -0x11, 
+    -0x48, -0x1c, -0x40, 0x14, 0x4e, -0x53, 0x25, 0x5e, 0x14, 0x4f, 0x7c, 0x6d, -0x61, -0x38, 0x35, 
+    -0x5a, -0x44, 0x12, 0x52, -0x60, 0x22, -0x1c, -0x8, -0x4, -0x6b, -0x71, 0x43, 0xb, 0x7b, -0x7, 
+    -0x3c, -0x3b, -0x40, -0xd, 0x44, 0x6, 0x30, 0x38, 0x57, 0x1f, -0x7, 0x2, 0x4f, 0x64, 0x7c, -0x3,
+    -0x13, -0x71, -0x45, -0x53, -0x52, 0x2b, -0x11, -0x1d, -0x2, -0x29, -0x37, 0x3d, 0x19, 0x76, 0x18,
+     0x1d, 0x12, -0x29, -0x5e, -0x54, -0x48, 0x5d, -0x41, -0x3f, 0x7e, -0x2a, 0x41, 0x57, -0x65, -0x15, 
+     0x12, 0x1f, -0x57, 0x79, -0x64, 0x3a, -0x2f, 0x7f, -0x6c, 0xa, 0x52, -0x1f, -0x41, 0x6e, -0x4b, 
+     0x3d, -0x1b, -0x42, 0x22, -0x3c, -0x35, -0xf, 0xc, 0x32, -0x15, -0x68, -0x21, 0x0, -0x16, 0x14,
+    -0x10, -0x5b, 0x2f, 0x21, 0x41, -0x8, -0x12, -0xa, 0x10, 0xf, 0x7e, -0x76, -0x1d, 0x2b, -0x49, 
+     0x42, -0x25, -0x78, -0x69, -0x2c, 0x3f, 0xc, 0x52, 0x6d, 0x2e, -0x13, 0x76, 0x37, -0x36, -0x51,
+    -0x5, -0x63, -0x4f, 0x1c, 0x6b, -0x4b, 0x71, -0x12, 0x72, -0x3f,-0x4a, 0xf, 0x3a, -0xd, 0x38, 0x3b,
+    -0x5d, 0x75, -0x43, -0x10, -0xa, -0x7a, 0x1a, -0x44, 0x1c, 0x6a, 0x43, -0x1b, -0x35, 0x7d, -0x2c,
+    -0x10, 0x5b, -0x42, -0x4f, 0x69, 0x1f, 0x1b, -0x64, -0x21, 0x19, -0x5d, 0x2e, -0x2a, -0x65, -0x13,
+    -0x70, -0x6e
+    };
+
+    const int8_t ofm0_kws [] = 
+    {
+    -0x80, 0x7f, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80
+    };
+
+    // First 640 samples from yes.wav.
+    std::vector<int16_t> testWav = std::vector<int16_t>
+    {
+    139, 143, 164, 163, 157, 156, 151, 148, 172, 171,
+    165, 169, 149, 142, 145, 147, 166, 146, 112, 132,
+    132, 136, 165, 176, 176, 152, 138, 158, 179, 185,
+    183, 148, 121, 130, 167, 204, 163, 132, 165, 184,
+    193, 205, 210, 204, 195, 178, 168, 197, 207, 201,
+    197, 177, 185, 196, 191, 198, 196, 183, 193, 181,
+    157, 170, 167, 159, 164, 152, 146, 167, 180, 171,
+    194, 232, 204, 173, 171, 172, 184, 169, 175, 199,
+    200, 195, 185, 214, 214, 193, 196, 191, 204, 191,
+    172, 187, 183, 192, 203, 172, 182, 228, 232, 205,
+    177, 174, 191, 210, 210, 211, 197, 177, 198, 217,
+    233, 236, 203, 191, 169, 145, 149, 161, 198, 206,
+    176, 137, 142, 181, 200, 215, 201, 188, 166, 162,
+    184, 155, 135, 132, 126, 142, 169, 184, 172, 156,
+    132, 119, 150, 147, 154, 160, 125, 130, 137, 154,
+    161, 168, 195, 182, 160, 134, 138, 146, 130, 120,
+    101, 122, 137, 118, 117, 131, 145, 140, 146, 148,
+    148, 168, 159, 134, 114, 114, 130, 147, 147, 134,
+    125, 98, 107, 127, 99, 79, 84, 107, 117, 114,
+    93, 92, 127, 112, 109, 110, 96, 118, 97, 87,
+    110, 95, 128, 153, 147, 165, 146, 106, 101, 137,
+    139, 96, 73, 90, 91, 51, 69, 102, 100, 103,
+    96, 101, 123, 107, 82, 89, 118, 127, 99, 100,
+    111, 97, 111, 123, 106, 121, 133, 103, 100, 88,
+    85, 111, 114, 125, 102, 91, 97, 84, 139, 157,
+    109, 66, 72, 129, 111, 90, 127, 126, 101, 109,
+    142, 138, 129, 159, 140, 80, 74, 78, 76, 98,
+    68, 42, 106, 143, 112, 102, 115, 114, 82, 75,
+    92, 80, 110, 114, 66, 86, 119, 101, 101, 103,
+    118, 145, 85, 40, 62, 88, 95, 87, 73, 64,
+    86, 71, 71, 105, 80, 73, 96, 92, 85, 90,
+    81, 86, 105, 100, 89, 78, 102, 114, 95, 98,
+    69, 70, 108, 112, 111, 90, 104, 137, 143, 160,
+    145, 121, 98, 86, 91, 87, 115, 123, 109, 99,
+    85, 120, 131, 116, 125, 144, 153, 111, 98, 110,
+    93, 89, 101, 137, 155, 142, 108, 94, 136, 145,
+    129, 129, 122, 109, 90, 76, 81, 110, 119, 96,
+    95, 102, 105, 111, 90, 89, 111, 115, 86, 51,
+    107, 140, 105, 105, 110, 142, 125, 76, 75, 69,
+    65, 52, 61, 69, 55, 42, 47, 58, 37, 35,
+    24, 20, 44, 22, 16, 26, 6, 3, 4, 23,
+    60, 51, 30, 12, 24, 31, -9, -16, -13, 13,
+    19, 9, 37, 55, 70, 36, 23, 57, 45, 33,
+    50, 59, 18, 11, 62, 74, 52, 8, -3, 26,
+    51, 48, -5, -9, 12, -7, -12, -5, 28, 41,
+    -2, -30, -13, 31, 33, -12, -22, -8, -15, -17,
+    2, -6, -25, -27, -24, -8, 4, -9, -52, -47,
+    -9, -32, -45, -5, 41, 15, -32, -14, 2, -1,
+    -10, -30, -32, -25, -21, -17, -14, 8, -4, -13,
+    34, 18, -36, -38, -18, -19, -28, -17, -14, -16,
+    -2, -20, -27, 12, 11, -17, -33, -12, -22, -64,
+    -42, -26, -23, -22, -37, -51, -53, -30, -18, -48,
+    -69, -38, -54, -96, -72, -49, -50, -57, -41, -22,
+    -43, -64, -54, -23, -49, -69, -41, -44, -42, -49,
+    -40, -26, -54, -50, -38, -49, -70, -94, -89, -69,
+    -56, -65, -71, -47, -39, -49, -79, -91, -56, -46,
+    -62, -86, -64, -32, -47, -50, -71, -77, -65, -68,
+    -52, -51, -61, -67, -61, -81, -93, -52, -59, -62,
+    -51, -75, -76, -50, -32, -54, -68, -70, -43, 1,
+    -42, -92, -80, -41, -38, -79, -69, -49, -82, -122,
+    -93, -21, -24, -61, -70, -73, -62, -74, -69, -43,
+    -25, -15, -43, -23, -26, -69, -44, -12, 1, -51,
+    -78, -13, 3, -53, -105, -72, -24, -62, -66, -31,
+    -40, -65, -86, -64, -44, -55, -63, -61, -37, -41,
+    };
+
+    // Golden audio ops mfcc output for the above wav.
+    const std::vector<float> testWavMfcc 
+    {
+    -22.67135, -0.61615, 2.07233, 0.58137, 1.01655, 0.85816, 0.46039, 0.03393, 1.16511, 0.0072,
+    };
+
+    std::vector<float> testWavFloat(640);
+    constexpr float normaliser = 1.0/(1u<<15u);
+    std::transform(testWav.begin(), testWav.end(), testWavFloat.begin(),
+                   std::bind1st(std::multiplies<float>(), normaliser));
+
+    const float DsCNNInputQuantizationScale = 1.107164;
+    const int DsCNNInputQuantizationOffset = 95;
+
+    std::map<int,std::string> labels = 
+    {
+        {0,"silence"},
+        {1, "unknown"},
+        { 2, "yes"},
+        { 3,"no"},
+        { 4, "up"},
+        { 5, "down"},
+        { 6, "left"},
+        { 7, "right"},
+        { 8, "on"},
+        { 9, "off"},
+        { 10, "stop"},
+        {11, "go"}
+    };
+    common::PipelineOptions options;
+    options.m_ModelFilePath = GetResourceFilePath("ds_cnn_clustered_int8.tflite");
+    options.m_ModelName = "DS_CNN_CLUSTERED_INT8";
+    options.m_backends = {"CpuAcc", "CpuRef"};
+    kws::IPipelinePtr kwsPipeline = kws::CreatePipeline(options);
+
+    CHECK(kwsPipeline->getInputSamplesSize() == 16000);
+    std::vector<int8_t> expectedWavMfcc;
+    for(auto& i : testWavMfcc)
+    {
+        expectedWavMfcc.push_back( 
+            (i + DsCNNInputQuantizationScale * DsCNNInputQuantizationOffset) / DsCNNInputQuantizationScale);
+    }
+
+    SECTION("Pre-processing")
+    {
+        testWavFloat.resize(16000);
+        expectedWavMfcc.resize(49 * 10);
+        std::vector<int8_t> preprocessedData = kwsPipeline->PreProcessing(testWavFloat);
+        CHECK(preprocessedData.size() == expectedWavMfcc.size());
+        for(int i = 0; i < 10; ++i)
+        {
+            CHECK(expectedWavMfcc[i] == Approx(preprocessedData[i]).margin(1));
+        }
+    }
+
+    SECTION("Execute inference")
+    {
+        common::InferenceResults<int8_t> result;
+        std::vector<int8_t> IFM(std::begin(ifm0_kws), std::end(ifm0_kws));
+        kwsPipeline->Inference(IFM, result);
+        std::vector<int8_t> OFM(std::begin(ofm0_kws), std::end(ofm0_kws));
+
+        CHECK(1 == result.size());
+        CHECK(OFM.size() == result[0].size());
+
+        int count = 0;
+        for (auto& i : result)
+        {
+            for (signed char& j : i)
+            {
+                CHECK(j == OFM[count++]);
+
+            }
+        }
+    }
+
+    SECTION("Convert inference result to keyword")
+    {
+        std::vector< std::vector< int8_t >> modelOutput = {{1, 4, 2, 3, 1, 1, 3, 1, 43, 1, 6, 1}};
+        kwsPipeline->PostProcessing(modelOutput, labels,
+                                    [](int index, std::string& label, float prob) -> void {
+                                        CHECK(index == 8);
+                                        CHECK(label == "on");
+                                    });
+    }
+}

diff --git a/samples/ObjectDetection/CMakeLists.txt b/samples/ObjectDetection/CMakeLists.txt
index 7e587f7..dbcd55f 100644
--- a/samples/ObjectDetection/CMakeLists.txt
+++ b/samples/ObjectDetection/CMakeLists.txt

@@ -47,7 +47,8 @@
 include_directories(../common/include/CVUtils)
 
 file(GLOB SOURCES "src/*.cpp")
-file(GLOB COMMON_SOURCES "../common/src/**/*.cpp")
+file(GLOB CVUTILS_SOURCES "../common/src/CVUtils**/*.cpp")
+file(GLOB UTILS_SOURCES "../common/src/Utils**/*.cpp")
 list(REMOVE_ITEM SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/Main.cpp)
 file(GLOB TEST_SOURCES "test/*.cpp")
 file(GLOB APP_MAIN "src/Main.cpp")
@@ -59,7 +60,7 @@
 
 set(APP_TARGET_NAME "${CMAKE_PROJECT_NAME}")
 
-add_executable("${APP_TARGET_NAME}" ${SOURCES} ${COMMON_SOURCES} ${APP_MAIN})
+add_executable("${APP_TARGET_NAME}" ${SOURCES} ${CVUTILS_SOURCES} ${UTILS_SOURCES} ${APP_MAIN})
 
 if (NOT OPENCV_LIBS_FOUND)
     message("Building OpenCV libs")

diff --git a/samples/ObjectDetection/Readme.md b/samples/ObjectDetection/Readme.md
index bc53373..ba5883a 100644
--- a/samples/ObjectDetection/Readme.md
+++ b/samples/ObjectDetection/Readme.md

@@ -247,8 +247,9 @@
  --model-name [YOLO_V3_TINY | SSD_MOBILE]
 ```
 
-This application has been verified to work against the MobileNet SSD model, which can be downloaded along with it's label set from:
-* https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip
+This application has been verified to work against the MobileNet SSD and the YOLO V3 tiny models, which can be downloaded along with their label sets from the Arm Model Zoo:
+* https://github.com/ARM-software/ML-zoo/tree/master/models/object_detection/ssd_mobilenet_v1
+* https://github.com/ARM-software/ML-zoo/tree/master/models/object_detection/yolo_v3_tiny
 
 ---
 

diff --git a/samples/ObjectDetection/cmake/unit_tests.cmake b/samples/ObjectDetection/cmake/unit_tests.cmake
index 1a8c466..dd3de70 100644
--- a/samples/ObjectDetection/cmake/unit_tests.cmake
+++ b/samples/ObjectDetection/cmake/unit_tests.cmake

@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: MIT
 
 set(TEST_RESOURCES_DIR ${CMAKE_SOURCE_DIR}/test/resources)
+file(MAKE_DIRECTORY ${TEST_RESOURCES_DIR})
 add_definitions (-DTEST_RESOURCE_DIR="${TEST_RESOURCES_DIR}")
 set(TEST_TARGET_NAME "${CMAKE_PROJECT_NAME}-tests")
 
@@ -9,16 +10,6 @@
 
 include(../common/cmake/find_catch.cmake)
 
-file(DOWNLOAD "https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip"
-        ${CMAKE_CURRENT_SOURCE_DIR}/test/resources/models.zip SHOW_PROGRESS)
-
-# Extract
-execute_process(
-        COMMAND ${CMAKE_COMMAND} -E tar xf models.zip
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/test/resources/
-        RESULT_VARIABLE return_code
-)
-
 ExternalProject_Add(basketball-image
         URL https://raw.githubusercontent.com/opencv/opencv/4.0.0/samples/data/basketball1.png
         DOWNLOAD_NO_EXTRACT 1
@@ -43,7 +34,15 @@
         INSTALL_COMMAND ""
         )
 
-add_executable("${TEST_TARGET_NAME}" ${SOURCES} ${TEST_SOURCES} ${COMMON_SOURCES})
+ExternalProject_Add(ssd_mobile
+        URL https://github.com/ARM-software/ML-zoo/raw/master/models/object_detection/ssd_mobilenet_v1/tflite_uint8/ssd_mobilenet_v1.tflite
+        DOWNLOAD_NO_EXTRACT 1
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ${CMAKE_COMMAND} -E copy <DOWNLOAD_DIR>/ssd_mobilenet_v1.tflite ${CMAKE_CURRENT_SOURCE_DIR}/test/resources
+        INSTALL_COMMAND ""
+        )
+
+add_executable("${TEST_TARGET_NAME}" ${SOURCES} ${TEST_SOURCES} ${CVUTILS_SOURCES} ${UTILS_SOURCES})
 
 add_dependencies(
     "${TEST_TARGET_NAME}"

diff --git a/samples/ObjectDetection/test/PipelineTest.cpp b/samples/ObjectDetection/test/PipelineTest.cpp
index bc5824e..7af0900 100644
--- a/samples/ObjectDetection/test/PipelineTest.cpp
+++ b/samples/ObjectDetection/test/PipelineTest.cpp

@@ -33,9 +33,9 @@
     REQUIRE(testResources != "");
     // Create the network options
     common::PipelineOptions options;
-    options.m_ModelFilePath = GetResourceFilePath("detect.tflite");
+    options.m_ModelFilePath = GetResourceFilePath("ssd_mobilenet_v1.tflite");
     options.m_ModelName = "SSD_MOBILE";
-    options.m_backends = {"CpuAcc", "CpuRef"};
+    options.m_backends = {"CpuRef"};
 
     od::IPipelinePtr objectDetectionPipeline = od::CreatePipeline(options);
 

diff --git a/samples/SpeechRecognition/CMakeLists.txt b/samples/SpeechRecognition/CMakeLists.txt
index 6c6b0b6..296a251 100644
--- a/samples/SpeechRecognition/CMakeLists.txt
+++ b/samples/SpeechRecognition/CMakeLists.txt

@@ -1,4 +1,4 @@
-# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 # SPDX-License-Identifier: MIT
 
 cmake_minimum_required(VERSION 3.0.2)
@@ -43,9 +43,11 @@
 include_directories(include)
 include_directories(../common/include/ArmnnUtils)
 include_directories(../common/include/Utils)
+include_directories(../common/include/Audio)
 
 file(GLOB SOURCES "src/*.cpp")
 file(GLOB COMMON_UTILS_SOURCES "../common/src/Utils/*.cpp")
+file(GLOB COMMON_AUDIO_SOURCES "../common/src/Audio/*.cpp")
 list(REMOVE_ITEM SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/Main.cpp)
 file(GLOB TEST_SOURCES "test/*.cpp")
 file(GLOB APP_MAIN "src/Main.cpp")
@@ -56,7 +58,7 @@
 
 set(APP_TARGET_NAME "${CMAKE_PROJECT_NAME}")
 
-add_executable("${APP_TARGET_NAME}"  ${COMMON_UTILS_SOURCES} ${SOURCES} ${APP_MAIN})
+add_executable("${APP_TARGET_NAME}"  ${COMMON_UTILS_SOURCES} ${COMMON_AUDIO_SOURCES} ${SOURCES} ${APP_MAIN})
 
 target_link_libraries("${APP_TARGET_NAME}" PUBLIC ${ARMNN_LIBS} -lsndfile -lsamplerate)
 target_include_directories("${APP_TARGET_NAME}" PUBLIC ${ARMNN_INCLUDE_DIR} )

diff --git a/samples/SpeechRecognition/cmake/unit_tests.cmake b/samples/SpeechRecognition/cmake/unit_tests.cmake
index 47c4f4b..955eed4 100644
--- a/samples/SpeechRecognition/cmake/unit_tests.cmake
+++ b/samples/SpeechRecognition/cmake/unit_tests.cmake

@@ -1,4 +1,4 @@
-# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 # SPDX-License-Identifier: MIT
 
 set(TEST_RESOURCES_DIR ${CMAKE_SOURCE_DIR}/test/resources)
@@ -7,11 +7,12 @@
 set(TEST_TARGET_NAME "${CMAKE_PROJECT_NAME}-tests")
 
 file(GLOB TEST_SOURCES "test/*")
+file(GLOB TESTS_AUDIO_COMMON "../common/test/Audio/*")
 
 file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/test/resources)
 include(../common/cmake/find_catch.cmake)
 
-add_executable("${TEST_TARGET_NAME}" ${COMMON_UTILS_SOURCES} ${SOURCES} ${TEST_SOURCES} )
+add_executable("${TEST_TARGET_NAME}" ${COMMON_UTILS_SOURCES} ${COMMON_AUDIO_SOURCES} ${SOURCES} ${TEST_SOURCES} ${TESTS_AUDIO_COMMON})
 
 ExternalProject_Add(passport
         URL https://raw.githubusercontent.com/Azure-Samples/cognitive-services-speech-sdk/master/sampledata/audiofiles/myVoiceIsMyPassportVerifyMe04.wav

diff --git a/samples/SpeechRecognition/include/Decoder.hpp b/samples/SpeechRecognition/include/Decoder.hpp
index 69d97cc..9dd484a 100644
--- a/samples/SpeechRecognition/include/Decoder.hpp
+++ b/samples/SpeechRecognition/include/Decoder.hpp

@@ -46,8 +46,8 @@
                     rowVector.emplace_back(static_cast<int16_t>(contextToProcess[row * rowLength + j]));
                 }
 
-                int max_index = std::distance(rowVector.begin(),std::max_element(rowVector.begin(), rowVector.end()));
-                unfilteredText.emplace_back(this->m_labels.at(max_index)[0]);
+                int maxIndex = std::distance(rowVector.begin(), std::max_element(rowVector.begin(), rowVector.end()));
+                unfilteredText.emplace_back(this->m_labels.at(maxIndex)[0]);
             }
 
             std::string filteredText = FilterCharacters(unfilteredText);

diff --git a/samples/SpeechRecognition/include/MFCC.hpp b/samples/SpeechRecognition/include/MFCC.hpp
deleted file mode 100644
index 14b6d9f..0000000
--- a/samples/SpeechRecognition/include/MFCC.hpp
+++ /dev/null

@@ -1,244 +0,0 @@
-//
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include <vector>
-#include <cstdint>
-#include <cmath>
-#include <limits>
-#include <string>
-
-/* MFCC's consolidated parameters */
-class MfccParams
-{
-public:
-    float       m_samplingFreq;
-    int         m_numFbankBins;
-    float       m_melLoFreq;
-    float       m_melHiFreq;
-    int         m_numMfccFeatures;
-    int         m_frameLen;
-    int         m_frameLenPadded;
-    bool        m_useHtkMethod;
-    int         m_numMfccVectors;
-
-    /** @brief  Constructor */
-    MfccParams(const float samplingFreq, const int numFbankBins,
-               const float melLoFreq, const float melHiFreq,
-               const int numMfccFeats, const int frameLen,
-               const bool useHtkMethod, const int numMfccVectors);
-
-    /* Delete the default constructor */
-    MfccParams()  = delete;
-
-    /* Default destructor */
-    ~MfccParams() = default;
-
-    /** @brief  String representation of parameters */
-    std::string Str();
-};
-
-/**
- * @brief   Class for MFCC feature extraction.
- *          Based on https://github.com/ARM-software/ML-KWS-for-MCU/blob/master/Deployment/Source/MFCC/mfcc.cpp
- *          This class is designed to be generic and self-sufficient but
- *          certain calculation routines can be overridden to accommodate
- *          use-case specific requirements.
- */
-class MFCC
-{
-
-public:
-
-    /**
-    * @brief        Extract MFCC  features for one single small frame of
-    *               audio data e.g. 640 samples.
-    * @param[in]    audioData - Vector of audio samples to calculate
-    *               features for.
-    * @return       Vector of extracted MFCC features.
-    **/
-    std::vector<float> MfccCompute(const std::vector<float>& audioData);
-
-    MfccParams _m_params;
-
-    /**
-     * @brief       Constructor
-     * @param[in]   params - MFCC parameters
-    */
-    MFCC(const MfccParams& params);
-
-    /* Delete the default constructor */
-    MFCC() = delete;
-
-    /** @brief  Default destructor */
-    ~MFCC() = default;
-
-    /** @brief  Initialise */
-    void Init();
-
-    /**
-     * @brief        Extract MFCC features and quantise for one single small
-     *               frame of audio data e.g. 640 samples.
-     * @param[in]    audioData - Vector of audio samples to calculate
-     *               features for.
-     * @param[in]    quantScale - quantisation scale.
-     * @param[in]    quantOffset - quantisation offset
-     * @return      Vector of extracted quantised MFCC features.
-     **/
-    template<typename T>
-    std::vector<T> MfccComputeQuant(const std::vector<float>& audioData,
-                                    const float quantScale,
-                                    const int quantOffset)
-    {
-        this->_MfccComputePreFeature(audioData);
-        float minVal = std::numeric_limits<T>::min();
-        float maxVal = std::numeric_limits<T>::max();
-
-        std::vector<T> mfccOut(this->_m_params.m_numMfccFeatures);
-        const size_t numFbankBins = this->_m_params.m_numFbankBins;
-
-        /* Take DCT. Uses matrix mul. */
-        for (size_t i = 0, j = 0; i < mfccOut.size(); ++i, j += numFbankBins)
-        {
-            float sum = 0;
-            for (size_t k = 0; k < numFbankBins; ++k)
-            {
-                sum += this->_m_dctMatrix[j + k] * this->_m_melEnergies[k];
-            }
-            /* Quantize to T. */
-            sum = std::round((sum / quantScale) + quantOffset);
-            mfccOut[i] = static_cast<T>(std::min<float>(std::max<float>(sum, minVal), maxVal));
-        }
-
-        return mfccOut;
-    }
-
-    /* Constants */
-    static constexpr float logStep = 1.8562979903656 / 27.0;
-    static constexpr float freqStep = 200.0 / 3;
-    static constexpr float minLogHz = 1000.0;
-    static constexpr float minLogMel = minLogHz / freqStep;
-
-protected:
-    /**
-     * @brief       Project input frequency to Mel Scale.
-     * @param[in]   freq - input frequency in floating point
-     * @param[in]   useHTKmethod - bool to signal if HTK method is to be
-     *              used for calculation
-     * @return      Mel transformed frequency in floating point
-     **/
-    static float MelScale(const float    freq,
-                          const bool     useHTKMethod = true);
-
-    /**
-     * @brief       Inverse Mel transform - convert MEL warped frequency
-     *              back to normal frequency
-     * @param[in]   freq - Mel frequency in floating point
-     * @param[in]   useHTKmethod - bool to signal if HTK method is to be
-     *              used for calculation
-     * @return      Real world frequency in floating point
-     **/
-    static float InverseMelScale(const float melFreq,
-                                 const bool  useHTKMethod = true);
-
-    /**
-     * @brief       Populates MEL energies after applying the MEL filter
-     *              bank weights and adding them up to be placed into
-     *              bins, according to the filter bank's first and last
-     *              indices (pre-computed for each filter bank element
-     *              by _CreateMelFilterBank function).
-     * @param[in]   fftVec                  Vector populated with FFT magnitudes
-     * @param[in]   melFilterBank           2D Vector with filter bank weights
-     * @param[in]   filterBankFilterFirst   Vector containing the first indices of filter bank
-     *                                      to be used for each bin.
-     * @param[in]   filterBankFilterLast    Vector containing the last indices of filter bank
-     *                                      to be used for each bin.
-     * @param[out]  melEnergies             Pre-allocated vector of MEL energies to be
-     *                                      populated.
-     * @return      true if successful, false otherwise
-     */
-    virtual bool ApplyMelFilterBank(
-            std::vector<float>&                 fftVec,
-            std::vector<std::vector<float>>&    melFilterBank,
-            std::vector<int32_t>&               filterBankFilterFirst,
-            std::vector<int32_t>&               filterBankFilterLast,
-            std::vector<float>&                 melEnergies);
-
-    /**
-     * @brief           Converts the Mel energies for logarithmic scale
-     * @param[in/out]   melEnergies - 1D vector of Mel energies
-     **/
-    virtual void ConvertToLogarithmicScale(std::vector<float>& melEnergies);
-
-    /**
-     * @brief       Create a matrix used to calculate Discrete Cosine
-     *              Transform.
-     * @param[in]   inputLength - input length of the buffer on which
-     *              DCT will be performed
-     * @param[in]   coefficientCount - Total coefficients per input
-     *              length
-     * @return      1D vector with inputLength x coefficientCount elements
-     *              populated with DCT coefficients.
-     */
-    virtual std::vector<float> CreateDCTMatrix(
-            const int32_t inputLength,
-            const int32_t coefficientCount);
-
-    /**
-     * @brief       Given the low and high Mel values, get the normaliser
-     *              for weights to be applied when populating the filter
-     *              bank.
-     * @param[in]   leftMel - low Mel frequency value
-     * @param[in]   rightMel - high Mel frequency value
-     * @param[in]   useHTKMethod - bool to signal if HTK method is to be
-     *              used for calculation
-     */
-    virtual float GetMelFilterBankNormaliser(
-            const float&   leftMel,
-            const float&   rightMel,
-            const bool     useHTKMethod);
-
-private:
-
-    std::vector<float>              _m_frame;
-    std::vector<float>              _m_buffer;
-    std::vector<float>              _m_melEnergies;
-    std::vector<float>              _m_windowFunc;
-    std::vector<std::vector<float>> _m_melFilterBank;
-    std::vector<float>              _m_dctMatrix;
-    std::vector<int32_t>            _m_filterBankFilterFirst;
-    std::vector<int32_t>            _m_filterBankFilterLast;
-    bool                            _m_filterBankInitialised;
-
-    /**
-     * @brief       Initialises the filter banks and the DCT matrix **/
-    void _InitMelFilterBank();
-
-    /**
-     * @brief       Signals whether the instance of MFCC has had its
-     *              required buffers initialised
-     * @return      True if initialised, false otherwise
-     **/
-    bool _IsMelFilterBankInited();
-
-    /**
-     * @brief       Create mel filter banks for MFCC calculation.
-     * @return      2D vector of floats
-     **/
-    std::vector<std::vector<float>> _CreateMelFilterBank();
-
-    /**
-     * @brief       Computes and populates internal memeber buffers used
-     *              in MFCC feature calculation
-     * @param[in]   audioData - 1D vector of 16-bit audio data
-     */
-    void _MfccComputePreFeature(const std::vector<float>& audioData);
-
-    /** @brief       Computes the magnitude from an interleaved complex array */
-    void _ConvertToPowerSpectrum();
-
-};
-

diff --git a/samples/SpeechRecognition/include/Preprocess.hpp b/samples/SpeechRecognition/include/Preprocess.hpp
deleted file mode 100644
index 80c5684..0000000
--- a/samples/SpeechRecognition/include/Preprocess.hpp
+++ /dev/null

@@ -1,175 +0,0 @@
-//
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include "DataStructures.hpp"
-#include "SlidingWindow.hpp"
-#include <numeric>
-#include "MFCC.hpp"
-
-/* Class to facilitate pre-processing calculation for Wav2Letter model
-     * for ASR */
-using AudioWindow = SlidingWindow <const float>;
-
-class Preprocess
-{
-public:
-
-    MFCC                _m_mfcc;            /* MFCC instance */
-
-    /* Actual buffers to be populated */
-    Array2d<float>      _m_mfccBuf;         /* Contiguous buffer 1D: MFCC */
-    Array2d<float>      _m_delta1Buf;       /* Contiguous buffer 1D: Delta 1 */
-    Array2d<float>      _m_delta2Buf;       /* Contiguous buffer 1D: Delta 2 */
-
-    uint32_t            _m_windowLen;       /* Window length for MFCC */
-    uint32_t            _m_windowStride;    /* Window stride len for MFCC */
-    AudioWindow         _m_window;          /* Sliding window */
-
-    /**
-     * @brief       Constructor
-     * @param[in]   numMfccFeatures     number of MFCC features per window
-     * @param[in]   windowLen           number of elements in a window
-     * @param[in]   windowStride        stride (in number of elements) for
-     *                                  moving the window
-     * @param[in]   numMfccVectors      number of MFCC vectors per window
-    */
-    Preprocess(
-            const uint32_t  windowLen,
-            const uint32_t  windowStride,
-            const MFCC mfccInst);
-    Preprocess() = delete;
-    ~Preprocess();
-
-    /**
-     * @brief       Calculates the features required from audio data. This
-     *              includes MFCC, first and second order deltas,
-     *              normalisation and finally, quantisation. The tensor is
-     *              populated with feature from a given window placed along
-     *              in a single row.
-     * @param[in]   audioData     pointer to the first element of audio data
-     * @param[in]   audioDataLen  number of elements in the audio data
-     * @param[in]   tensor        tensor to be populated
-     * @return      true if successful, false in case of error.
-     */
-    bool Invoke(const float* audioData,
-                const uint32_t  audioDataLen,
-                std::vector<int8_t>& output,
-                int quantOffset,
-                float quantScale);
-
-
-protected:
-    /**
-     * @brief Computes the first and second order deltas for the
-     *        MFCC buffers - they are assumed to be populated.
-     *
-     * @param[in]  mfcc   MFCC buffers
-     * @param[out] delta1 result of the first diff computation
-     * @param[out] delta2 result of the second diff computation
-     *
-     * @return true if successful, false otherwise
-     */
-    static bool _ComputeDeltas(Array2d<float>& mfcc,
-                               Array2d<float>& delta1,
-                               Array2d<float>& delta2);
-
-    /**
-     * @brief      Given a 2D vector of floats, computes the mean
-     * @param[in]   vec      vector of vector of floats
-     * @return      mean value
-     */
-    static float _GetMean(Array2d<float>& vec);
-
-    /**
-     * @brief       Given a 2D vector of floats, computes the stddev
-     * @param[in]   vec   vector of vector of floats
-     * @param[in]   mean     mean value of the vector passed in
-     * @return      stddev value
-     */
-    static float _GetStdDev(Array2d<float>& vec,
-                            const float mean);
-
-    /**
-     * @brief           Given a 2D vector of floats, normalises it using
-     *                  the mean and the stddev
-     * @param[in/out]   vec      vector of vector of floats
-     * @return
-     */
-    static void _NormaliseVec(Array2d<float>& vec);
-
-    /**
-     * @brief       Normalises the MFCC and delta buffers
-     * @return
-     */
-    void _Normalise();
-
-    /**
-     * @brief       Given the quantisation and data type limits, computes
-     *              the quantised values of a floating point input data.
-     * @param[in]   elem            Element to be quantised
-     * @param[in]   quantScale      Scale
-     * @param[in]   quantOffset     Offset
-     * @param[in]   minVal          Numerical limit - minimum
-     * @param[in]   maxVal          Numerical limit - maximum
-     * @return      floating point quantised value
-     */
-    static float _GetQuantElem(
-            const float     elem,
-            const float     quantScale,
-            const int       quantOffset,
-            const float     minVal,
-            const float     maxVal);
-
-    /**
-     * @brief       Quantises the MFCC and delta buffers, and places them
-     *              in the output buffer. While doing so, it transposes
-     *              the data. Reason: Buffers in this class are arranged
-     *              for "time" axis to be row major. Primary reason for
-     *              this being the convolution speed up (as we can use
-     *              contiguous memory). The output, however, requires the
-     *              time axis to be in column major arrangement.
-     * @param[in]   outputBuf       pointer to the output buffer
-     * @param[in]   outputBufSz     output buffer's size
-     * @param[in]   quantScale      quantisation scale
-     * @param[in]   quantOffset     quantisation offset
-     */
-    template <typename T>
-    bool _Quantise(T* outputBuf, int quantOffset, float quantScale)
-    {
-        /* Populate */
-        T* outputBufMfcc = outputBuf;
-        T* outputBufD1 = outputBuf + this->_m_mfcc._m_params.m_numMfccFeatures;
-        T* outputBufD2 = outputBufD1 + this->_m_mfcc._m_params.m_numMfccFeatures;
-        const uint32_t ptrIncr = this->_m_mfcc._m_params.m_numMfccFeatures * 2; /* (3 vectors - 1 vector) */
-
-        const float minVal = std::numeric_limits<T>::min();
-        const float maxVal = std::numeric_limits<T>::max();
-
-        /* We need to do a transpose while copying and concatenating
-         * the tensor*/
-        for (uint32_t j = 0; j < this->_m_mfcc._m_params.m_numMfccVectors; ++j) {
-            for (uint32_t i = 0; i < this->_m_mfcc._m_params.m_numMfccFeatures; ++i)
-            {
-                *outputBufMfcc++ = static_cast<T>(this->_GetQuantElem(
-                        this->_m_mfccBuf(i, j), quantScale,
-                        quantOffset, minVal, maxVal));
-                *outputBufD1++ = static_cast<T>(this->_GetQuantElem(
-                        this->_m_delta1Buf(i, j), quantScale,
-                        quantOffset, minVal, maxVal));
-                *outputBufD2++ = static_cast<T>(this->_GetQuantElem(
-                        this->_m_delta2Buf(i, j), quantScale,
-                        quantOffset, minVal, maxVal));
-            }
-            outputBufMfcc += ptrIncr;
-            outputBufD1 += ptrIncr;
-            outputBufD2 += ptrIncr;
-        }
-
-        return true;
-    }
-};
-

diff --git a/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp b/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp
index 47ce304..bc3fbfe 100644
--- a/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp
+++ b/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp

@@ -8,16 +8,16 @@
 #include "ArmnnNetworkExecutor.hpp"
 #include "Decoder.hpp"
 #include "MFCC.hpp"
-#include "Preprocess.hpp"
+#include "Wav2LetterPreprocessor.hpp"
 
-namespace asr
+namespace asr 
 {
 /**
  * Generic Speech Recognition pipeline with 3 steps: data pre-processing, inference execution and inference
  * result post-processing.
  *
  */
-class ASRPipeline
+class ASRPipeline 
 {
 public:
 
@@ -27,7 +27,7 @@
      * @param decoder - unique pointer to inference results decoder
      */
     ASRPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor,
-                std::unique_ptr<Decoder> decoder);
+                std::unique_ptr<Decoder> decoder, std::unique_ptr<Wav2LetterPreprocessor> preprocessor);
 
     /**
      * @brief Standard audio pre-processing implementation.
@@ -36,20 +36,16 @@
      * extracting the MFCC features.
 
      * @param[in] audio - the raw audio data
-     * @param[out] preprocessor - the preprocessor object, which handles the data prepreration
+     * @param[out] preprocessor - the preprocessor object, which handles the data preparation
      */
-    template<typename Tin,typename Tout>
-    std::vector<Tout> PreProcessing(std::vector<Tin>& audio, Preprocess& preprocessor)
-    {
-        int audioDataToPreProcess = preprocessor._m_windowLen +
-                ((preprocessor._m_mfcc._m_params.m_numMfccVectors -1) *preprocessor._m_windowStride);
-        int outputBufferSize = preprocessor._m_mfcc._m_params.m_numMfccVectors
-                * preprocessor._m_mfcc._m_params.m_numMfccFeatures * 3;
-        std::vector<Tout> outputBuffer(outputBufferSize);
-        preprocessor.Invoke(audio.data(), audioDataToPreProcess, outputBuffer, m_executor->GetQuantizationOffset(),
-                            m_executor->GetQuantizationScale());
-        return outputBuffer;
-    }
+    std::vector<int8_t> PreProcessing(std::vector<float>& audio);
+
+    int getInputSamplesSize();
+    int getSlidingWindowOffset();
+
+    // Exposing hardcoded constant as it can only be derived from model knowledge and not from model itself
+    // Will need to be refactored so that hard coded values are not defined outside of model settings
+    int SLIDING_WINDOW_OFFSET;
 
     /**
      * @brief Executes inference
@@ -60,9 +56,9 @@
      * @param[out] result - raw inference results.
      */
     template<typename T>
-    void Inference(const std::vector<T>& preprocessedData, common::InferenceResults<int8_t>& result)
+    void Inference(const std::vector<T>& preprocessedData, common::InferenceResults<int8_t>& result) 
     {
-        size_t data_bytes = sizeof(std::vector<T>) + (sizeof(T) * preprocessedData.size());
+        size_t data_bytes = sizeof(T) * preprocessedData.size();
         m_executor->Run(preprocessedData.data(), data_bytes, result);
     }
 
@@ -78,9 +74,9 @@
      */
     template<typename T>
     void PostProcessing(common::InferenceResults<int8_t>& inferenceResult,
-                                     bool& isFirstWindow,
-                                     bool isLastWindow,
-                                     std::string currentRContext)
+                        bool& isFirstWindow,
+                        bool isLastWindow,
+                        std::string currentRContext) 
     {
         int rowLength = 29;
         int middleContextStart = 49;
@@ -92,17 +88,17 @@
         std::vector<T> contextToProcess;
 
         // If isFirstWindow we keep the left context of the output
-        if(isFirstWindow)
+        if (isFirstWindow) 
         {
             std::vector<T> chunk(&inferenceResult[0][leftContextStart],
-                    &inferenceResult[0][middleContextEnd * rowLength]);
+                                 &inferenceResult[0][middleContextEnd * rowLength]);
             contextToProcess = chunk;
         }
-        // Else we only keep the middle context of the output
-        else
+        else 
         {
+            // Else we only keep the middle context of the output
             std::vector<T> chunk(&inferenceResult[0][middleContextStart * rowLength],
-                    &inferenceResult[0][middleContextEnd * rowLength]);
+                                 &inferenceResult[0][middleContextEnd * rowLength]);
             contextToProcess = chunk;
         }
         std::string output = this->m_decoder->DecodeOutput<T>(contextToProcess);
@@ -110,10 +106,10 @@
         std::cout << output << std::flush;
 
         // If this is the last window, we print the right context of the output
-        if(isLastWindow)
+        if (isLastWindow) 
         {
-            std::vector<T> rContext(&inferenceResult[0][rightContextStart*rowLength],
-                    &inferenceResult[0][rightContextEnd * rowLength]);
+            std::vector<T> rContext(&inferenceResult[0][rightContextStart * rowLength],
+                                    &inferenceResult[0][rightContextEnd * rowLength]);
             currentRContext = this->m_decoder->DecodeOutput(rContext);
             std::cout << currentRContext << std::endl;
         }
@@ -122,6 +118,7 @@
 protected:
     std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> m_executor;
     std::unique_ptr<Decoder> m_decoder;
+    std::unique_ptr<Wav2LetterPreprocessor> m_preProcessor;
 };
 
 using IPipelinePtr = std::unique_ptr<asr::ASRPipeline>;
@@ -136,4 +133,4 @@
  */
 IPipelinePtr CreatePipeline(common::PipelineOptions& config, std::map<int, std::string>& labels);
 
-}// namespace asr
\ No newline at end of file
+} // namespace asr
\ No newline at end of file

diff --git a/samples/SpeechRecognition/include/Wav2LetterMFCC.hpp b/samples/SpeechRecognition/include/Wav2LetterMFCC.hpp
new file mode 100644
index 0000000..aa88aaf
--- /dev/null
+++ b/samples/SpeechRecognition/include/Wav2LetterMFCC.hpp

@@ -0,0 +1,78 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "MFCC.hpp"
+
+/* Class to provide Wav2Letter specific MFCC calculation requirements. */
+class Wav2LetterMFCC : public MFCC 
+{
+
+public:
+    explicit Wav2LetterMFCC(const MfccParams& params)
+        :  MFCC(params)
+    {}
+
+    Wav2LetterMFCC()  = delete;
+    ~Wav2LetterMFCC() = default;
+
+protected:
+
+    /**
+     * @brief       Overrides base class implementation of this function.
+     * @param[in]   fftVec                  Vector populated with FFT magnitudes
+     * @param[in]   melFilterBank           2D Vector with filter bank weights
+     * @param[in]   filterBankFilterFirst   Vector containing the first indices of filter bank
+     *                                      to be used for each bin.
+     * @param[in]   filterBankFilterLast    Vector containing the last indices of filter bank
+     *                                      to be used for each bin.
+     * @param[out]  melEnergies             Pre-allocated vector of MEL energies to be
+     *                                      populated.
+     * @return      true if successful, false otherwise
+     */
+    bool ApplyMelFilterBank(
+        std::vector<float>&                 fftVec,
+        std::vector<std::vector<float>>&    melFilterBank,
+        std::vector<uint32_t>&              filterBankFilterFirst,
+        std::vector<uint32_t>&              filterBankFilterLast,
+        std::vector<float>&                 melEnergies) override;
+
+    /**
+     * @brief           Override for the base class implementation convert mel
+     *                  energies to logarithmic scale. The difference from
+     *                  default behaviour is that the power is converted to dB
+     *                  and subsequently clamped.
+     * @param[in,out]   melEnergies   1D vector of Mel energies
+     **/
+    void ConvertToLogarithmicScale(std::vector<float>& melEnergies) override;
+
+    /**
+     * @brief       Create a matrix used to calculate Discrete Cosine
+     *              Transform. Override for the base class' default
+     *              implementation as the first and last elements
+     *              use a different normaliser.
+     * @param[in]   inputLength        input length of the buffer on which
+     *                                 DCT will be performed
+     * @param[in]   coefficientCount   Total coefficients per input length.
+     * @return      1D vector with inputLength x coefficientCount elements
+     *              populated with DCT coefficients.
+     */
+    std::vector<float> CreateDCTMatrix(int32_t inputLength,
+                                       int32_t coefficientCount) override;
+
+    /**
+     * @brief       Given the low and high Mel values, get the normaliser
+     *              for weights to be applied when populating the filter
+     *              bank. Override for the base class implementation.
+     * @param[in]   leftMel        Low Mel frequency value.
+     * @param[in]   rightMel       High Mel frequency value.
+     * @param[in]   useHTKMethod   bool to signal if HTK method is to be
+     *                             used for calculation.
+     * @return      Value to use for normalising.
+     */
+    float GetMelFilterBankNormaliser(const float&   leftMel,
+                                     const float&   rightMel,
+                                     bool     useHTKMethod) override;
+};
\ No newline at end of file

diff --git a/samples/SpeechRecognition/include/Wav2LetterPreprocessor.hpp b/samples/SpeechRecognition/include/Wav2LetterPreprocessor.hpp
new file mode 100644
index 0000000..ebc9e86
--- /dev/null
+++ b/samples/SpeechRecognition/include/Wav2LetterPreprocessor.hpp

@@ -0,0 +1,158 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#ifndef SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
+#define SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
+
+#include <numeric>
+#include "DataStructures.hpp"
+#include "SlidingWindow.hpp"
+#include "MFCC.hpp"
+#include "Wav2LetterMFCC.hpp"
+// Class to facilitate pre-processing calculation for Wav2Letter model for ASR 
+using AudioWindow = SlidingWindow<const float>;
+
+class Wav2LetterPreprocessor 
+{
+public:
+    Wav2LetterPreprocessor(uint32_t windowLen, uint32_t windowStride,
+                           std::unique_ptr<Wav2LetterMFCC> mfccInst);
+
+    /**
+     * @brief       Calculates the features required from audio data. This
+     *              includes MFCC, first and second order deltas,
+     *              normalisation and finally, quantisation. The tensor is
+     *              populated with feature from a given window placed along
+     *              in a single row.
+     * @param[in]   audioData     pointer to the first element of audio data
+     * @param[in]   audioDataLen  number of elements in the audio data
+     * @param[in]   tensor        tensor to be populated
+     * @return      true if successful, false in case of error.
+     */
+    bool Invoke(const float* audioData, uint32_t audioDataLen, std::vector<int8_t>& output, int quantOffset,
+                float quantScale);
+
+    std::unique_ptr<MFCC> m_mfcc;
+
+    // Actual buffers to be populated 
+    Array2d<float> m_mfccBuf;         // Contiguous buffer 1D: MFCC 
+    Array2d<float> m_delta1Buf;       // Contiguous buffer 1D: Delta 1 
+    Array2d<float> m_delta2Buf;       // Contiguous buffer 1D: Delta 2
+
+    uint32_t m_windowLen;       // Window length for MFCC 
+    uint32_t m_windowStride;    // Window stride len for MFCC 
+    AudioWindow m_window;       // Sliding window 
+
+protected:
+    /**
+     * @brief Computes the first and second order deltas for the
+     *        MFCC buffers - they are assumed to be populated.
+     *
+     * @param[in]  mfcc   MFCC buffers
+     * @param[out] delta1 result of the first diff computation
+     * @param[out] delta2 result of the second diff computation
+     *
+     * @return true if successful, false otherwise
+     */
+    static bool ComputeDeltas(Array2d<float>& mfcc,
+                              Array2d<float>& delta1,
+                              Array2d<float>& delta2);
+
+protected:
+
+    /**
+     * @brief      Given a 2D vector of floats, computes the mean
+     * @param[in]   vec      vector of vector of floats
+     * @return      mean value
+     */
+    static float GetMean(Array2d<float>& vec);
+
+    /**
+     * @brief       Given a 2D vector of floats, computes the stddev
+     * @param[in]   vec   vector of vector of floats
+     * @param[in]   mean     mean value of the vector passed in
+     * @return      stddev value
+     */
+    static float GetStdDev(Array2d<float>& vec, float mean);
+
+    /**
+     * @brief           Given a 2D vector of floats, normalises it using
+     *                  the mean and the stddev
+     * @param[in/out]   vec      vector of vector of floats
+     * @return
+     */
+    static void NormaliseVec(Array2d<float>& vec);
+
+    /**
+     * @brief       Normalises the MFCC and delta buffers
+     * @return
+     */
+    void Normalise();
+
+    /**
+     * @brief       Given the quantisation and data type limits, computes
+     *              the quantised values of a floating point input data.
+     * @param[in]   elem            Element to be quantised
+     * @param[in]   quantScale      Scale
+     * @param[in]   quantOffset     Offset
+     * @param[in]   minVal          Numerical limit - minimum
+     * @param[in]   maxVal          Numerical limit - maximum
+     * @return      floating point quantised value
+     */
+    static float GetQuantElem(
+            float elem,
+            float quantScale,
+            int quantOffset,
+            float minVal,
+            float maxVal);
+
+    /**
+     * @brief       Quantises the MFCC and delta buffers, and places them
+     *              in the output buffer. While doing so, it transposes
+     *              the data. Reason: Buffers in this class are arranged
+     *              for "time" axis to be row major. Primary reason for
+     *              this being the convolution speed up (as we can use
+     *              contiguous memory). The output, however, requires the
+     *              time axis to be in column major arrangement.
+     * @param[in]   outputBuf       pointer to the output buffer
+     * @param[in]   outputBufSz     output buffer's size
+     * @param[in]   quantScale      quantisation scale
+     * @param[in]   quantOffset     quantisation offset
+     */
+    template<typename T>
+    bool Quantise(T*outputBuf, int quantOffset, float quantScale) 
+    {
+        // Populate 
+        T* outputBufMfcc = outputBuf;
+        T* outputBufD1 = outputBuf + this->m_mfcc->m_params.m_numMfccFeatures;
+        T* outputBufD2 = outputBufD1 + this->m_mfcc->m_params.m_numMfccFeatures;
+        const uint32_t ptrIncr = this->m_mfcc->m_params.m_numMfccFeatures * 2; // (3 vectors - 1 vector) 
+
+        const float minVal = std::numeric_limits<T>::min();
+        const float maxVal = std::numeric_limits<T>::max();
+
+        // We need to do a transpose while copying and concatenating the tensor
+        for (uint32_t j = 0; j < this->m_mfcc->m_params.m_numMfccVectors; ++j) 
+        {
+            for (uint32_t i = 0; i < this->m_mfcc->m_params.m_numMfccFeatures; ++i) 
+            {
+                *outputBufMfcc++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
+                        this->m_mfccBuf(i, j), quantScale,
+                        quantOffset, minVal, maxVal));
+                *outputBufD1++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
+                        this->m_delta1Buf(i, j), quantScale,
+                        quantOffset, minVal, maxVal));
+                *outputBufD2++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
+                        this->m_delta2Buf(i, j), quantScale,
+                        quantOffset, minVal, maxVal));
+            }
+            outputBufMfcc += ptrIncr;
+            outputBufD1 += ptrIncr;
+            outputBufD2 += ptrIncr;
+        }
+        return true;
+    }
+};
+
+#endif //SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP

diff --git a/samples/SpeechRecognition/src/Decoder.cpp b/samples/SpeechRecognition/src/Decoder.cpp
index 663d4db..b95288e 100644
--- a/samples/SpeechRecognition/src/Decoder.cpp
+++ b/samples/SpeechRecognition/src/Decoder.cpp

@@ -5,33 +5,32 @@
 
 #include "Decoder.hpp"
 
-namespace asr {
+namespace asr 
+{
 
-    Decoder::Decoder(std::map<int, std::string>& labels):
-            m_labels(labels)
-    {}
+Decoder::Decoder(std::map<int, std::string>& labels) :
+            m_labels(labels) {}
 
-    std::string Decoder::FilterCharacters(std::vector<char>& unfiltered)
+std::string Decoder::FilterCharacters(std::vector<char>& unfiltered) 
+{
+    std::string filtered;
+
+    for (int i = 0; i < unfiltered.size(); ++i) 
     {
-        std::string filtered = "";
-
-        for(int i = 0; i < unfiltered.size(); ++i)
+        if (unfiltered.at(i) == '$') 
         {
-            if (unfiltered.at(i) == '$')
-            {
-                continue;
-            }
-
-            else if (i + 1 < unfiltered.size() && unfiltered.at(i) == unfiltered.at(i + 1))
-            {
-                continue;
-            }
-            else
-            {
-                filtered += unfiltered.at(i);
-            }
+            continue;
+        } 
+        else if (i + 1 < unfiltered.size() && unfiltered.at(i) == unfiltered.at(i + 1)) 
+        {
+            continue;
+        } 
+        else 
+        {
+            filtered += unfiltered.at(i);
         }
-        return filtered;
     }
-}// namespace
+    return filtered;
+}
+} // namespace asr
 

diff --git a/samples/SpeechRecognition/src/MFCC.cpp b/samples/SpeechRecognition/src/MFCC.cpp
deleted file mode 100644
index 234b14d..0000000
--- a/samples/SpeechRecognition/src/MFCC.cpp
+++ /dev/null

@@ -1,397 +0,0 @@
-//
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#include <cstdio>
-#include <float.h>
-
-#include "MFCC.hpp"
-#include "MathUtils.hpp"
-
-
-MfccParams::MfccParams(
-        const float samplingFreq,
-        const int numFbankBins,
-        const float melLoFreq,
-        const float melHiFreq,
-        const int numMfccFeats,
-        const int frameLen,
-        const bool useHtkMethod,
-        const int numMfccVectors):
-        m_samplingFreq(samplingFreq),
-        m_numFbankBins(numFbankBins),
-        m_melLoFreq(melLoFreq),
-        m_melHiFreq(melHiFreq),
-        m_numMfccFeatures(numMfccFeats),
-        m_frameLen(frameLen),
-        m_numMfccVectors(numMfccVectors),
-
-        /* Smallest power of 2 >= frame length. */
-        m_frameLenPadded(pow(2, ceil((log(frameLen)/log(2))))),
-        m_useHtkMethod(useHtkMethod)
-{}
-
-std::string MfccParams::Str()
-{
-    char strC[1024];
-    snprintf(strC, sizeof(strC) - 1, "\n   \
-            \n\t Sampling frequency:         %f\
-            \n\t Number of filter banks:     %u\
-            \n\t Mel frequency limit (low):  %f\
-            \n\t Mel frequency limit (high): %f\
-            \n\t Number of MFCC features:    %u\
-            \n\t Frame length:               %u\
-            \n\t Padded frame length:        %u\
-            \n\t Using HTK for Mel scale:    %s\n",
-             this->m_samplingFreq, this->m_numFbankBins, this->m_melLoFreq,
-             this->m_melHiFreq, this->m_numMfccFeatures, this->m_frameLen,
-             this->m_frameLenPadded, this->m_useHtkMethod ? "yes" : "no");
-    return std::string{strC};
-}
-
-MFCC::MFCC(const MfccParams& params):
-        _m_params(params),
-        _m_filterBankInitialised(false)
-{
-    this->_m_buffer = std::vector<float>(
-            this->_m_params.m_frameLenPadded, 0.0);
-    this->_m_frame = std::vector<float>(
-            this->_m_params.m_frameLenPadded, 0.0);
-    this->_m_melEnergies = std::vector<float>(
-            this->_m_params.m_numFbankBins, 0.0);
-
-    this->_m_windowFunc = std::vector<float>(this->_m_params.m_frameLen);
-    const float multiplier = 2 * M_PI / this->_m_params.m_frameLen;
-
-    /* Create window function. */
-    for (size_t i = 0; i < this->_m_params.m_frameLen; i++)
-    {
-        this->_m_windowFunc[i] = (0.5 - (0.5 * cos(static_cast<float>(i) * multiplier)));
-    }
-}
-
-void MFCC::Init()
-{
-    this->_InitMelFilterBank();
-}
-
-float MFCC::MelScale(const float freq, const bool useHTKMethod)
-{
-    if (useHTKMethod)
-    {
-        return 1127.0f * logf (1.0f + freq / 700.0f);
-    }
-    else
-    {
-        /* Slaney formula for mel scale. */
-        float mel = freq / freqStep;
-
-        if (freq >= minLogHz)
-        {
-            mel = minLogMel + logf(freq / minLogHz) / logStep;
-        }
-        return mel;
-    }
-}
-
-float MFCC::InverseMelScale(const float melFreq, const bool useHTKMethod)
-{
-    if (useHTKMethod)
-    {
-        return 700.0f * (expf (melFreq / 1127.0f) - 1.0f);
-    }
-    else
-    {
-        /* Slaney formula for mel scale. */
-        float freq = freqStep * melFreq;
-
-        if (melFreq >= minLogMel)
-        {
-            freq = minLogHz * expf(logStep * (melFreq - minLogMel));
-        }
-        return freq;
-    }
-}
-
-
-bool MFCC::ApplyMelFilterBank(
-        std::vector<float>&                 fftVec,
-        std::vector<std::vector<float>>&    melFilterBank,
-        std::vector<int32_t>&               filterBankFilterFirst,
-        std::vector<int32_t>&               filterBankFilterLast,
-        std::vector<float>&                 melEnergies)
-{
-    const size_t numBanks = melEnergies.size();
-
-    if (numBanks != filterBankFilterFirst.size() ||
-        numBanks != filterBankFilterLast.size())
-    {
-        printf("unexpected filter bank lengths\n");
-        return false;
-    }
-
-    for (size_t bin = 0; bin < numBanks; ++bin)
-    {
-        auto filterBankIter = melFilterBank[bin].begin();
-        float melEnergy = 1e-10; /* Avoid log of zero at later stages */
-        const int32_t firstIndex = filterBankFilterFirst[bin];
-        const int32_t lastIndex = filterBankFilterLast[bin];
-
-        for (int32_t i = firstIndex; i <= lastIndex; ++i)
-        {
-            melEnergy += (*filterBankIter++ * fftVec[i]);
-        }
-
-        melEnergies[bin] = melEnergy;
-    }
-
-    return true;
-}
-
-void MFCC::ConvertToLogarithmicScale(std::vector<float>& melEnergies)
-{
-    float maxMelEnergy = -FLT_MAX;
-
-    /* Container for natural logarithms of mel energies */
-    std::vector <float> vecLogEnergies(melEnergies.size(), 0.f);
-
-    /* Because we are taking natural logs, we need to multiply by log10(e).
-     * Also, for wav2letter model, we scale our log10 values by 10 */
-    constexpr float multiplier = 10.0 * /* default scalar */
-                                 0.4342944819032518; /* log10f(std::exp(1.0))*/
-
-    /* Take log of the whole vector */
-    MathUtils::VecLogarithmF32(melEnergies, vecLogEnergies);
-
-    /* Scale the log values and get the max */
-    for (auto iterM = melEnergies.begin(), iterL = vecLogEnergies.begin();
-         iterM != melEnergies.end(); ++iterM, ++iterL)
-    {
-        *iterM = *iterL * multiplier;
-
-        /* Save the max mel energy. */
-        if (*iterM > maxMelEnergy)
-        {
-            maxMelEnergy = *iterM;
-        }
-    }
-
-    /* Clamp the mel energies */
-    constexpr float maxDb = 80.0;
-    const float clampLevelLowdB = maxMelEnergy - maxDb;
-    for (auto iter = melEnergies.begin(); iter != melEnergies.end(); ++iter)
-    {
-        *iter = std::max(*iter, clampLevelLowdB);
-    }
-}
-
-void MFCC::_ConvertToPowerSpectrum()
-{
-    const uint32_t halfDim = this->_m_params.m_frameLenPadded / 2;
-
-    /* Handle this special case. */
-    float firstEnergy = this->_m_buffer[0] * this->_m_buffer[0];
-    float lastEnergy = this->_m_buffer[1] * this->_m_buffer[1];
-
-    MathUtils::ComplexMagnitudeSquaredF32(
-            this->_m_buffer.data(),
-            this->_m_buffer.size(),
-            this->_m_buffer.data(),
-            this->_m_buffer.size()/2);
-
-    this->_m_buffer[0] = firstEnergy;
-    this->_m_buffer[halfDim] = lastEnergy;
-}
-
-std::vector<float> MFCC::CreateDCTMatrix(
-        const int32_t inputLength,
-        const int32_t coefficientCount)
-{
-    std::vector<float> dctMatix(inputLength * coefficientCount);
-
-    /* Orthonormal normalization. */
-    const float normalizerK0 = 2 * sqrt(1.0 / static_cast<float>(4*inputLength));
-    const float normalizer = 2 * sqrt(1.0 / static_cast<float>(2*inputLength));
-
-    const float angleIncr = M_PI/inputLength;
-    float angle = angleIncr; /* we start using it at k = 1 loop */
-
-    /* First row of DCT will use normalizer K0 */
-    for (int32_t n = 0; n < inputLength; ++n)
-    {
-        dctMatix[n] = normalizerK0;
-    }
-
-    /* Second row (index = 1) onwards, we use standard normalizer */
-    for (int32_t k = 1, m = inputLength; k < coefficientCount; ++k, m += inputLength)
-    {
-        for (int32_t n = 0; n < inputLength; ++n)
-        {
-            dctMatix[m+n] = normalizer *
-                            cos((n + 0.5) * angle);
-        }
-        angle += angleIncr;
-    }
-    return dctMatix;
-}
-
-float MFCC::GetMelFilterBankNormaliser(
-        const float&    leftMel,
-        const float&    rightMel,
-        const bool      useHTKMethod)
-{
-/* Slaney normalization for mel weights. */
-    return (2.0f / (MFCC::InverseMelScale(rightMel, useHTKMethod) -
-                    MFCC::InverseMelScale(leftMel, useHTKMethod)));
-}
-
-void MFCC::_InitMelFilterBank()
-{
-    if (!this->_IsMelFilterBankInited())
-    {
-        this->_m_melFilterBank = this->_CreateMelFilterBank();
-        this->_m_dctMatrix = this->CreateDCTMatrix(
-                this->_m_params.m_numFbankBins,
-                this->_m_params.m_numMfccFeatures);
-        this->_m_filterBankInitialised = true;
-    }
-}
-
-bool MFCC::_IsMelFilterBankInited()
-{
-    return this->_m_filterBankInitialised;
-}
-
-void MFCC::_MfccComputePreFeature(const std::vector<float>& audioData)
-{
-    this->_InitMelFilterBank();
-
-    /* TensorFlow way of normalizing .wav data to (-1, 1). */
-    constexpr float normaliser = 1.0;
-    for (size_t i = 0; i < this->_m_params.m_frameLen; i++)
-    {
-        this->_m_frame[i] = static_cast<float>(audioData[i]) * normaliser;
-    }
-
-    /* Apply window function to input frame. */
-    for(size_t i = 0; i < this->_m_params.m_frameLen; i++)
-    {
-        this->_m_frame[i] *= this->_m_windowFunc[i];
-    }
-
-    /* Set remaining frame values to 0. */
-    std::fill(this->_m_frame.begin() + this->_m_params.m_frameLen,this->_m_frame.end(), 0);
-
-    /* Compute FFT. */
-    MathUtils::FftF32(this->_m_frame, this->_m_buffer);
-
-    /* Convert to power spectrum. */
-    this->_ConvertToPowerSpectrum();
-
-    /* Apply mel filterbanks. */
-    if (!this->ApplyMelFilterBank(this->_m_buffer,
-                                  this->_m_melFilterBank,
-                                  this->_m_filterBankFilterFirst,
-                                  this->_m_filterBankFilterLast,
-                                  this->_m_melEnergies))
-    {
-        printf("Failed to apply MEL filter banks\n");
-    }
-
-    /* Convert to logarithmic scale */
-    this->ConvertToLogarithmicScale(this->_m_melEnergies);
-}
-
-std::vector<float> MFCC::MfccCompute(const std::vector<float>& audioData)
-{
-    this->_MfccComputePreFeature(audioData);
-
-    std::vector<float> mfccOut(this->_m_params.m_numMfccFeatures);
-
-    float * ptrMel = this->_m_melEnergies.data();
-    float * ptrDct = this->_m_dctMatrix.data();
-    float * ptrMfcc = mfccOut.data();
-
-    /* Take DCT. Uses matrix mul. */
-    for (size_t i = 0, j = 0; i < mfccOut.size();
-         ++i, j += this->_m_params.m_numFbankBins)
-    {
-        *ptrMfcc++ = MathUtils::DotProductF32(
-                ptrDct + j,
-                ptrMel,
-                this->_m_params.m_numFbankBins);
-    }
-
-    return mfccOut;
-}
-
-std::vector<std::vector<float>> MFCC::_CreateMelFilterBank()
-{
-    size_t numFftBins = this->_m_params.m_frameLenPadded / 2;
-    float fftBinWidth = static_cast<float>(this->_m_params.m_samplingFreq) / this->_m_params.m_frameLenPadded;
-
-    float melLowFreq = MFCC::MelScale(this->_m_params.m_melLoFreq,
-                                      this->_m_params.m_useHtkMethod);
-    float melHighFreq = MFCC::MelScale(this->_m_params.m_melHiFreq,
-                                       this->_m_params.m_useHtkMethod);
-    float melFreqDelta = (melHighFreq - melLowFreq) / (this->_m_params.m_numFbankBins + 1);
-
-    std::vector<float> thisBin = std::vector<float>(numFftBins);
-    std::vector<std::vector<float>> melFilterBank(
-            this->_m_params.m_numFbankBins);
-    this->_m_filterBankFilterFirst =
-            std::vector<int32_t>(this->_m_params.m_numFbankBins);
-    this->_m_filterBankFilterLast =
-            std::vector<int32_t>(this->_m_params.m_numFbankBins);
-
-    for (size_t bin = 0; bin < this->_m_params.m_numFbankBins; bin++)
-    {
-        float leftMel = melLowFreq + bin * melFreqDelta;
-        float centerMel = melLowFreq + (bin + 1) * melFreqDelta;
-        float rightMel = melLowFreq + (bin + 2) * melFreqDelta;
-
-        int32_t firstIndex = -1;
-        int32_t lastIndex = -1;
-        const float normaliser = this->GetMelFilterBankNormaliser(leftMel, rightMel, this->_m_params.m_useHtkMethod);
-
-        for (size_t i = 0; i < numFftBins; i++)
-        {
-            float freq = (fftBinWidth * i); /* Center freq of this fft bin. */
-            float mel = MFCC::MelScale(freq, this->_m_params.m_useHtkMethod);
-            thisBin[i] = 0.0;
-
-            if (mel > leftMel && mel < rightMel)
-            {
-                float weight;
-                if (mel <= centerMel)
-                {
-                    weight = (mel - leftMel) / (centerMel - leftMel);
-                }
-                else
-                {
-                    weight = (rightMel - mel) / (rightMel - centerMel);
-                }
-
-                thisBin[i] = weight * normaliser;
-                if (firstIndex == -1)
-                {
-                    firstIndex = i;
-                }
-                lastIndex = i;
-            }
-        }
-
-        this->_m_filterBankFilterFirst[bin] = firstIndex;
-        this->_m_filterBankFilterLast[bin] = lastIndex;
-
-        /* Copy the part we care about. */
-        for (int32_t i = firstIndex; i <= lastIndex; i++)
-        {
-            melFilterBank[bin].push_back(thisBin[i]);
-        }
-    }
-
-    return melFilterBank;
-}
-

diff --git a/samples/SpeechRecognition/src/Main.cpp b/samples/SpeechRecognition/src/Main.cpp
index de37e23..e2d2930 100644
--- a/samples/SpeechRecognition/src/Main.cpp
+++ b/samples/SpeechRecognition/src/Main.cpp

@@ -1,5 +1,5 @@
 //
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #include <iostream>
@@ -11,10 +11,8 @@
 #include "CmdArgsParser.hpp"
 #include "ArmnnNetworkExecutor.hpp"
 #include "AudioCapture.hpp"
-#include "Preprocess.hpp"
-#include "Decoder.hpp"
 #include "SpeechRecognitionPipeline.hpp"
-
+#include "Wav2LetterMFCC.hpp"
 
 using InferenceResult = std::vector<int8_t>;
 using InferenceResults = std::vector<InferenceResult>;
@@ -25,101 +23,77 @@
 const std::string PREFERRED_BACKENDS = "--preferred-backends";
 const std::string HELP = "--help";
 
-std::map<int, std::string> labels = {
-        {0, "a" },
-        {1, "b" },
-        {2, "c" },
-        {3, "d" },
-        {4, "e" },
-        {5, "f" },
-        {6, "g" },
-        {7, "h" },
-        {8, "i" },
-        {9, "j" },
-        {10,"k" },
-        {11,"l" },
-        {12,"m" },
-        {13,"n" },
-        {14,"o" },
-        {15,"p" },
-        {16,"q" },
-        {17,"r" },
-        {18,"s" },
-        {19,"t" },
-        {20,"u" },
-        {21,"v" },
-        {22,"w" },
-        {23,"x" },
-        {24,"y" },
-        {25,"z" },
-        {26, "\'" },
+std::map<int, std::string> labels = 
+{
+        {0,  "a"},
+        {1,  "b"},
+        {2,  "c"},
+        {3,  "d"},
+        {4,  "e"},
+        {5,  "f"},
+        {6,  "g"},
+        {7,  "h"},
+        {8,  "i"},
+        {9,  "j"},
+        {10, "k"},
+        {11, "l"},
+        {12, "m"},
+        {13, "n"},
+        {14, "o"},
+        {15, "p"},
+        {16, "q"},
+        {17, "r"},
+        {18, "s"},
+        {19, "t"},
+        {20, "u"},
+        {21, "v"},
+        {22, "w"},
+        {23, "x"},
+        {24, "y"},
+        {25, "z"},
+        {26, "\'"},
         {27, " "},
-        {28,"$" }
+        {28, "$"}
 };
 
 /*
  * The accepted options for this Speech Recognition executable
  */
-static std::map<std::string, std::string> CMD_OPTIONS = {
-        {AUDIO_FILE_PATH, "[REQUIRED] Path to the Audio file to run speech recognition on"},
-        {MODEL_FILE_PATH, "[REQUIRED] Path to the Speech Recognition model to use"},
-        {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma."
-                             " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]."
-                             " Defaults to CpuAcc,CpuRef"}
+static std::map<std::string, std::string> CMD_OPTIONS = 
+{
+    {AUDIO_FILE_PATH,    "[REQUIRED] Path to the Audio file to run speech recognition on"},
+    {MODEL_FILE_PATH,    "[REQUIRED] Path to the Speech Recognition model to use"},
+    {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma."
+                         " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]."
+                         " Defaults to CpuAcc,CpuRef"}
 };
 
 /*
  * Reads the user supplied backend preference, splits it by comma, and returns an ordered vector
  */
-std::vector<armnn::BackendId> GetPreferredBackendList(const std::string& preferredBackends)
+std::vector<armnn::BackendId> GetPreferredBackendList(const std::string& preferredBackends) 
 {
     std::vector<armnn::BackendId> backends;
     std::stringstream ss(preferredBackends);
 
-    while(ss.good())
+    while (ss.good()) 
     {
         std::string backend;
-        std::getline( ss, backend, ',' );
+        std::getline(ss, backend, ',');
         backends.emplace_back(backend);
     }
     return backends;
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[]) 
 {
-    // Wav2Letter ASR SETTINGS
-    int             SAMP_FREQ                  = 16000;
-    int             FRAME_LEN_MS               = 32;
-    int             FRAME_LEN_SAMPLES          = SAMP_FREQ * FRAME_LEN_MS * 0.001;
-    int             NUM_MFCC_FEATS             = 13;
-    int             MFCC_WINDOW_LEN            = 512;
-    int             MFCC_WINDOW_STRIDE         = 160;
-    const int       NUM_MFCC_VECTORS           = 296;
-    int             SAMPLES_PER_INFERENCE      = MFCC_WINDOW_LEN + ((NUM_MFCC_VECTORS -1) * MFCC_WINDOW_STRIDE);
-    int             MEL_LO_FREQ                = 0;
-    int             MEL_HI_FREQ                = 8000;
-    int             NUM_FBANK_BIN              = 128;
-    int             INPUT_WINDOW_LEFT_CONTEXT  = 98;
-    int             INPUT_WINDOW_RIGHT_CONTEXT = 98;
-    int             INPUT_WINDOW_INNER_CONTEXT = NUM_MFCC_VECTORS -
-            (INPUT_WINDOW_LEFT_CONTEXT + INPUT_WINDOW_RIGHT_CONTEXT);
-    int             SLIDING_WINDOW_OFFSET      = INPUT_WINDOW_INNER_CONTEXT * MFCC_WINDOW_STRIDE;
-
-
-    MfccParams mfccParams(SAMP_FREQ, NUM_FBANK_BIN,
-            MEL_LO_FREQ, MEL_HI_FREQ, NUM_MFCC_FEATS, FRAME_LEN_SAMPLES, false, NUM_MFCC_VECTORS);
-
-    MFCC mfccInst = MFCC(mfccParams);
-
-    Preprocess preprocessor(MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE, mfccInst);
-
     bool isFirstWindow = true;
-    std::string currentRContext  = "";
+    std::string currentRContext = "";
 
-    std::map <std::string, std::string> options;
+    std::map<std::string, std::string> options;
 
     int result = ParseOptions(options, CMD_OPTIONS, argv, argc);
-    if (result != 0)
+    if (result != 0) 
     {
         return result;
     }
@@ -127,28 +101,29 @@
     // Create the network options
     common::PipelineOptions pipelineOptions;
     pipelineOptions.m_ModelFilePath = GetSpecifiedOption(options, MODEL_FILE_PATH);
-
-    if (CheckOptionSpecified(options, PREFERRED_BACKENDS))
+    pipelineOptions.m_ModelName = "Wav2Letter";
+    if (CheckOptionSpecified(options, PREFERRED_BACKENDS)) 
     {
         pipelineOptions.m_backends = GetPreferredBackendList((GetSpecifiedOption(options, PREFERRED_BACKENDS)));
-    }
-    else
+    } 
+    else 
     {
         pipelineOptions.m_backends = {"CpuAcc", "CpuRef"};
     }
 
     asr::IPipelinePtr asrPipeline = asr::CreatePipeline(pipelineOptions, labels);
 
-    asr::AudioCapture capture;
-    std::vector<float> audioData = capture.LoadAudioFile(GetSpecifiedOption(options, AUDIO_FILE_PATH));
-    capture.InitSlidingWindow(audioData.data(), audioData.size(), SAMPLES_PER_INFERENCE, SLIDING_WINDOW_OFFSET);
+    audio::AudioCapture capture;
+    std::vector<float> audioData = audio::AudioCapture::LoadAudioFile(GetSpecifiedOption(options, AUDIO_FILE_PATH));
+    capture.InitSlidingWindow(audioData.data(), audioData.size(), asrPipeline->getInputSamplesSize(),
+                              asrPipeline->getSlidingWindowOffset());
 
-    while (capture.HasNext())
+    while (capture.HasNext()) 
     {
         std::vector<float> audioBlock = capture.Next();
         InferenceResults results;
 
-        std::vector<int8_t> preprocessedData = asrPipeline->PreProcessing<float, int8_t>(audioBlock, preprocessor);
+        std::vector<int8_t> preprocessedData = asrPipeline->PreProcessing(audioBlock);
         asrPipeline->Inference<int8_t>(preprocessedData, results);
         asrPipeline->PostProcessing<int8_t>(results, isFirstWindow, !capture.HasNext(), currentRContext);
     }

diff --git a/samples/SpeechRecognition/src/Preprocess.cpp b/samples/SpeechRecognition/src/Preprocess.cpp
deleted file mode 100644
index 8627961..0000000
--- a/samples/SpeechRecognition/src/Preprocess.cpp
+++ /dev/null

@@ -1,192 +0,0 @@
-//
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#include <algorithm>
-#include <numeric>
-#include <math.h>
-#include <string.h>
-
-#include "MathUtils.hpp"
-#include "Preprocess.hpp"
-
-Preprocess::Preprocess(
-        const uint32_t  windowLen,
-        const uint32_t  windowStride,
-        const MFCC mfccInst):
-        _m_mfcc(mfccInst),
-        _m_mfccBuf(mfccInst._m_params.m_numMfccFeatures, mfccInst._m_params.m_numMfccVectors),
-        _m_delta1Buf(mfccInst._m_params.m_numMfccFeatures, mfccInst._m_params.m_numMfccVectors),
-        _m_delta2Buf(mfccInst._m_params.m_numMfccFeatures, mfccInst._m_params.m_numMfccVectors),
-        _m_windowLen(windowLen),
-        _m_windowStride(windowStride)
-{
-    if (mfccInst._m_params.m_numMfccFeatures > 0 && windowLen > 0)
-    {
-        this->_m_mfcc.Init();
-    }
-}
-
-Preprocess::~Preprocess()
-{
-}
-
-bool Preprocess::Invoke( const float*  audioData, const uint32_t  audioDataLen, std::vector<int8_t>& output,
-        int quantOffset, float quantScale)
-{
-    this->_m_window = SlidingWindow<const float>(
-            audioData, audioDataLen,
-            this->_m_windowLen, this->_m_windowStride);
-
-    uint32_t mfccBufIdx = 0;
-
-    // Init buffers with 0
-    std::fill(_m_mfccBuf.begin(), _m_mfccBuf.end(), 0.f);
-    std::fill(_m_delta1Buf.begin(), _m_delta1Buf.end(), 0.f);
-    std::fill(_m_delta2Buf.begin(), _m_delta2Buf.end(), 0.f);
-
-    /* While we can slide over the window */
-    while (this->_m_window.HasNext())
-    {
-        const float*  mfccWindow = this->_m_window.Next();
-        auto mfccAudioData = std::vector<float>(
-                mfccWindow,
-                mfccWindow + this->_m_windowLen);
-
-        auto mfcc = this->_m_mfcc.MfccCompute(mfccAudioData);
-        for (size_t i = 0; i < this->_m_mfccBuf.size(0); ++i)
-        {
-            this->_m_mfccBuf(i, mfccBufIdx) = mfcc[i];
-        }
-        ++mfccBufIdx;
-    }
-
-    /* Pad MFCC if needed by repeating last feature vector */
-    while (mfccBufIdx != this->_m_mfcc._m_params.m_numMfccVectors)
-    {
-        memcpy(&this->_m_mfccBuf(0, mfccBufIdx),
-               &this->_m_mfccBuf(0, mfccBufIdx-1), sizeof(float)*this->_m_mfcc._m_params.m_numMfccFeatures);
-        ++mfccBufIdx;
-    }
-
-    /* Compute first and second order deltas from MFCCs */
-    this->_ComputeDeltas(this->_m_mfccBuf,
-                         this->_m_delta1Buf,
-                         this->_m_delta2Buf);
-
-    /* Normalise */
-    this->_Normalise();
-
-    return this->_Quantise<int8_t>(output.data(), quantOffset, quantScale);
-}
-
-bool Preprocess::_ComputeDeltas(Array2d<float>& mfcc,
-                                Array2d<float>& delta1,
-                                Array2d<float>& delta2)
-{
-    const std::vector <float> delta1Coeffs =
-            {6.66666667e-02,  5.00000000e-02,  3.33333333e-02,
-             1.66666667e-02, -3.46944695e-18, -1.66666667e-02,
-             -3.33333333e-02, -5.00000000e-02, -6.66666667e-02};
-
-    const std::vector <float> delta2Coeffs =
-            {0.06060606,      0.01515152,     -0.01731602,
-             -0.03679654,     -0.04329004,     -0.03679654,
-             -0.01731602,      0.01515152,      0.06060606};
-
-    if (delta1.size(0) == 0 || delta2.size(0) != delta1.size(0) ||
-        mfcc.size(0) == 0 || mfcc.size(1) == 0)
-    {
-        return false;
-    }
-
-    /* Get the middle index; coeff vec len should always be odd */
-    const size_t coeffLen = delta1Coeffs.size();
-    const size_t fMidIdx = (coeffLen - 1)/2;
-    const size_t numFeatures = mfcc.size(0);
-    const size_t numFeatVectors = mfcc.size(1);
-
-    /* iterate through features in MFCC vector*/
-    for (size_t i = 0; i < numFeatures; ++i)
-    {
-        /* for each feature, iterate through time (t) samples representing feature evolution and
-        * calculate d/dt and d^2/dt^2, using 1d convolution with differential kernels.
-        * Convolution padding = valid, result size is `time length - kernel length + 1`.
-        * The result is padded with 0 from both sides to match the size of initial time samples data.
-        *
-        * For the small filter, conv1d implementation as a simple loop is efficient enough.
-        * Filters of a greater size would need CMSIS-DSP functions to be used, like arm_fir_f32.
-        */
-
-        for (size_t j = fMidIdx; j < numFeatVectors - fMidIdx; ++j)
-        {
-            float d1 = 0;
-            float d2 = 0;
-            const size_t mfccStIdx = j - fMidIdx;
-
-            for (size_t k = 0, m = coeffLen - 1; k < coeffLen; ++k, --m)
-            {
-
-                d1 +=  mfcc(i,mfccStIdx + k) * delta1Coeffs[m];
-                d2 +=  mfcc(i,mfccStIdx + k) * delta2Coeffs[m];
-            }
-
-            delta1(i,j) = d1;
-            delta2(i,j) = d2;
-        }
-    }
-
-    return true;
-}
-
-float Preprocess::_GetMean(Array2d<float>& vec)
-{
-    return MathUtils::MeanF32(vec.begin(), vec.totalSize());
-}
-
-float Preprocess::_GetStdDev(Array2d<float>& vec, const float mean)
-{
-    return MathUtils::StdDevF32(vec.begin(), vec.totalSize(), mean);
-}
-
-void Preprocess::_NormaliseVec(Array2d<float>& vec)
-{
-    auto mean = Preprocess::_GetMean(vec);
-    auto stddev = Preprocess::_GetStdDev(vec, mean);
-
-    if (stddev == 0)
-    {
-        std::fill(vec.begin(), vec.end(), 0);
-    }
-    else
-    {
-        const float stddevInv = 1.f/stddev;
-        const float normalisedMean = mean/stddev;
-
-        auto NormalisingFunction = [=](float &value) {
-            value = value * stddevInv - normalisedMean;
-        };
-        std::for_each(vec.begin(), vec.end(), NormalisingFunction);
-    }
-}
-
-void Preprocess::_Normalise()
-{
-    Preprocess::_NormaliseVec(this->_m_mfccBuf);
-    Preprocess::_NormaliseVec(this->_m_delta1Buf);
-    Preprocess::_NormaliseVec(this->_m_delta2Buf);
-}
-
-float Preprocess::_GetQuantElem(
-        const float     elem,
-        const float     quantScale,
-        const int       quantOffset,
-        const float     minVal,
-        const float     maxVal)
-{
-    float val = std::round((elem/quantScale) + quantOffset);
-    float maxim = std::max<float>(val, minVal);
-    float returnVal = std::min<float>(std::max<float>(val, minVal), maxVal);
-    return returnVal;
-}
\ No newline at end of file

diff --git a/samples/SpeechRecognition/src/SpeechRecognitionPipeline.cpp b/samples/SpeechRecognition/src/SpeechRecognitionPipeline.cpp
index 1b822d6..8b7dd11 100644
--- a/samples/SpeechRecognition/src/SpeechRecognitionPipeline.cpp
+++ b/samples/SpeechRecognition/src/SpeechRecognitionPipeline.cpp

@@ -6,21 +6,86 @@
 #include "SpeechRecognitionPipeline.hpp"
 #include "ArmnnNetworkExecutor.hpp"
 
-namespace asr
+namespace asr 
 {
+
 ASRPipeline::ASRPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor,
-                         std::unique_ptr<Decoder> decoder
-                         ) :
+                         std::unique_ptr<Decoder> decoder, std::unique_ptr<Wav2LetterPreprocessor> preProcessor) :
         m_executor(std::move(executor)),
-        m_decoder(std::move(decoder)){}
+        m_decoder(std::move(decoder)), m_preProcessor(std::move(preProcessor)) {}
 
-IPipelinePtr CreatePipeline(common::PipelineOptions& config, std::map<int, std::string>& labels)
+int ASRPipeline::getInputSamplesSize() 
 {
-    auto executor = std::make_unique<common::ArmnnNetworkExecutor<int8_t>>(config.m_ModelFilePath, config.m_backends);
+    return this->m_preProcessor->m_windowLen +
+           ((this->m_preProcessor->m_mfcc->m_params.m_numMfccVectors - 1) * this->m_preProcessor->m_windowStride);
+}
 
-    auto decoder = std::make_unique<asr::Decoder>(labels);
+int ASRPipeline::getSlidingWindowOffset()
+{
+    // Hardcoded for now until refactor
+    return ASRPipeline::SLIDING_WINDOW_OFFSET;
+}
 
-    return std::make_unique<asr::ASRPipeline>(std::move(executor), std::move(decoder));
+std::vector<int8_t> ASRPipeline::PreProcessing(std::vector<float>& audio) 
+{
+    int audioDataToPreProcess = m_preProcessor->m_windowLen +
+                                ((m_preProcessor->m_mfcc->m_params.m_numMfccVectors - 1) *
+                                 m_preProcessor->m_windowStride);
+    int outputBufferSize = m_preProcessor->m_mfcc->m_params.m_numMfccVectors
+                           * m_preProcessor->m_mfcc->m_params.m_numMfccFeatures * 3;
+    std::vector<int8_t> outputBuffer(outputBufferSize);
+    m_preProcessor->Invoke(audio.data(), audioDataToPreProcess, outputBuffer, m_executor->GetQuantizationOffset(),
+                           m_executor->GetQuantizationScale());
+    return outputBuffer;
+}
+
+IPipelinePtr CreatePipeline(common::PipelineOptions& config, std::map<int, std::string>& labels) 
+{
+    if (config.m_ModelName == "Wav2Letter") 
+    {
+        // Wav2Letter ASR SETTINGS
+        int SAMP_FREQ = 16000;
+        int FRAME_LEN_MS = 32;
+        int FRAME_LEN_SAMPLES = SAMP_FREQ * FRAME_LEN_MS * 0.001;
+        int NUM_MFCC_FEATS = 13;
+        int MFCC_WINDOW_LEN = 512;
+        int MFCC_WINDOW_STRIDE = 160;
+        const int NUM_MFCC_VECTORS = 296;
+        int SAMPLES_PER_INFERENCE = MFCC_WINDOW_LEN + ((NUM_MFCC_VECTORS - 1) * MFCC_WINDOW_STRIDE);
+        int MEL_LO_FREQ = 0;
+        int MEL_HI_FREQ = 8000;
+        int NUM_FBANK_BIN = 128;
+        int INPUT_WINDOW_LEFT_CONTEXT = 98;
+        int INPUT_WINDOW_RIGHT_CONTEXT = 98;
+        int INPUT_WINDOW_INNER_CONTEXT = NUM_MFCC_VECTORS -
+                                         (INPUT_WINDOW_LEFT_CONTEXT + INPUT_WINDOW_RIGHT_CONTEXT);
+        int SLIDING_WINDOW_OFFSET = INPUT_WINDOW_INNER_CONTEXT * MFCC_WINDOW_STRIDE;
+
+
+        MfccParams mfccParams(SAMP_FREQ, NUM_FBANK_BIN,
+                              MEL_LO_FREQ, MEL_HI_FREQ, NUM_MFCC_FEATS, FRAME_LEN_SAMPLES, false, NUM_MFCC_VECTORS);
+
+        std::unique_ptr<Wav2LetterMFCC> mfccInst = std::make_unique<Wav2LetterMFCC>(mfccParams);
+
+        auto executor = std::make_unique<common::ArmnnNetworkExecutor<int8_t>>(config.m_ModelFilePath,
+                                                                               config.m_backends);
+
+        auto decoder = std::make_unique<asr::Decoder>(labels);
+
+        auto preprocessor = std::make_unique<Wav2LetterPreprocessor>(MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE,
+                                                                     std::move(mfccInst));
+
+        auto ptr = std::make_unique<asr::ASRPipeline>(
+                std::move(executor), std::move(decoder), std::move(preprocessor));
+
+        ptr->SLIDING_WINDOW_OFFSET = SLIDING_WINDOW_OFFSET;
+
+        return ptr;
+    } 
+    else
+    {
+        throw std::invalid_argument("Unknown Model name: " + config.m_ModelName + " .");
+    }
 }
 
 }// namespace asr
\ No newline at end of file

diff --git a/samples/SpeechRecognition/src/Wav2LetterMFCC.cpp b/samples/SpeechRecognition/src/Wav2LetterMFCC.cpp
new file mode 100644
index 0000000..959bd90
--- /dev/null
+++ b/samples/SpeechRecognition/src/Wav2LetterMFCC.cpp

@@ -0,0 +1,126 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include "Wav2LetterMFCC.hpp"
+#include "MathUtils.hpp"
+
+#include <cfloat>
+
+bool Wav2LetterMFCC::ApplyMelFilterBank(
+        std::vector<float>&                 fftVec,
+        std::vector<std::vector<float>>&    melFilterBank,
+        std::vector<uint32_t>&               filterBankFilterFirst,
+        std::vector<uint32_t>&               filterBankFilterLast,
+        std::vector<float>&                 melEnergies)
+{
+    const size_t numBanks = melEnergies.size();
+
+    if (numBanks != filterBankFilterFirst.size() ||
+            numBanks != filterBankFilterLast.size()) 
+    {
+        printf("Unexpected filter bank lengths\n");
+        return false;
+    }
+
+    for (size_t bin = 0; bin < numBanks; ++bin) 
+    {
+        auto filterBankIter = melFilterBank[bin].begin();
+        auto end = melFilterBank[bin].end();
+        // Avoid log of zero at later stages, same value used in librosa.
+        // The number was used during our default wav2letter model training. 
+        float melEnergy = 1e-10;
+        const uint32_t firstIndex = filterBankFilterFirst[bin];
+        const uint32_t lastIndex = std::min<uint32_t>(filterBankFilterLast[bin], fftVec.size() - 1);
+
+        for (uint32_t i = firstIndex; i <= lastIndex && filterBankIter != end; ++i) 
+        {
+            melEnergy += (*filterBankIter++ * fftVec[i]);
+        }
+
+        melEnergies[bin] = melEnergy;
+    }
+
+    return true;
+}
+
+void Wav2LetterMFCC::ConvertToLogarithmicScale(std::vector<float>& melEnergies)
+{
+    float maxMelEnergy = -FLT_MAX;
+
+    // Container for natural logarithms of mel energies. 
+    std::vector <float> vecLogEnergies(melEnergies.size(), 0.f);
+
+    // Because we are taking natural logs, we need to multiply by log10(e).
+    // Also, for wav2letter model, we scale our log10 values by 10. 
+    constexpr float multiplier = 10.0 *  // Default scalar. 
+                                  0.4342944819032518;  // log10f(std::exp(1.0)) 
+
+    // Take log of the whole vector. 
+    MathUtils::VecLogarithmF32(melEnergies, vecLogEnergies);
+
+    // Scale the log values and get the max. 
+    for (auto iterM = melEnergies.begin(), iterL = vecLogEnergies.begin();
+              iterM != melEnergies.end() && iterL != vecLogEnergies.end(); ++iterM, ++iterL) 
+    {
+
+        *iterM = *iterL * multiplier;
+
+        // Save the max mel energy. 
+        if (*iterM > maxMelEnergy) 
+        {
+            maxMelEnergy = *iterM;
+        }
+    }
+
+    // Clamp the mel energies. 
+    constexpr float maxDb = 80.0;
+    const float clampLevelLowdB = maxMelEnergy - maxDb;
+    for (float& melEnergy : melEnergies) 
+    {
+        melEnergy = std::max(melEnergy, clampLevelLowdB);
+    }
+}
+
+std::vector<float> Wav2LetterMFCC::CreateDCTMatrix(
+                                    const int32_t inputLength,
+                                    const int32_t coefficientCount)
+{
+    std::vector<float> dctMatix(inputLength * coefficientCount);
+
+    // Orthonormal normalization. 
+    const float normalizerK0 = 2 * sqrtf(1.0f /
+                                    static_cast<float>(4 * inputLength));
+    const float normalizer = 2 * sqrtf(1.0f /
+                                    static_cast<float>(2 * inputLength));
+
+    const float angleIncr = M_PI / inputLength;
+    float angle = angleIncr;  // We start using it at k = 1 loop. 
+
+    // First row of DCT will use normalizer K0. 
+    for (int32_t n = 0; n < inputLength; ++n) 
+    {
+        dctMatix[n] = normalizerK0;  // cos(0) = 1 
+    }
+
+    // Second row (index = 1) onwards, we use standard normalizer. 
+    for (int32_t k = 1, m = inputLength; k < coefficientCount; ++k, m += inputLength) 
+    {
+        for (int32_t n = 0; n < inputLength; ++n) 
+        {
+            dctMatix[m+n] = normalizer * cosf((n + 0.5f) * angle);
+        }
+        angle += angleIncr;
+    }
+    return dctMatix;
+}
+
+float Wav2LetterMFCC::GetMelFilterBankNormaliser(
+                                const float&    leftMel,
+                                const float&    rightMel,
+                                const bool      useHTKMethod)
+{
+    // Slaney normalization for mel weights. 
+    return (2.0f / (MFCC::InverseMelScale(rightMel, useHTKMethod) -
+            MFCC::InverseMelScale(leftMel, useHTKMethod)));
+}

diff --git a/samples/SpeechRecognition/src/Wav2LetterPreprocessor.cpp b/samples/SpeechRecognition/src/Wav2LetterPreprocessor.cpp
new file mode 100644
index 0000000..9329d5e
--- /dev/null
+++ b/samples/SpeechRecognition/src/Wav2LetterPreprocessor.cpp

@@ -0,0 +1,187 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include "MathUtils.hpp"
+#include <cstring>
+#include <cmath>
+#include <numeric>
+#include <algorithm>
+#include <memory>
+#include "Wav2LetterPreprocessor.hpp"
+#include "Wav2LetterMFCC.hpp"
+
+float Wav2LetterPreprocessor::GetMean(Array2d<float>& vec)
+{
+    return MathUtils::MeanF32(vec.begin(), vec.totalSize());
+}
+
+float Wav2LetterPreprocessor::GetStdDev(Array2d<float>& vec, const float mean)
+{
+    return MathUtils::StdDevF32(vec.begin(), vec.totalSize(), mean);
+}
+
+void Wav2LetterPreprocessor::NormaliseVec(Array2d<float>& vec)
+{
+    auto mean = Wav2LetterPreprocessor::GetMean(vec);
+    auto stddev = Wav2LetterPreprocessor::GetStdDev(vec, mean);
+
+    if (stddev == 0)
+    {
+        std::fill(vec.begin(), vec.end(), 0);
+    }
+    else
+    {
+        const float stddevInv = 1.f/stddev;
+        const float normalisedMean = mean/stddev;
+
+        auto NormalisingFunction = [=](float &value) {
+            value = value * stddevInv - normalisedMean;
+        };
+        std::for_each(vec.begin(), vec.end(), NormalisingFunction);
+    }
+}
+
+void Wav2LetterPreprocessor::Normalise()
+{
+    Wav2LetterPreprocessor::NormaliseVec(this->m_mfccBuf);
+    Wav2LetterPreprocessor::NormaliseVec(this->m_delta1Buf);
+    Wav2LetterPreprocessor::NormaliseVec(this->m_delta2Buf);
+}
+
+float Wav2LetterPreprocessor::GetQuantElem(
+        const float     elem,
+        const float     quantScale,
+        const int       quantOffset,
+        const float     minVal,
+        const float     maxVal)
+{
+    float val = std::round((elem/quantScale) + quantOffset);
+    float returnVal = std::min<float>(std::max<float>(val, minVal), maxVal);
+    return returnVal;
+}
+
+bool Wav2LetterPreprocessor::Invoke(const float*  audioData, const uint32_t  audioDataLen, std::vector<int8_t>& output,
+                                     int quantOffset, float quantScale)
+{
+    this->m_window = SlidingWindow<const float>(
+            audioData, audioDataLen,
+            this->m_windowLen, this->m_windowStride);
+
+    uint32_t mfccBufIdx = 0;
+
+    // Init buffers with 0
+    std::fill(m_mfccBuf.begin(), m_mfccBuf.end(), 0.f);
+    std::fill(m_delta1Buf.begin(), m_delta1Buf.end(), 0.f);
+    std::fill(m_delta2Buf.begin(), m_delta2Buf.end(), 0.f);
+
+    // While we can slide over the window 
+    while (this->m_window.HasNext())
+    {
+        const float* mfccWindow = this->m_window.Next();
+        auto mfccAudioData = std::vector<float>(
+                mfccWindow,
+                mfccWindow + this->m_windowLen);
+
+        auto mfcc = this->m_mfcc->MfccCompute(mfccAudioData);
+        for (size_t i = 0; i < this->m_mfccBuf.size(0); ++i)
+        {
+            this->m_mfccBuf(i, mfccBufIdx) = mfcc[i];
+        }
+        ++mfccBufIdx;
+    }
+
+    // Pad MFCC if needed by repeating last feature vector 
+    while (mfccBufIdx != this->m_mfcc->m_params.m_numMfccVectors)
+    {
+        memcpy(&this->m_mfccBuf(0, mfccBufIdx),
+               &this->m_mfccBuf(0, mfccBufIdx - 1), sizeof(float) * this->m_mfcc->m_params.m_numMfccFeatures);
+        ++mfccBufIdx;
+    }
+
+    // Compute first and second order deltas from MFCCs 
+    Wav2LetterPreprocessor::ComputeDeltas(this->m_mfccBuf,
+                        this->m_delta1Buf,
+                        this->m_delta2Buf);
+
+    // Normalise 
+    this->Normalise();
+
+    return this->Quantise<int8_t>(output.data(), quantOffset, quantScale);
+}
+
+bool Wav2LetterPreprocessor::ComputeDeltas(Array2d<float>& mfcc,
+                                           Array2d<float>& delta1,
+                                           Array2d<float>& delta2)
+{
+    const std::vector <float> delta1Coeffs =
+            {6.66666667e-02,  5.00000000e-02,  3.33333333e-02,
+             1.66666667e-02, -3.46944695e-18, -1.66666667e-02,
+             -3.33333333e-02, -5.00000000e-02, -6.66666667e-02};
+
+    const std::vector <float> delta2Coeffs =
+            {0.06060606,      0.01515152,     -0.01731602,
+             -0.03679654,     -0.04329004,     -0.03679654,
+             -0.01731602,      0.01515152,      0.06060606};
+
+    if (delta1.size(0) == 0 || delta2.size(0) != delta1.size(0) ||
+        mfcc.size(0) == 0 || mfcc.size(1) == 0)
+    {
+        return false;
+    }
+
+    // Get the middle index; coeff vec len should always be odd 
+    const size_t coeffLen = delta1Coeffs.size();
+    const size_t fMidIdx = (coeffLen - 1)/2;
+    const size_t numFeatures = mfcc.size(0);
+    const size_t numFeatVectors = mfcc.size(1);
+
+    // iterate through features in MFCC vector
+    for (size_t i = 0; i < numFeatures; ++i)
+    {
+        /* for each feature, iterate through time (t) samples representing feature evolution and
+        * calculate d/dt and d^2/dt^2, using 1d convolution with differential kernels.
+        * Convolution padding = valid, result size is `time length - kernel length + 1`.
+        * The result is padded with 0 from both sides to match the size of initial time samples data.
+        *
+        * For the small filter, conv1d implementation as a simple loop is efficient enough.
+        * Filters of a greater size would need CMSIS-DSP functions to be used, like arm_fir_f32.
+        */
+
+        for (size_t j = fMidIdx; j < numFeatVectors - fMidIdx; ++j)
+        {
+            float d1 = 0;
+            float d2 = 0;
+            const size_t mfccStIdx = j - fMidIdx;
+
+            for (size_t k = 0, m = coeffLen - 1; k < coeffLen; ++k, --m)
+            {
+
+                d1 +=  mfcc(i,mfccStIdx + k) * delta1Coeffs[m];
+                d2 +=  mfcc(i,mfccStIdx + k) * delta2Coeffs[m];
+            }
+
+            delta1(i,j) = d1;
+            delta2(i,j) = d2;
+        }
+    }
+
+    return true;
+}
+
+Wav2LetterPreprocessor::Wav2LetterPreprocessor(const uint32_t  windowLen,
+                                               const uint32_t  windowStride,
+                                               std::unique_ptr<Wav2LetterMFCC> mfccInst):
+    m_mfcc(std::move(mfccInst)),
+    m_mfccBuf(m_mfcc->m_params.m_numMfccFeatures, m_mfcc->m_params.m_numMfccVectors),
+    m_delta1Buf(m_mfcc->m_params.m_numMfccFeatures, m_mfcc->m_params.m_numMfccVectors),
+    m_delta2Buf(m_mfcc->m_params.m_numMfccFeatures, m_mfcc->m_params.m_numMfccVectors),
+    m_windowLen(windowLen),
+    m_windowStride(windowStride) 
+{
+    if (m_mfcc->m_params.m_numMfccFeatures > 0 && windowLen > 0) 
+    {
+        this->m_mfcc->Init();
+    }
+    std::fill(m_mfccBuf.begin(), m_mfccBuf.end(), 0.f);
+}
\ No newline at end of file

diff --git a/samples/SpeechRecognition/test/MFCCTest.cpp b/samples/SpeechRecognition/test/MFCCTest.cpp
index 2a55264..62a92fd 100644
--- a/samples/SpeechRecognition/test/MFCCTest.cpp
+++ b/samples/SpeechRecognition/test/MFCCTest.cpp

@@ -6,9 +6,10 @@
 #include <catch.hpp>
 #include <limits>
 
-#include "MFCC.hpp"
+#include "Wav2LetterMFCC.hpp"
 
-const std::vector<float> testWav = std::vector<float>{
+const std::vector<float> testWav = std::vector<float>
+{
     -3.0f, 0.0f, 1.0f, -1.0f, 2.0f, 3.0f, -2.0f, 2.0f,
             1.0f, -2.0f, 0.0f, 3.0f, -1.0f, 8.0f, 3.0f, 2.0f,
             -1.0f, -1.0f, 2.0f, 7.0f, 3.0f, 5.0f, 6.0f, 6.0f,
@@ -84,15 +85,16 @@
 
     std::vector<float> fullAudioData;
 
-        for (auto f : testWav)
-            {
-                fullAudioData.emplace_back( f / (1<<15));
-            }
+    for (auto f : testWav)
+    {
+        fullAudioData.emplace_back( f / (1<<15));
+    }
 
+    MfccParams mfccParams(sampFreq, 128, 0, 8000, numMfccFeats,
+                          frameLenSamples, false, 1);
 
-    MfccParams mfccParams(sampFreq, 128, 0, 8000, numMfccFeats, frameLenSamples, false, 1);
-
-    MFCC mfccInst = MFCC(mfccParams);
+    Wav2LetterMFCC mfccInst = Wav2LetterMFCC(mfccParams);
+    mfccInst.Init();
     auto mfccOutput = mfccInst.MfccCompute(fullAudioData);
 
     std::vector<float> expected = { -834.96564f, 21.02699f, 18.62856f, 7.3412f, 18.90791f, -5.36034f, 6.52351f,

diff --git a/samples/SpeechRecognition/test/PreprocessTest.cpp b/samples/SpeechRecognition/test/PreprocessTest.cpp
index 2b98831..f112747 100644
--- a/samples/SpeechRecognition/test/PreprocessTest.cpp
+++ b/samples/SpeechRecognition/test/PreprocessTest.cpp

@@ -6,8 +6,8 @@
 #include <catch.hpp>
 #include <limits>
 
-#include "Preprocess.hpp"
 #include "DataStructures.hpp"
+#include "Wav2LetterPreprocessor.hpp"
 
 void PopulateTestWavVector(std::vector<int16_t>& vec)
 {
@@ -51,9 +51,10 @@
     /* Populate with dummy input */
     PopulateTestWavVector(testWav1);
 
-    MfccParams mfccParams(sampFreq, 128, 0, 8000, numMfccFeats, frameLenSamples, false, numMfccVectors);
+    MfccParams mfccParams(sampFreq, 128, 0, 8000, numMfccFeats,
+                          frameLenSamples, false, numMfccVectors);
 
-    MFCC mfccInst = MFCC(mfccParams);
+    std::unique_ptr<Wav2LetterMFCC> mfccInst = std::make_unique<Wav2LetterMFCC>(mfccParams);
 
     std::vector<float> fullAudioData;
 
@@ -65,7 +66,7 @@
         }
     }
 
-    Preprocess prep(frameLenSamples, windowStride, mfccInst);
+    Wav2LetterPreprocessor prep(frameLenSamples, windowStride, std::move(mfccInst));
 
     std::vector<int8_t> outputBuffer(outputBufferSize);
 

diff --git a/samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp b/samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp
index 96cc1d0..9f1ef54 100644
--- a/samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp
+++ b/samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp

@@ -72,6 +72,10 @@
 
     int GetQuantizationOffset();
 
+    float GetOutputQuantizationScale(int tensorIndex);
+
+    int GetOutputQuantizationOffset(int tensorIndex);
+
     /**
     * @brief Runs inference on the provided input data, and stores the results in the provided InferenceResults object.
     *
@@ -203,6 +207,20 @@
 }
 
 template <class Tout>
+float ArmnnNetworkExecutor<Tout>::GetOutputQuantizationScale(int tensorIndex)
+{
+    assert(this->m_outputLayerNamesList.size() > tensorIndex);
+    return this->m_outputBindingInfo[tensorIndex].second.GetQuantizationScale();
+}
+
+template <class Tout>
+int ArmnnNetworkExecutor<Tout>::GetOutputQuantizationOffset(int tensorIndex)
+{
+    assert(this->m_outputLayerNamesList.size() > tensorIndex);
+    return this->m_outputBindingInfo[tensorIndex].second.GetQuantizationOffset();
+}
+
+template <class Tout>
 Size ArmnnNetworkExecutor<Tout>::GetImageAspectRatio()
 {
     const auto shape = m_inputBindingInfo.second.GetShape();

diff --git a/samples/SpeechRecognition/include/AudioCapture.hpp b/samples/common/include/Audio/AudioCapture.hpp
similarity index 81%
rename from samples/SpeechRecognition/include/AudioCapture.hpp
rename to samples/common/include/Audio/AudioCapture.hpp
index 90c2ecc..898bf91 100644
--- a/samples/SpeechRecognition/include/AudioCapture.hpp
+++ b/samples/common/include/Audio/AudioCapture.hpp

@@ -1,5 +1,5 @@
 //
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -7,16 +7,13 @@
 
 #include <string>
 #include <iostream>
-
-#include <math.h>
-
+#include <cmath>
 #include <vector>
-
 #include <exception>
 
 #include "SlidingWindow.hpp"
 
-namespace asr
+namespace audio
 {
 
 /**
@@ -29,19 +26,17 @@
     public:
 
         SlidingWindow<const float> m_window;
-        int lastReadIdx= 0;
 
         /**
         * @brief Default constructor
         */
-        AudioCapture()
-        {};
+        AudioCapture() = default;
 
         /**
         * @brief Function to load the audio data captured from the
          * input file to memory.
         */
-        std::vector<float> LoadAudioFile(std::string filePath);
+        static std::vector<float> LoadAudioFile(std::string filePath);
 
         /**
         * @brief Function to initialize the sliding window. This will set its position in memory, its
@@ -59,4 +54,4 @@
         */
         std::vector<float> Next();
     };
-} // namespace asr
\ No newline at end of file
+} // namespace audio
\ No newline at end of file

diff --git a/samples/SpeechRecognition/include/DataStructures.hpp b/samples/common/include/Audio/DataStructures.hpp
similarity index 100%
rename from samples/SpeechRecognition/include/DataStructures.hpp
rename to samples/common/include/Audio/DataStructures.hpp


diff --git a/samples/common/include/Audio/MFCC.hpp b/samples/common/include/Audio/MFCC.hpp
new file mode 100644
index 0000000..468bf92
--- /dev/null
+++ b/samples/common/include/Audio/MFCC.hpp

@@ -0,0 +1,234 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+
+#include <vector>
+#include <cstdint>
+#include <cmath>
+#include <limits>
+#include <string>
+
+/* MFCC's consolidated parameters */
+class MfccParams
+{
+public:
+    float       m_samplingFreq;
+    int         m_numFbankBins;
+    float       m_melLoFreq;
+    float       m_melHiFreq;
+    int         m_numMfccFeatures;
+    int         m_frameLen;
+    int         m_frameLenPadded;
+    bool        m_useHtkMethod;
+    int         m_numMfccVectors;
+    /** @brief  Constructor */
+    MfccParams(const float samplingFreq, const int numFbankBins,
+               const float melLoFreq, const float melHiFreq,
+               const int numMfccFeats, const int frameLen,
+               const bool useHtkMethod, const int numMfccVectors);
+    /* Delete the default constructor */
+    MfccParams()  = delete;
+    /* Default destructor */
+    ~MfccParams() = default;
+    /** @brief  String representation of parameters */
+    std::string Str();
+};
+
+/**
+ * @brief   Class for MFCC feature extraction.
+ *          Based on https://github.com/ARM-software/ML-KWS-for-MCU/blob/master/Deployment/Source/MFCC/mfcc.cpp
+ *          This class is designed to be generic and self-sufficient but
+ *          certain calculation routines can be overridden to accommodate
+ *          use-case specific requirements.
+ */
+class MFCC {
+public:
+    /**
+     * @brief       Constructor
+     * @param[in]   params   MFCC parameters
+    */
+    explicit MFCC(const MfccParams& params);
+
+    MFCC() = delete;
+
+    ~MFCC() = default;
+
+    /**
+    * @brief        Extract MFCC  features for one single small frame of
+    *               audio data e.g. 640 samples.
+    * @param[in]    audioData   Vector of audio samples to calculate
+    *                           features for.
+    * @return       Vector of extracted MFCC features.
+    **/
+    std::vector<float> MfccCompute(const std::vector<float>& audioData);
+
+    /** @brief  Initialise. */
+    void Init();
+
+   /**
+    * @brief        Extract MFCC features and quantise for one single small
+    *               frame of audio data e.g. 640 samples.
+    * @param[in]    audioData     Vector of audio samples to calculate
+    *                             features for.
+    * @param[in]    quantScale    Quantisation scale.
+    * @param[in]    quantOffset   Quantisation offset.
+    * @return       Vector of extracted quantised MFCC features.
+    **/
+    template<typename T>
+    std::vector<T> MfccComputeQuant(const std::vector<float>& audioData,
+                                    const float quantScale,
+                                    const int quantOffset)
+    {
+        this->MfccComputePreFeature(audioData);
+        float minVal = std::numeric_limits<T>::min();
+        float maxVal = std::numeric_limits<T>::max();
+
+        std::vector<T> mfccOut(this->m_params.m_numMfccFeatures);
+        const size_t numFbankBins = this->m_params.m_numFbankBins;
+
+        /* Take DCT. Uses matrix mul. */
+        for (size_t i = 0, j = 0; i < mfccOut.size(); ++i, j += numFbankBins)
+        {
+            float sum = 0;
+            for (size_t k = 0; k < numFbankBins; ++k)
+            {
+                sum += this->m_dctMatrix[j + k] * this->m_melEnergies[k];
+            }
+            /* Quantize to T. */
+            sum = std::round((sum / quantScale) + quantOffset);
+            mfccOut[i] = static_cast<T>(std::min<float>(std::max<float>(sum, minVal), maxVal));
+        }
+
+        return mfccOut;
+    }
+
+    MfccParams m_params;
+
+    /* Constants */
+    static constexpr float ms_logStep = /*logf(6.4)*/ 1.8562979903656 / 27.0;
+    static constexpr float ms_freqStep = 200.0 / 3;
+    static constexpr float ms_minLogHz = 1000.0;
+    static constexpr float ms_minLogMel = ms_minLogHz / ms_freqStep;
+
+protected:
+    /**
+     * @brief       Project input frequency to Mel Scale.
+     * @param[in]   freq           Input frequency in floating point.
+     * @param[in]   useHTKMethod   bool to signal if HTK method is to be
+     *                             used for calculation.
+     * @return      Mel transformed frequency in floating point.
+     **/
+    static float MelScale(float freq,
+                          bool  useHTKMethod = true);
+
+    /**
+     * @brief       Inverse Mel transform - convert MEL warped frequency
+     *              back to normal frequency.
+     * @param[in]   melFreq        Mel frequency in floating point.
+     * @param[in]   useHTKMethod   bool to signal if HTK method is to be
+     *                             used for calculation.
+     * @return      Real world frequency in floating point.
+     **/
+    static float InverseMelScale(float melFreq,
+                                 bool  useHTKMethod = true);
+
+    /**
+     * @brief       Populates MEL energies after applying the MEL filter
+     *              bank weights and adding them up to be placed into
+     *              bins, according to the filter bank's first and last
+     *              indices (pre-computed for each filter bank element
+     *              by CreateMelFilterBank function).
+     * @param[in]   fftVec                  Vector populated with FFT magnitudes.
+     * @param[in]   melFilterBank           2D Vector with filter bank weights.
+     * @param[in]   filterBankFilterFirst   Vector containing the first indices of filter bank
+     *                                      to be used for each bin.
+     * @param[in]   filterBankFilterLast    Vector containing the last indices of filter bank
+     *                                      to be used for each bin.
+     * @param[out]  melEnergies             Pre-allocated vector of MEL energies to be
+     *                                      populated.
+     * @return      true if successful, false otherwise.
+     */
+    virtual bool ApplyMelFilterBank(
+        std::vector<float>&                 fftVec,
+        std::vector<std::vector<float>>&    melFilterBank,
+        std::vector<uint32_t>&              filterBankFilterFirst,
+        std::vector<uint32_t>&              filterBankFilterLast,
+        std::vector<float>&                 melEnergies);
+
+    /**
+     * @brief           Converts the Mel energies for logarithmic scale.
+     * @param[in,out]   melEnergies   1D vector of Mel energies.
+     **/
+    virtual void ConvertToLogarithmicScale(std::vector<float>& melEnergies);
+
+    /**
+     * @brief       Create a matrix used to calculate Discrete Cosine
+     *              Transform.
+     * @param[in]   inputLength        Input length of the buffer on which
+     *                                 DCT will be performed.
+     * @param[in]   coefficientCount   Total coefficients per input length.
+     * @return      1D vector with inputLength x coefficientCount elements
+     *              populated with DCT coefficients.
+     */
+    virtual std::vector<float> CreateDCTMatrix(
+                                int32_t inputLength,
+                                int32_t coefficientCount);
+
+    /**
+     * @brief       Given the low and high Mel values, get the normaliser
+     *              for weights to be applied when populating the filter
+     *              bank.
+     * @param[in]   leftMel        Low Mel frequency value.
+     * @param[in]   rightMel       High Mel frequency value.
+     * @param[in]   useHTKMethod   bool to signal if HTK method is to be
+     *                             used for calculation.
+     * @return      Value to use for normalizing.
+     */
+    virtual float GetMelFilterBankNormaliser(
+                    const float&   leftMel,
+                    const float&   rightMel,
+                    bool     useHTKMethod);
+
+private:
+
+    std::vector<float>              m_frame;
+    std::vector<float>              m_buffer;
+    std::vector<float>              m_melEnergies;
+    std::vector<float>              m_windowFunc;
+    std::vector<std::vector<float>> m_melFilterBank;
+    std::vector<float>              m_dctMatrix;
+    std::vector<uint32_t>           m_filterBankFilterFirst;
+    std::vector<uint32_t>           m_filterBankFilterLast;
+    bool                            m_filterBankInitialised;
+
+    /**
+     * @brief       Initialises the filter banks and the DCT matrix. **/
+    void InitMelFilterBank();
+
+    /**
+     * @brief       Signals whether the instance of MFCC has had its
+     *              required buffers initialised.
+     * @return      true if initialised, false otherwise.
+     **/
+    bool IsMelFilterBankInited() const;
+
+    /**
+     * @brief       Create mel filter banks for MFCC calculation.
+     * @return      2D vector of floats.
+     **/
+    std::vector<std::vector<float>> CreateMelFilterBank();
+
+    /**
+     * @brief       Computes and populates internal memeber buffers used
+     *              in MFCC feature calculation
+     * @param[in]   audioData   1D vector of 16-bit audio data.
+     */
+    void MfccComputePreFeature(const std::vector<float>& audioData);
+
+    /** @brief       Computes the magnitude from an interleaved complex array. */
+    void ConvertToPowerSpectrum();
+
+};

diff --git a/samples/SpeechRecognition/include/MathUtils.hpp b/samples/common/include/Audio/MathUtils.hpp
similarity index 85%
rename from samples/SpeechRecognition/include/MathUtils.hpp
rename to samples/common/include/Audio/MathUtils.hpp
index 5f81fb6..1d8b0d3 100644
--- a/samples/SpeechRecognition/include/MathUtils.hpp
+++ b/samples/common/include/Audio/MathUtils.hpp

@@ -35,8 +35,8 @@
      * @param[in]   srcLen      Number of elements in the array/vector
      * @return      dot product
      */
-    static float DotProductF32(float* srcPtrA, float* srcPtrB,
-                               const int srcLen);
+    static float DotProductF32(const float* srcPtrA, float* srcPtrB,
+                               int srcLen);
 
     /**
      * @brief       Computes the squared magnitude of floating point
@@ -48,10 +48,10 @@
      * @param[in]   dstLen      output buffer len (for sanity check only)
      * @return      true if successful, false otherwise
      */
-    static bool ComplexMagnitudeSquaredF32(float* ptrSrc,
-                                           const int srcLen,
+    static bool ComplexMagnitudeSquaredF32(const float* ptrSrc,
+                                           int srcLen,
                                            float* ptrDst,
-                                           const int dstLen);
+                                           int dstLen);
 
     /**
          * @brief       Computes the natural logarithms of input floating point
@@ -70,7 +70,7 @@
          * @param[in]   srcLen  Number of elements in the array/vector
          * @return      average value
          */
-    static float MeanF32(float* ptrSrc, const uint32_t srcLen);
+    static float MeanF32(const float* ptrSrc, uint32_t srcLen);
 
     /**
      * @brief       Gets the standard deviation of a floating point array
@@ -80,6 +80,6 @@
      * @param[in]   mean    pre-computed mean value
      * @return      standard deviation value
      */
-    static float StdDevF32(float* ptrSrc, const uint32_t srcLen,
-                           const float mean);
+    static float StdDevF32(const float* ptrSrc, uint32_t srcLen,
+                           float mean);
 };

diff --git a/samples/SpeechRecognition/include/SlidingWindow.hpp b/samples/common/include/Audio/SlidingWindow.hpp
similarity index 97%
rename from samples/SpeechRecognition/include/SlidingWindow.hpp
rename to samples/common/include/Audio/SlidingWindow.hpp
index 791a0b7..77498c6 100644
--- a/samples/SpeechRecognition/include/SlidingWindow.hpp
+++ b/samples/common/include/Audio/SlidingWindow.hpp

@@ -1,5 +1,5 @@
 //
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 

diff --git a/samples/SpeechRecognition/src/AudioCapture.cpp b/samples/common/src/Audio/AudioCapture.cpp
similarity index 71%
rename from samples/SpeechRecognition/src/AudioCapture.cpp
rename to samples/common/src/Audio/AudioCapture.cpp
index f3b9092..920d7a5 100644
--- a/samples/SpeechRecognition/src/AudioCapture.cpp
+++ b/samples/common/src/Audio/AudioCapture.cpp

@@ -1,5 +1,5 @@
 //
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -8,12 +8,12 @@
 #include <sndfile.h>
 #include <samplerate.h>
 
-namespace asr
+namespace audio
 {
     std::vector<float> AudioCapture::LoadAudioFile(std::string filePath)
     {
         SF_INFO inputSoundFileInfo;
-        SNDFILE* infile = NULL;
+        SNDFILE* infile = nullptr;
         infile = sf_open(filePath.c_str(), SFM_READ, &inputSoundFileInfo);
 
         float audioIn[inputSoundFileInfo.channels * inputSoundFileInfo.frames];
@@ -21,14 +21,12 @@
 
         float sampleRate = 16000.0f;
         float srcRatio = sampleRate / (float)inputSoundFileInfo.samplerate;
-        int outputFrames = ceil(inputSoundFileInfo.frames * srcRatio);
-        float dataOut[outputFrames];
+        int outputFrames = ceilf(inputSoundFileInfo.frames * srcRatio);
 
         // Convert to mono
-        float monoData[inputSoundFileInfo.frames];
+        std::vector<float> monoData(inputSoundFileInfo.frames);
         for(int i = 0; i < inputSoundFileInfo.frames; i++)
         {
-            float val = 0.0f;
             for(int j = 0; j < inputSoundFileInfo.channels; j++)
                 monoData[i] += audioIn[i * inputSoundFileInfo.channels + j];
             monoData[i] /= inputSoundFileInfo.channels;
@@ -36,25 +34,20 @@
 
         // Resample
         SRC_DATA srcData;
-        srcData.data_in = monoData;
+        srcData.data_in = monoData.data();
         srcData.input_frames = inputSoundFileInfo.frames;
-        srcData.data_out = dataOut;
+
+        std::vector<float> dataOut(outputFrames);
+        srcData.data_out = dataOut.data();
+
         srcData.output_frames = outputFrames;
         srcData.src_ratio = srcRatio;
 
         src_simple(&srcData, SRC_SINC_BEST_QUALITY, 1);
 
-        // Convert to Vector
-        std::vector<float> processedInput;
-
-        for(int i = 0; i < srcData.output_frames_gen; ++i)
-        {
-            processedInput.push_back(srcData.data_out[i]);
-        }
-
         sf_close(infile);
 
-        return processedInput;
+        return dataOut;
     }
 
     void AudioCapture::InitSlidingWindow(float* data, size_t dataSize, int minSamples, size_t stride)
@@ -78,21 +71,21 @@
 
             if(remainingData < windowSize)
             {
-                std::vector<float> mfccAudioData(windowSize, 0.0f);
+                std::vector<float> audioData(windowSize, 0.0f);
                 for(int i = 0; i < remainingData; ++i)
                 {
-                    mfccAudioData[i] = *windowData;
+                    audioData[i] = *windowData;
                     if(i < remainingData - 1)
                     {
                         ++windowData;
                     }
                 }
-                return mfccAudioData;
+                return audioData;
             }
             else
             {
-                std::vector<float> mfccAudioData(windowData,  windowData + windowSize);
-                return mfccAudioData;
+                std::vector<float> audioData(windowData, windowData + windowSize);
+                return audioData;
             }
         }
         else
@@ -100,5 +93,4 @@
             throw std::out_of_range("Error, end of audio data reached.");
         }
     }
-} //namespace asr
-
+} //namespace asr
\ No newline at end of file

diff --git a/samples/common/src/Audio/MFCC.cpp b/samples/common/src/Audio/MFCC.cpp
new file mode 100644
index 0000000..911c32b
--- /dev/null
+++ b/samples/common/src/Audio/MFCC.cpp

@@ -0,0 +1,354 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include "MFCC.hpp"
+#include "MathUtils.hpp"
+
+#include <cfloat>
+#include <cinttypes>
+#include <cstring>
+
+MfccParams::MfccParams(
+        const float samplingFreq,
+        const int numFbankBins,
+        const float melLoFreq,
+        const float melHiFreq,
+        const int numMfccFeats,
+        const int frameLen,
+        const bool useHtkMethod,
+        const int numMfccVectors):
+        m_samplingFreq(samplingFreq),
+        m_numFbankBins(numFbankBins),
+        m_melLoFreq(melLoFreq),
+        m_melHiFreq(melHiFreq),
+        m_numMfccFeatures(numMfccFeats),
+        m_frameLen(frameLen),
+        m_numMfccVectors(numMfccVectors),
+        /* Smallest power of 2 >= frame length. */
+        m_frameLenPadded(pow(2, ceil((log(frameLen)/log(2))))),
+        m_useHtkMethod(useHtkMethod)
+{}
+
+std::string MfccParams::Str()
+{
+    char strC[1024];
+    snprintf(strC, sizeof(strC) - 1, "\n   \
+            \n\t Sampling frequency:         %f\
+            \n\t Number of filter banks:     %u\
+            \n\t Mel frequency limit (low):  %f\
+            \n\t Mel frequency limit (high): %f\
+            \n\t Number of MFCC features:    %u\
+            \n\t Frame length:               %u\
+            \n\t Padded frame length:        %u\
+            \n\t Using HTK for Mel scale:    %s\n",
+             this->m_samplingFreq, this->m_numFbankBins, this->m_melLoFreq,
+             this->m_melHiFreq, this->m_numMfccFeatures, this->m_frameLen,
+             this->m_frameLenPadded, this->m_useHtkMethod ? "yes" : "no");
+    return std::string{strC};
+}
+
+MFCC::MFCC(const MfccParams& params):
+    m_params(params),
+    m_filterBankInitialised(false)
+{
+    this->m_buffer = std::vector<float>(
+            this->m_params.m_frameLenPadded, 0.0);
+    this->m_frame = std::vector<float>(
+            this->m_params.m_frameLenPadded, 0.0);
+    this->m_melEnergies = std::vector<float>(
+            this->m_params.m_numFbankBins, 0.0);
+
+    this->m_windowFunc = std::vector<float>(this->m_params.m_frameLen);
+    const auto multiplier = static_cast<float>(2 * M_PI / this->m_params.m_frameLen);
+
+    /* Create window function. */
+    for (size_t i = 0; i < this->m_params.m_frameLen; i++) 
+    {
+        this->m_windowFunc[i] = (0.5 - (0.5 * cosf(static_cast<float>(i) * multiplier)));
+    }
+
+}
+
+void MFCC::Init()
+{
+    this->InitMelFilterBank();
+}
+
+float MFCC::MelScale(const float freq, const bool useHTKMethod)
+{
+    if (useHTKMethod) 
+    {
+        return 1127.0f * logf (1.0f + freq / 700.0f);
+    } 
+    else 
+    {
+        /* Slaney formula for mel scale. */
+        float mel = freq / ms_freqStep;
+
+        if (freq >= ms_minLogHz) 
+        {
+            mel = ms_minLogMel + logf(freq / ms_minLogHz) / ms_logStep;
+        }
+        return mel;
+    }
+}
+
+float MFCC::InverseMelScale(const float melFreq, const bool useHTKMethod)
+{
+    if (useHTKMethod) {
+        return 700.0f * (expf (melFreq / 1127.0f) - 1.0f);
+    } 
+    else 
+    {
+        /* Slaney formula for mel scale. */
+        float freq = ms_freqStep * melFreq;
+
+        if (melFreq >= ms_minLogMel) 
+        {
+            freq = ms_minLogHz * expf(ms_logStep * (melFreq - ms_minLogMel));
+        }
+        return freq;
+    }
+}
+
+
+bool MFCC::ApplyMelFilterBank(
+        std::vector<float>&                 fftVec,
+        std::vector<std::vector<float>>&    melFilterBank,
+        std::vector<uint32_t>&              filterBankFilterFirst,
+        std::vector<uint32_t>&              filterBankFilterLast,
+        std::vector<float>&                 melEnergies)
+{
+    const size_t numBanks = melEnergies.size();
+
+    if (numBanks != filterBankFilterFirst.size() ||
+        numBanks != filterBankFilterLast.size()) 
+    {
+        printf("unexpected filter bank lengths\n");
+        return false;
+    }
+
+    for (size_t bin = 0; bin < numBanks; ++bin) 
+    {
+        auto filterBankIter = melFilterBank[bin].begin();
+        auto end = melFilterBank[bin].end();
+        float melEnergy = FLT_MIN;  /* Avoid log of zero at later stages */
+        const uint32_t firstIndex = filterBankFilterFirst[bin];
+        const uint32_t lastIndex = std::min<uint32_t>(filterBankFilterLast[bin], fftVec.size() - 1);
+
+        for (uint32_t i = firstIndex; i <= lastIndex && filterBankIter != end; i++) 
+        {
+            float energyRep = sqrt(fftVec[i]);
+            melEnergy += (*filterBankIter++ * energyRep);
+        }
+
+        melEnergies[bin] = melEnergy;
+    }
+
+    return true;
+}
+
+void MFCC::ConvertToLogarithmicScale(std::vector<float>& melEnergies)
+{
+    for (float& melEnergy : melEnergies) 
+    {
+        melEnergy = logf(melEnergy);
+    }
+}
+
+void MFCC::ConvertToPowerSpectrum()
+{
+    const uint32_t halfDim = this->m_buffer.size() / 2;
+
+    /* Handle this special case. */
+    float firstEnergy = this->m_buffer[0] * this->m_buffer[0];
+    float lastEnergy = this->m_buffer[1] * this->m_buffer[1];
+
+    MathUtils::ComplexMagnitudeSquaredF32(
+            this->m_buffer.data(),
+            this->m_buffer.size(),
+            this->m_buffer.data(),
+            this->m_buffer.size()/2);
+
+    this->m_buffer[0] = firstEnergy;
+    this->m_buffer[halfDim] = lastEnergy;
+}
+
+std::vector<float> MFCC::CreateDCTMatrix(
+                            const int32_t inputLength,
+                            const int32_t coefficientCount)
+{
+    std::vector<float> dctMatrix(inputLength * coefficientCount);
+
+    const float normalizer = sqrtf(2.0f/inputLength);
+    const float angleIncr = M_PI/inputLength;
+    float angle = 0;
+
+    for (int32_t k = 0, m = 0; k < coefficientCount; k++, m += inputLength) 
+    {
+        for (int32_t n = 0; n < inputLength; n++) 
+        {
+            dctMatrix[m + n] = normalizer * cosf((n + 0.5f) * angle);
+        }
+        angle += angleIncr;
+    }
+
+    return dctMatrix;
+}
+
+float MFCC::GetMelFilterBankNormaliser(
+                const float&    leftMel,
+                const float&    rightMel,
+                const bool      useHTKMethod)
+{
+    /* By default, no normalisation => return 1 */
+    return 1.f;
+}
+
+void MFCC::InitMelFilterBank()
+{
+    if (!this->IsMelFilterBankInited()) 
+    {
+        this->m_melFilterBank = this->CreateMelFilterBank();
+        this->m_dctMatrix = this->CreateDCTMatrix(
+                                this->m_params.m_numFbankBins,
+                                this->m_params.m_numMfccFeatures);
+        this->m_filterBankInitialised = true;
+    }
+}
+
+bool MFCC::IsMelFilterBankInited() const
+{
+    return this->m_filterBankInitialised;
+}
+
+void MFCC::MfccComputePreFeature(const std::vector<float>& audioData)
+{
+    this->InitMelFilterBank();
+
+    auto size = std::min(std::min(this->m_frame.size(), audioData.size()),
+                         static_cast<size_t>(this->m_params.m_frameLen)) * sizeof(float);
+    std::memcpy(this->m_frame.data(), audioData.data(), size);
+
+    /* Apply window function to input frame. */
+    for(size_t i = 0; i < this->m_params.m_frameLen; i++) 
+    {
+        this->m_frame[i] *= this->m_windowFunc[i];
+    }
+
+    /* Set remaining frame values to 0. */
+    std::fill(this->m_frame.begin() + this->m_params.m_frameLen,this->m_frame.end(), 0);
+
+    /* Compute FFT. */
+    MathUtils::FftF32(this->m_frame, this->m_buffer);
+
+    /* Convert to power spectrum. */
+    this->ConvertToPowerSpectrum();
+
+    /* Apply mel filterbanks. */
+    if (!this->ApplyMelFilterBank(this->m_buffer,
+                                  this->m_melFilterBank,
+                                  this->m_filterBankFilterFirst,
+                                  this->m_filterBankFilterLast,
+                                  this->m_melEnergies)) 
+    {
+        printf("Failed to apply MEL filter banks\n");
+    }
+
+    /* Convert to logarithmic scale. */
+    this->ConvertToLogarithmicScale(this->m_melEnergies);
+}
+
+std::vector<float> MFCC::MfccCompute(const std::vector<float>& audioData)
+{
+    this->MfccComputePreFeature(audioData);
+
+    std::vector<float> mfccOut(this->m_params.m_numMfccFeatures);
+
+    float * ptrMel = this->m_melEnergies.data();
+    float * ptrDct = this->m_dctMatrix.data();
+    float * ptrMfcc = mfccOut.data();
+
+    /* Take DCT. Uses matrix mul. */
+    for (size_t i = 0, j = 0; i < mfccOut.size();
+                ++i, j += this->m_params.m_numFbankBins) 
+    {
+        *ptrMfcc++ = MathUtils::DotProductF32(
+                ptrDct + j,
+                ptrMel,
+                this->m_params.m_numFbankBins);
+    }
+    return mfccOut;
+}
+
+std::vector<std::vector<float>> MFCC::CreateMelFilterBank()
+{
+    size_t numFftBins = this->m_params.m_frameLenPadded / 2;
+    float fftBinWidth = static_cast<float>(this->m_params.m_samplingFreq) / this->m_params.m_frameLenPadded;
+
+    float melLowFreq = MFCC::MelScale(this->m_params.m_melLoFreq,
+                                      this->m_params.m_useHtkMethod);
+    float melHighFreq = MFCC::MelScale(this->m_params.m_melHiFreq,
+                                       this->m_params.m_useHtkMethod);
+    float melFreqDelta = (melHighFreq - melLowFreq) / (this->m_params.m_numFbankBins + 1);
+
+    std::vector<float> thisBin = std::vector<float>(numFftBins);
+    std::vector<std::vector<float>> melFilterBank(
+                                        this->m_params.m_numFbankBins);
+    this->m_filterBankFilterFirst =
+                    std::vector<uint32_t>(this->m_params.m_numFbankBins);
+    this->m_filterBankFilterLast =
+                    std::vector<uint32_t>(this->m_params.m_numFbankBins);
+
+    for (size_t bin = 0; bin < this->m_params.m_numFbankBins; bin++) 
+    {
+        float leftMel = melLowFreq + bin * melFreqDelta;
+        float centerMel = melLowFreq + (bin + 1) * melFreqDelta;
+        float rightMel = melLowFreq + (bin + 2) * melFreqDelta;
+
+        uint32_t firstIndex = 0;
+        uint32_t lastIndex = 0;
+        bool firstIndexFound = false;
+        const float normaliser = this->GetMelFilterBankNormaliser(leftMel, rightMel, this->m_params.m_useHtkMethod);
+
+        for (size_t i = 0; i < numFftBins; i++) 
+        {
+            float freq = (fftBinWidth * i);  /* Center freq of this fft bin. */
+            float mel = MFCC::MelScale(freq, this->m_params.m_useHtkMethod);
+            thisBin[i] = 0.0;
+
+            if (mel > leftMel && mel < rightMel) 
+            {
+                float weight;
+                if (mel <= centerMel) 
+                {
+                    weight = (mel - leftMel) / (centerMel - leftMel);
+                } 
+                else 
+                {
+                    weight = (rightMel - mel) / (rightMel - centerMel);
+                }
+
+                thisBin[i] = weight * normaliser;
+                if (!firstIndexFound) 
+                {
+                    firstIndex = i;
+                    firstIndexFound = true;
+                }
+                lastIndex = i;
+            }
+        }
+
+        this->m_filterBankFilterFirst[bin] = firstIndex;
+        this->m_filterBankFilterLast[bin] = lastIndex;
+
+        /* Copy the part we care about. */
+        for (uint32_t i = firstIndex; i <= lastIndex; i++) 
+        {
+            melFilterBank[bin].push_back(thisBin[i]);
+        }
+    }
+
+    return melFilterBank;
+}

diff --git a/samples/SpeechRecognition/src/MathUtils.cpp b/samples/common/src/Audio/MathUtils.cpp
similarity index 78%
rename from samples/SpeechRecognition/src/MathUtils.cpp
rename to samples/common/src/Audio/MathUtils.cpp
index bf99083..d91b509 100644
--- a/samples/SpeechRecognition/src/MathUtils.cpp
+++ b/samples/common/src/Audio/MathUtils.cpp

@@ -1,5 +1,5 @@
 //
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -41,7 +41,7 @@
     }
 }
 
-float MathUtils::DotProductF32(float* srcPtrA, float* srcPtrB,
+float MathUtils::DotProductF32(const float* srcPtrA, float* srcPtrB,
                                const int srcLen)
 {
     float output = 0.f;
@@ -53,10 +53,10 @@
     return output;
 }
 
-bool MathUtils::ComplexMagnitudeSquaredF32(float* ptrSrc,
-                                           const int srcLen,
+bool MathUtils::ComplexMagnitudeSquaredF32(const float* ptrSrc,
+                                           int srcLen,
                                            float* ptrDst,
-                                           const int dstLen)
+                                           int dstLen)
 {
     if (dstLen < srcLen/2)
     {
@@ -64,7 +64,7 @@
         return false;
     }
 
-    for (int j = 0; j < srcLen; ++j)
+    for (int j = 0; j < dstLen; ++j)
     {
         const float real = *ptrSrc++;
         const float im = *ptrSrc++;
@@ -83,7 +83,7 @@
     }
 }
 
-float MathUtils::MeanF32(float* ptrSrc, const uint32_t srcLen)
+float MathUtils::MeanF32(const float* ptrSrc, const uint32_t srcLen)
 {
     if (!srcLen)
     {
@@ -94,14 +94,13 @@
     return acc/srcLen;
 }
 
-float MathUtils::StdDevF32(float* ptrSrc, const uint32_t srcLen,
-                           const float mean)
+float MathUtils::StdDevF32(const float* ptrSrc, uint32_t srcLen, float mean)
 {
     if (!srcLen)
     {
         return 0.f;
     }
-    auto VarianceFunction = [=](float acc, const float value) {
+    auto VarianceFunction = [mean, srcLen](float acc, const float value) {
         return acc + (((value - mean) * (value - mean))/ srcLen);
     };
 

diff --git a/samples/SpeechRecognition/test/AudioCaptureTest.cpp b/samples/common/test/Audio/AudioCaptureTest.cpp
similarity index 93%
rename from samples/SpeechRecognition/test/AudioCaptureTest.cpp
rename to samples/common/test/Audio/AudioCaptureTest.cpp
index 94b4e7c..b8ea7b2 100644
--- a/samples/SpeechRecognition/test/AudioCaptureTest.cpp
+++ b/samples/common/test/Audio/AudioCaptureTest.cpp

@@ -1,5 +1,5 @@
 //
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -14,7 +14,7 @@
     std::string testResources = TEST_RESOURCE_DIR;
     REQUIRE(testResources != "");
     std::string file =  testResources + "/" + "myVoiceIsMyPassportVerifyMe04.wav";
-    asr::AudioCapture capture;
+    audio::AudioCapture capture;
     std::vector<float> audioData = capture.LoadAudioFile(file);
     capture.InitSlidingWindow(audioData.data(), audioData.size(), 47712, 16000);
 
@@ -49,7 +49,7 @@
     std::string testResources = TEST_RESOURCE_DIR;
     REQUIRE(testResources != "");
     std::string file =  testResources + "/" + "myVoiceIsMyPassportVerifyMe04.wav";
-    asr::AudioCapture capture;
+    audio::AudioCapture capture;
     std::vector<float> audioData = capture.LoadAudioFile(file);
     capture.InitSlidingWindow(audioData.data(), audioData.size(), 47712, 16000);
     capture.Next();

diff --git a/samples/common/test/Audio/MathUtilsTest.cpp b/samples/common/test/Audio/MathUtilsTest.cpp
new file mode 100644
index 0000000..d7a435d
--- /dev/null
+++ b/samples/common/test/Audio/MathUtilsTest.cpp

@@ -0,0 +1,112 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <catch.hpp>
+#include <limits>
+
+#include "MathUtils.hpp"
+#include <iostream>
+#include <numeric>
+
+TEST_CASE("Test DotProductF32")
+{
+    // Test  Constants:
+    const int length = 6;
+
+    float inputA[] = { 1, 1, 1, 0, 0, 0 };
+    float inputB[] = { 0, 0, 0, 1, 1, 1 };
+
+    float dot_prod = MathUtils::DotProductF32(inputA, inputB, length);
+    float expectedResult = 0;
+    CHECK(dot_prod == expectedResult);
+}
+
+TEST_CASE("Test FFT32")
+{
+    // Test  Constants:
+    std::vector<float> input(32, 0);
+    std::vector<float> output(32);
+    std::vector<float> expectedResult(32, 0);
+
+    MathUtils::FftF32(input, output);
+
+    // To avoid common failed assertions due to rounding of near-zero values a small offset is added
+    transform(output.begin(), output.end(), output.begin(),
+    bind2nd(std::plus<double>(), 0.1));
+
+    transform(expectedResult.begin(), expectedResult.end(), expectedResult.begin(),
+    bind2nd(std::plus<double>(), 0.1));
+
+    for (int i = 0; i < output.size(); i++)
+    {
+        CHECK (expectedResult[i] == Approx(output[i]));
+    }
+}
+
+TEST_CASE("Test ComplexMagnitudeSquaredF32")
+{
+    // Test  Constants:
+    float input[] = { 0.0, 0.0, 0.5, 0.5,1,1 };
+    int inputLen = (sizeof(input)/sizeof(*input));
+    float expectedResult[] = { 0.0, 0.5, 2 };
+    int outputLen = inputLen/2;
+    float output[outputLen];
+
+    MathUtils::ComplexMagnitudeSquaredF32(input, inputLen, output, outputLen);
+
+    for (int i = 0; i < outputLen; i++)
+    {
+        CHECK (expectedResult[i] == Approx(output[i]));
+    }
+}
+
+TEST_CASE("Test VecLogarithmF32")
+{
+    // Test  Constants:
+
+    std::vector<float> input = { 1, 0.1e-10 };
+    std::vector<float> expectedResult = { 0, -25.328436 };
+    std::vector<float> output(input.size());
+    MathUtils::VecLogarithmF32(input,output);
+
+    for (int i = 0; i < input.size(); i++)
+    {
+        CHECK (expectedResult[i] == Approx(output[i]));
+    }
+}
+
+TEST_CASE("Test MeanF32")
+{    
+    // Test  Constants:
+    float input[] = { 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 1.000 };
+    uint32_t inputLen = (sizeof(input)/sizeof(*input));
+    float output;
+
+    // Manually calculated mean of above array
+    float expectedResult = 0.100;
+    output = MathUtils::MeanF32(input, inputLen);
+
+    CHECK (expectedResult == Approx(output));
+}
+
+TEST_CASE("Test StdDevF32")
+{
+    // Test  Constants:
+
+    float input[] = { 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 1.000 };
+
+    uint32_t inputLen = (sizeof(input)/sizeof(*input));
+
+    // Calculate mean using std library to avoid dependency on MathUtils::MeanF32 
+    float mean = (std::accumulate(input, input + inputLen, 0.0f))/float(inputLen);
+
+    float output = MathUtils::StdDevF32(input, inputLen, mean);
+
+    // Manually calculated standard deviation of above array
+    float expectedResult = 0.300;
+
+    CHECK (expectedResult == Approx(output));
+}
+
commit	23c26277086c78704a17f0dae86da947816320c0	[log] [tgz]
author	George Gekov <george.gekov@arm.com>	Mon Aug 16 11:32:10 2021 +0100
committer	Jim Flynn <jim.flynn@arm.com>	Sat Feb 05 19:49:06 2022 +0000
tree	88b02fd1fae3130256d059251788a7ef68d2831f
parent	922b912fd2d462bac0809bac5669310ad1506310 [diff]