MLECO-2492 Add CPP OD example with TFLITE-ArmnnDelegate

Signed-off-by: Dvir Markovich <dvir.markovich@arm.com>
Change-Id: If412c15ba49abe8370a570260b0a8ed8de305b7c
diff --git a/samples/ObjectDetection/CMakeLists.txt b/samples/ObjectDetection/CMakeLists.txt
index dbcd55f..953c4ed 100644
--- a/samples/ObjectDetection/CMakeLists.txt
+++ b/samples/ObjectDetection/CMakeLists.txt
@@ -2,9 +2,12 @@
 # SPDX-License-Identifier: MIT
 
 cmake_minimum_required(VERSION 3.0.2)
+project (object_detection_example)
 
 set(CMAKE_C_STANDARD                99)
 set(CMAKE_CXX_STANDARD              14)
+#location of FindTfLite.cmake and FindTfLiteSrc.cmake
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/../../delegate/cmake/Modules/")
 
 # Make the standard a requirement => prevent fallback to previous
 # supported standard
@@ -15,14 +18,15 @@
 set(CMAKE_C_EXTENSIONS              OFF)
 set(CMAKE_CXX_EXTENSIONS            OFF)
 
-project (object_detection_example)
-
 set(CMAKE_C_FLAGS_DEBUG         "-DDEBUG -O0 -g -fPIC")
 set(CMAKE_C_FLAGS_RELEASE       "-DNDEBUG -O3 -fPIC")
 
 set(CMAKE_CXX_FLAGS_DEBUG       "-DDEBUG -O0 -g -fPIC")
 set(CMAKE_CXX_FLAGS_RELEASE     "-DNDEBUG -O3 -fPIC")
 
+SET(USE_ARMNN_DELEGATE False CACHE BOOL "Use delegate file")
+message("USE_ARMNN_DELEGATE=${USE_ARMNN_DELEGATE}")
+
 include(ExternalProject)
 
 # Build in release mode by default
@@ -40,9 +44,23 @@
 
 include(../common/cmake/find_opencv.cmake)
 include(../common/cmake/find_armnn.cmake)
+if( USE_ARMNN_DELEGATE )
+    ## Add TfLite dependency
+    find_package(TfLiteSrc REQUIRED MODULE)
+    find_package(TfLite REQUIRED MODULE)
+    ## Add Flatbuffers dependency
+    find_package(Flatbuffers REQUIRED MODULE)
+
+    add_definitions(-DUSE_TF_LITE_DELEGATE)
+endif()
 
 include_directories(include)
-include_directories(../common/include/ArmnnUtils)
+## chose the correct instance of ArmnnNetworkExecutor.hpp
+if( USE_ARMNN_DELEGATE )
+    include_directories(include/delegate)
+else()
+    include_directories(../common/include/ArmnnUtils)
+endif()
 include_directories(../common/include/Utils)
 include_directories(../common/include/CVUtils)
 
@@ -50,7 +68,22 @@
 file(GLOB CVUTILS_SOURCES "../common/src/CVUtils**/*.cpp")
 file(GLOB UTILS_SOURCES "../common/src/Utils**/*.cpp")
 list(REMOVE_ITEM SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/Main.cpp)
-file(GLOB TEST_SOURCES "test/*.cpp")
+if( USE_ARMNN_DELEGATE )
+    file(GLOB TEST_SOURCES "test/delegate/*.cpp" "test/*.cpp")
+
+    # Various tflite header files are not warning clean
+    # We can't change compilation flags on header files directly, so we need to add them to an interface library first
+    add_library(tflite_headers INTERFACE)
+    target_include_directories(tflite_headers INTERFACE $<BUILD_INTERFACE:${TfLite_INCLUDE_DIR}>
+                                                    $<INSTALL_INTERFACE:include/tflite_headers>)
+
+    target_compile_options(tflite_headers INTERFACE -Wno-conversion
+                                                    -Wno-sign-conversion
+                                                    -Wno-unused-parameter
+                                                    -Wno-unused-function)
+else()
+    file(GLOB TEST_SOURCES "test/*.cpp")
+endif()
 file(GLOB APP_MAIN "src/Main.cpp")
 
 if(BUILD_UNIT_TESTS)
@@ -62,6 +95,15 @@
 
 add_executable("${APP_TARGET_NAME}" ${SOURCES} ${CVUTILS_SOURCES} ${UTILS_SOURCES} ${APP_MAIN})
 
+if( USE_ARMNN_DELEGATE )
+    set(CMAKE_CXX_FLAGS " -ldl -lrt -Wl,--copy-dt-needed-entries")
+    target_link_libraries("${APP_TARGET_NAME}" PUBLIC ${TfLite_LIB})
+
+    target_link_libraries("${APP_TARGET_NAME}" PUBLIC tflite_headers)
+    target_include_directories("${APP_TARGET_NAME}" PUBLIC ${Flatbuffers_INCLUDE_DIR})
+    target_link_libraries("${APP_TARGET_NAME}" PUBLIC ${Flatbuffers_LIB})
+endif()
+
 if (NOT OPENCV_LIBS_FOUND)
     message("Building OpenCV libs")
     add_dependencies("${APP_TARGET_NAME}" "${OPENCV_LIB}")
diff --git a/samples/ObjectDetection/Readme.md b/samples/ObjectDetection/Readme.md
index 04ac011..194a3e9 100644
--- a/samples/ObjectDetection/Readme.md
+++ b/samples/ObjectDetection/Readme.md
@@ -1,7 +1,11 @@
 # Object Detection Example
 
 ## Introduction
-This is a sample code showing object detection using Arm NN public C++ API. The compiled application can take
+This is a sample code showing object detection using Arm NN in two different modes:
+1. Utilizing public Arm NN C++ API.
+2. Utilizing Tensorflow lite delegate file mechanism together with Armnn delegate file.
+
+The compiled application can take
 
  * a video file
 
@@ -13,8 +17,22 @@
 
 ## Dependencies
 
-This example utilises OpenCV functions to capture and output video data. Top level inference API is provided by Arm NN
-library.
+This example utilizes OpenCV functions to capture and output video data.
+1. Public Arm NN C++ API is provided by Arm NN library.
+2. For Delegate file mode following dependencies exist:
+2.1 Tensorflow version 2.5.0
+2.2 Flatbuffers version 1.12.0
+2.3 Arm NN delegate library
+
+## System
+
+This example was created on Ubuntu 20.04 with gcc and g++ version 9.
+If encountered any compiler errors while running with a different compiler version, you can install version 9 with:
+```commandline
+sudo apt install gcc-9 g++-9
+```
+and add to every cmake command those compiler flags:
+-DCMAKE_C_COMPILER=gcc-9 -DCMAKE_CXX_COMPILER=g++-9
 
 ### Arm NN
 
@@ -22,7 +40,13 @@
 please ensure that Arm NN libraries and header files are available on your build platform.
 The application executable binary dynamically links with the following Arm NN libraries:
 * libarmnn.so
+For Arm NN public C++ API mode:
 * libarmnnTfLiteParser.so
+For Delegate file mode:
+* libarmnnDelegate.so
+
+Pre compiled Arm NN libraries can be downloaded from https://github.com/ARM-software/armnn/releases/download/v21.11/ArmNN-linux-aarch64.tar.gz
+the "lib" and "include" directories should be taken together.
 
 The build script searches for available Arm NN libraries in the following order:
 1. Inside custom user directory specified by ARMNN_LIB_DIR cmake option.
@@ -37,8 +61,11 @@
 ### OpenCV
 
 This application uses [OpenCV (Open Source Computer Vision Library)](https://opencv.org/) for video stream processing.
-Your host platform may have OpenCV available through linux package manager. If this is the case, please install it using
-standard way. If not, our build system has a script to download and cross-compile required OpenCV modules
+Your host platform may have OpenCV available through linux package manager. If this is the case, please install it using standard way.
+```commandline
+sudo apt install python3-opencv
+```
+If not, our build system has a script to download and cross-compile required OpenCV modules
 as well as [FFMPEG](https://ffmpeg.org/) and [x264 encoder](https://www.videolan.org/developers/x264.html) libraries.
 The latter will build limited OpenCV functionality and application will support only video file input and video file output
 way of working. Displaying video frames in a window requires building OpenCV with GTK and OpenGL support.
@@ -68,6 +95,49 @@
 
 Please see [find_opencv.cmake](./cmake/find_opencv.cmake) for implementation details.
 
+### Tensorflow Lite (Needed only in delegate file mode)
+
+This application uses [Tensorflow Lite)](https://www.tensorflow.org/) version 2.5.0 for demonstrating use of 'armnnDelegate'.
+armnnDelegate is a library for accelerating certain TensorFlow Lite operators on Arm hardware by providing
+the TensorFlow Lite interpreter with an alternative implementation of the operators via its delegation mechanism.
+You may clone and build Tensorflow lite and provide the path to its root and output library directories through the cmake
+flags TENSORFLOW_ROOT and TFLITE_LIB_ROOT respectively.
+For implementation details see the scripts FindTfLite.cmake and FindTfLiteSrc.cmake
+
+The application links with the Tensorflow lite library libtensorflow-lite.a
+
+#### Download and build Tensorflow Lite version 2.5.0
+Example for Tensorflow Lite native compilation
+```commandline
+sudo apt install build-essential
+git clone https://github.com/tensorflow/tensorflow.git
+cd tensorflow/tensorflow
+git checkout tags/v2.5.0
+mkdir build && cd build
+cmake ../lite -DTFLITE_ENABLE_XNNPACK=OFF
+make
+```
+
+### Flatbuffers (needed only in delegate file mode)
+
+This application uses [Flatbuffers)](https://google.github.io/flatbuffers/) version 1.12.0 for serialization
+You may clone and build Flatbuffers and provide the path to its root directory through the cmake
+flag FLATBUFFERS_ROOT.
+Please see [FindFlatbuffers.cmake] for implementation details.
+
+The application links with the Flatbuffers library libflatbuffers.a
+
+#### Download and build flatbuffers version 1.12.0
+Example for flatbuffer native compilation
+```commandline
+wget -O flatbuffers-1.12.0.zip https://github.com/google/flatbuffers/archive/v1.12.0.zip
+unzip -d . flatbuffers-1.12.0.zip
+cd flatbuffers-1.12.0
+mkdir install && cd install
+cmake .. -DCMAKE_INSTALL_PREFIX:PATH=`pwd`
+make install
+```
+
 ## Building
 There are two flows for building this application:
 * native build on a host platform,
@@ -83,6 +153,12 @@
 * BUILD_UNIT_TESTS -  set to `1` to build tests. Additionally to the main application, `object_detection_example-tests`
 unit tests executable will be created.
 
+* For the Delegate file mode:
+* USE_ARMNN_DELEGATE - set to True to build the application with Tflite and delegate file mode. default is False.
+* TFLITE_LIB_ROOT - point to the custom location of Tflite lib
+* TENSORFLOW_ROOT - point to the custom location of Tensorflow root directory
+* FLATBUFFERS_ROOT - point to the custom location of Flatbuffers root directory
+
 ### Native Build
 To build this application on a host platform, firstly ensure that required dependencies are installed:
 For example, for raspberry PI:
@@ -90,7 +166,7 @@
 sudo apt-get update
 sudo apt-get -yq install pkg-config
 sudo apt-get -yq install libgtk2.0-dev zlib1g-dev libjpeg-dev libpng-dev libxvidcore-dev libx264-dev
-sudo apt-get -yq install libavcodec-dev libavformat-dev libswscale-dev
+sudo apt-get -yq install libavcodec-dev libavformat-dev libswscale-dev ocl-icd-opencl-dev
 ```
 
 To build demo application, create a build directory:
@@ -114,6 +190,15 @@
 make
 ```
 
+If you have build with Delegate file mode and have custom Arm NN, Tflite, and Flatbuffers locations,
+use the USE_ARMNN_DELEGATE flag together with `TFLITE_LIB_ROOT`, `TENSORFLOW_ROOT`, `FLATBUFFERS_ROOT` and
+`ARMNN_LIB_DIR` options:
+```commandline
+cmake -DARMNN_LIB_DIR=/path/to/armnn/build/lib/ -DUSE_ARMNN_DELEGATE=True -DTFLITE_LIB_ROOT=/path/to/tensorflow/
+ -DTENSORFLOW_ROOT=/path/to/tensorflow/ -DFLATBUFFERS_ROOT=/path/to/flatbuffers/ ..
+make
+```
+
 ### Cross-compilation
 
 This section will explain how to cross-compile the application and dependencies on a Linux x86 machine
@@ -170,8 +255,18 @@
 libarmnn.so
 libarmnn.so.29
 libarmnn.so.29.0
+For Arm NN public C++ API mode:
 libarmnnTfLiteParser.so
 libarmnnTfLiteParser.so.24.4
+end
+For Delegate file mode:
+libarmnnDelegate.so
+libarmnnDelegate.so.25
+libarmnnDelegate.so.25.0
+libtensorflow-lite.a
+libflatbuffers.a
+end
+
 libavcodec.so
 libavcodec.so.58
 libavcodec.so.58.54.100
@@ -230,6 +325,9 @@
 * --preferred-backends: Takes the preferred backends in preference order, separated by comma.
                         For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc].
                         Defaults to CpuRef **[OPTIONAL]**
+* --profiling_enabled: Enabling this option will print important ML related milestones timing
+                       information in micro-seconds. By default, this option is disabled.
+                       Accepted options are true/false **[OPTIONAL]**
 
 ### Object Detection on a supplied video file
 
@@ -258,10 +356,15 @@
 1. Initialisation
     1. Reading from Video Source
     2. Preparing Labels and Model Specific Functions
-2. Creating a Network
-    1. Creating Parser and Importing Graph
-    3. Optimizing Graph for Compute Device
-    4. Creating Input and Output Binding Information
+2. Creating a Network (two modes are available)
+    a. Armnn C++ API mode:
+        1. Creating Parser and Importing Graph
+        2. Optimizing Graph for Compute Device
+        3. Creating Input and Output Binding Information
+    b. using Tflite and delegate file mode:
+        1. Building a Model and creating Interpreter
+        2. Creating Arm NN delegate file
+        3. Registering the Arm NN delegate file to the Interpreter
 3. Object detection pipeline
     1. Pre-processing the Captured Frame
     2. Making Input and Output Tensors
@@ -298,10 +401,14 @@
 Depending on the model being used, `CreatePipeline`  function returns specific implementation of the object detection
 pipeline.
 
-### Creating a Network
 
-All operations with Arm NN and networks are encapsulated in [`ArmnnNetworkExecutor`](./include/ArmnnNetworkExecutor.hpp)
-class.
+### There are two ways for Creating the Network. The first is using the Arm NN C++ API, and the second is using
+### Tflite with Arm NN delegate file
+
+#### Creating a Network using the Arm NN C++ API
+
+All operations with Arm NN and networks are encapsulated in
+[`ArmnnNetworkExecutor`](./common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp) class.
 
 ##### Creating Parser and Importing Graph
 The first step with Arm NN SDK is to import a graph from file by using the appropriate parser.
@@ -374,9 +481,67 @@
 Similarly, we can get the output binding information for an output layer by using the parser to retrieve output
 tensor names and calling `GetNetworkOutputBindingInfo()`.
 
+#### Creating a Network using Tflite and Arm NN delegate file
+
+All operations with Tflite and networks are encapsulated in [`ArmnnNetworkExecutor`](./include/delegate/ArmnnNetworkExecutor.hpp)
+class.
+
+##### Building a Model and creating Interpreter
+The first step with Tflite is to build a model from file by using Flatbuffer model class.
+with that model we create the Tflite Interpreter.
+```c++
+#include <tensorflow/lite/interpreter.h>
+
+armnnTfLiteParser::ITfLiteParserPtr parser = armnnTfLiteParser::ITfLiteParser::Create();m_model = tflite::FlatBufferModel::BuildFromFile(modelPath.c_str());
+tflite::ops::builtin::BuiltinOpResolver resolver;
+tflite::InterpreterBuilder(*m_model, resolver)(&m_interpreter);
+```
+after the Interpreter is created we allocate tensors using the AllocateTensors function of the Interpreter
+```c++
+m_interpreter->AllocateTensors();
+```
+
+##### Creating Arm NN Delegate file
+Arm NN Delegate file is created using the ArmnnDelegate constructor
+The constructor accepts a DelegateOptions object that is created from the
+list of the preferred backends that we want to use, and the optimizerOptions object (optional).
+In this example we enable fast math and reduce all float32 operators to float16 optimizations.
+These optimizations can sometime improve the performance but can also cause degredation,
+depending on the model and the backends involved, therefore one should try it out and
+decide whether to use it or not.
+
+
+```c++
+#include <armnn_delegate.hpp>
+#include <DelegateOptions.hpp>
+#include <DelegateUtils.hpp>
+
+/* enable fast math optimization */
+armnn::BackendOptions modelOptionGpu("GpuAcc", {{"FastMathEnabled", true}});
+optimizerOptions.m_ModelOptions.push_back(modelOptionGpu);
+
+armnn::BackendOptions modelOptionCpu("CpuAcc", {{"FastMathEnabled", true}});
+optimizerOptions.m_ModelOptions.push_back(modelOptionCpu);
+/* enable reduce float32 to float16 optimization */
+optimizerOptions.m_ReduceFp32ToFp16 = true;
+
+armnnDelegate::DelegateOptions delegateOptions(preferredBackends, optimizerOptions);
+/* create delegate object */
+std::unique_ptr<TfLiteDelegate, decltype(&armnnDelegate::TfLiteArmnnDelegateDelete)>
+            theArmnnDelegate(armnnDelegate::TfLiteArmnnDelegateCreate(delegateOptions),
+                             armnnDelegate::TfLiteArmnnDelegateDelete);
+```
+##### Registering the Arm NN delegate file to the Interpreter
+Registering the Arm NN delegate file will provide the TensorFlow Lite interpreter with an alternative implementation
+of the operators that can be accelerated by the Arm hardware
+For example:
+```c++
+    /* Register the delegate file */
+    m_interpreter->ModifyGraphWithDelegate(std::move(theArmnnDelegate));
+```
 ### Object detection pipeline
 
-Generic object detection pipeline has 3 steps to perform data pre-processing, run inference and decode inference results
+Generic object detection pipeline has 3 steps, to perform data pre-processing, run inference and decode inference results
 in the post-processing step.
 
 See [`ObjDetectionPipeline`](include/ObjectDetectionPipeline.hpp) and implementations for [`MobileNetSSDv1`](include/ObjectDetectionPipeline.hpp)
@@ -406,6 +571,13 @@
 objectDetectionPipeline->Inference(processed, results);
 ```
 Inference step will call `ArmnnNetworkExecutor::Run` method that will prepare input tensors and execute inference.
+We have two separate implementations of the `ArmnnNetworkExecutor` class and its functions including `ArmnnNetworkExecutor::Run`
+The first Implementation [`ArmnnNetworkExecutor`](./common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp)is utilizing
+Arm NN C++ API,
+while the second implementation [`ArmnnNetworkExecutor`](./include/delegate/ArmnnNetworkExecutor.hpp) is utilizing
+Tensorflow lite and its Delegate file mechanism.
+
+##### Executing Inference utilizing the Arm NN C++ API
 A compute device performs inference for the loaded network using the `EnqueueWorkload()` function of the runtime context.
 For example:
 ```c++
@@ -416,8 +588,22 @@
 runtime->EnqueueWorkload(0, inputTensors, outputTensors);
 ```
 We allocate memory for output data once and map it to output tensor objects. After successful inference, we read data
-from the pre-allocated output data buffer. See [`ArmnnNetworkExecutor::ArmnnNetworkExecutor`](./src/ArmnnNetworkExecutor.cpp)
-and [`ArmnnNetworkExecutor::Run`](./src/ArmnnNetworkExecutor.cpp) for more details.
+from the pre-allocated output data buffer.
+See [`ArmnnNetworkExecutor::ArmnnNetworkExecutor`](./common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp)
+and [`ArmnnNetworkExecutor::Run`](./common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp) for more details.
+
+##### Executing Inference utilizing the Tensorflow lite and Arm NN delegate file 
+Inside the `PrepareTensors(..)` function, the input frame is copied to the Tflite Interpreter input tensor,
+than the Tflite Interpreter performs inference for the loaded network using the `Invoke()` function.
+For example:
+```c++
+PrepareTensors(inputData, dataBytes);
+
+if (m_interpreter->Invoke() == kTfLiteOk)
+```
+After successful inference, we read data from the Tflite Interpreter output tensor and copy
+it to the outResults vector.
+See [`ArmnnNetworkExecutor::Run`](./include/delegate/ArmnnNetworkExecutor.hpp) for more details.
 
 #### Postprocessing
 
@@ -430,7 +616,7 @@
 See [`SSDResultDecoder`](./include/SSDResultDecoder.hpp) for more details.
 
 For YOLO V3 Tiny models, we decode the output and perform non-maximum suppression to filter out any weak detections
-below a confidence threshold and any redudant bounding boxes above an intersection-over-union threshold.
+below a confidence threshold and any redundant bounding boxes above an intersection-over-union threshold.
 See [`YoloResultDecoder`](./include/YoloResultDecoder.hpp) for more details.
 
 It is encouraged to experiment with threshold values for confidence and intersection-over-union (IoU)
@@ -450,4 +636,4 @@
             AddInferenceOutputToFrame(detects, *frame, labels);
         });
 ```
-The processed frames are written to a file or displayed in a separate window.
\ No newline at end of file
+The processed frames are written to a file or displayed in a separate window.
diff --git a/samples/ObjectDetection/cmake/unit_tests.cmake b/samples/ObjectDetection/cmake/unit_tests.cmake
index dd3de70..6b2a9bb 100644
--- a/samples/ObjectDetection/cmake/unit_tests.cmake
+++ b/samples/ObjectDetection/cmake/unit_tests.cmake
@@ -6,8 +6,6 @@
 add_definitions (-DTEST_RESOURCE_DIR="${TEST_RESOURCES_DIR}")
 set(TEST_TARGET_NAME "${CMAKE_PROJECT_NAME}-tests")
 
-file(GLOB TEST_SOURCES "test/*")
-
 include(../common/cmake/find_catch.cmake)
 
 ExternalProject_Add(basketball-image
@@ -42,6 +40,14 @@
         INSTALL_COMMAND ""
         )
 
+ExternalProject_Add(yolo_v3
+        URL https://github.com/ARM-software/ML-zoo/raw/master/models/object_detection/yolo_v3_tiny/tflite_fp32/yolo_v3_tiny_darknet_fp32.tflite
+        DOWNLOAD_NO_EXTRACT 1
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ${CMAKE_COMMAND} -E copy <DOWNLOAD_DIR>/yolo_v3_tiny_darknet_fp32.tflite ${CMAKE_CURRENT_SOURCE_DIR}/test/resources
+        INSTALL_COMMAND ""
+        )
+
 add_executable("${TEST_TARGET_NAME}" ${SOURCES} ${TEST_SOURCES} ${CVUTILS_SOURCES} ${UTILS_SOURCES})
 
 add_dependencies(
@@ -61,4 +67,11 @@
     ${ARMNN_INCLUDE_DIR}
     ${OPENCV_INCLUDE_DIR} ${DEPENDENCIES_DIR} ${TEST_RESOURCES_DIR} ${COMMON_INCLUDE_DIR})
 
-target_link_libraries("${TEST_TARGET_NAME}" PUBLIC ${ARMNN_LIBS} ${OPENCV_LIBS} ${FFMPEG_LIBS})
\ No newline at end of file
+target_link_libraries("${TEST_TARGET_NAME}" PUBLIC ${ARMNN_LIBS} ${OPENCV_LIBS} ${FFMPEG_LIBS})
+if( USE_ARMNN_DELEGATE )
+    set(CMAKE_CXX_FLAGS " -ldl -lrt -Wl,--copy-dt-needed-entries")
+    target_link_libraries("${TEST_TARGET_NAME}" PUBLIC ${TfLite_LIB})
+    target_link_libraries("${TEST_TARGET_NAME}" PUBLIC tflite_headers)
+    target_include_directories("${TEST_TARGET_NAME}" PUBLIC ${Flatbuffers_INCLUDE_DIR})
+    target_link_libraries("${TEST_TARGET_NAME}" PUBLIC ${Flatbuffers_LIB})
+endif()
\ No newline at end of file
diff --git a/samples/ObjectDetection/include/delegate/ArmnnNetworkExecutor.hpp b/samples/ObjectDetection/include/delegate/ArmnnNetworkExecutor.hpp
new file mode 100644
index 0000000..c8875a2
--- /dev/null
+++ b/samples/ObjectDetection/include/delegate/ArmnnNetworkExecutor.hpp
@@ -0,0 +1,253 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "Types.hpp"
+
+#include "armnn/ArmNN.hpp"
+#include <armnn/Logging.hpp>
+#include <armnn_delegate.hpp>
+#include <DelegateOptions.hpp>
+#include <DelegateUtils.hpp>
+#include <Profiling.hpp>
+#include <tensorflow/lite/builtin_ops.h>
+#include <tensorflow/lite/c/builtin_op_data.h>
+#include <tensorflow/lite/c/common.h>
+#include <tensorflow/lite/optional_debug_tools.h>
+#include <tensorflow/lite/kernels/builtin_op_kernels.h>
+#include <tensorflow/lite/interpreter.h>
+#include <tensorflow/lite/kernels/register.h>
+
+#include <string>
+#include <vector>
+
+namespace common
+{
+/**
+* @brief Used to load in a network through Tflite Interpreter,
+*        register Armnn Delegate file to it, and run inference
+*        on it against a given backend.
+*        currently it is assumed that the input data will be
+*        cv:MAT (Frame), the assumption is implemented in
+*        PrepareTensors method, it can be generalized later
+*
+*/
+template <typename Tout>
+class ArmnnNetworkExecutor
+{
+private:
+    std::unique_ptr<tflite::Interpreter> m_interpreter;
+    std::unique_ptr<tflite::FlatBufferModel> m_model;
+    Profiling m_profiling;
+
+    void PrepareTensors(const void* inputData, const size_t dataBytes);
+
+    template <typename Enumeration>
+    auto log_as_int(Enumeration value)
+    -> typename std::underlying_type<Enumeration>::type
+    {
+        return static_cast<typename std::underlying_type<Enumeration>::type>(value);
+    }
+
+public:
+    ArmnnNetworkExecutor() = delete;
+
+    /**
+    * @brief Initializes the network with the given input data.
+    *
+    *
+    *       * @param[in] modelPath - Relative path to the model file
+    *       * @param[in] backends - The list of preferred backends to run inference on
+    */
+    ArmnnNetworkExecutor(std::string& modelPath,
+                         std::vector<armnn::BackendId>& backends,
+                         bool isProfilingEnabled = false);
+
+    /**
+    * @brief Returns the aspect ratio of the associated model in the order of width, height.
+    */
+    Size GetImageAspectRatio();
+
+    /**
+    * @brief Returns the data type of the associated model.
+    */
+    armnn::DataType GetInputDataType() const;
+
+    float GetQuantizationScale();
+
+    int GetQuantizationOffset();
+
+    float GetOutputQuantizationScale(int tensorIndex);
+
+    int GetOutputQuantizationOffset(int tensorIndex);
+
+
+    /**
+    * @brief Runs inference on the provided input data, and stores the results
+    * in the provided InferenceResults object.
+    *
+    * @param[in] inputData - input frame data
+    * @param[in] dataBytes - input data size in bytes
+    * @param[out] outResults - Vector of DetectionResult objects used to store the output result.
+    */
+    bool Run(const void *inputData, const size_t dataBytes,
+             InferenceResults<Tout> &outResults);
+};
+
+template <typename Tout>
+ArmnnNetworkExecutor<Tout>::ArmnnNetworkExecutor(std::string& modelPath,
+                                           std::vector<armnn::BackendId>& preferredBackends,
+                                           bool isProfilingEnabled):
+                                           m_profiling(isProfilingEnabled)
+{
+    m_profiling.ProfilingStart();
+    armnn::OptimizerOptions optimizerOptions;
+    m_model = tflite::FlatBufferModel::BuildFromFile(modelPath.c_str());
+    if (m_model == nullptr)
+    {
+        const std::string errorMessage{"ArmnnNetworkExecutor: Failed to build the model"};
+        ARMNN_LOG(error) << errorMessage;
+        throw armnn::Exception(errorMessage);
+    }
+    m_profiling.ProfilingStopAndPrintUs("Loading the model took");
+
+    m_profiling.ProfilingStart();
+    tflite::ops::builtin::BuiltinOpResolver resolver;
+    tflite::InterpreterBuilder(*m_model, resolver)(&m_interpreter);
+    if (m_interpreter->AllocateTensors() != kTfLiteOk)
+    {
+        const std::string errorMessage{"ArmnnNetworkExecutor: Failed to alloc tensors"};
+        ARMNN_LOG(error) << errorMessage;
+        throw armnn::Exception(errorMessage);
+    }
+    m_profiling.ProfilingStopAndPrintUs("Create the tflite interpreter");
+
+    /* create delegate options */
+    m_profiling.ProfilingStart();
+
+    /* enable fast math optimization */
+    armnn::BackendOptions modelOptionGpu("GpuAcc", {{"FastMathEnabled", true}});
+    optimizerOptions.m_ModelOptions.push_back(modelOptionGpu);
+
+    armnn::BackendOptions modelOptionCpu("CpuAcc", {{"FastMathEnabled", true}});
+    optimizerOptions.m_ModelOptions.push_back(modelOptionCpu);
+    /* enable reduce float32 to float16 optimization */
+    optimizerOptions.m_ReduceFp32ToFp16 = true;
+
+    armnnDelegate::DelegateOptions delegateOptions(preferredBackends, optimizerOptions);
+
+    /* create delegate object */
+    std::unique_ptr<TfLiteDelegate, decltype(&armnnDelegate::TfLiteArmnnDelegateDelete)>
+                theArmnnDelegate(armnnDelegate::TfLiteArmnnDelegateCreate(delegateOptions),
+                                 armnnDelegate::TfLiteArmnnDelegateDelete);
+
+    /* Register the delegate file */
+    m_interpreter->ModifyGraphWithDelegate(std::move(theArmnnDelegate));
+    m_profiling.ProfilingStopAndPrintUs("Create and load ArmNN Delegate");
+
+}
+
+template<typename Tout>
+void ArmnnNetworkExecutor<Tout>::PrepareTensors(const void *inputData, const size_t dataBytes)
+{
+    size_t inputTensorSize = m_interpreter->input_tensor(0)->bytes;
+    auto * inputTensorPtr = m_interpreter->input_tensor(0)->data.raw;
+    assert(inputTensorSize >= dataBytes);
+    if (inputTensorPtr != nullptr)
+    {
+       memcpy(inputTensorPtr, inputData, inputTensorSize);
+    }
+    else
+    {
+        const std::string errorMessage{"ArmnnNetworkExecutor: input tensor is null"};
+        ARMNN_LOG(error) << errorMessage;
+        throw armnn::Exception(errorMessage);
+    }
+
+}
+
+template <typename Tout>
+bool ArmnnNetworkExecutor<Tout>::Run(const void *inputData, const size_t dataBytes,
+                                             InferenceResults<Tout>& outResults)
+{
+    bool ret = false;
+    m_profiling.ProfilingStart();
+    PrepareTensors(inputData, dataBytes);
+
+    if (m_interpreter->Invoke() == kTfLiteOk)
+    {
+
+
+        ret = true;
+        // Extract the output tensor data.
+        outResults.clear();
+        outResults.reserve(m_interpreter->outputs().size());
+        for (int index = 0; index < m_interpreter->outputs().size(); index++)
+        {
+            size_t size = m_interpreter->output_tensor(index)->bytes / sizeof(Tout);
+            const Tout *p_Output = m_interpreter->typed_output_tensor<Tout>(index);
+            if (p_Output != nullptr) {
+                InferenceResult<float> outRes(p_Output, p_Output + size);
+                outResults.emplace_back(outRes);
+            }
+            else
+            {
+                const std::string errorMessage{"ArmnnNetworkExecutor: p_Output tensor is null"};
+                ARMNN_LOG(error) << errorMessage;
+                ret = false;
+            }
+        }
+    }
+    else
+    {
+        const std::string errorMessage{"ArmnnNetworkExecutor: Invoke has failed"};
+        ARMNN_LOG(error) << errorMessage;
+    }
+    m_profiling.ProfilingStopAndPrintUs("Perform inference");
+    return ret;
+}
+
+template <typename Tout>
+Size ArmnnNetworkExecutor<Tout>::GetImageAspectRatio()
+{
+    assert(m_interpreter->tensor(m_interpreter->inputs()[0])->dims->size == 4);
+    return Size(m_interpreter->tensor(m_interpreter->inputs()[0])->dims->data[2],
+                m_interpreter->tensor(m_interpreter->inputs()[0])->dims->data[1]);
+}
+
+template <typename Tout>
+armnn::DataType ArmnnNetworkExecutor<Tout>::GetInputDataType() const
+{
+    return GetDataType(*(m_interpreter->tensor(m_interpreter->inputs()[0])));
+}
+
+template <typename Tout>
+float ArmnnNetworkExecutor<Tout>::GetQuantizationScale()
+{
+    return m_interpreter->tensor(m_interpreter->inputs()[0])->params.scale;
+}
+
+template <typename Tout>
+int ArmnnNetworkExecutor<Tout>::GetQuantizationOffset()
+{
+    return m_interpreter->tensor(m_interpreter->inputs()[0])->params.zero_point;
+}
+
+template <typename Tout>
+float ArmnnNetworkExecutor<Tout>::GetOutputQuantizationScale(int tensorIndex)
+{
+    assert(m_interpreter->outputs().size() > tensorIndex);
+    return m_interpreter->tensor(m_interpreter->outputs()[tensorIndex])->params.scale;
+}
+
+template <typename Tout>
+int ArmnnNetworkExecutor<Tout>::GetOutputQuantizationOffset(int tensorIndex)
+{
+    assert(m_interpreter->outputs().size() > tensorIndex);
+    return m_interpreter->tensor(m_interpreter->outputs()[tensorIndex])->params.zero_point;
+}
+
+}// namespace common
\ No newline at end of file
diff --git a/samples/ObjectDetection/src/Main.cpp b/samples/ObjectDetection/src/Main.cpp
index e057981..8bc2f0d 100644
--- a/samples/ObjectDetection/src/Main.cpp
+++ b/samples/ObjectDetection/src/Main.cpp
@@ -20,6 +20,7 @@
 const std::string OUTPUT_VIDEO_FILE_PATH = "--output-video-file-path";
 const std::string LABEL_PATH = "--label-path";
 const std::string PREFERRED_BACKENDS = "--preferred-backends";
+const std::string PROFILING_ENABLED = "--profiling_enabled";
 const std::string HELP = "--help";
 
 /*
@@ -29,13 +30,16 @@
         {VIDEO_FILE_PATH, "[REQUIRED] Path to the video file to run object detection on"},
         {MODEL_FILE_PATH, "[REQUIRED] Path to the Object Detection model to use"},
         {LABEL_PATH, "[REQUIRED] Path to the label set for the provided model file. "
-                     "Label file is should just be an ordered list, seperated by new line."},
+                     "Label file  should be an ordered list, separated by a new line."},
         {MODEL_NAME, "[REQUIRED] The name of the model being used. Accepted options: YOLO_V3_TINY, SSD_MOBILE"},
         {OUTPUT_VIDEO_FILE_PATH, "[OPTIONAL] Path to the output video file with detections added in. "
                                  "If specified will save file to disk, else displays the output to screen"},
         {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma."
                              " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]."
-                             " Defaults to CpuAcc,CpuRef"}
+                             " Defaults to CpuAcc,CpuRef"},
+        {PROFILING_ENABLED, "[OPTIONAL] Enabling this option will print important ML related milestones timing"
+                            "information in micro-seconds. By default, this option is disabled."
+                            "Accepted options are true/false."}
 };
 
 /*
@@ -137,6 +141,10 @@
     pipelineOptions.m_ModelFilePath = GetSpecifiedOption(options, MODEL_FILE_PATH);
     pipelineOptions.m_ModelName = GetSpecifiedOption(options, MODEL_NAME);
 
+    if (CheckOptionSpecified(options, PROFILING_ENABLED))
+    {
+        pipelineOptions.m_ProfilingEnabled = GetSpecifiedOption(options, PROFILING_ENABLED) == "true";
+    }
     if(CheckOptionSpecified(options, PREFERRED_BACKENDS))
     {
         pipelineOptions.m_backends = GetPreferredBackendList((GetSpecifiedOption(options, PREFERRED_BACKENDS)));
@@ -148,6 +156,8 @@
 
     auto labels = AssignColourToLabel(GetSpecifiedOption(options, LABEL_PATH));
 
+    common::Profiling profiling(pipelineOptions.m_ProfilingEnabled);
+    profiling.ProfilingStart();
     od::IPipelinePtr objectDetectionPipeline = od::CreatePipeline(pipelineOptions);
 
     auto inputAndOutput = GetFrameSourceAndSink(options);
@@ -180,5 +190,6 @@
         frame = reader->ReadFrame();
     }
     sink->Close();
+    profiling.ProfilingStopAndPrintUs("Overall compute time");
     return 0;
 }
diff --git a/samples/ObjectDetection/src/ObjectDetectionPipeline.cpp b/samples/ObjectDetection/src/ObjectDetectionPipeline.cpp
index 077caa4..2c4a76d 100644
--- a/samples/ObjectDetection/src/ObjectDetectionPipeline.cpp
+++ b/samples/ObjectDetection/src/ObjectDetectionPipeline.cpp
@@ -11,8 +11,8 @@
 
 ObjDetectionPipeline::ObjDetectionPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<float>> executor,
                                            std::unique_ptr<IDetectionResultDecoder> decoder) :
-        m_executor(std::move(executor)),
-        m_decoder(std::move(decoder)){}
+    m_executor(std::move(executor)),
+    m_decoder(std::move(decoder)){}
 
 void od::ObjDetectionPipeline::Inference(const cv::Mat& processed, common::InferenceResults<float>& result)
 {
@@ -39,8 +39,8 @@
 
 MobileNetSSDv1::MobileNetSSDv1(std::unique_ptr<common::ArmnnNetworkExecutor<float>> executor,
                                float objectThreshold) :
-        ObjDetectionPipeline(std::move(executor),
-                             std::make_unique<SSDResultDecoder>(objectThreshold))
+    ObjDetectionPipeline(std::move(executor),
+                         std::make_unique<SSDResultDecoder>(objectThreshold))
 {}
 
 void MobileNetSSDv1::PreProcessing(const cv::Mat& frame, cv::Mat& processed)
@@ -52,13 +52,12 @@
         processed.convertTo(processed, CV_32FC3, 1 / 127.5, -1);
     }
 }
-
 YoloV3Tiny::YoloV3Tiny(std::unique_ptr<common::ArmnnNetworkExecutor<float>> executor,
                        float NMSThreshold, float ClsThreshold, float ObjectThreshold) :
-        ObjDetectionPipeline(std::move(executor),
-                             std::move(std::make_unique<YoloResultDecoder>(NMSThreshold,
-                                                                           ClsThreshold,
-                                                                           ObjectThreshold)))
+    ObjDetectionPipeline(std::move(executor),
+                         std::move(std::make_unique<YoloResultDecoder>(NMSThreshold,
+                                                                       ClsThreshold,
+                                                                       ObjectThreshold)))
 {}
 
 void YoloV3Tiny::PreProcessing(const cv::Mat& frame, cv::Mat& processed)
@@ -72,11 +71,12 @@
 
 IPipelinePtr CreatePipeline(common::PipelineOptions& config)
 {
-    auto executor = std::make_unique<common::ArmnnNetworkExecutor<float>>(config.m_ModelFilePath, config.m_backends);
-
+    auto executor = std::make_unique<common::ArmnnNetworkExecutor<float>>(config.m_ModelFilePath,
+                                                                          config.m_backends,
+                                                                          config.m_ProfilingEnabled);
     if (config.m_ModelName == "SSD_MOBILE")
     {
-        float detectionThreshold = 0.6;
+        float detectionThreshold = 0.5;
 
         return std::make_unique<od::MobileNetSSDv1>(std::move(executor),
                                                     detectionThreshold
@@ -99,4 +99,4 @@
     }
 
 }
-}// namespace od
\ No newline at end of file
+}// namespace od
diff --git a/samples/ObjectDetection/test/PipelineTest.cpp b/samples/ObjectDetection/test/PipelineTest.cpp
index 7af0900..48ac32c 100644
--- a/samples/ObjectDetection/test/PipelineTest.cpp
+++ b/samples/ObjectDetection/test/PipelineTest.cpp
@@ -35,7 +35,7 @@
     common::PipelineOptions options;
     options.m_ModelFilePath = GetResourceFilePath("ssd_mobilenet_v1.tflite");
     options.m_ModelName = "SSD_MOBILE";
-    options.m_backends = {"CpuRef"};
+    options.m_backends = {"CpuAcc", "CpuRef"};
 
     od::IPipelinePtr objectDetectionPipeline = od::CreatePipeline(options);
 
diff --git a/samples/ObjectDetection/test/delegate/ArmnnDelegateNetworkExecutorTest.cpp b/samples/ObjectDetection/test/delegate/ArmnnDelegateNetworkExecutorTest.cpp
new file mode 100644
index 0000000..4700660
--- /dev/null
+++ b/samples/ObjectDetection/test/delegate/ArmnnDelegateNetworkExecutorTest.cpp
@@ -0,0 +1,129 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include <catch.hpp>
+#include <opencv2/opencv.hpp>
+#include "ArmnnNetworkExecutor.hpp"
+#include "Types.hpp"
+#include "ImageUtils.hpp"
+#include "SSDResultDecoder.hpp"
+#include "YoloResultDecoder.hpp"
+
+using namespace std;
+
+static string GetResourceFilePath(const string& filename)
+{
+    string testResources = TEST_RESOURCE_DIR;
+
+    if(testResources.back() != '/')
+    {
+        return testResources + "/" + filename;
+    }
+    else
+    {
+        return testResources + filename;
+    }
+}
+
+TEST_CASE("Test Delegate Execution SSD_MOBILE")
+{
+    string testResources = TEST_RESOURCE_DIR;
+    REQUIRE(testResources != "");
+
+    vector<armnn::BackendId> m_backends = {"CpuRef"};
+    string file_path = GetResourceFilePath("ssd_mobilenet_v1.tflite");
+    common::InferenceResults<float> results;
+    cv::Mat processed;
+    cv::Mat cache;
+    float detectionThreshold = 0.6;
+    common::Profiling profiling(true);
+
+    profiling.ProfilingStart();
+    auto executor = make_unique<common::ArmnnNetworkExecutor<float>>(file_path, m_backends, true);
+    int width = executor->GetImageAspectRatio().m_Width;
+    int height = executor->GetImageAspectRatio().m_Height;
+    od::SSDResultDecoder ssdResult(detectionThreshold);
+
+    /* check GetInputDataType */
+    CHECK(executor->GetInputDataType() == armnn::DataType::QAsymmU8);
+    /* check GetImageAspectRatio */
+    CHECK(width == 300);
+    CHECK(height == 300);
+
+    cv::Mat inputFrame = cv::imread(GetResourceFilePath("basketball1.png"), cv::IMREAD_COLOR);
+    cv::cvtColor(inputFrame, inputFrame, cv::COLOR_BGR2RGB);
+    ResizeWithPad(inputFrame, processed, cache, common::Size(width,height));
+    CHECK(executor->Run(processed.data, processed.total() * processed.elemSize(), results) == true);
+    od::DetectedObjects detections = ssdResult.Decode(results,
+                      common::Size(inputFrame.size().width, inputFrame.size().height),
+                      common::Size(width, height), {});
+
+    /* Make sure we've found 2 persons in the image */
+    CHECK(detections.size() == 2 );
+    CHECK(detections[0].GetLabel() == "0");
+    CHECK(detections[1].GetLabel() == "0");
+    /* check GetQuantizationScale */
+    CHECK(to_string(executor->GetQuantizationScale()) == string("0.007812"));
+    /* check GetQuantizationOffset */
+    CHECK(executor->GetQuantizationOffset() == 128);
+    /* check GetQuantizationScale */
+    CHECK(executor->GetOutputQuantizationScale(0) == 0.0f);
+    /* check GetOutputQuantizationOffset */
+    CHECK(executor->GetOutputQuantizationOffset(0) == 0);
+    profiling.ProfilingStopAndPrintUs("Overall test");
+}
+
+TEST_CASE("Test Delegate Execution YOLO_V3")
+{
+    string testResources = TEST_RESOURCE_DIR;
+    REQUIRE(testResources != "");
+
+    vector<armnn::BackendId> m_backends = {"CpuRef"};
+    string file_path = GetResourceFilePath("yolo_v3_tiny_darknet_fp32.tflite");
+    common::InferenceResults<float> results;
+    cv::Mat processed;
+    cv::Mat cache;
+    float NMSThreshold = 0.3f;
+    float ClsThreshold = 0.3f;
+    float ObjectThreshold = 0.3f;
+
+
+    auto executor = make_unique<common::ArmnnNetworkExecutor<float>>(file_path, m_backends);
+    int width = executor->GetImageAspectRatio().m_Width;
+    int height = executor->GetImageAspectRatio().m_Height;
+    od::YoloResultDecoder yoloResult(NMSThreshold, ClsThreshold, ObjectThreshold);
+
+    /* check GetInputDataType */
+    CHECK(executor->GetInputDataType() == armnn::DataType::Float32);
+    /* check GetImageAspectRatio */
+    CHECK(width == 416);
+    CHECK(height == 416);
+
+    /* read the image */
+    cv::Mat inputFrame = cv::imread(GetResourceFilePath("basketball1.png"), cv::IMREAD_COLOR);
+    /* resize it according to the the input tensor requirments */
+    ResizeWithPad(inputFrame, processed, cache, common::Size(width,height));
+    /* converting to 3 channel matrix of 32 bits floats */
+    processed.convertTo(processed, CV_32FC3);
+    /* run the inference */
+    CHECK(executor->Run(processed.data, processed.total() * processed.elemSize(), results) == true);
+    /* decode the results */
+    od::DetectedObjects detections = yoloResult.Decode(results,
+                      common::Size(inputFrame.size().width, inputFrame.size().height),
+                      common::Size(width, height), {});
+
+    /* Make sure we've found 2 persons in the image */
+    CHECK(detections.size() == 2 );
+    CHECK(detections[0].GetLabel() == "0");
+    CHECK(detections[1].GetLabel() == "0");
+    /* check GetQuantizationScale */
+    CHECK(to_string(executor->GetQuantizationScale()) == string("0.000000"));
+    /* check GetQuantizationOffset */
+    CHECK(executor->GetQuantizationOffset() == 0);
+    /* check GetQuantizationScale */
+    CHECK(executor->GetOutputQuantizationScale(0) == 0.0f);
+    /* check GetOutputQuantizationOffset */
+    CHECK(executor->GetOutputQuantizationOffset(0) == 0);
+
+}
diff --git a/samples/common/cmake/find_armnn.cmake b/samples/common/cmake/find_armnn.cmake
index 289e912..35f87eb 100644
--- a/samples/common/cmake/find_armnn.cmake
+++ b/samples/common/cmake/find_armnn.cmake
@@ -2,8 +2,13 @@
 # SPDX-License-Identifier: MIT
 # Search for ArmNN built libraries in user-provided path first, then current repository, then system
 
-set(ARMNN_LIB_NAMES "libarmnn.so"
-    "libarmnnTfLiteParser.so")
+if( USE_ARMNN_DELEGATE )
+    set(ARMNN_LIB_NAMES "libarmnn.so"
+        "libarmnnDelegate.so")
+else()
+    set(ARMNN_LIB_NAMES "libarmnn.so"
+        "libarmnnTfLiteParser.so")
+endif()
 
 set(ARMNN_LIBS "")
 
@@ -26,7 +31,13 @@
         list(APPEND ARMNN_LIBS ${ARMNN_${armnn_lib}})
         get_filename_component(LIB_DIR ${ARMNN_${armnn_lib}} DIRECTORY)
         get_filename_component(LIB_PARENT_DIR ${LIB_DIR} DIRECTORY)
-        set(ARMNN_INCLUDE_DIR ${LIB_PARENT_DIR}/include)
+        if( USE_ARMNN_DELEGATE )
+            set(ARMNN_INCLUDE_DIR ${LIB_PARENT_DIR}/include
+                ${PARENT_DIR}/../delegate/include
+                ${PARENT_DIR}/../delegate/src)
+        else()
+            set(ARMNN_INCLUDE_DIR ${LIB_PARENT_DIR}/include)
+        endif()
     endif()
 endforeach()
 
diff --git a/samples/common/cmake/find_catch.cmake b/samples/common/cmake/find_catch.cmake
index 584b807..f55654e 100644
--- a/samples/common/cmake/find_catch.cmake
+++ b/samples/common/cmake/find_catch.cmake
@@ -8,9 +8,10 @@
 file(MAKE_DIRECTORY ${TEST_TPIP_INCLUDE})
 
 ExternalProject_Add(catch2-headers
-    URL https://github.com/catchorg/Catch2/releases/download/v2.11.1/catch.hpp
+    URL https://github.com/catchorg/Catch2/releases/download/v2.13.5/catch.hpp
+    URL_HASH MD5=b43c586fe617aefdee3e480e9fa8f370
     DOWNLOAD_NO_EXTRACT 1
     CONFIGURE_COMMAND ""
     BUILD_COMMAND ${CMAKE_COMMAND} -E copy <DOWNLOAD_DIR>/catch.hpp ${TEST_TPIP_INCLUDE}
     INSTALL_COMMAND ""
-    )
\ No newline at end of file
+    )
diff --git a/samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp b/samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp
index 9f1ef54..80558d8 100644
--- a/samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp
+++ b/samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp
@@ -11,6 +11,7 @@
 #include "armnnTfLiteParser/ITfLiteParser.hpp"
 #include "armnnUtils/DataLayoutIndexed.hpp"
 #include <armnn/Logging.hpp>
+#include "Profiling.hpp"
 
 #include <string>
 #include <vector>
@@ -21,7 +22,7 @@
 * @brief Used to load in a network through ArmNN and run inference on it against a given backend.
 *
 */
-template <class Tout>
+template <typename Tout>
 class ArmnnNetworkExecutor
 {
 private:
@@ -31,7 +32,7 @@
     armnn::InputTensors     m_InputTensors;
     armnn::OutputTensors    m_OutputTensors;
     std::vector<armnnTfLiteParser::BindingPointInfo> m_outputBindingInfo;
-
+    Profiling m_profiling;
     std::vector<std::string> m_outputLayerNamesList;
 
     armnnTfLiteParser::BindingPointInfo m_inputBindingInfo;
@@ -59,7 +60,8 @@
     *       * @param[in] backends - The list of preferred backends to run inference on
     */
     ArmnnNetworkExecutor(std::string& modelPath,
-                         std::vector<armnn::BackendId>& backends);
+                         std::vector<armnn::BackendId>& backends,
+                         bool isProfilingEnabled = false);
 
     /**
     * @brief Returns the aspect ratio of the associated model in the order of width, height.
@@ -87,12 +89,15 @@
 
 };
 
-template <class Tout>
+template <typename Tout>
 ArmnnNetworkExecutor<Tout>::ArmnnNetworkExecutor(std::string& modelPath,
-                                           std::vector<armnn::BackendId>& preferredBackends)
-        : m_Runtime(armnn::IRuntime::Create(armnn::IRuntime::CreationOptions()))
+                                           std::vector<armnn::BackendId>& preferredBackends,
+                                           bool isProfilingEnabled):
+        m_profiling(isProfilingEnabled),
+        m_Runtime(armnn::IRuntime::Create(armnn::IRuntime::CreationOptions()))
 {
     // Import the TensorFlow lite model.
+    m_profiling.ProfilingStart();
     armnnTfLiteParser::ITfLiteParserPtr parser = armnnTfLiteParser::ITfLiteParser::Create();
     armnn::INetworkPtr network = parser->CreateNetworkFromBinaryFile(modelPath.c_str());
 
@@ -151,16 +156,16 @@
             ));
         }
     }
-
+    m_profiling.ProfilingStopAndPrintUs("ArmnnNetworkExecutor time");
 }
 
-template <class Tout>
+template <typename Tout>
 armnn::DataType ArmnnNetworkExecutor<Tout>::GetInputDataType() const
 {
     return m_inputBindingInfo.second.GetDataType();
 }
 
-template <class Tout>
+template <typename Tout>
 void ArmnnNetworkExecutor<Tout>::PrepareTensors(const void* inputData, const size_t dataBytes)
 {
     assert(m_inputBindingInfo.second.GetNumBytes() >= dataBytes);
@@ -168,9 +173,10 @@
     m_InputTensors = {{ m_inputBindingInfo.first, armnn::ConstTensor(m_inputBindingInfo.second, inputData)}};
 }
 
-template <class Tout>
+template <typename Tout>
 bool ArmnnNetworkExecutor<Tout>::Run(const void* inputData, const size_t dataBytes, InferenceResults<Tout>& outResults)
 {
+    m_profiling.ProfilingStart();
     /* Prepare tensors if they are not ready */
     ARMNN_LOG(debug) << "Preparing tensors...";
     this->PrepareTensors(inputData, dataBytes);
@@ -190,37 +196,37 @@
 
     outResults.reserve(m_outputLayerNamesList.size());
     outResults = m_OutputBuffer;
-
+    m_profiling.ProfilingStopAndPrintUs("Total inference time");
     return (armnn::Status::Success == ret);
 }
 
-template <class Tout>
+template <typename Tout>
 float ArmnnNetworkExecutor<Tout>::GetQuantizationScale()
 {
     return this->m_inputBindingInfo.second.GetQuantizationScale();
 }
 
-template <class Tout>
+template <typename Tout>
 int ArmnnNetworkExecutor<Tout>::GetQuantizationOffset()
 {
     return this->m_inputBindingInfo.second.GetQuantizationOffset();
 }
 
-template <class Tout>
+template <typename Tout>
 float ArmnnNetworkExecutor<Tout>::GetOutputQuantizationScale(int tensorIndex)
 {
     assert(this->m_outputLayerNamesList.size() > tensorIndex);
     return this->m_outputBindingInfo[tensorIndex].second.GetQuantizationScale();
 }
 
-template <class Tout>
+template <typename Tout>
 int ArmnnNetworkExecutor<Tout>::GetOutputQuantizationOffset(int tensorIndex)
 {
     assert(this->m_outputLayerNamesList.size() > tensorIndex);
     return this->m_outputBindingInfo[tensorIndex].second.GetQuantizationOffset();
 }
 
-template <class Tout>
+template <typename Tout>
 Size ArmnnNetworkExecutor<Tout>::GetImageAspectRatio()
 {
     const auto shape = m_inputBindingInfo.second.GetShape();
diff --git a/samples/common/include/Utils/Profiling.hpp b/samples/common/include/Utils/Profiling.hpp
new file mode 100644
index 0000000..cca5632
--- /dev/null
+++ b/samples/common/include/Utils/Profiling.hpp
@@ -0,0 +1,90 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+#include <chrono>
+#include <iostream>
+#include <string>
+
+using namespace std::chrono;
+
+namespace common
+{
+/**
+* @brief Used for meausuring performance of specific actions in the code.
+ * Profiling should be enabled with a parameter passed to the constructor and
+ * it's disabled by default.
+ * In order to measure timing, wrap the desired code section with
+ * ProfilingStart() and ProfilingStopAndPrintUs(title)
+*/
+class Profiling {
+private:
+
+    struct group_thousands : std::numpunct<char>
+    {
+        std::string do_grouping() const override { return "\3"; }
+    };
+
+    bool mProfilingEnabled{};
+    steady_clock::time_point mStart{};
+    steady_clock::time_point mStop{};
+public:
+    Profiling() : mProfilingEnabled(false) {};
+
+    /**
+    * @brief Initializes the profiling object.
+    *
+    *       * @param[in] isEnabled - Enables the profiling computation and prints.
+    */
+    explicit Profiling(bool isEnabled) : mProfilingEnabled(isEnabled) {};
+
+/**
+* @brief Starts the profiling measurement.
+*
+*/
+
+    void ProfilingStart()
+    {
+        if (mProfilingEnabled)
+        {
+            mStart = steady_clock::now();
+        }
+    }
+
+/**
+* @brief Stops the profiling measurement, without printing the results.
+*
+*/
+    auto ProfilingStop()
+    {
+        if (mProfilingEnabled)
+        {
+            mStop = steady_clock::now();
+        }
+    }
+
+/**
+* @brief Get the measurement result in micro-seconds.
+*
+*/
+    auto ProfilingGetUs()
+    {
+        return mProfilingEnabled ? duration_cast<microseconds>(mStop - mStart).count() : 0;
+    }
+
+/**
+* @brief Stop the profiling measurement and print the result in micro-seconds.
+*
+*/
+    void ProfilingStopAndPrintUs(const std::string &title)
+    {
+        ProfilingStop();
+        if (mProfilingEnabled) {
+            std::cout.imbue(std::locale(std::cout.getloc(), new group_thousands));
+            std::cout << "Profiling: " << title << ": " << ProfilingGetUs() << " uSeconds" << std::endl;
+        }
+    }
+};
+}// namespace common
\ No newline at end of file
diff --git a/samples/common/include/Utils/Types.hpp b/samples/common/include/Utils/Types.hpp
index 4d1f708..184e02a 100644
--- a/samples/common/include/Utils/Types.hpp
+++ b/samples/common/include/Utils/Types.hpp
@@ -44,6 +44,7 @@
     std::string m_ModelName;
     std::string m_ModelFilePath;
     std::vector<armnn::BackendId> m_backends;
+    bool m_ProfilingEnabled = false;
 };
 
 template<typename T>