COMPMID-534: Add MemoryManager support in OpenCL functions

Adds support for:
-CLConvolution
-CLGEMM
-CLGEMMLowp
-CLHOGDescriptor
-CLHOGGradient
-CLHOGMultiDetection
-CLL2Normalize
-CLLocallyConnectedLayer
-CLOpticalFlow
-CLReductionOperation

Change-Id: Ib13354d274ccf32ae933f3fbbad3ac3896cfd3bd
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87938
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
diff --git a/arm_compute/runtime/CL/functions/CLConvolution.h b/arm_compute/runtime/CL/functions/CLConvolution.h
index f526f6f..bc05cb2 100644
--- a/arm_compute/runtime/CL/functions/CLConvolution.h
+++ b/arm_compute/runtime/CL/functions/CLConvolution.h
@@ -27,11 +27,14 @@
 #include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
@@ -70,7 +73,7 @@
 {
 public:
     /** Default constructor */
-    CLConvolutionSquare();
+    CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -86,6 +89,7 @@
     void run() override;
 
 private:
+    CLMemoryGroup                                 _memory_group;   /**< Function's memory group */
     CLTensor                                      _tmp;            /**< temporary buffer for output of horizontal pass */
     bool                                          _is_separable;   /**< true if the convolution can be separated */
     CLSeparableConvolutionHorKernel<matrix_size>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 9b88730..2765b77 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -29,8 +29,12 @@
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -48,7 +52,7 @@
 {
 public:
     /** Default constructor. */
-    CLGEMM();
+    CLGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the kernel's inputs and output
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -70,6 +74,7 @@
     void run() override;
 
 private:
+    CLMemoryGroup              _memory_group;
     CLGEMMInterleave4x4Kernel  _interleave_kernel;
     CLGEMMTranspose1xWKernel   _transpose_kernel;
     CLGEMMMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowp.h b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
index da8883c..613fcaa 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowp.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
@@ -25,12 +25,15 @@
 #define __ARM_COMPUTE_CLGEMMLOWP_H__
 
 #include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -47,7 +50,7 @@
 {
 public:
     /** Constructor */
-    CLGEMMLowp();
+    CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the kernel's inputs, output
     *
     * @note GEMM_LOWP:  low precision matrix multiply kernel
@@ -75,6 +78,7 @@
     void run() override;
 
 private:
+    CLMemoryGroup                  _memory_group;
     CLGEMMInterleave4x4Kernel      _interleave_kernel;
     CLGEMMTranspose1xWKernel       _transpose_kernel;
     CLGEMMLowpMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
index cdb23bf..00d64f1 100644
--- a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
+++ b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
@@ -26,9 +26,13 @@
 
 #include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -44,7 +48,7 @@
 {
 public:
     /** Default constructor */
-    CLHOGDescriptor();
+    CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the function's source, destination, HOG data-object and border mode
      *
      * @param[in, out] input                 Input tensor. Data type supported: U8
@@ -60,6 +64,7 @@
     void run() override;
 
 private:
+    CLMemoryGroup                 _memory_group;
     CLHOGGradient                 _gradient;
     CLHOGOrientationBinningKernel _orient_bin;
     CLHOGBlockNormalizationKernel _block_norm;
diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h
index e74a684..051e586 100644
--- a/arm_compute/runtime/CL/functions/CLHOGGradient.h
+++ b/arm_compute/runtime/CL/functions/CLHOGGradient.h
@@ -28,11 +28,14 @@
 
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLDerivative.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
@@ -46,7 +49,7 @@
 {
 public:
     /** Default constructor */
-    CLHOGGradient();
+    CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the function's source, destinations, phase type and border mode
      *
      * @param[in, out] input                 Input tensor. Data type supported: U8.
@@ -63,6 +66,7 @@
     void run() override;
 
 private:
+    CLMemoryGroup          _memory_group;
     CLDerivative           _derivative;
     CLMagnitudePhaseKernel _mag_phase;
     CLTensor               _gx;
diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
index 3fe0fa9..1ff9865 100644
--- a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
+++ b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
@@ -28,10 +28,14 @@
 #include "arm_compute/core/CL/ICLMultiHOG.h"
 #include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
 #include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -53,7 +57,7 @@
 {
 public:
     /** Default constructor */
-    CLHOGMultiDetection();
+    CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLHOGMultiDetection(const CLHOGMultiDetection &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -85,6 +89,7 @@
     void run() override;
 
 private:
+    CLMemoryGroup                                                 _memory_group;
     CLHOGGradient                                                 _gradient_kernel;
     std::unique_ptr<CLHOGOrientationBinningKernel[]>              _orient_bin_kernel;
     std::unique_ptr<CLHOGBlockNormalizationKernel[]>              _block_norm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLL2Normalize.h b/arm_compute/runtime/CL/functions/CLL2Normalize.h
index 52c562c..20af54e 100644
--- a/arm_compute/runtime/CL/functions/CLL2Normalize.h
+++ b/arm_compute/runtime/CL/functions/CLL2Normalize.h
@@ -26,11 +26,14 @@
 
 #include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
@@ -42,7 +45,7 @@
 {
 public:
     /** Constructor */
-    CLL2Normalize();
+    CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
 
     /** Set the input and output tensors.
      *
@@ -57,6 +60,7 @@
     void run() override;
 
 private:
+    CLMemoryGroup        _memory_group;
     CLReductionOperation _reduce_func;
     CLL2NormalizeKernel  _normalize_kernel;
     CLTensor             _sumsq;
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
index 5f4f1ba..f56039f 100644
--- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
@@ -31,7 +31,11 @@
 #include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -48,7 +52,7 @@
 {
 public:
     /** Default constructor */
-    CLLocallyConnectedLayer();
+    CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -66,6 +70,7 @@
     void run() override;
 
 private:
+    CLMemoryGroup                          _memory_group;
     CLIm2ColKernel                         _input_im2col_kernel;
     CLWeightsReshapeKernel                 _weights_reshape_kernel;
     CLLocallyConnectedMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLOpticalFlow.h b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
index ca3f861..94dda18 100644
--- a/arm_compute/runtime/CL/functions/CLOpticalFlow.h
+++ b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
@@ -29,9 +29,11 @@
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -57,7 +59,7 @@
 {
 public:
     /** Default constructor */
-    CLOpticalFlow();
+    CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLOpticalFlow(const CLOpticalFlow &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -91,6 +93,7 @@
     void run() override;
 
 private:
+    CLMemoryGroup                              _memory_group;
     std::unique_ptr<CLLKTrackerInitKernel[]>   _tracker_init_kernel;
     std::unique_ptr<CLLKTrackerStage0Kernel[]> _tracker_stage0_kernel;
     std::unique_ptr<CLLKTrackerStage1Kernel[]> _tracker_stage1_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 89fdad2..09beaba 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -27,8 +27,10 @@
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 #include <cstdint>
 #include <memory>
@@ -44,7 +46,7 @@
 {
 public:
     /* Constructor */
-    CLReductionOperation();
+    CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
 
     /** Set the input and output tensors.
      *
@@ -59,6 +61,7 @@
     void run() override;
 
 private:
+    CLMemoryGroup                                 _memory_group;
     std::vector<CLTensor *>                       _sums_vector{ nullptr };
     std::unique_ptr<CLReductionOperationKernel[]> _reduction_kernels_vector{ nullptr };
     std::unique_ptr<CLFillBorderKernel[]>         _border_handlers_vector{ nullptr };
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 6410444..a9b0867 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -47,8 +47,8 @@
 }
 
 template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::CLConvolutionSquare()
-    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
 {
 }
 
@@ -66,6 +66,9 @@
         std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
         _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp);
+
         if(scale == 0)
         {
             scale = calculate_matrix_scale(conv, matrix_size);
@@ -92,8 +95,12 @@
 
     if(_is_separable)
     {
+        _memory_group.acquire();
+
         CLScheduler::get().enqueue(_kernel_hor, false);
         CLScheduler::get().enqueue(_kernel_vert);
+
+        _memory_group.release();
     }
     else
     {
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 9867229..a81d113 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -38,8 +38,8 @@
 
 using namespace arm_compute;
 
-CLGEMM::CLGEMM()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
 {
 }
 
@@ -86,6 +86,10 @@
         TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
         _tmp_b.allocator()->init(info_b);
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp_a);
+        _memory_group.manage(&_tmp_b);
+
         // Configure interleave kernel
         _interleave_kernel.configure(a, &_tmp_a);
 
@@ -115,6 +119,8 @@
 
 void CLGEMM::run()
 {
+    _memory_group.acquire();
+
     if(_is_interleaved_transposed)
     {
         // Run interleave kernel
@@ -132,4 +138,6 @@
     {
         CLScheduler::get().enqueue(_ma_kernel);
     }
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
index 45e011d..db6d11c 100644
--- a/src/runtime/CL/functions/CLGEMMLowp.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowp.cpp
@@ -33,8 +33,8 @@
 
 using namespace arm_compute;
 
-CLGEMMLowp::CLGEMMLowp()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+CLGEMMLowp::CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
 {
 }
 
@@ -62,6 +62,10 @@
     TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
     _tmp_b.allocator()->init(info_b);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp_a);
+    _memory_group.manage(&_tmp_b);
+
     // Configure kernels
     _interleave_kernel.configure(a, &_tmp_a);
     _transpose_kernel.configure(b, &_tmp_b);
@@ -74,6 +78,8 @@
 
 void CLGEMMLowp::run()
 {
+    _memory_group.acquire();
+
     /* Run interleave kernel */
     CLScheduler::get().enqueue(_interleave_kernel, false);
 
@@ -82,4 +88,6 @@
 
     /* Run matrix multiply kernel */
     CLScheduler::get().enqueue(_mm_kernel, false);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index b1b5a03..1470d5c 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -31,8 +31,8 @@
 
 using namespace arm_compute;
 
-CLHOGDescriptor::CLHOGDescriptor()
-    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
 {
 }
 
@@ -71,9 +71,16 @@
     TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
     _hog_space.allocator()->init(info_space);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_hog_space);
+
     // Initialise orientation binning kernel
     _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
 
@@ -88,6 +95,8 @@
 
 void CLHOGDescriptor::run()
 {
+    _memory_group.acquire();
+
     // Run gradient
     _gradient.run();
 
@@ -96,4 +105,6 @@
 
     // Run block normalization
     CLScheduler::get().enqueue(_block_norm);
+
+    _memory_group.release();
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 2387474..51aeaed 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -29,8 +29,8 @@
 
 using namespace arm_compute;
 
-CLHOGGradient::CLHOGGradient()
-    : _derivative(), _mag_phase(), _gx(), _gy()
+CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy()
 {
 }
 
@@ -47,6 +47,10 @@
     _gx.allocator()->init(info);
     _gy.allocator()->init(info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Initialise derivate kernel
     _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
 
@@ -67,9 +71,13 @@
 
 void CLHOGGradient::run()
 {
+    _memory_group.acquire();
+
     // Run derivative
     _derivative.run();
 
     // Run magnitude/phase kernel
     CLScheduler::get().enqueue(_mag_phase);
+
+    _memory_group.release();
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 9eed355..8012c2f 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -34,8 +34,9 @@
 
 using namespace arm_compute;
 
-CLHOGMultiDetection::CLHOGMultiDetection() // NOLINT
-    : _gradient_kernel(),
+CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _gradient_kernel(),
       _orient_bin_kernel(),
       _block_norm_kernel(),
       _hog_detect_kernel(),
@@ -141,6 +142,10 @@
     TensorInfo info_phase(shape_img, Format::U8);
     _phase.allocator()->init(info_phase);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
 
@@ -166,10 +171,17 @@
         TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
         _hog_space[i].allocator()->init(info_space);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_space.get() + i);
+
         // Initialise orientation binning kernel
         _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
     // Configure CLTensor for the normalized HOG space and block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
@@ -180,10 +192,19 @@
         TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
         _hog_norm_space[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_norm_space.get() + i);
+
         // Initialize block normalization kernel
         _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
     detection_window_strides->map(CLScheduler::get().queue(), true);
 
     // Configure HOG detector kernel
@@ -200,14 +221,6 @@
     _non_maxima_kernel->configure(_detection_windows, min_distance);
 
     // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        _hog_space[i].allocator()->allocate();
-    }
-
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
         _hog_norm_space[i].allocator()->allocate();
@@ -218,6 +231,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
 
+    _memory_group.acquire();
+
     // Reset detection window
     _detection_windows->clear();
 
@@ -250,4 +265,6 @@
         Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
         _detection_windows->unmap(CLScheduler::get().queue());
     }
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLL2Normalize.cpp b/src/runtime/CL/functions/CLL2Normalize.cpp
index 18d05be..99be8ca 100644
--- a/src/runtime/CL/functions/CLL2Normalize.cpp
+++ b/src/runtime/CL/functions/CLL2Normalize.cpp
@@ -34,13 +34,16 @@
 
 using namespace arm_compute;
 
-CLL2Normalize::CLL2Normalize()
-    : _reduce_func(), _normalize_kernel(), _sumsq()
+CLL2Normalize::CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
 }
 
 void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
 {
+    // Manage intermediate buffers
+    _memory_group.manage(&_sumsq);
+
     // Configure kernels
     _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
     _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
@@ -51,6 +54,10 @@
 
 void CLL2Normalize::run()
 {
+    _memory_group.acquire();
+
     _reduce_func.run();
     CLScheduler::get().enqueue(_normalize_kernel, true);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index ef6fb50..a89a45a 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -33,8 +33,9 @@
 
 using namespace arm_compute;
 
-CLLocallyConnectedLayer::CLLocallyConnectedLayer()
-    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+      _is_first_run(false)
 {
 }
 
@@ -99,6 +100,10 @@
     shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_input_im2col_reshaped);
+    _memory_group.manage(&_gemm_output);
+
     // Configure kernels
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
@@ -120,6 +125,8 @@
         CLScheduler::get().enqueue(_weights_reshape_kernel);
     }
 
+    _memory_group.acquire();
+
     // Run input reshaping
     CLScheduler::get().enqueue(_input_im2col_kernel);
 
@@ -128,4 +135,6 @@
 
     // Reshape output matrix
     CLScheduler::get().enqueue(_output_col2im_kernel, false);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index 07ca2f9..d00b1b5 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -37,8 +37,9 @@
 
 using namespace arm_compute;
 
-CLOpticalFlow::CLOpticalFlow() // NOLINT
-    : _tracker_init_kernel(),
+CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _tracker_init_kernel(),
       _tracker_stage0_kernel(),
       _tracker_stage1_kernel(),
       _tracker_finalize_kernel(),
@@ -116,6 +117,10 @@
         _scharr_gx[i].allocator()->init(tensor_info);
         _scharr_gy[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_scharr_gx.get() + i);
+        _memory_group.manage(_scharr_gy.get() + i);
+
         // Init Scharr kernel
         _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
 
@@ -144,6 +149,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
 
+    _memory_group.acquire();
+
     for(unsigned int level = _num_levels; level > 0; --level)
     {
         // Run Scharr kernel
@@ -160,4 +167,6 @@
     }
 
     CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 5bb3320..6643c9b 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -35,8 +35,8 @@
 
 using namespace arm_compute;
 
-CLReductionOperation::CLReductionOperation()
-    : _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
+CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
 {
 }
 
@@ -59,6 +59,7 @@
         shape.set(0, ceil(shape.x() / 128.f));
         auto *tensor = new CLTensor;
         tensor->allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+        _memory_group.manage(tensor);
         _sums_vector.push_back(tensor);
     }
 
@@ -76,9 +77,13 @@
 
 void CLReductionOperation::run()
 {
+    _memory_group.acquire();
+
     for(unsigned int i = 0; i < _num_of_stages; ++i)
     {
         CLScheduler::get().enqueue(_border_handlers_vector[i], false);
         CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
     }
+
+    _memory_group.release();
 }
\ No newline at end of file