COMPMID-959: Sets memory manager to CLWinograd

-Sets memory manager to Winograd functions
-Marks CLGEMM inputs as unused if needed

Change-Id: I425a3f864c756e0e2b4da895e1730b8822149ba8
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/128891
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 643e24d..97ef895 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -53,7 +53,7 @@
     {
         case ConvolutionMethod::WINOGRAD:
         {
-            auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>();
+            auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
             f->configure(input, weights, biases, output, conv_info);
             _function = std::move(f);
             break;
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index cf41ecc..bff5781 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -92,8 +92,8 @@
 } // namespace
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
-      _is_first_run(true), _reshape_b_only_on_first_run(false)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
+      _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
 {
 }
 
@@ -104,6 +104,9 @@
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info));
 
+    // Store original b matrix
+    _original_b = b;
+
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
 
@@ -192,7 +195,11 @@
             // Run transpose kernel
             CLScheduler::get().enqueue(_transpose_kernel, false);
 
-            _is_first_run = false;
+            // Mark original b matrix as unused
+            if(_reshape_b_only_on_first_run)
+            {
+                _original_b->mark_as_unused();
+            }
         }
         else if(!_reshape_b_only_on_first_run)
         {
@@ -211,4 +218,6 @@
     }
 
     _memory_group.release();
+
+    _is_first_run = false;
 }
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 084c4df..87c4a30 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -374,7 +374,6 @@
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
         _reshape_weights.run();
-        _is_first_run = false;
 
         // Mark original weights tensor as unused
         _original_weights->mark_as_unused();
@@ -398,6 +397,13 @@
     {
         // Run gemm
         _mm_gemm.run();
+
+        // Release reshaped weights if marked unused by CLGEMM
+        if(_is_first_run && !_weights_reshaped.is_used())
+        {
+            CLScheduler::get().queue().finish();
+            _weights_reshaped.allocator()->free();
+        }
     }
 
     // Reshape output matrix
@@ -410,4 +416,6 @@
     }
 
     _memory_group.release();
+
+    _is_first_run = false;
 }
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 86ccdda..65747cf 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -151,8 +151,6 @@
     {
         // Run filter transform
         CLScheduler::get().enqueue(_filter_transform, false);
-
-        _is_first_run = false;
     }
 
     _memory_group.acquire();
@@ -163,6 +161,13 @@
     // Run batched matrix multiplication
     _batched_mm.run();
 
+    // Release reshaped weights if marked unused by CLGEMM
+    if(_is_first_run && !_input1.is_used())
+    {
+        CLScheduler::get().queue().finish();
+        _input1.allocator()->free();
+    }
+
     // Run output transform
     CLScheduler::get().enqueue(_output_transform);
 
@@ -172,4 +177,6 @@
     }
 
     _memory_group.release();
+
+    _is_first_run = false;
 }