COMPMID-920: Introduce prepare() stage

Change-Id: I08ddb7f6e061178e7566518b48e4e18f8f078596
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/129825
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 83281e1..3d4fb11 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -135,5 +135,11 @@
 
 void CLConvolutionLayer::run()
 {
+    prepare();
     _function->run();
 }
+
+void CLConvolutionLayer::prepare()
+{
+    _function->prepare();
+}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 9b3bf48..151fa1b 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -220,13 +220,6 @@
         _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
         _gemmlowp_output.allocator()->allocate();
     }
-
-    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!_are_weights_reshaped)
-    {
-        // Allocate the tensor for the weights reshaped
-        _reshape_weights_output.allocator()->allocate();
-    }
 }
 
 Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
@@ -311,17 +304,7 @@
 
 void CLFullyConnectedLayer::run()
 {
-    // Reshape of the weights (happens only once)
-    if(!_are_weights_reshaped)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _are_weights_reshaped = true;
-        _reshape_weights_kernel.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -356,3 +339,30 @@
 
     _memory_group.release();
 }
+
+void CLFullyConnectedLayer::prepare()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run reshape weights kernel and mark weights as unused
+        _reshape_weights_output.allocator()->allocate();
+        _reshape_weights_kernel.run();
+        _original_weights->mark_as_unused();
+
+        // Prepare GEMM prepare and release unused weights
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+            if(!_reshape_weights_output.is_used())
+            {
+                _reshape_weights_output.allocator()->free();
+            }
+        }
+
+        CLScheduler::get().queue().finish();
+        _are_weights_reshaped = true;
+    }
+}
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 37fa0c5..e735adb 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -98,7 +98,7 @@
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
-      _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
+      _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
 
@@ -114,6 +114,7 @@
 
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = false;
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
@@ -169,7 +170,10 @@
     {
         // Allocate intermediate tensors
         _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
+        if(!_reshape_b_only_on_first_run)
+        {
+            _tmp_b.allocator()->allocate();
+        }
     }
 
     // Configure matrix addition kernel
@@ -188,6 +192,8 @@
 
 void CLGEMM::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
     if(_is_interleaved_transposed)
@@ -195,18 +201,7 @@
         // Run interleave kernel
         CLScheduler::get().enqueue(_interleave_kernel, false);
 
-        if(_is_first_run)
-        {
-            // Run transpose kernel
-            CLScheduler::get().enqueue(_transpose_kernel, false);
-
-            // Mark original b matrix as unused
-            if(_reshape_b_only_on_first_run)
-            {
-                _original_b->mark_as_unused();
-            }
-        }
-        else if(!_reshape_b_only_on_first_run)
+        if(!_reshape_b_only_on_first_run)
         {
             // Run transpose kernel
             CLScheduler::get().enqueue(_transpose_kernel, false);
@@ -223,6 +218,20 @@
     }
 
     _memory_group.release();
+}
 
-    _is_first_run = false;
+void CLGEMM::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        {
+            // Run transpose kernel
+            _tmp_b.allocator()->allocate();
+            CLScheduler::get().enqueue(_transpose_kernel, false);
+            _original_b->mark_as_unused();
+        }
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
 }
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index cf8a6a8..610eec4 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -91,7 +91,7 @@
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
-      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true), _is_activationlayer_enabled(false)
+      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
@@ -165,7 +165,7 @@
                                                                 dilation,
                                                                 act_info));
 
-    _is_first_run     = true;
+    _is_prepared      = false;
     _original_weights = weights;
     _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
@@ -258,9 +258,6 @@
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
-    // Allocate intermediate tensor
-    _weights_reshaped.allocator()->allocate();
-
     //Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
 
@@ -305,7 +302,7 @@
     unsigned int mat_weights_cols = weights->dimension(3);
     unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + bias_element;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized? nullptr:biases, nullptr));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr));
 
     // Create tensor info for im2col reshaped inputs
     const unsigned int mat_input_cols = mat_weights_rows;
@@ -369,16 +366,7 @@
 
 void CLGEMMConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _reshape_weights.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -398,13 +386,6 @@
     {
         // Run gemm
         _mm_gemm.run();
-
-        // Release reshaped weights if marked unused by CLGEMM
-        if(_is_first_run && !_weights_reshaped.is_used())
-        {
-            CLScheduler::get().queue().finish();
-            _weights_reshaped.allocator()->free();
-        }
     }
 
     // Reshape output matrix
@@ -417,6 +398,29 @@
     }
 
     _memory_group.release();
+}
 
-    _is_first_run = false;
+void CLGEMMConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run weights reshaping and mark as unused
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        _weights_reshaped.allocator()->allocate();
+        _reshape_weights.run();
+        _original_weights->mark_as_unused();
+
+        // Run GEMM prepare
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+            if(!_weights_reshaped.is_used())
+            {
+                _weights_reshaped.allocator()->free();
+            }
+        }
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
 }
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 5ff4fbc..025a16b 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -69,7 +69,7 @@
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
-      _is_first_run(true), _is_activationlayer_enabled(false)
+      _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
 {
 }
 
@@ -97,6 +97,9 @@
                                                     conv_info,
                                                     input->info()->data_layout());
 
+    _is_prepared      = false;
+    _original_weights = weights;
+
     // Manage intermediate tensors
     _memory_group.manage(&_input0);
     _memory_group.manage(&_batched_mm_output);
@@ -124,7 +127,6 @@
 
     // Allocate temporary tensors
     _input0.allocator()->allocate();
-    _input1.allocator()->allocate();
     _batched_mm_output.allocator()->allocate();
 }
 
@@ -182,11 +184,7 @@
 
 void CLWinogradConvolutionLayer::run()
 {
-    if(_is_first_run)
-    {
-        // Run filter transform
-        CLScheduler::get().enqueue(_filter_transform, false);
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -196,13 +194,6 @@
     // Run batched matrix multiplication
     _batched_mm.run();
 
-    // Release reshaped weights if marked unused by CLGEMM
-    if(_is_first_run && !_input1.is_used())
-    {
-        CLScheduler::get().queue().finish();
-        _input1.allocator()->free();
-    }
-
     // Run output transform
     CLScheduler::get().enqueue(_output_transform);
 
@@ -212,6 +203,25 @@
     }
 
     _memory_group.release();
+}
 
-    _is_first_run = false;
+void CLWinogradConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run filter transform and mark original weights as unused
+        _input1.allocator()->allocate();
+        CLScheduler::get().enqueue(_filter_transform, false);
+        _original_weights->mark_as_unused();
+
+        // Prepare GEMM and release reshaped weights if marked unused by CLGEMM
+        _batched_mm.prepare();
+        if(!_input1.is_used())
+        {
+            _input1.allocator()->free();
+        }
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
 }