COMPMID-1145: (API) Introduce prepare() stage (NEON/CL/GLES)

Change-Id: I5b46764f9c3154ec3e3b9c951cc9e6dfbcb81dfb
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/134255
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 6c54b18..4c1ea5b 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -38,7 +38,8 @@
     : _memory_group(std::move(memory_manager)),
       _scale_f(),
       _conv_f(),
-      _scaled_output()
+      _scaled_output(),
+      _is_prepared(false)
 {
 }
 
@@ -104,6 +105,8 @@
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
 
+    _is_prepared = false;
+
     _memory_group.manage(&_scaled_output);
 
     // configure scale function
@@ -126,8 +129,21 @@
 
 void CLDeconvolutionLayer::run()
 {
+    prepare();
+
     _memory_group.acquire();
+
     _scale_f.run();
     _conv_f.run();
+
     _memory_group.release();
 }
+
+void CLDeconvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        _conv_f.prepare();
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index c2b24e3..1815361 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -91,7 +91,7 @@
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
@@ -104,7 +104,7 @@
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
-    _is_first_run     = true;
+    _is_prepared      = false;
     _original_weights = weights;
     _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
@@ -182,7 +182,6 @@
 
     // Allocate intermediate tensors
     _input_reshaped.allocator()->allocate();
-    _weights_reshaped.allocator()->allocate();
     _v2mm_output.allocator()->allocate();
 }
 
@@ -235,18 +234,7 @@
 
 void CLDepthwiseConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
-        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
-        _is_first_run = false;
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     CLScheduler::get().enqueue(_im2col_kernel);
     CLScheduler::get().enqueue(_v2mm_input_fill_border);
@@ -257,3 +245,20 @@
         CLScheduler::get().enqueue(_output_stage_kernel);
     }
 }
+
+void CLDepthwiseConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark original weights tensor as unused
+        _weights_reshaped.allocator()->allocate();
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+        _original_weights->mark_as_unused();
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
index af2c6f0..fa2c3af 100644
--- a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,14 @@
 
 void CLDepthwiseSeparableConvolutionLayer::run()
 {
+    prepare();
+
     _depthwise_conv.run();
     _pointwise_conv.run();
+}
+
+void CLDepthwiseSeparableConvolutionLayer::prepare()
+{
+    _depthwise_conv.prepare();
+    _pointwise_conv.prepare();
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index f9713bb..bb76872 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -84,12 +84,10 @@
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
 
-    // Store original b matrix
-    _original_b = b;
-
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _is_prepared                 = false;
+    _original_b                  = b;
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
@@ -262,7 +260,7 @@
     {
         if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
         {
-            // Run transpose kernel
+            // Run transpose kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
             CLScheduler::get().enqueue(_transpose_kernel, false);
             _original_b->mark_as_unused();
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 27bed44..82710b6 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -91,8 +91,7 @@
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
-      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false),
-      _retain_internal_weights(false)
+      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
@@ -166,10 +165,9 @@
                                                                 dilation,
                                                                 act_info));
 
-    _is_prepared             = false;
-    _original_weights        = weights;
-    _is_quantized            = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _retain_internal_weights = weights_info.retain_internal_weights();
+    _is_prepared      = weights_info.retain_internal_weights();
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     const DataType dt = input->info()->data_type();
 
@@ -408,23 +406,18 @@
 {
     if(!_is_prepared)
     {
-        if(!_retain_internal_weights)
-        {
-            // Run weights reshaping and mark as unused
-            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-            _weights_reshaped.allocator()->allocate();
-            _reshape_weights.run();
-            _original_weights->mark_as_unused();
-        }
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-        // Run GEMM prepare
-        if(!_is_quantized)
+        // Run weights reshaping and mark original weights tensor as unused
+        _weights_reshaped.allocator()->allocate();
+        _reshape_weights.run();
+        _original_weights->mark_as_unused();
+
+        // Prepare GEMM
+        _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
+        if(!_weights_reshaped.is_used())
         {
-            _mm_gemm.prepare();
-            if(!_weights_reshaped.is_used() && !_retain_internal_weights)
-            {
-                _weights_reshaped.allocator()->free();
-            }
+            _weights_reshaped.allocator()->free();
         }
 
         CLScheduler::get().queue().finish();
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 711b006..94dc0e0 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -59,8 +59,23 @@
 } // namespace
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
-      _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false)
+    : _memory_group(std::move(memory_manager)),
+      _mm_kernel(),
+      _mtx_a_reshape_kernel(),
+      _mtx_b_reshape_kernel(),
+      _mtx_a_reduction_kernel(),
+      _mtx_b_reduction_kernel(),
+      _offset_contribution_kernel(),
+      _vector_sum_col(),
+      _vector_sum_row(),
+      _tmp_a(),
+      _tmp_b(),
+      _original_b(nullptr),
+      _a_offset(0),
+      _b_offset(0),
+      _is_interleaved_transposed(true),
+      _reshape_b_only_on_first_run(false),
+      _is_prepared(false)
 {
 }
 
@@ -70,6 +85,8 @@
     ARM_COMPUTE_UNUSED(gemm_info);
     ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
 
+    _is_prepared                 = false;
+    _original_b                  = b;
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->info()->quantization_info().offset;
     _b_offset                    = b->info()->quantization_info().offset;
@@ -149,10 +166,13 @@
     if(_is_interleaved_transposed)
     {
         _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
+        if(!_reshape_b_only_on_first_run)
+        {
+            _tmp_b.allocator()->allocate();
+        }
     }
 
-    if(_a_offset != 0)
+    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
         _vector_sum_col.allocator()->allocate();
     }
@@ -234,6 +254,8 @@
 
 void CLGEMMLowpMatrixMultiplyCore::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
     if(_is_interleaved_transposed)
@@ -241,21 +263,17 @@
         // Run reshape matrix A
         CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
 
-        if(_is_first_run || !_reshape_b_only_on_first_run)
+        if(!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
             CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
         }
     }
 
-    // Note: if _reshape_b_only_on_first_run = true, the reduction kernel can be executed only once
-    if(_is_first_run || !_reshape_b_only_on_first_run)
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
-        // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0)
-        {
-            CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
-        }
+        CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
     }
 
     // Run matrix multiply
@@ -271,6 +289,30 @@
     CLScheduler::get().enqueue(_offset_contribution_kernel, true);
 
     _memory_group.release();
+}
 
-    _is_first_run = false;
+void CLGEMMLowpMatrixMultiplyCore::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            // Run reshape kernel and mark original weights tensor as unused
+            _tmp_b.allocator()->allocate();
+            CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+            _original_b->mark_as_unused();
+        }
+
+        // Run matrix B reduction kernel only if _a_offset is not equal to 0
+        if(_a_offset != 0 && _reshape_b_only_on_first_run)
+        {
+            _vector_sum_col.allocator()->allocate();
+            CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+        }
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
 }
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 31d5cd5..d15e5df 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -73,7 +73,7 @@
 
 CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_first_run(false), _original_weights(nullptr)
+      _is_prepared(false), _original_weights(nullptr)
 {
 }
 
@@ -128,7 +128,7 @@
 
     bool _has_bias    = (biases != nullptr);
     _original_weights = weights;
-    _is_first_run     = true;
+    _is_prepared      = false;
 
     const unsigned int kernel_width  = weights->info()->dimension(0);
     const unsigned int kernel_height = weights->info()->dimension(1);
@@ -160,7 +160,6 @@
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
 
     // Allocate intermediate tensors
-    _weights_reshaped.allocator()->allocate();
     _input_im2col_reshaped.allocator()->allocate();
     _gemm_output.allocator()->allocate();
 
@@ -169,17 +168,7 @@
 
 void CLLocallyConnectedLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _is_first_run = false;
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -194,3 +183,19 @@
 
     _memory_group.release();
 }
+
+void CLLocallyConnectedLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark original weights tensor as unused
+        _weights_reshaped.allocator()->allocate();
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+        _original_weights->mark_as_unused();
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 4843ba6..0e1b9d5 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -36,7 +36,8 @@
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output()
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
+      _is_prepared(false)
 {
 }
 
@@ -74,6 +75,8 @@
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
     TensorShape shape      = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
 
+    _is_prepared = false;
+
     _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
 
@@ -100,7 +103,10 @@
 
 void CLRNNLayer::run()
 {
+    prepare();
+
     _memory_group.acquire();
+
     _fully_connected_kernel.run();
     _gemm_state_f.run();
     CLScheduler::get().enqueue(_add_kernel);
@@ -108,5 +114,17 @@
 
     // copy hidden out to output
     CLScheduler::get().enqueue(_copy_kernel);
+
     _memory_group.release();
+}
+
+void CLRNNLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        _fully_connected_kernel.prepare();
+        _gemm_state_f.prepare();
+
+        _is_prepared = true;
+    }
 }
\ No newline at end of file
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index d1ef87d..67b2ae9 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
@@ -37,7 +37,7 @@
 using namespace arm_compute;
 
 GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel(), _weights_reshaped()
+    : _weights_reshape_kernel()
 {
 }
 
@@ -68,7 +68,7 @@
 
 GCConvolutionLayer::GCConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _mm_gemm(), _output_col2im_kernel(), _fill_border(), _activationlayer_function(), _original_weights(nullptr),
-      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_first_run(true), _is_activationlayer_enabled(false)
+      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
@@ -97,7 +97,7 @@
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
-    _is_first_run     = true;
+    _is_prepared      = false;
     _original_weights = weights;
 
     if(biases != nullptr)
@@ -184,9 +184,6 @@
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
-    // Allocate intermediate tensor
-    _weights_reshaped.allocator()->allocate();
-
     //Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
 
@@ -200,17 +197,7 @@
 
 void GCConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _reshape_weights.run();
-        _is_first_run = false;
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -221,17 +208,34 @@
 
     // Run gemm on reshaped matrices
     _mm_gemm.run();
-
     GCScheduler::get().memory_barrier();
+
     // Reshape output matrix
     GCScheduler::get().dispatch(_output_col2im_kernel, false);
+    GCScheduler::get().memory_barrier();
 
     _memory_group.release();
 
-    GCScheduler::get().memory_barrier();
     // Run Activation Layer
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.run();
     }
 }
+
+void GCConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark as unused
+        _weights_reshaped.allocator()->allocate();
+        _reshape_weights.run();
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
+
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
index a300033..ab2c6c2 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
@@ -40,7 +40,7 @@
 
 GCFullyConnectedLayer::GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
-      _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
+      _original_weights(nullptr), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
 {
 }
 
@@ -86,6 +86,7 @@
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
 
+    _original_weights     = weights;
     _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
     _is_fc_after_conv     = true;
     _accumulate_biases    = false;
@@ -141,25 +142,13 @@
         configure_fc_fc(input, weights_to_use, output);
     }
 
-    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!_are_weights_reshaped && !retain_internal_weights)
-    {
-        // Allocate the tensor for the weights reshaped
-        _reshape_weights_output.allocator()->allocate();
-    }
-
     ARM_COMPUTE_ERROR_ON(retain_internal_weights && _reshape_weights_output.gc_buffer() == 0);
     _are_weights_reshaped = _are_weights_reshaped || retain_internal_weights;
 }
 
 void GCFullyConnectedLayer::run()
 {
-    // Reshape of the weights (happens only once)
-    if(!_are_weights_reshaped)
-    {
-        _are_weights_reshaped = true;
-        _reshape_weights_kernel.run();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -187,3 +176,21 @@
 
     _memory_group.release();
 }
+
+void GCFullyConnectedLayer::prepare()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run reshape weights kernel and mark weights as unused
+        _reshape_weights_output.allocator()->allocate();
+        _reshape_weights_kernel.run();
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
+
+        _are_weights_reshaped = true;
+    }
+}
\ No newline at end of file
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index 79f8f71..8ae91ee 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
@@ -73,8 +73,8 @@
 } // namespace
 
 GCGEMM::GCGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
-      _is_first_run(true), _reshape_b_only_on_first_run(false)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
+      _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
 
@@ -87,6 +87,8 @@
 
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = false;
+    _original_b                  = b;
 
     const IGCTensor *matrix_a = a;
     const IGCTensor *matrix_b = b;
@@ -136,7 +138,10 @@
     {
         // Allocate intermediate tensors
         _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
+        if(!_reshape_b_only_on_first_run)
+        {
+            _tmp_b.allocator()->allocate();
+        }
     }
 
     // Configure matrix addition kernel
@@ -155,23 +160,21 @@
 
 void GCGEMM::run()
 {
+    prepare();
+
     _memory_group.acquire();
+
     if(_is_interleaved_transposed)
     {
         // Run interleave kernel
         GCScheduler::get().dispatch(_interleave_kernel, false);
 
-        if(_is_first_run)
-        {
-            // Run transpose kernel
-            GCScheduler::get().dispatch(_transpose_kernel, false);
-            _is_first_run = false;
-        }
-        else if(!_reshape_b_only_on_first_run)
+        if(!_reshape_b_only_on_first_run)
         {
             // Run transpose kernel
             GCScheduler::get().dispatch(_transpose_kernel, false);
         }
+
         GCScheduler::get().memory_barrier();
     }
 
@@ -184,5 +187,27 @@
         GCScheduler::get().memory_barrier();
         GCScheduler::get().dispatch(_ma_kernel);
     }
+
     _memory_group.release();
 }
+
+void GCGEMM::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            // Run transpose kernel
+            _tmp_b.allocator()->allocate();
+            GCScheduler::get().dispatch(_transpose_kernel, false);
+            GCScheduler::get().memory_barrier();
+
+            // Mark original weights tensor as unused
+            _original_b->mark_as_unused();
+        }
+
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 96ac95f..4018407 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -155,6 +155,12 @@
 
 void NEConvolutionLayer::run()
 {
+    prepare();
     _function->run();
 }
+
+void NEConvolutionLayer::prepare()
+{
+    _function->prepare();
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 40ada8f..8051d6d 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -38,7 +38,8 @@
       _scaled_output(),
       _input(nullptr),
       _info(),
-      _inner_border()
+      _inner_border(),
+      _is_prepared(false)
 {
 }
 
@@ -104,6 +105,7 @@
     _input        = input;
     _info         = info;
     _inner_border = std::make_pair(inner_border_right, inner_border_top);
+    _is_prepared  = false;
 
     const unsigned int stride_x = info.stride().first;
     const unsigned int stride_y = info.stride().second;
@@ -132,13 +134,21 @@
 
 void NEDeconvolutionLayer::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
-    // Run upsample kernel
     _upsample_f.run();
-
-    // Run convolution layer
     _conv_f.run();
 
     _memory_group.release();
+}
+
+void NEDeconvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        _conv_f.prepare();
+        _is_prepared = true;
+    }
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 0a977ad..83c3e21 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -172,7 +172,7 @@
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
@@ -187,7 +187,7 @@
     const size_t weights_z = weights->info()->dimension(2);
 
     _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _is_first_run     = true;
+    _is_prepared      = false;
     _original_weights = weights;
 
     // Should bias be appended ?
@@ -260,24 +260,12 @@
 
     // Allocate intermediate tensors
     _input_reshaped.allocator()->allocate();
-    _weights_reshaped.allocator()->allocate();
     _v2mm_output.allocator()->allocate();
 }
 
 void NEDepthwiseConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
-        NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
-        _is_first_run = false;
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
     NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
@@ -288,3 +276,19 @@
         NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
     }
 }
+
+void NEDepthwiseConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run reshape and mark original weights as unused
+        _weights_reshaped.allocator()->allocate();
+        NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+        NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+        _original_weights->mark_as_unused();
+
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
index d70a668..da2e49c 100644
--- a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,14 @@
 
 void NEDepthwiseSeparableConvolutionLayer::run()
 {
+    prepare();
+
     _depthwise_conv.run();
     _pointwise_conv.run();
+}
+
+void NEDepthwiseSeparableConvolutionLayer::prepare()
+{
+    _depthwise_conv.prepare();
+    _pointwise_conv.prepare();
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 958d081..5b9f182 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -131,8 +131,8 @@
 }
 
 NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
-      _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr)
+    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_function(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(),
+      _interleave4x4_output(), _reshape_weights_output(), _original_weights(nullptr), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _is_prepared(false)
 {
 }
 
@@ -163,16 +163,16 @@
     const int    num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
     const size_t linear_input_size    = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
 
-    _original_weights     = weights;
-    _linearize_input      = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
-    _are_weights_reshaped = are_weights_reshaped;
-    _accumulate_biases    = biases != nullptr;
-    _is_batched_fc_layer  = num_batch_dimensions > 0;
+    _original_weights    = weights;
+    _linearize_input     = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
+    _accumulate_biases   = biases != nullptr;
+    _is_batched_fc_layer = num_batch_dimensions > 0;
+    _is_prepared         = are_weights_reshaped || (!transpose_weights && !_is_batched_fc_layer);
 
     const size_t   interleave_width = 16 / input->info()->element_size();
     const ITensor *weights_to_use   = weights;
 
-    if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
+    if(!_is_prepared)
     {
         weights_to_use = &_reshape_weights_output;
 
@@ -181,7 +181,7 @@
                                                   _is_batched_fc_layer, interleave_width)));
 
         // Reshape the weights
-        _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+        _reshape_weights_function.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
     }
 
     const ITensor *multiply_input = input;
@@ -220,13 +220,6 @@
         _accumulate_biases_kernel.configure(output, biases);
     }
 
-    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
-    {
-        // Allocate the tensor for the weights reshaped
-        _reshape_weights_output.allocator()->allocate();
-    }
-
     if(_linearize_input)
     {
         _im2col_output.allocator()->allocate();
@@ -322,17 +315,7 @@
 
 void NEFullyConnectedLayer::run()
 {
-    // Reshape of the weights (happens only once)
-    if(!_are_weights_reshaped)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _are_weights_reshaped = true;
-        _reshape_weights_kernel.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -359,3 +342,20 @@
 
     _memory_group.release();
 }
+
+void NEFullyConnectedLayer::prepare()
+{
+    // Reshape of the weights (happens only once)
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshape, clean internal tensors and mark original weights tensor as unused
+        _reshape_weights_output.allocator()->allocate();
+        _reshape_weights_function.run();
+        _reshape_weights_function = NEFullyConnectedLayerReshapeWeights();
+        _original_weights->mark_as_unused();
+
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 9168ed4..a98309d 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -40,7 +40,7 @@
 {
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _B_pretransposed(),
-      _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
+      _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
 
@@ -63,8 +63,11 @@
     }
 
     // Check if we need to reshape the matrix B only on the first run
+    _is_prepared                      = false;
     _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+    _original_b                       = b;
+    _asm_glue._optimised_kernel       = nullptr;
 
     const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)
                                && setup_assembly_kernel(a, b, d, alpha, beta, _reshape_b_only_on_first_run, _workspace, _B_pretransposed, _memory_group, _asm_glue);
@@ -128,7 +131,10 @@
 
             // Allocate once the all configure methods have been called
             _tmp_a.allocator()->allocate();
-            _tmp_b.allocator()->allocate();
+            if(!_reshape_b_only_on_first_run)
+            {
+                _tmp_b.allocator()->allocate();
+            }
 
             // Configure matrix addition kernel
             if(beta != 0 && c != nullptr)
@@ -142,28 +148,24 @@
 
 void NEGEMM::run()
 {
-    _memory_group.acquire();
+    prepare();
 
     if(_asm_glue._optimised_kernel != nullptr)
     {
+        _memory_group.acquire();
         _asm_glue.run();
         _memory_group.release();
     }
     else
     {
+        _memory_group.acquire();
+
         if(!_run_vector_matrix_multiplication)
         {
             // Run interleave kernel
             NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
 
-            if(_is_first_run)
-            {
-                // Run transpose kernel
-                NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
-
-                _is_first_run = false;
-            }
-            else if(!_reshape_b_only_on_first_run)
+            if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
                 NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
@@ -181,4 +183,28 @@
         }
     }
 }
+
+void NEGEMM::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_asm_glue._optimised_kernel)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            _asm_glue.prepare();
+            _original_b->mark_as_unused();
+        }
+        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue._optimised_kernel)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            _tmp_b.allocator()->allocate();
+            NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+            _original_b->mark_as_unused();
+        }
+
+        _is_prepared = true;
+    }
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 303691a..d4400b8 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -231,7 +231,7 @@
     : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
       _output_col2im_kernel(), _activationlayer_function(), _add_bias_kernel(), _original_weights(nullptr), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(),
       _tmp_output(), _workspace(), _B_pretransposed(), _data_layout(DataLayout::NCHW), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false),
-      _is_interleaved(false), _is_activationlayer_enabled(false), _skip_im2col(false)
+      _is_interleaved(false), _is_activationlayer_enabled(false), _skip_im2col(false), _is_prepared(false)
 {
 }
 
@@ -287,6 +287,7 @@
 
     ARM_COMPUTE_ERROR_THROW_ON(status);
 
+    _is_prepared                            = false;
     _original_weights                       = weights;
     const unsigned int fixed_point_position = input->info()->fixed_point_position();
     const ITensor     *biases_to_use        = (_append_bias) ? biases : nullptr;
@@ -442,12 +443,6 @@
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
 
-    // Allocate intermediate tensor
-    if(!_are_weights_reshaped)
-    {
-        _weights_reshaped.allocator()->allocate();
-    }
-
     //Configure Activation Layer
     if(_is_activationlayer_enabled)
     {
@@ -585,17 +580,7 @@
 
 void NEGEMMConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(!_are_weights_reshaped)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _are_weights_reshaped = true;
-        _reshape_weights.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -610,11 +595,6 @@
     if(_asm_glue._optimised_kernel != nullptr)
     {
         _asm_glue.run();
-        // Release weights in case buffer is pretransposed
-        if(!_weights_reshaped.is_used())
-        {
-            _weights_reshaped.allocator()->free();
-        }
     }
     else
     {
@@ -659,4 +639,43 @@
 
     _memory_group.release();
 }
+
+void NEGEMMConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run weights reshaping (Runs once for every configure)
+        if(!_are_weights_reshaped)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+            _weights_reshaped.allocator()->allocate();
+            _reshape_weights.run();
+            _reshape_weights = NEConvolutionLayerReshapeWeights();
+            _original_weights->mark_as_unused();
+            _are_weights_reshaped = true;
+        }
+
+        // Run GEMM prepare stage
+        if(_asm_glue._optimised_kernel)
+        {
+            _asm_glue.prepare();
+        }
+        else
+        {
+            if(_is_quantized)
+            {
+                _mm_gemmlowp.prepare();
+            }
+        }
+
+        // Release weights in case buffer is pretransposed
+        if(!_weights_reshaped.is_used())
+        {
+            _weights_reshaped.allocator()->free();
+        }
+
+        _is_prepared = true;
+    }
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 2e06fa2..a92ffa7 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -42,8 +42,8 @@
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _a_offset(0), _b_offset(0),
-      _run_vector_matrix_multiplication(false), _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
+      _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _original_b(nullptr), _a_offset(0), _b_offset(0),
+      _run_vector_matrix_multiplication(false), _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
 
@@ -52,23 +52,32 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
 
+    // Clear state
+    _mtx_a_reshape_kernel                = nullptr;
+    _mtx_b_reshape_kernel                = nullptr;
+    _asm_glue_signed._optimised_kernel   = nullptr;
+    _asm_glue_unsigned._optimised_kernel = nullptr;
+
+    // Set internal variables
     _a_offset                         = a->info()->quantization_info().offset;
     _b_offset                         = b->info()->quantization_info().offset;
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
     _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                      = false;
+    _original_b                       = b;
 
 #ifdef __aarch64__
     switch(a->info()->data_type())
     {
         case DataType::S8:
         {
-            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed);
+            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed);
             break;
         }
         case DataType::QASYMM8:
         case DataType::U8:
         {
-            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned);
+            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned);
             break;
         }
         default:
@@ -160,10 +169,13 @@
     if(!_dot_product_path && !_run_vector_matrix_multiplication)
     {
         _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
+        if(!_reshape_b_only_on_first_run)
+        {
+            _tmp_b.allocator()->allocate();
+        }
     }
 
-    if(_a_offset != 0)
+    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
         _vector_sum_col.allocator()->allocate();
     }
@@ -248,22 +260,21 @@
 
 void NEGEMMLowpMatrixMultiplyCore::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
-    // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction
-    if(!_run_vector_matrix_multiplication && !_dot_product_path)
+    // Reshape inputs
+    if(_mtx_a_reshape_kernel)
     {
-        if(_mtx_a_reshape_kernel)
-        {
-            NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
-        }
-
-        if(_mtx_b_reshape_kernel && (_is_first_run || !_reshape_b_only_on_first_run))
-        {
-            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
-        }
+        NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+    }
+    if(_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
+    {
+        NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
     }
 
+    // Run GEMM
     if(_asm_glue_unsigned._optimised_kernel != nullptr)
     {
         _asm_glue_unsigned.run();
@@ -284,7 +295,7 @@
     }
 
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0 && (_is_first_run || !_reshape_b_only_on_first_run))
+    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
         NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
     }
@@ -293,6 +304,45 @@
     NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
 
     _memory_group.release();
+}
 
-    _is_first_run = false;
+void NEGEMMLowpMatrixMultiplyCore::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run assembly reshape
+        if((_asm_glue_signed._optimised_kernel || _asm_glue_signed._optimised_kernel) && _reshape_b_only_on_first_run)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            if(_asm_glue_unsigned._optimised_kernel != nullptr)
+            {
+                _asm_glue_unsigned.prepare();
+            }
+            else if(_asm_glue_signed._optimised_kernel != nullptr)
+            {
+                _asm_glue_signed.prepare();
+            }
+            _original_b->mark_as_unused();
+        }
+        // Run non-assembly reshape
+        else if(_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            // Run reshape kernel and mark original weights tensor as unused
+            _tmp_b.allocator()->allocate();
+            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+            _original_b->mark_as_unused();
+        }
+
+        // Run matrix B reduction kernel only if _a_offset is not equal to 0
+        if(_a_offset != 0 && _reshape_b_only_on_first_run)
+        {
+            _vector_sum_col.allocator()->allocate();
+            NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+        }
+
+        _is_prepared = true;
+    }
 }
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 913acf8..0737bd2 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -73,7 +73,7 @@
 
 NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_first_run(false), _original_weights(nullptr)
+      _is_prepared(false), _original_weights(nullptr)
 {
 }
 
@@ -127,7 +127,7 @@
     ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
 
     bool _has_bias    = (biases != nullptr);
-    _is_first_run     = true;
+    _is_prepared      = false;
     _original_weights = weights;
 
     const unsigned int kernel_width  = weights->info()->dimension(0);
@@ -160,24 +160,13 @@
     _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
 
     // Allocate intermediate tensors
-    _weights_reshaped.allocator()->allocate();
     _input_im2col_reshaped.allocator()->allocate();
     _gemm_output.allocator()->allocate();
 }
 
 void NELocallyConnectedLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _is_first_run = false;
-        NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -192,3 +181,18 @@
 
     _memory_group.release();
 }
+
+void NELocallyConnectedLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark original weights tensor as unused
+        _weights_reshaped.allocator()->allocate();
+        NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+        _original_weights->mark_as_unused();
+
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index d6bc5cf..d9f6c0e 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -113,7 +113,7 @@
 NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
       _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
-      _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
+      _workspace(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false)
 {
 } /* arm_compute */
 
@@ -139,9 +139,10 @@
         ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
     }
 
-    _weights = weights;
-    _input   = input;
-    _output  = output;
+    _weights     = weights;
+    _input       = input;
+    _output      = output;
+    _is_prepared = false;
 
     std::unique_ptr<INEWinogradLayerTransformInputKernel<float>>   transform_input_kernel;
     std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
@@ -198,10 +199,13 @@
     const Tensor4DShape in_shape(internal_get_input_shape(input));
     const size_t        data_type_size = input->info()->element_size();
     // Get the memory required to instantiate a new Winograd operator.
-    constexpr size_t storage_alignment   = 64;
-    const size_t     kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
+    constexpr size_t storage_alignment = 64;
+
+    // Kernel Storage
+    const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
     _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
     _kernel_storage.allocator()->allocate();
+
     // Input storage
     const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
     _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
@@ -331,13 +335,9 @@
 {
     const DataLayout data_layout = _input->info()->data_layout();
 
+    prepare();
+
     _memory_group.acquire();
-    if(!_reshaped_kernel)
-    {
-        _reshaped_kernel = true;
-        _permute_weights.run();
-        NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
-    }
 
     if(data_layout == DataLayout::NCHW)
     {
@@ -491,4 +491,20 @@
     return Status{};
 }
 
+void NEWinogradConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Permute weights
+        _permute_weights.run();
+        _weights->mark_as_unused();
+
+        // Transform weights
+        NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
+        _weights_hwio.allocator()->free();
+
+        _is_prepared = true;
+    }
+}
+
 } // namespace arm_compute