COMPMID-1451: Fix allocation of weights in Deconvolution

Change-Id: If3ca0b034a7448df1e5349b51a2b124f1b4e99c1
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/153956
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: bsgcomp <bsgcomp@arm.com>
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index 39cbe0c..7a58c5a 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
@@ -123,7 +123,7 @@
     CLConvolutionLayer           _conv_f;
     CPPFlipWeightsKernel         _flip_weights;
     CLTensor                     _scaled_output;
-    ICLTensor                   *_weights;
+    ICLTensor                   *_original_weights;
     CLTensor                     _weights_flipped;
     bool                         _is_prepared;
 };
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 7387009..277945d 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -125,6 +125,7 @@
     CPPFlipWeightsKernel _flip_weights;
     Tensor               _scaled_output;
     Tensor               _weights_flipped;
+    const ITensor       *_original_weights;
     ITensor             *_input;
     PadStrideInfo        _info;
     std::pair<unsigned int, unsigned int> _inner_border;
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 951d1ec..bbf4e66 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -42,7 +42,7 @@
       _conv_f(),
       _flip_weights(),
       _scaled_output(),
-      _weights(),
+      _original_weights(nullptr),
       _weights_flipped(),
       _is_prepared(false)
 {
@@ -120,7 +120,7 @@
     const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
-    _weights = weights;
+    _original_weights = weights;
     _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
     _flip_weights.configure(weights, &_weights_flipped);
 
@@ -138,7 +138,6 @@
     _is_prepared = false;
 
     _memory_group.manage(&_scaled_output);
-    _memory_group.manage(&_weights_flipped);
 
     // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
     unsigned int      padx            = 0;
@@ -175,14 +174,25 @@
 {
     if(!_is_prepared)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights flipping and mark original weights tensor as unused
         _weights_flipped.allocator()->allocate();
         _weights_flipped.map(true);
-        _weights->map(CLScheduler::get().queue(), true);
+        _original_weights->map(CLScheduler::get().queue(), true);
         CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
         _weights_flipped.unmap();
-        _weights->unmap(CLScheduler::get().queue());
+        _original_weights->unmap(CLScheduler::get().queue());
+        _original_weights->mark_as_unused();
+
+        // Prepare convolution
         _conv_f.prepare();
 
+        if(!_weights_flipped.is_used())
+        {
+            _weights_flipped.allocator()->free();
+        }
+
         _is_prepared = true;
     }
 }
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index cbe7c51..23def59 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -39,6 +39,7 @@
       _flip_weights(),
       _scaled_output(),
       _weights_flipped(),
+      _original_weights(nullptr),
       _input(nullptr),
       _info(),
       _inner_border(),
@@ -104,10 +105,11 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    _input        = input;
-    _info         = info;
-    _inner_border = std::make_pair(inner_border_right, inner_border_top);
-    _is_prepared  = false;
+    _input            = input;
+    _original_weights = weights;
+    _info             = info;
+    _inner_border     = std::make_pair(inner_border_right, inner_border_top);
+    _is_prepared      = false;
 
     const unsigned int stride_x = info.stride().first;
     const unsigned int stride_y = info.stride().second;
@@ -160,9 +162,21 @@
 {
     if(!_is_prepared)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights flipping and mark original weights tensor as unused
         _weights_flipped.allocator()->allocate();
         CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
+        _original_weights->mark_as_unused();
+
+        // Prepare convolution
         _conv_f.prepare();
+
+        if(!_weights_flipped.is_used())
+        {
+            _weights_flipped.allocator()->free();
+        }
+
         _is_prepared = true;
     }
 }