Skip upsampling for deconvolution when not needed

If the input tensor's stride is 1 and the kernel size is 1x1,
skip upsampling step and pass the input tensor pointer for
convolution directly.

Partially resolve: [ONCPUML-1137]

Change-Id: I9de9444ff99cf35d44a51ccbe0fa6facc1035d27
Signed-off-by: Annop Wongwathanarat <annop.wongwathanarat@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8994
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 1a75c14..c30870d 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -77,7 +77,8 @@
       _original_weights(nullptr),
       _input(nullptr),
       _info(),
-      _is_prepared(false)
+      _is_prepared(false),
+      _do_upsampling(true)
 {
 }
 
@@ -176,11 +177,13 @@
     const unsigned int stride_x = info.stride().first;
     const unsigned int stride_y = info.stride().second;
 
+    // Do not perform upsampling when input is unit stride and weight shape is 1x1
+    _do_upsampling = stride_x != 1 || stride_y != 1 || weights->info()->dimension(width_idx) != 1 || weights->info()->dimension(height_idx) != 1;
+
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
 
     _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
-    _memory_group.manage(&_scaled_output);
 
     _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
     _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
@@ -190,27 +193,36 @@
     uint32_t            deconv_pad_x = 0;
     uint32_t            deconv_pad_y = 0;
 
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(),
-                                                                              stride_x, stride_y,
-                                                                              out_dims, deconv_pad_x, deconv_pad_y);
-
-    const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
-
-    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
-    scale_out_info.set_data_layout(data_layout);
-    _scaled_output.allocator()->init(scale_out_info);
-
-    _upsample_f.configure(input, &_scaled_output, upsample_info);
-
-    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
-
     // Setup flip axis data
     _flip_axis.allocator()->allocate();
     auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
     axis_data[0]   = static_cast<uint32_t>(width_idx);
     axis_data[1]   = static_cast<uint32_t>(height_idx);
 
-    _scaled_output.allocator()->allocate();
+    // Setup convolution and upsampling, if needed
+    if (_do_upsampling)
+    {
+        _memory_group.manage(&_scaled_output);
+        const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(),
+                                                                                  stride_x, stride_y,
+                                                                                  out_dims, deconv_pad_x, deconv_pad_y);
+
+        const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
+
+        TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+        scale_out_info.set_data_layout(data_layout);
+        _scaled_output.allocator()->init(scale_out_info);
+
+        _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+        _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
+
+        _scaled_output.allocator()->allocate();
+    }
+    else
+    {
+        _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
+    }
 }
 
 void NEDeconvolutionLayer::run()
@@ -219,7 +231,10 @@
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    _upsample_f.run();
+    if(_do_upsampling)
+    {
+        _upsample_f.run();
+    }
     _conv_f.run();
 }