Fix performance regression in Transposed Convolution

Resolves: COMPMID-5849

Change-Id: I86f8bbc1f3a7c12c66d5ad8fcd74dd9e69629aa0
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9102
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Dynamic-Fusion: Jakub Sujak <jakub.sujak@arm.com>
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 56e9dae..5c25cba 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -150,10 +150,12 @@
 
     const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+    const size_t ofm   = weights->tensor_shape()[idx_n];
 
     if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
     {
-        if(input->data_layout() == DataLayout::NHWC)
+        if(input->data_layout() == DataLayout::NHWC && ofm <= 16)
         {
             return DeconvolutionMethod::DIRECT;
         }