COMPMID-1188: Static tuning of CLScale

Change-Id: Icf1cc00d9861fdb8766d0b8fd33ca90833863927
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/144830
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 7ef55f9..4ff9763 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
@@ -34,9 +35,13 @@
 void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
+    k->set_target(CLScheduler::get().target());
     k->configure(input, output, policy, border_mode, sampling_policy);
     _kernel = std::move(k);
 
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_kernel);
+
     // In the case of NHWC we can't have undefined border mode as this would require to access elements outside z dimension,
     // so we treat it like border constant.
     if(border_mode == BorderMode::UNDEFINED && input->info()->data_layout() == DataLayout::NHWC)
diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
index fa67710..2d52f33 100644
--- a/src/runtime/CL/tuners/BifrostTuner.cpp
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp
@@ -249,6 +249,34 @@
 
     k.set_lws_hint(lws_hint);
 }
+
+void tune_scale_kernel(CLScaleKernel &k)
+{
+    cl::NDRange               lws_hint      = k.lws_hint();
+    const GPUTarget           gpu_target    = k.get_target();
+    const DataType            dt            = k.input()->info()->data_type();
+    const InterpolationPolicy interpolation = k._interpolationPolicy;
+
+    // Configure the local work size for Bifrost, interpolation (bilinear) and datatype F32.
+    // The value are obtained via exhaustive autotuning.
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (dt == DataType::F32) && (interpolation == InterpolationPolicy::BILINEAR))
+    {
+        auto dim_0 = k.output()->info()->dimension(0);
+        if(dim_0 == 480)
+        {
+            lws_hint = cl::NDRange(2, 1);
+        }
+        else if(dim_0 == 3120)
+        {
+            lws_hint = cl::NDRange(2, 8);
+        }
+        else if(dim_0 == 4160)
+        {
+            lws_hint = cl::NDRange(4, 8);
+        }
+        k.set_lws_hint(lws_hint);
+    }
+}
 } // namespace
 
 void BifrostTuner::tune_kernel_static(ICLKernel &kernel)
@@ -281,6 +309,10 @@
     {
         tune_pooling_kernel(*utils::cast::polymorphic_downcast<CLPoolingLayerKernel *>(&kernel));
     }
+    else if(dynamic_cast<CLScaleKernel *>(&kernel) != nullptr)
+    {
+        tune_scale_kernel(*utils::cast::polymorphic_downcast<CLScaleKernel *>(&kernel));
+    }
 }
 
 void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel)