Improve start-up time for ClScale

- Add macro guard for different kernels in scale.cl
- Rework TENSOR4D to the new format
- Pass scale_x and scale_y at runtime

Resolves COMPMID-4886

Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com>
Change-Id: Ib904a703d511fb8260618057ac92e5ea9efeee2b
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6619
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 9ba17d0..eb750cb 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -116,6 +116,33 @@
     ARM_COMPUTE_UNUSED(idx_start);
 }
 
+void ICLKernel::add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const ITensorInfo *info = tensor->info();
+    ARM_COMPUTE_ERROR_ON(info == nullptr);
+    const Strides &strides = info->strides_in_bytes();
+
+    // Tensor poniter
+    _kernel.setArg(idx++, tensor->cl_buffer());
+
+    // Add stride_y, stride_z and stride_w
+    _kernel.setArg<cl_uint>(idx++, strides[1]);
+    _kernel.setArg<cl_uint>(idx++, strides[2]);
+    _kernel.setArg<cl_uint>(idx++, strides[3]);
+
+    // Tensor dimensions
+    _kernel.setArg<cl_uint>(idx++, info->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(2));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(3));
+
+    // Offset of first element
+    unsigned int offset_first_element = info->offset_first_element_in_bytes();
+    _kernel.setArg<cl_uint>(idx++, offset_first_element);
+}
+
 #ifndef DOXYGEN_SKIP_THIS
 template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window);
 template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window);