Improve start-up time for ClScale

- Add macro guard for different kernels in scale.cl
- Rework TENSOR4D to the new format
- Pass scale_x and scale_y at runtime

Resolves COMPMID-4886

Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com>
Change-Id: Ib904a703d511fb8260618057ac92e5ea9efeee2b
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6619
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 9ba17d0..eb750cb 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -116,6 +116,33 @@
     ARM_COMPUTE_UNUSED(idx_start);
 }
 
+void ICLKernel::add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const ITensorInfo *info = tensor->info();
+    ARM_COMPUTE_ERROR_ON(info == nullptr);
+    const Strides &strides = info->strides_in_bytes();
+
+    // Tensor poniter
+    _kernel.setArg(idx++, tensor->cl_buffer());
+
+    // Add stride_y, stride_z and stride_w
+    _kernel.setArg<cl_uint>(idx++, strides[1]);
+    _kernel.setArg<cl_uint>(idx++, strides[2]);
+    _kernel.setArg<cl_uint>(idx++, strides[3]);
+
+    // Tensor dimensions
+    _kernel.setArg<cl_uint>(idx++, info->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(2));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(3));
+
+    // Offset of first element
+    unsigned int offset_first_element = info->offset_first_element_in_bytes();
+    _kernel.setArg<cl_uint>(idx++, offset_first_element);
+}
+
 #ifndef DOXYGEN_SKIP_THIS
 template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window);
 template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window);
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index 3b3217d..a7c979e 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -225,6 +225,24 @@
     {
         add_tensor_argument<4>(idx, tensor, window);
     }
+
+    /** Add the passed NHWC 4D tensor's parameters to the object's kernel's arguments by passing strides, dimensions and the offset to the first valid element in bytes.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     */
+    void add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor);
+
+    /** Returns the number of arguments enqueued per NHWC 4D Tensor object.
+     *
+     * @return The number of arguments enqueued per NHWC 4D Tensor object.
+     */
+    constexpr static unsigned int num_arguments_per_4d_tensor_nhwc()
+    {
+        constexpr unsigned int no_args_per_4d_tensor_nhwc = 9u;
+        return no_args_per_4d_tensor_nhwc;
+    }
+
     /** Returns the number of arguments enqueued per 1D array object.
      *
      * @return The number of arguments enqueues per 1D array object.
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index 21579ae..bccfd65 100644
--- a/src/core/CL/cl_kernels/nhwc/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -24,12 +24,11 @@
 #include "helpers.h"
 #include "tile_helpers.h"
 
+#if defined(SCALE_NEAREST_NEIGHBOUR)
 //! @cond Doxygen_Suppress
 /** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC)
  *
  * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
  * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
  * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
  * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
@@ -37,61 +36,52 @@
  * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
  * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
  * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
- * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5)
- * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5)
  * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c                             The size of the channels dimension of the source tensor
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale_x                           The scale value to apply on the source width
+ * @param[in] scale_y                           The scale value to apply on the source height
  */
- //! @endcond
+//! @endcond
 __kernel void scale_nearest_neighbour_nhwc(
-    TENSOR4D(src, SRC_TENSOR_TYPE),
-    TENSOR4D(dst, DST_TENSOR_TYPE))
+    TENSOR4D_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_T(dst, DST_TENSOR_TYPE),
+    const float scale_x,
+    const float scale_y)
 {
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _ISCALE_X SCALE_X
-#define _ISCALE_Y SCALE_Y
-
     const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
     const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
 #if defined(BATCHED_EXECUTION)
-    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
-    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
-#else                                                         // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
     const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
-    const int bout = 0; // BATCH SIZE IDX
-#endif                                                        // defined(BATCHED_EXECUTION)
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                 // defined(BATCHED_EXECUTION)
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
-    float xi_f = (xo * (float)SCALE_X);
-    float yi_f = (yo * (float)SCALE_Y);
+    float xi_f = (xo * scale_x);
+    float yi_f = (yo * scale_y);
 #elif SAMPLING_POLICY_CENTER
-    float     xi_f = ((xo + 0.5f) * (float)SCALE_X);
-    float     yi_f = ((yo + 0.5f) * (float)SCALE_Y);
+    float     xi_f = ((xo + 0.5f) * scale_x);
+    float     yi_f = ((yo + 0.5f) * scale_y);
 #else // SAMPLING_POLICY
 #error("Unsupported sampling policy");
 #endif // SAMPLING_POLICY
@@ -101,30 +91,30 @@
     yi_f = round(yi_f);
 #endif // ALIGN_CORNERS
 
-    const int xi0 = clamp((int)xi_f, 0, _ISRC_WIDTH - 1);
-    const int yi0 = clamp((int)yi_f, 0, _ISRC_HEIGHT - 1);
+    const int xi0 = clamp((int)xi_f, 0, (int)src_w - 1);
+    const int yi0 = clamp((int)yi_f, 0, (int)src_h - 1);
 
     TILE(SRC_DATA_TYPE, 1, N0, in00);
 
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
 
     TILE(uint, 1, 1, dst_indirect_y);
 
     // Calculate the destination indirect Y
-    dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
 
     bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
 
     T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y);
 }
+#endif /* SCALE_NEAREST_NEIGHBOUR */
 
+#if defined(SCALE_BILINEAR)
 //! @cond Doxygen_Suppress
 /** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC)
  *
  * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
  * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
  * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
  * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
  * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
@@ -132,65 +122,56 @@
  * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
  * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
  * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
- * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5)
- * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5)
  * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
  *
  * @note In case of QASYMM8, the following extra information must be passed at compile time:
  * - The source offset e.g. -DOFFSET=4
  * - The source scale e.g. -DSCALE=4
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c                             The size of the channels dimension of the source tensor
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale_x                           The scale value to apply on the source width
+ * @param[in] scale_y                           The scale value to apply on the source height
  */
- //! @endcond
+//! @endcond
 __kernel void scale_bilinear_nhwc(
-    TENSOR4D(src, SRC_TENSOR_TYPE),
-    TENSOR4D(dst, DST_TENSOR_TYPE))
+    TENSOR4D_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_T(dst, DST_TENSOR_TYPE),
+    const float scale_x,
+    const float scale_y)
 {
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _ISCALE_X SCALE_X
-#define _ISCALE_Y SCALE_Y
-
     const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
     const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
 #if defined(BATCHED_EXECUTION)
-    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
-    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
-#else                                                         // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
     const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
     const int bout = 0;                        // BATCH SIZE IDX
-#endif                                                        // defined(BATCHED_EXECUTION)
+#endif                                                 // defined(BATCHED_EXECUTION)
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
-    float xi_f = (xo * (float)SCALE_X);
-    float yi_f = (yo * (float)SCALE_Y);
+    float xi_f = (xo * scale_x);
+    float yi_f = (yo * scale_y);
 #elif SAMPLING_POLICY_CENTER
-    float     xi_f = ((xo + 0.5f) * (float)SCALE_X - 0.5f);
-    float     yi_f = ((yo + 0.5f) * (float)SCALE_Y - 0.5f);
+    float     xi_f = ((xo + 0.5f) * scale_x - 0.5f);
+    float     yi_f = ((yo + 0.5f) * scale_y - 0.5f);
 #else // SAMPLING_POLICY
 #error("Unsupported sampling policy");
 #endif // SAMPLING_POLICY
@@ -210,20 +191,20 @@
     in11[0].v = CONSTANT_VALUE;
 
 #ifndef BORDER_MODE_REPLICATE
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in00);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in01);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in10);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in11);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, src_w, src_h, 1, 1, true, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, src_w, src_h, 1, 1, true, in01);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, src_w, src_h, 1, 1, true, in10);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, src_w, src_h, 1, 1, true, in11);
 #else  // BORDER_MODE_REPLICATE
-    const int xi0  = clamp(xi, 0, _ISRC_WIDTH - 1);
-    const int yi0  = clamp(yi, 0, _ISRC_HEIGHT - 1);
-    const int xi1  = clamp(xi + 1, 0, _ISRC_WIDTH - 1);
-    const int yi1  = clamp(yi + 1, 0, _ISRC_HEIGHT - 1);
+    const int xi0  = clamp(xi, 0, (int)src_w - 1);
+    const int yi0  = clamp(yi, 0, (int)src_h - 1);
+    const int xi1  = clamp(xi + 1, 0, (int)src_w - 1);
+    const int yi1  = clamp(yi + 1, 0, (int)src_h - 1);
 
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in01);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in10);
-    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in11);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, src_w, src_h, 1, 1, false, in01);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, src_w, src_h, 1, 1, false, in10);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, src_w, src_h, 1, 1, false, in11);
 #endif // BORDER_MODE_REPLICATE
 
     TILE(DST_DATA_TYPE, 1, N0, out);
@@ -270,9 +251,10 @@
     TILE(uint, 1, 1, dst_indirect_y);
 
     // Calculate the destination indirect Y
-    dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
 
     bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
 
     T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y);
-}
\ No newline at end of file
+}
+#endif /* SCALE_BILINEAR */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index f36f273..cc20616 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -104,6 +104,32 @@
 #define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
 #define TENSOR4D(name, type) TENSOR4D_STR(name, type)
 
+#define TENSOR4D_T_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    __global uchar *name##_ptr,       \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_stride_w, \
+    uint        name##_c,   \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_T_BUFFER(name)    \
+    __global uchar *name##_ptr,  \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_stride_w, \
+    uint        name##_c,   \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
+#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
+
 #if !defined(UNROLL_WITH_PRAGMA)
 #define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
 
diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
index d63c0e1..6f16adc 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp
@@ -117,9 +117,7 @@
     const int          idx_channel       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
     const unsigned int src_width         = src->dimension(idx_width);
     const unsigned int src_height        = src->dimension(idx_height);
-    const unsigned int src_channel       = src->dimension(idx_channel);
     const unsigned int dst_width         = dst->dimension(idx_width);
-    const unsigned int dst_height        = dst->dimension(idx_height);
     const unsigned int dst_channels      = dst->dimension(idx_channel);
     unsigned int       vec_size          = 0;
     unsigned int       vec_size_leftover = 0;
@@ -130,20 +128,13 @@
         vec_size          = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels);
         vec_size_leftover = dst_channels % vec_size;
         build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER");
-        build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
-        build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
-        build_opts.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src_channel));
         build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
-        build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst_width));
-        build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst_height));
-        build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst_channels));
         build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
         build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
-        build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
-        build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
         build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size));
         build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover));
+        build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use));
         build_opts.add_option_if(src->num_dimensions() > 3, "-DBATCHED_EXECUTION");
         build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
         build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
@@ -203,6 +194,13 @@
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 
+    // Pass scale kernel arguments
+    if(is_nhwc)
+    {
+        unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc();
+        _kernel.setArg<cl_float>(idx++, scale_x);
+        _kernel.setArg<cl_float>(idx++, scale_y);
+    }
     // Set config_id for enabling LWS tuning
     _config_id = "scale_";
     _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
@@ -248,8 +246,8 @@
             Window slice     = collapsed.first_slice_window_4D();
 
             unsigned int idx = 0;
-            add_4D_tensor_argument(idx, src, slice);
-            add_4D_tensor_argument(idx, dst, slice);
+            add_4d_tensor_nhwc_argument(idx, src);
+            add_4d_tensor_nhwc_argument(idx, dst);
             enqueue(queue, *this, slice, lws_hint());
             break;
         }