Extend cl image support to input and output tensors

- Add support for texture image to input and output of direct
convolution
- Extend T_LOAD2D_INDIRECT macro to read values from cl image storages

Resolves COMPMID-5715

Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Change-Id: Idb0410f53f6d0763cd9e39895a7cbf9bc826d33a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8904
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 8f39c2d..84cf88e 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_compute
 {
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch)
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType type)
 {
     cl_channel_type cl_data_type;
 
@@ -61,7 +61,17 @@
     desc.image_width     = shape2d[0];
     desc.image_height    = shape2d[1];
 
-    cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
+    switch(type)
+    {
+        case CLImage2DType::ReadOnly:
+            cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
+            break;
+        case CLImage2DType::WriteOnly:
+            cl_image = clCreateImage(ctx(), CL_MEM_WRITE_ONLY, &format, &desc, nullptr, &err);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported CLImage2DType");
+    }
 
     ARM_COMPUTE_UNUSED(err);
     ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Error during the creation of CL image from buffer");
@@ -176,4 +186,4 @@
 }
 } // namespace experimental
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index d133e4f..b31944c 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,13 @@
 class CLBuildOptions;
 class ITensorInfo;
 
+/** OpenCL Image2D types */
+enum class CLImage2DType
+{
+    ReadOnly,
+    WriteOnly
+};
+
 /** Create a cl::Image2D object from an OpenCL buffer
  *
  * @note The following conditions are required to create a OpenCL image object from OpenCL buffer,
@@ -49,10 +56,11 @@
  * @param[in] shape2d         2D tensor shape
  * @param[in] data_type       DataType to use. Only supported: F32,F16
  * @param[in] image_row_pitch Image row pitch (a.k.a. stride Y) to be used in the image2d object
+ * @param[in] image_type      Image 2D type (@ref CLImage2DType)
  *
  * @return cl::Image2D object
  */
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch);
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type);
 
 namespace experimental
 {
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 298edc2..c5d94cc 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -438,6 +438,16 @@
 #define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
+#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
+#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
+#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
 /** Utility macro to read a 2D OpenCL image object.
  *
  * @note Coordinates are not normalized
@@ -454,6 +464,22 @@
 #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
 #define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
 
+/** Utility macro to write a 2D OpenCL image object.
+ *
+ * @note Coordinates are not normalized
+ *
+ * @param[in] data_type Data type
+ * @param[in] n0        Number of pixel to write. Only 1,2 and 4 is supported
+ * @param[in] img       OpenCL image object
+ * @param[in] x_coord   The x coordinate for the top-left pixel
+ * @param[in] y_coord   The y coordinate for the top-left pixel
+ * @param[in] values    Values to write
+ *
+ * @{
+ */
+#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
+
 #define VSTORE_STR(size) vstore##size
 #define VSTORE(size) VSTORE_STR(size)
 
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index 4693a1f..81ceeb8 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,36 +66,36 @@
  * - The weights offset e.g. -DWEI_OFFSET=4
  * - The quantized zero value e.g. -DZERO_VALUE=4
  *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32/QASYMM8
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
  * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
- * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  wei_step_x                        wei_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
- * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
+ * @param[in]  wei_c                             The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weights matrix
  * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED)
  * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
  * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
@@ -103,9 +103,9 @@
  */
 //! @endcond
 __kernel void direct_convolution_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
-    TENSOR4D_T(wei, WEI_TENSOR_TYPE)
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bia)
@@ -292,4 +292,4 @@
 #undef _IDST_HEIGHT
 #undef _IDST_CHANNELS
 #undef _IY_MULTIPLIER
-}
\ No newline at end of file
+}
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
index 3454690..dcbae22 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,6 +54,7 @@
  * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
  * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
  *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
@@ -63,6 +64,7 @@
  * @param[in]  src_h                             The size of the height dimension of the source tensor
  * @param[in]  src_n                             The size of the batches dimension of the source tensor
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
@@ -72,16 +74,16 @@
  * @param[in]  dst_h                             The size of the height dimension of the destination tensor
  * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
  * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
- * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  wei_step_x                        wei_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
- * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
+ * @param[in]  wei_c                             The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weigts matrix
  * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
  * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
  * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
@@ -89,9 +91,9 @@
  */
 //! @endcond
 __kernel void dwc_native_fp_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
-    TENSOR4D(wei, WEI_TENSOR_TYPE)
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bia)
@@ -206,4 +208,4 @@
 }
 #endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
 // *INDENT-ON*
-// clang-format on
\ No newline at end of file
+// clang-format on
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
index e2ffd44..2d255e5 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,7 @@
  * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
  * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
  *
+ * @param[in]  src_img                                       (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
  * @param[in]  src_ptr                                       Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
  * @param[in]  src_stride_y                                  Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_stride_z                                  Stride of the source tensor in Z dimension (in bytes)
@@ -86,6 +87,7 @@
  * @param[in]  src_h                                         The size of the height dimension of the source tensor
  * @param[in]  src_n                                         The size of the batches dimension of the source tensor
  * @param[in]  src_offset_first_element_in_bytes             The offset of the first element in the source tensor
+ * @param[out] dst_img                                       (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
  * @param[out] dst_ptr                                       Pointer to the destination tensor. Supported data type: same as @p src_ptr
  * @param[in]  dst_stride_y                                  Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_stride_z                                  Stride of the destination tensor in Z dimension (in bytes)
@@ -95,14 +97,15 @@
  * @param[in]  dst_h                                         The size of the height dimension of the destination tensor
  * @param[in]  dst_n                                         The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes             The offset of the first element in the destination tensor
+ * @param[in]  wei_img                                       (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
  * @param[in]  wei_ptr                                       Pointer to the weights tensor. Supported data type: same as @p src_ptr
- * @param[in]  wei_stride_x                                  Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  wei_step_x                                    wei_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  wei_stride_y                                  Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  wei_step_y                                    wei_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  wei_stride_z                                  Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  wei_step_z                                    wei_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  wei_stride_w                                  Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                                         The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                                         The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                                         The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                                         The size of the batches dimension of the weights tensor
  * @param[in]  wei_step_w                                    wei_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  wei_offset_first_element_in_bytes             The offset of the first element in the weights tensor
  * @param[in]  dst_multipliers_ptr                           Pointer to the destination multipliers tensor for the per-channel quantization. Supported data type: S32
@@ -120,9 +123,9 @@
  */
 //! @endcond
 __kernel void dwc_native_quantized_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
-    TENSOR4D(wei, WEI_TENSOR_TYPE),
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE),
     VECTOR_DECLARATION(dst_multipliers),
     VECTOR_DECLARATION(dst_shifts)
 #if defined(HAS_BIAS)
@@ -269,4 +272,4 @@
 }
 #endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
 // *INDENT-ON*
-// clang-format on
\ No newline at end of file
+// clang-format on
diff --git a/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
index c88f003..aa719bf 100644
--- a/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@
  * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  *  - M0 = 1, 2, 3, 4, 5, 6, 7, and 8
  *
- * @param[out] dst_img                           (Not supported) CLImage object to the destination tensor (DST_TENSOR_TYPE=IMAGE only)
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: INT32
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
@@ -54,7 +54,7 @@
  */
 //! @endcond
 __kernel void indirect_convolution_address_precalculation(
-    TENSOR4D_T(dst, DST_TENSOR_TYPE))
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE))
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -123,7 +123,7 @@
  *  - N0 = 2, 3, 4, 8, 16
  *  - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
  *
- * @param[in]  src_img                           (Not supported) CLImage object to the source tensor (SRC_TENSOR_TYPE=IMAGE only)
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
@@ -133,7 +133,7 @@
  * @param[in]  src_h                             The size of the height dimension of the source tensor
  * @param[in]  src_n                             The size of the batches dimension of the source tensor
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  off_img                           (Not supported) CLImage object to the indirect buffer tensor (OFF_TENSOR_TYPE=IMAGE only)
+ * @param[in]  off_img                           (Not supported) Read only cl_image object for the indirect buffer tensor. Included when OFF_TENSOR_TYPE=IMAGE
  * @param[in]  off_ptr                           Pointer to the indirect buffer tensor. Supported data type: INT32
  * @param[in]  off_stride_y                      Stride of the indirect buffer tensor in Y dimension (in bytes)
  * @param[in]  off_stride_z                      Stride of the indirect buffer tensor in Z dimension (in bytes)
@@ -143,8 +143,8 @@
  * @param[in]  off_h                             The size of the height dimension of the indirect buffer tensor
  * @param[in]  off_n                             The size of the batches dimension of the indirect buffer tensor
  * @param[in]  off_offset_first_element_in_bytes The offset of the first element in the indirect buffer tensor
- * @param[out] dst_img                           (Not supported) CLImage object to the destination tensor (DST_TENSOR_TYPE=IMAGE only)
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as the input tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
@@ -153,8 +153,8 @@
  * @param[in]  dst_h                             The size of the height dimension of the destination tensor
  * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] wei_img                           (Optional) CLImage object to the destination tensor (WEI_TENSOR_TYPE=IMAGE only)
- * @param[out] wei_ptr                           Pointer to the weights tensor. Supported data type: same as the input tensor
+ * @param[out] wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[out] wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
  * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
  * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
  * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
@@ -163,23 +163,17 @@
  * @param[in]  wei_h                             The size of the height dimension of the weights tensor
  * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
  * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[out] bia_img                           (Not supported) CLImage object to the destination tensor (BIA_TENSOR_TYPE=IMAGE only)
- * @param[out] bia_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as the input tensor
- * @param[in]  bia_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
- * @param[in]  bia_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
- * @param[in]  bia_stride_w                      (Optional) Stride of the bias tensor in W dimension (in bytes)
- * @param[in]  bia_c                             (Optional) The size of the channels dimension of the bias tensor
- * @param[in]  bia_w                             (Optional) The size of the width dimension of the bias tensor
- * @param[in]  bia_h                             (Optional) The size of the height dimension of the bias tensor
- * @param[in]  bia_n                             (Optional) The size of the batches dimension of the bias tensor
- * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
  */
 //! @endcond
 __kernel void indirect_convolution_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(off, OFF_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
-    TENSOR4D_T(wei, WEI_TENSOR_TYPE)
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_RO_T(off, OFF_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bia)
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index f6a3e09..e071b0f 100644
--- a/src/core/CL/cl_kernels/nhwc/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,8 +61,8 @@
  */
 //! @endcond
 __kernel void scale_nearest_neighbour_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
     const float scale_x,
     const float scale_y)
 {
@@ -128,31 +128,33 @@
  * - The source offset e.g. -DOFFSET=4
  * - The source scale e.g. -DSCALE=4
  *
- * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
- * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_c                             The size of the channels dimension of the source tensor
- * @param[in] src_w                             The size of the width dimension of the source tensor
- * @param[in] src_h                             The size of the height dimension of the source tensor
- * @param[in] src_n                             The size of the batches dimension of the source tensor
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_c                             The size of the channels dimension of the destination tensor
- * @param[in] dst_w                             The size of the width dimension of the destination tensor
- * @param[in] dst_h                             The size of the height dimension of the destination tensor
- * @param[in] dst_n                             The size of the batches dimension of the destination tensor
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] scale_x                           The scale value to apply on the source width
- * @param[in] scale_y                           The scale value to apply on the source height
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale_x                           The scale value to apply on the source width
+ * @param[in]  scale_y                           The scale value to apply on the source height
  */
 //! @endcond
 __kernel void scale_bilinear_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
     const float scale_x,
     const float scale_y)
 {
@@ -224,11 +226,11 @@
     const float a1 = (yi_f - (float)yi);
     const float b1 = (1.f - a1);
 
-    out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + 
-                   (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) + 
-                   (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + 
-                   (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), 
-                VEC_DATA_TYPE(DST_DATA_TYPE, N0));
+    out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) +
+                           (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
+                           (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) +
+                           (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1),
+                           VEC_DATA_TYPE(DST_DATA_TYPE, N0));
 #endif // defined(IS_FLOATING_POINT)
 
     TILE(uint, 1, 1, dst_indirect_y);
@@ -240,4 +242,4 @@
 
     T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y);
 }
-#endif /* SCALE_BILINEAR */
\ No newline at end of file
+#endif /* SCALE_BILINEAR */
diff --git a/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
index fe6182f..1393537 100644
--- a/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,7 +64,7 @@
  * - The weights offset e.g. -DWEI_OFFSET=4
  * - The quantized zero value e.g. -DZERO_VALUE=4
  *
- *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
@@ -73,6 +73,7 @@
  * @param[in]  src_w                             The size of the width dimension of the source tensor
  * @param[in]  src_h                             The size of the height dimension of the source tensor
  * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
@@ -82,6 +83,7 @@
  * @param[in]  dst_h                             The size of the height dimension of the destination tensor
  * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
  * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
  * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
  * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
@@ -98,9 +100,9 @@
  */
 //! @endcond
 __kernel void transposed_convolution_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
-    TENSOR4D_T(wei, WEI_TENSOR_TYPE)
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bia)
@@ -292,4 +294,4 @@
 #undef _IDST_HEIGHT
 #undef _IDST_CHANNELS
 #undef _IY_MULTIPLIER
-}
\ No newline at end of file
+}
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index acc174d..507e172 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -130,8 +130,44 @@
     uint        name##_offset_first_element_in_bytes
 
 #define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
+
+/** Legacy tensor 4D arguments
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
 #define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
 
+#define TENSOR4D_RO_T_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
+
+/** Read-Only (RO) tensor 4D.
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
+#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
+
+#define TENSOR4D_WO_T_IMAGE(name)          \
+    __write_only image2d_t name##_img, \
+    TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
+
+/** Write-Only (WO) tensor 4D.
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
+#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
+
 #define TENSOR3D_T_IMAGE(name)          \
     __read_only image2d_t name##_img, \
     __global uchar *name##_ptr,       \
@@ -457,6 +493,25 @@
     (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
 #define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
 
+/** Store a vector in global memory (tensor)
+ *
+ * @param[in] DATA_TYPE   Data type
+ * @param[in] WIDTH       Number of dst columns
+ * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                        In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in] TENSOR      Tensor basename
+ * @param[in] X           Starting X position
+ * @param[in] Y           Starting Y position
+ * @param[in] STRIDE_Y    Stride Y (in bytes)
+ * @param[in] VALUES      Values to store in memory
+ */
+#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
+#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
+#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
+    VSTORE(WIDTH)                                                \
+    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
+#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
+
 /** Load a tile from global memory (tensor)
  *
  * @param[in]  DATA_TYPE     Data type
@@ -658,7 +713,8 @@
  * @param[in]  DATA_TYPE     Data type
  * @param[in]  TILE_AREA     Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
  * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
- * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                           When TENSOR_TYPE=IMAGE, the if condition for the out-of-bound check can be skipped
  *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
  * @param[in]  TENSOR        Tensor basename
  * @param[in]  C             Starting C index
@@ -667,15 +723,25 @@
  *                           16 is the maximum indirect buffer size.
  * @param[out] dst           Output tile
  */
-#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)                \
-    ({                                                                                                                                                                \
-        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
-        {                                                                                                                                                             \
-            if(yi[0].s[_i] >= 0)                                                                                                                                     \
-            {                                                                                                                                                         \
-                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y);                                                               \
-            }                                                                                                                                                         \
-        })                                                                                                                                                            \
+#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
+#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
+#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
+    ({ \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
+        { \
+            if(yi[0].s[_i] >= 0) \
+            { \
+                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
+            } \
+        }) \
+    })
+
+#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
+    ({ \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
+        { \
+            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
+        }) \
     })
 
 /** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index cded319..2d21a6e 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -377,7 +377,7 @@
             const size_t      image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
             const TensorShape shape2d(image_w, image_h);
             const size_t      image_row_pitch = _input->info()->strides_in_bytes()[1];
-            input_cl_image                    = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch);
+            input_cl_image                    = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         }
 
         if(_export_weights_to_cl_image)
@@ -386,7 +386,8 @@
             const size_t      image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
             const TensorShape shape2d(image_w, image_h);
             const size_t      image_row_pitch = _weights->info()->strides_in_bytes()[1];
-            weights_cl_image                  = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch);
+            weights_cl_image                  = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch,
+                                                                           CLImage2DType::ReadOnly);
         }
     }
 
@@ -401,7 +402,7 @@
     {
         _kernel.setArg(idx++, weights_cl_image);
     }
-    add_4D_tensor_argument(idx, _weights, slice);
+    add_4d_tensor_nhwc_argument(idx, _weights);
     if(_is_quantized)
     {
         add_1D_tensor_argument(idx, _output_multipliers, slice);