COMPMID-1332: Implement Slice for CL

Change-Id: I0dbc4fd7f640d31daa1970eb3da0e941cb771f2b
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/146145
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 29fd672..0cc6e32 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -361,7 +361,7 @@
     { "softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl" },
     { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
     { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
-    { "strided_slice", "strided_slice.cl" },
+    { "strided_slice", "slice_ops.cl" },
     { "suppress_non_maximum", "canny.cl" },
     { "tablelookup_U8", "tablelookup.cl" },
     { "tablelookup_S16", "tablelookup.cl" },
@@ -742,8 +742,8 @@
 #include "./cl_kernels/softmax_layer_quantized.clembed"
     },
     {
-        "strided_slice.cl",
-#include "./cl_kernels/strided_slice.clembed"
+        "slice_ops.cl",
+#include "./cl_kernels/slice_ops.clembed"
     },
     {
         "tablelookup.cl",
diff --git a/src/core/CL/cl_kernels/strided_slice.cl b/src/core/CL/cl_kernels/slice_ops.cl
similarity index 76%
rename from src/core/CL/cl_kernels/strided_slice.cl
rename to src/core/CL/cl_kernels/slice_ops.cl
index 7c68fb9..bc3df47 100644
--- a/src/core/CL/cl_kernels/strided_slice.cl
+++ b/src/core/CL/cl_kernels/slice_ops.cl
@@ -61,25 +61,47 @@
     Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
     Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
 
-    int offset_0 = 0;
-    int offset_1 = 0;
-    int offset_2 = 0;
-    int offset_3 = 0;
+    int offset = 0;
 
-    // Calculate offset
-#if defined(START_0) && defined(STRIDE_0)
-    offset_0 = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0;
+    // Offset X
+#if defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    offset       = (int)START_0 + min(xi, (int)LAST_ACCESSED_X);
+    input.ptr += offset * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+#elif defined(START_0) && defined(STRIDE_0)
+    offset = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0;
+    input.ptr += offset * input_stride_x;
 #endif // defined(START_0) && defined(STRIDE_0)
+
+    // Offset Y
 #if defined(START_1) && defined(STRIDE_1)
-    offset_1 = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
+    offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
+    input.ptr += offset * input_stride_y;
 #endif // defined(START_1) && defined(STRIDE_1)
+
+    // Offset Z
 #if defined(START_2) && defined(STRIDE_2)
-    offset_2 = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
+    offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
+    input.ptr += offset * input_stride_z;
 #endif // defined(START_2) && defined(STRIDE_2)
+
+    // Offset depth
 #if defined(START_3) && defined(STRIDE_3)
-    offset_3 = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
-#endif // defined(START_2) && defined(STRIDE_2)
+    offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
+    input.ptr += offset * input_stride_w;
+#endif // defined(START_3) && defined(STRIDE_3)
 
     // Store result
-    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)tensor4D_offset(&input, offset_0, offset_1, offset_2, offset_3));
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input.ptr));
+
+    VSTORE(VEC_SIZE)
+    (val, 0, (__global DATA_TYPE *)(output.ptr));
+#else  // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
 }
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index f07436a..2d2ba10 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -55,10 +55,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
-    for(unsigned int i = 0; i < strides.num_dimensions(); ++i)
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(strides[i] == 0);
-    }
+        return i == 0;
+    }));
 
     // Get expected output shape
     const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
@@ -120,6 +120,19 @@
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    // Enable multiple elements processing along x if stride_x is 1 and output width greater than the access vector size
+    const int  vec_size_x     = 16 / input->info()->element_size();
+    const int  output_width_x = output->info()->tensor_shape().x();
+    const bool multi_access_x = (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
+
+    // Update window if needed
+    if(multi_access_x)
+    {
+        Window &updated_window = std::get<1>(win_config);
+        updated_window.set(Window::DimX,
+                           Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
+    }
     ICLKernel::configure_internal(win_config.second);
 
     // Create build options
@@ -130,6 +143,8 @@
         build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
         build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
     }
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
                                   "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
                                   "-DSRC_DEPTH=1");