COMPMID-1390: OCLGrind and benchmark tests fail for QASYMM8
COMPMID-1392: OCLGrind failures in im2col1x1_stridex1_dchw
COMPMID-1395: OCLGrind failures in output_stage_quantized

Change-Id: I35504bd1f701316df122be52d458c71bbd7e7909
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/139722
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
index ae87420..83da767 100644
--- a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
+++ b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
@@ -248,6 +248,12 @@
 }
 #endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
 
+#if defined(VEC_SIZE)
+
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_SAT_UCHAR_STR(x, size) (convert_uchar##size##_sat((x)))
+#define CONVERT_SAT_UCHAR(x, size) CONVERT_SAT_UCHAR_STR(x, size)
+
 /** This function computes the output stage of a depthwise convolution.
  *
  * @param[in] src_ptr                            Pointer to the source image. Supported data types: QASYMM8
@@ -274,7 +280,6 @@
  * @param[in] output_multiplier                  Output scale multiplier
  * @param[in] output_shift                       Output scale divisor exponent
  */
-
 __kernel void output_stage_quantized(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
@@ -292,22 +297,29 @@
 #endif //defined(HAS_BIAS)
 
     // Load input
-    int16 vals = vload16(0, (__global int *)(src.ptr));
+    VEC_INT vals = VLOAD(VEC_SIZE)(0, (__global int *)(src.ptr));
 
 #if defined(HAS_BIAS)
     // Load and add bias
 #if defined(NCHW)
     int bias_value = *((__global int *)(vector_offset(&bias, get_global_id(2))));
 #else  // defined(NCHW)
-    int16 bias_value = vload16(0, ((__global int *)(vector_offset(&bias, get_global_id(0) * 16))));
+    VEC_INT bias_value = VLOAD(VEC_SIZE)(0, ((__global int *)(vector_offset(&bias, get_global_id(0) * VEC_SIZE))));
 #endif // defined(NCHW)
 
-    vals += (int16)(bias_value);
+    vals += (VEC_INT)(bias_value);
 #endif //defined(HAS_BIAS)
 
-    vals = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(vals, output_multiplier, output_shift, 16);
+    vals = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(vals, output_multiplier, output_shift, VEC_SIZE);
     vals = vals + output_offset;
 
     // Store result in dst
-    vstore16(convert_uchar16_sat(vals), 0, (__global uchar *)dst.ptr);
+    VSTORE(VEC_SIZE)
+    (CONVERT_SAT_UCHAR(vals, VEC_SIZE), 0, (__global uchar *)dst.ptr);
 }
+
+#undef VEC_INT
+#undef CONVERT_SAT_UCHAR_STR
+#undef CONVERT_SAT_UCHAR
+
+#endif // defined(VEC_SIZE)
diff --git a/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
index 3d9d520..4e2352c 100644
--- a/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
@@ -90,44 +90,29 @@
     bool         window_changed                    = false;
     unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
 
-    // Update processed elements when input is S32 (comes from quantization input)
-    if(input->data_type() == DataType::S32)
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    // Input window
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win, input_access);
+
+    // Bias window
+    if(bias != nullptr)
     {
-        num_elems_processed_per_iteration = 16;
+        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->dimension(1));
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
     }
 
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
+    // Output window
     if(output != nullptr && (output->total_size() != 0))
     {
         AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-        if(bias == nullptr)
-        {
-            window_changed = update_window_and_padding(win, input_access, output_access);
-        }
-        else
-        {
-            AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
-            window_changed = update_window_and_padding(win, input_access, output_access, bias_access);
-        }
-
+        window_changed = window_changed || update_window_and_padding(win, output_access);
         output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
     }
     else
     {
-        if(bias == nullptr)
-        {
-            window_changed = update_window_and_padding(win, input_access);
-        }
-        else
-        {
-            AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
-            window_changed = update_window_and_padding(win, input_access, bias_access);
-        }
-
         input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
     }
 
@@ -165,10 +150,13 @@
     _result_shift                 = result_shift;
     _result_offset_after_shift    = result_offset_after_shift;
 
+    const unsigned int num_elems_accessed_per_iteration = 16 / element_size_from_data_type(input->info()->data_type());
+
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option_if(bias != nullptr, "-DHAS_BIAS");
     build_opts.add_option("-D" + string_from_data_layout(input->info()->data_layout()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration));
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("output_stage_quantized", build_opts.options()));
 
     // Set static kernel arguments
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 06ca005..b1290b8 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -262,7 +262,7 @@
             AccessWindowStatic input_access(input->info(),
                                             -border.left,
                                             -border.top,
-                                            ceil_to_multiple(input_width + border.right, kernel_dims.width),
+                                            ceil_to_multiple(input_width + border.right, kernel_dims.width * _num_elems_processed_per_iteration),
                                             input_height + border.bottom);
             update_window_and_padding(win, input_access);
         }
diff --git a/tests/benchmark/fixtures/ConvolutionLayerFixture.h b/tests/benchmark/fixtures/ConvolutionLayerFixture.h
index 338a021..b23c345 100644
--- a/tests/benchmark/fixtures/ConvolutionLayerFixture.h
+++ b/tests/benchmark/fixtures/ConvolutionLayerFixture.h
@@ -46,16 +46,16 @@
                int batches)
     {
         // Set batched in source and destination shapes
-
         src_shape.set(3 /* batch */, batches);
         dst_shape.set(3 /* batch */, batches);
-        DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+        DataType               bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+        const QuantizationInfo qinfo(2.f / 255.f, 127);
 
         // Create tensors
-        src     = create_tensor<TensorType>(src_shape, data_type, 1);
-        weights = create_tensor<TensorType>(weights_shape, data_type, 1);
+        src     = create_tensor<TensorType>(src_shape, data_type, 1, qinfo);
+        weights = create_tensor<TensorType>(weights_shape, data_type, 1, qinfo);
         biases  = create_tensor<TensorType>(biases_shape, bias_data_type, 1);
-        dst     = create_tensor<TensorType>(dst_shape, data_type, 1);
+        dst     = create_tensor<TensorType>(dst_shape, data_type, 1, qinfo);
 
         // Create and configure function
         conv_layer.configure(&src, &weights, &biases, &dst, info, WeightsInfo(), dilation, act_info);