COMPMID-1394: Fixed OCLGrind failures ColorConvert

Change-Id: Ibf688e68205eac2dc07ce88c96498cc66c153cee
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140440
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
index 72c7493..02a0c8e 100644
--- a/src/core/CL/cl_kernels/color_convert.cl
+++ b/src/core/CL/cl_kernels/color_convert.cl
@@ -149,9 +149,9 @@
     f_g += lumav;
     f_b += lumav;
 
-    uchar8 r_0 = convert_uchar8_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_rtz(f_b);
+    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
 
     uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
                               r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
@@ -207,9 +207,9 @@
     f_g += lumav;
     f_b += lumav;
 
-    uchar8 r_0 = convert_uchar8_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_rtz(f_b);
+    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
 
     uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
                                r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
@@ -266,9 +266,9 @@
     f_g += lumav;
     f_b += lumav;
 
-    uchar8 r_0 = convert_uchar8_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_rtz(f_b);
+    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
 
     uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
                               r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
@@ -324,9 +324,9 @@
     f_g += lumav;
     f_b += lumav;
 
-    uchar8 r_0 = convert_uchar8_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_rtz(f_b);
+    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
 
     uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
                                r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
@@ -471,9 +471,9 @@
     float4 f_g = convert_float4(luma_0) + temp1;
     float4 f_b = convert_float4(luma_0) + temp2;
 
-    uchar4 r_0 = convert_uchar4_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_rtz(f_b);
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
 
     uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
     uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
@@ -484,9 +484,9 @@
     f_g = convert_float4(luma_1) + temp1;
     f_b = convert_float4(luma_1) + temp2;
 
-    r_0 = convert_uchar4_rtz(f_r);
-    g_0 = convert_uchar4_rtz(f_g);
-    b_0 = convert_uchar4_rtz(f_b);
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
 
     rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
     rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
@@ -927,9 +927,9 @@
     float4 f_g = convert_float4(luma_0) + temp1;
     float4 f_b = convert_float4(luma_0) + temp2;
 
-    uchar4 r_0 = convert_uchar4_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_rtz(f_b);
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
 
     uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
     uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
@@ -940,9 +940,9 @@
     f_g = convert_float4(luma_1) + temp1;
     f_b = convert_float4(luma_1) + temp2;
 
-    r_0 = convert_uchar4_rtz(f_r);
-    g_0 = convert_uchar4_rtz(f_g);
-    b_0 = convert_uchar4_rtz(f_b);
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
 
     rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
     rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
@@ -1126,9 +1126,9 @@
     float4 f_g = convert_float4(luma_0) + temp1;
     float4 f_b = convert_float4(luma_0) + temp2;
 
-    uchar4 r_0 = convert_uchar4_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_rtz(f_b);
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
 
     uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
     uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
@@ -1139,9 +1139,9 @@
     f_g = convert_float4(luma_1) + temp1;
     f_b = convert_float4(luma_1) + temp2;
 
-    r_0 = convert_uchar4_rtz(f_r);
-    g_0 = convert_uchar4_rtz(f_g);
-    b_0 = convert_uchar4_rtz(f_b);
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
 
     rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
     rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
@@ -1197,9 +1197,9 @@
     float4 f_g = convert_float4(luma_0) + temp1;
     float4 f_b = convert_float4(luma_0) + temp2;
 
-    uchar4 r_0 = convert_uchar4_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_rtz(f_b);
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
 
     uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
     uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
@@ -1210,9 +1210,9 @@
     f_g = convert_float4(luma_1) + temp1;
     f_b = convert_float4(luma_1) + temp2;
 
-    r_0 = convert_uchar4_rtz(f_r);
-    g_0 = convert_uchar4_rtz(f_g);
-    b_0 = convert_uchar4_rtz(f_b);
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
 
     rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
     rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
@@ -1525,9 +1525,9 @@
     float4 f_g = convert_float4(luma_0) + temp1;
     float4 f_b = convert_float4(luma_0) + temp2;
 
-    uchar4 r_0 = convert_uchar4_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_rtz(f_b);
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
 
     uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
     uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
@@ -1538,9 +1538,9 @@
     f_g = convert_float4(luma_1) + temp1;
     f_b = convert_float4(luma_1) + temp2;
 
-    r_0 = convert_uchar4_rtz(f_r);
-    g_0 = convert_uchar4_rtz(f_g);
-    b_0 = convert_uchar4_rtz(f_b);
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
 
     rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
     rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
@@ -1604,9 +1604,9 @@
     float4 f_g = convert_float4(luma_0) + temp1;
     float4 f_b = convert_float4(luma_0) + temp2;
 
-    uchar4 r_0 = convert_uchar4_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_rtz(f_b);
+    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
 
     uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
     uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
@@ -1617,9 +1617,9 @@
     f_g = convert_float4(luma_1) + temp1;
     f_b = convert_float4(luma_1) + temp2;
 
-    r_0 = convert_uchar4_rtz(f_r);
-    g_0 = convert_uchar4_rtz(f_g);
-    b_0 = convert_uchar4_rtz(f_b);
+    r_0 = convert_uchar4_sat_rtz(f_r);
+    g_0 = convert_uchar4_sat_rtz(f_g);
+    b_0 = convert_uchar4_sat_rtz(f_b);
 
     rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
     rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
index 52d9be9..2b89498 100644
--- a/src/core/CL/kernels/CLColorConvertKernel.cpp
+++ b/src/core/CL/kernels/CLColorConvertKernel.cpp
@@ -198,6 +198,7 @@
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     unsigned int num_elems_processed_per_iteration = 0;
+    unsigned int num_elems_read_per_iteration_x    = 0;
 
     bool  has_two_planes = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
     float sub_sampling   = (has_two_planes || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
@@ -212,9 +213,11 @@
                 case Format::NV12:
                 case Format::IYUV:
                     num_elems_processed_per_iteration = 2;
+                    num_elems_read_per_iteration_x    = 8;
                     break;
                 case Format::YUV444:
                     num_elems_processed_per_iteration = 4;
+                    num_elems_read_per_iteration_x    = 16;
                     break;
                 default:
                     break;
@@ -229,6 +232,7 @@
                 case Format::NV12:
                 case Format::IYUV:
                     num_elems_processed_per_iteration = 8;
+                    num_elems_read_per_iteration_x    = 8;
                     break;
                 default:
                     break;
@@ -238,6 +242,7 @@
         default:
             break;
     }
+
     ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
                              string_from_format(input->info()->format()).c_str(),
                              string_from_format(output->info()->format()).c_str());
@@ -248,7 +253,6 @@
     kernel_name << "_to_";
     kernel_name << string_from_format(output->info()->format());
     kernel_name << "_bt709";
-
     _input        = input;
     _multi_output = output;
 
@@ -267,8 +271,10 @@
     AccessWindowRectangle  output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0,
                                                 num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
 
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_read_per_iteration_x);
+
     update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              input_access,
                               output_plane0_access,
                               output_plane1_access,
                               output_plane2_access);