COMPMID-516 Increase tolerance rate of Scale, Conv, fully connected and GEMM

This patch also fix the scale kernel issue where it was calcuated the
scale factor inside the gpu but now in the CPU. The GPU and CPU gave
different result for simple float division operation

Change-Id: Ib6709cb6c41dcf4fc0fa4eb79e481430695bf40e
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87266
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
index b3398bd..0106ce0 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/scale.cl
@@ -70,20 +70,20 @@
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
  * @param[in]  input_width                       Input image width
  * @param[in]  input_height                      Input image height
- * @param[in]  output_width                      Output image width
- * @param[in]  output_height                     Output image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
  */
 __kernel void scale_nearest_neighbour(
     IMAGE_DECLARATION(in),
     IMAGE_DECLARATION(out),
     const float input_width,
     const float input_height,
-    const float output_width,
-    const float output_height)
+    const float scale_x,
+    const float scale_y)
 {
     Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
     Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(input_width / output_width, input_height / output_height);
+    const float2 r   = (float2)(scale_x, scale_y);
     const float8 tc  = clamp_to_border(transform_nearest(get_current_coords(), r), input_width, input_height);
     vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
 }
@@ -104,20 +104,20 @@
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
  * @param[in]  input_width                       Input image width
  * @param[in]  input_height                      Input image height
- * @param[in]  output_width                      Output image width
- * @param[in]  output_height                     Output image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
  */
 __kernel void scale_bilinear(
     IMAGE_DECLARATION(in),
     IMAGE_DECLARATION(out),
     const float input_width,
     const float input_height,
-    const float output_width,
-    const float output_height)
+    const float scale_x,
+    const float scale_y)
 {
     Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
     Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(input_width / output_width, input_height / output_height);
+    const float2 r   = (float2)(scale_x, scale_y);
     const float8 tc  = transform_bilinear(get_current_coords(), r);
     vstore4(bilinear_interpolate(&in, tc, input_width, input_height), 0, (__global DATA_TYPE *)out.ptr);
 }
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 66afc3d..82ebe64 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -98,9 +98,12 @@
     ICLKernel::configure(win);
 
     // Set static kernel arguments
+    const float scale_x = static_cast<float>(input->info()->dimension(0)) / output->info()->dimension(0);
+    const float scale_y = static_cast<float>(input->info()->dimension(1)) / output->info()->dimension(1);
+
     unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
     _kernel.setArg<float>(idx++, input->info()->dimension(0));
     _kernel.setArg<float>(idx++, input->info()->dimension(1));
-    _kernel.setArg<float>(idx++, output->info()->dimension(0));
-    _kernel.setArg<float>(idx++, output->info()->dimension(1));
+    _kernel.setArg<float>(idx++, scale_x);
+    _kernel.setArg<float>(idx++, scale_y);
 }
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index aa653ac..6b3b5c7 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -114,6 +114,27 @@
     }
 };
 
+/** Data set containing medium tensor shapes. */
+class MediumShapes final : public ShapeDataset
+{
+public:
+    MediumShapes()
+        : ShapeDataset("Shape",
+    {
+        // Batch size 1
+        TensorShape{ 37U, 37U },
+                     TensorShape{ 27U, 33U, 2U },
+                     TensorShape{ 128U, 64U, 1U, 3U },
+                     // Batch size 4
+                     TensorShape{ 37U, 37U, 3U, 4U },
+                     TensorShape{ 27U, 33U, 2U, 4U },
+                     // Arbitrary batch size
+                     TensorShape{ 37U, 37U, 3U, 5U }
+    })
+    {
+    }
+};
+
 /** Data set containing large tensor shapes. */
 class LargeShapes final : public ShapeDataset
 {
@@ -185,17 +206,13 @@
         : ShapeDataset("InputShape",
     {
         // Batch size 1
-        TensorShape{ 5U, 5U, 3U },
+        TensorShape{ 35U, 35U, 3U },
                      TensorShape{ 32U, 37U, 3U },
-                     TensorShape{ 13U, 15U, 8U },
                      // Batch size 4
-                     TensorShape{ 5U, 5U, 3U, 4U },
                      TensorShape{ 32U, 37U, 3U, 4U },
-                     TensorShape{ 13U, 15U, 8U, 4U },
                      // Batch size 8
-                     TensorShape{ 5U, 5U, 3U, 8U },
                      TensorShape{ 32U, 37U, 3U, 8U },
-                     TensorShape{ 13U, 15U, 8U, 8U },
+                     TensorShape{ 33U, 35U, 8U, 8U },
                      // Arbitrary batch size
                      TensorShape{ 32U, 37U, 3U, 8U }
     })
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index b2fd22e..a6e0724 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -43,10 +43,10 @@
 {
 namespace
 {
-RelativeTolerance<float>           tolerance_f32(0.001f);    /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-RelativeTolerance<half>            tolerance_f16(half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
-constexpr AbsoluteTolerance<float> tolerance_q(1.0f);        /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
-constexpr float                    tolerance_num = 0.07f;    /**< Tolerance number */
+RelativeTolerance<float>            tolerance_f32(0.05f);                 /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr AbsoluteTolerance<float>  tolerance_q(1.0f);                    /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
+constexpr float                     tolerance_num = 0.07f;                /**< Tolerance number */
 
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
diff --git a/tests/validation/CL/FullyConnectedLayer.cpp b/tests/validation/CL/FullyConnectedLayer.cpp
index 22f27e5..35b9d29 100644
--- a/tests/validation/CL/FullyConnectedLayer.cpp
+++ b/tests/validation/CL/FullyConnectedLayer.cpp
@@ -43,9 +43,9 @@
 namespace
 {
 /** Tolerance for float operations */
-RelativeTolerance<float> tolerance_f32(0.001f);
-RelativeTolerance<half>  tolerance_f16(half(0.2));
-constexpr float          tolerance_num = 0.07f; /**< Tolerance number */
+RelativeTolerance<float>            tolerance_f32(0.05f);
+RelativeTolerance<half_float::half> tolerance_f16(half(0.2));
+constexpr float                     tolerance_num = 0.07f; /**< Tolerance number */
 
 /** Tolerance for fixed point operations */
 constexpr AbsoluteTolerance<float> tolerance_fixed_point(1.f);
diff --git a/tests/validation/CL/GEMM.cpp b/tests/validation/CL/GEMM.cpp
index 8545519..62671e3 100644
--- a/tests/validation/CL/GEMM.cpp
+++ b/tests/validation/CL/GEMM.cpp
@@ -43,9 +43,10 @@
 {
 namespace
 {
-RelativeTolerance<float>           tolerance_f32(0.001f);    /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
-RelativeTolerance<half>            tolerance_f16(half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
-constexpr AbsoluteTolerance<float> tolerance_q(1.0f);        /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
+RelativeTolerance<float>            tolerance_f32(0.001f);    /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+RelativeTolerance<half_float::half> tolerance_f16(half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+constexpr AbsoluteTolerance<float>  tolerance_q(1.0f);        /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
+constexpr float                     tolerance_num = 0.02f;    /**< Tolerance number */
 
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
@@ -92,13 +93,13 @@
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallGEMMDataset(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16);
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMDataset(), framework::dataset::make("DataType",
                                                                                                DataType::F16)))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16);
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 TEST_SUITE_END()
 
diff --git a/tests/validation/CL/Scale.cpp b/tests/validation/CL/Scale.cpp
index 6757bd5..1a458b7 100644
--- a/tests/validation/CL/Scale.cpp
+++ b/tests/validation/CL/Scale.cpp
@@ -57,14 +57,14 @@
 /** Tolerance */
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
 constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
-RelativeTolerance<float>             tolerance_f32(0.01);
+RelativeTolerance<float>             tolerance_f32(0.05);
 RelativeTolerance<half>              tolerance_f16(half(0.1));
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(Scale)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), ScaleDataTypes),
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::MediumShapes(), datasets::LargeShapes()), ScaleDataTypes),
                                                                            framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                    datasets::BorderModes()),
                shape, data_type, policy, border_mode)
diff --git a/tests/validation/CPP/Scale.cpp b/tests/validation/CPP/Scale.cpp
index ba34553..74489aa 100644
--- a/tests/validation/CPP/Scale.cpp
+++ b/tests/validation/CPP/Scale.cpp
@@ -166,4 +166,4 @@
 } // namespace reference
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/validation/Validation.h b/tests/validation/Validation.h
index e461633..7d4ce57 100644
--- a/tests/validation/Validation.h
+++ b/tests/validation/Validation.h
@@ -269,7 +269,7 @@
             return true;
         }
 
-        const U epsilon = (std::is_same<half, typename std::remove_cv<U>::type>::value || (this->_reference == 0)) ? static_cast<U>(0.01) : std::numeric_limits<U>::epsilon();
+        const U epsilon = (std::is_same<half, typename std::remove_cv<U>::type>::value || (this->_reference == 0)) ? static_cast<U>(0.01) : static_cast<U>(1e-06);
 
         if(std::abs(static_cast<double>(this->_reference) - static_cast<double>(this->_target)) <= epsilon)
         {
diff --git a/tests/validation/fixtures/ScaleFixture.h b/tests/validation/fixtures/ScaleFixture.h
index ba252fb..6fa810a 100644
--- a/tests/validation/fixtures/ScaleFixture.h
+++ b/tests/validation/fixtures/ScaleFixture.h
@@ -124,4 +124,4 @@
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_SCALE_FIXTURE */
\ No newline at end of file
+#endif /* ARM_COMPUTE_TEST_SCALE_FIXTURE */