Disable unsafe FP optimizations in Winograd Output Transform

The unsafe FP optimizations flag causes accuracy issues when certain conditions are met regarding the hardware type, data type and the activation function.

Resolves: COMPMID-5375
Change-Id: I1b0b06549b8c108617962d006a20dd263d5e3c21
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8061
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
index 632fd26..9eb249a 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
@@ -178,14 +178,31 @@
 
     _num_tiles_x = num_tiles.width;
 
+    // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
+    const GPUTarget gpu_target    = get_target();
+    const auto      act_function  = act_info.activation();
+    const auto      src_data_type = src->data_type();
+
+    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
+       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+       && (src_data_type == DataType::F32 || src_data_type == DataType::F16))
+    {
+        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+        build_opts.add_option("-cl-unsafe-math-optimizations");
+    }
+    else
+    {
+        build_opts.add_option("-cl-fast-relaxed-math");
+    }
+
     if(_is_nhwc)
     {
         build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
-        build_opts.add_option("-cl-fast-relaxed-math");
         build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
         build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
         build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src_data_type));
         build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
         build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
         build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
@@ -194,12 +211,11 @@
     else
     {
         build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
-        build_opts.add_option("-cl-fast-relaxed-math");
         build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
         build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
         build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
         build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src_data_type));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1)));
         build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(idx_width)));
         build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
@@ -209,9 +225,9 @@
     }
 
     // Storing tensor dimensions to be sent later as kernel arguments
-    _src_height  = src->dimension(1);
-    _dst_width   = dst->dimension(idx_width);
-    _dst_height  = dst->dimension(idx_height);
+    _src_height = src->dimension(1);
+    _dst_width  = dst->dimension(idx_width);
+    _dst_height = dst->dimension(idx_height);
 
     // Create kernel
     std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout));
@@ -223,7 +239,7 @@
     // Set config_id for enabling LWS tuning
     _config_id = kernel_name;
     _config_id += "_";
-    _config_id += lower_string(string_from_data_type(src->data_type()));
+    _config_id += lower_string(string_from_data_type(src_data_type));
     _config_id += "_";
     _config_id += support::cpp11::to_string(src->dimension(0));
     _config_id += "_";
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp
index ffa1eff..b4163a5 100644
--- a/src/gpu/cl/operators/ClWinogradConv2d.cpp
+++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -214,6 +214,7 @@
                                                                                                                   (src->data_type() == DataType::F16)));
 
     // Configure output transform
+    _output_transform->set_target(CLScheduler::get().target());
     _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
 
     _aux_mem                             = _batched_mm.workspace();