COMPMID-3501 Modify heuristics for f16+fastmath NEON Winograd Conv

* Disable winograd on certain layers of squeezenet v1.1
* Fix winograd validate_kernel_3x3

Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: I380c6e4a0f8338056839df3c8810f726227f210f
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3348
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 4a77991..62eabb2 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -181,6 +181,39 @@
         {
             return ConvolutionMethod::GEMM;
         }
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        // This heuristics only applies to F16 data type on A55r1
+        if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16)
+        {
+            // Exclude known bad winograd configs (and defaults to GEMM)
+            const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs =
+            {
+                // Squeezenet_V1_1 fire2 and fire3
+                ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                // Squeezenet_V1_1 fire6 and fire7
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                // Squeezenet_V1_1 fire8 and fire9
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)),
+            };
+            const auto find_conv_config = [&](ConvolutionConfiguration c)
+            {
+                const PadStrideInfo info = std::get<3>(c);
+
+                return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
+                       && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
+                       && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+            };
+
+            bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(),
+                                          find_conv_config)
+                             != known_bad_winograd_f16_with_fastmath_configs.end();
+            if(found_bad)
+            {
+                return ConvolutionMethod::GEMM;
+            }
+        }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
     }
 }
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index a74e710..88d8a75 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -62,7 +62,7 @@
         }
     }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else if(input->data_type() == DataType::F32)
+    else if(input->data_type() == DataType::F16)
     {
         ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<__fp16, 4, 4, 3, 3>::validate(input, input0, winograd_info)));
         ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<__fp16, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));