COMPMID-1266 : support for FP16 in CLWinogradConvolutionLayer

Added support for FP16 in CLWinogradConvolutionLayer: 5x5 kernels and 3x3 kernels(COMPMID-937)

Change-Id: I0f394cbdc978dd04176416e9f612aca3986b09e6
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/145537
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index bb83f5a..3762e39 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -55,7 +55,9 @@
 // *INDENT-OFF*
 // clang-format off
 constexpr AbsoluteTolerance<float> tolerance_f32(0.001f);
+const AbsoluteTolerance<half> tolerance_f16(half(0.5f));
 constexpr AbsoluteTolerance<float> tolerance_convolution_layer_f32(0.1f);
+const AbsoluteTolerance<half> tolerance_convolution_layer_f16(half(0.4f));
 
 // Input transform
 const auto SmallWinogradInputTransformDatasetNCHW =
@@ -176,13 +178,14 @@
     ARM_COMPUTE_EXPECT(bool(CLWinogradInputTransform::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS);
 }
 
-using CLWinogradInputTransformFixture = WinogradInputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradInputTransform, float>;
+using CLWinogradInputTransformFixtureFP32 = WinogradInputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradInputTransform, float>;
+using CLWinogradInputTransformFixtureFP16 = WinogradInputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradInputTransform, half>;
 
 TEST_SUITE(NCHW)
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(SmallWinogradInputTransformDatasetNCHW,
                                                                                                       LargeWinogradInputTransformDatasetNCHW),
                                                                            framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                                                           framework::dataset::make("DataType", { DataType::F32 })),
+                                                                           framework::dataset::make("DataType", { DataType::F32, DataType::F16 })),
                shape_in, winograd_info, data_layout, data_type)
 {
     TensorInfo  tensor_info_in(shape_in, 1, data_type);
@@ -204,26 +207,44 @@
     winograd_input_transform.configure(&in, &out, winograd_info);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixture, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNCHW,
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixtureFP32, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNCHW,
                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                                      framework::dataset::make("DataType", { DataType::F32 })))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixture, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNCHW,
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixtureFP32, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNCHW,
                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                                    framework::dataset::make("DataType", { DataType::F32 })))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixtureFP16, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNCHW,
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                     framework::dataset::make("DataType", { DataType::F16 })))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixtureFP16, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNCHW,
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                   framework::dataset::make("DataType", { DataType::F16 })))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
 TEST_SUITE_END() // NCHW
 
 TEST_SUITE(NHWC)
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(SmallWinogradInputTransformDatasetNHWC,
                                                                                                       LargeWinogradInputTransformDatasetNHWC),
                                                                            framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                           framework::dataset::make("DataType", { DataType::F32 })),
+                                                                           framework::dataset::make("DataType", { DataType::F32, DataType::F16 })),
                shape_in, winograd_info, data_layout, data_type)
 {
     TensorShape shape_in_nhwc(shape_in);
@@ -251,26 +272,43 @@
     winograd_input_transform.configure(&in, &out, winograd_info);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixture, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNHWC,
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixtureFP16, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNHWC,
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                     framework::dataset::make("DataType", { DataType::F16 })))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixtureFP16, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNHWC,
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                   framework::dataset::make("DataType", { DataType::F16 })))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixtureFP32, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNHWC,
                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                                                                                                                      framework::dataset::make("DataType", { DataType::F32 })))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixture, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNHWC,
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixtureFP32, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNHWC,
                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                                                                                                                    framework::dataset::make("DataType", { DataType::F32 })))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE_END() // FP32
 TEST_SUITE_END() // NHWC
 TEST_SUITE_END() // InputTransform
 
 TEST_SUITE(FilterTransform)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                 framework::dataset::make("InputInfo",{
-                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 3U), 1, DataType::F16),     // F16 not supported
+                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 3U), 1, DataType::F16),     // F16 supported
                                                                                         TensorInfo(TensorShape(3U, 3U, 5U, 3U), 1, DataType::QASYMM8), // QASYMM8 not supported
                                                                                         TensorInfo(TensorShape(5U, 5U, 5U, 3U), 1, DataType::F32),     // Kernel size not supported
                                                                                         TensorInfo(TensorShape(3U, 3U), 1, DataType::F32),             // Output tile not supported
@@ -296,21 +334,22 @@
                                                                                           WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
                                                                                           WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ )
                                                                                          })),
-                                                framework::dataset::make("Expected", { false, false, false, false, true, true, true })),
+                                                framework::dataset::make("Expected", { true, false, false, false, true, true, true })),
                                             input_info, output_info, winograd_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLWinogradFilterTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS);
 }
 
 using CLWinogradFilterTransform        = CLSynthetizeFunctionWithZeroConstantBorder<CLWinogradFilterTransformKernel, 0>;
-using CLWinogradFilterTransformFixture = WinogradFilterTransformValidationFixture<CLTensor, CLAccessor, CLWinogradFilterTransform, float>;
+using CLWinogradFilterTransformFixtureFP32 = WinogradFilterTransformValidationFixture<CLTensor, CLAccessor, CLWinogradFilterTransform, float>;
+using CLWinogradFilterTransformFixtureFP16 = WinogradFilterTransformValidationFixture<CLTensor, CLAccessor, CLWinogradFilterTransform, half>;
 
 TEST_SUITE(NCHW)
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL,
                combine(combine(framework::dataset::concat(SmallWinogradFilterTransformDatasetNCHW,
                                                           LargeWinogradFilterTransformDatasetNCHW),
                                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                                          framework::dataset::make("DataType", { DataType::F32 })),
+                                                          framework::dataset::make("DataType", { DataType::F32, DataType::F16 })),
                shape_a, output_tile, data_layout, data_type)
 {
     WinogradInfo winograd_info(output_tile, Size2D(shape_a[0], shape_a[1]), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, data_layout /* Not needed */);
@@ -329,7 +368,8 @@
     winograd_filter_transform.configure(&a, &b, winograd_info);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixture, framework::DatasetMode::PRECOMMIT,
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixtureFP32, framework::DatasetMode::PRECOMMIT,
                        combine(combine(SmallWinogradFilterTransformDatasetNCHW,
                                        framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                        framework::dataset::make("DataType", { DataType::F32 })))
@@ -338,7 +378,7 @@
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixture, framework::DatasetMode::NIGHTLY,
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixtureFP32, framework::DatasetMode::NIGHTLY,
                        combine(combine(LargeWinogradFilterTransformDatasetNCHW,
                                        framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                        framework::dataset::make("DataType", { DataType::F32 })))
@@ -346,6 +386,26 @@
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE_END() // FP32
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixtureFP16, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(SmallWinogradFilterTransformDatasetNCHW,
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                       framework::dataset::make("DataType", { DataType::F16 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixtureFP16, framework::DatasetMode::NIGHTLY,
+                       combine(combine(LargeWinogradFilterTransformDatasetNCHW,
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                       framework::dataset::make("DataType", { DataType::F16 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
 TEST_SUITE_END() // NCHW
 
 TEST_SUITE(NHWC)
@@ -353,7 +413,7 @@
                combine(combine(framework::dataset::concat(SmallWinogradFilterTransformDatasetNHWC,
                                                           LargeWinogradFilterTransformDatasetNHWC),
                                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                          framework::dataset::make("DataType", { DataType::F32 })),
+                                                          framework::dataset::make("DataType", { DataType::F32, DataType::F16 })),
                shape_in, output_tile, data_layout, data_type)
 {
     TensorShape shape_in_nhwc(shape_in);
@@ -381,7 +441,27 @@
     winograd_filter_transform.configure(&a, &b, winograd_info);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixture, framework::DatasetMode::PRECOMMIT,
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixtureFP16, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(SmallWinogradFilterTransformDatasetNHWC,
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                       framework::dataset::make("DataType", { DataType::F16 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixtureFP16, framework::DatasetMode::NIGHTLY,
+                       combine(combine(LargeWinogradFilterTransformDatasetNHWC,
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                       framework::dataset::make("DataType", { DataType::F16 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixtureFP32, framework::DatasetMode::PRECOMMIT,
                        combine(combine(SmallWinogradFilterTransformDatasetNHWC,
                                        framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                                        framework::dataset::make("DataType", { DataType::F32 })))
@@ -390,7 +470,7 @@
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixture, framework::DatasetMode::NIGHTLY,
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixtureFP32, framework::DatasetMode::NIGHTLY,
                        combine(combine(LargeWinogradFilterTransformDatasetNHWC,
                                        framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                                        framework::dataset::make("DataType", { DataType::F32 })))
@@ -398,13 +478,14 @@
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE_END() // FP32
 TEST_SUITE_END() // NHWC
 TEST_SUITE_END() // FilterTransform
 
 TEST_SUITE(OutputTransform)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                 framework::dataset::make("InputInfo",{
-                                                                                        TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F16),      // F16 not supported
+                                                                                        TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F16),      // F16 supported
                                                                                         TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::QASYMM8),  // QASYMM8 not supported
                                                                                         TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F32),      // Kernel size not supported
                                                                                         TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F32),      // Valid
@@ -447,19 +528,20 @@
                                                                                         WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(64U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
                                                                                         WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(64U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)
                                                                                     })),
-                                                framework::dataset::make("Expected", { false, false, false, true, false, true, false, true, false })),
+                                                framework::dataset::make("Expected", { true, false, false, true, false, true, false, true, false })),
                                             input_info, bias_info, output_info, winograd_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLWinogradOutputTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS);
 }
 
 using CLWinogradOutputTransform        = CLSynthetizeFunctionWithZeroConstantBorder<CLWinogradOutputTransformKernel, 0>;
-using CLWinogradOutputTransformFixture = WinogradOutputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradOutputTransform, float>;
+using CLWinogradOutputTransformFixtureFP32 = WinogradOutputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradOutputTransform, float>;
+using CLWinogradOutputTransformFixtureFP16 = WinogradOutputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradOutputTransform, half>;
 
 TEST_SUITE(NCHW)
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(SmallWinogradOutputTransformDatasetNCHW,
                                                                                               LargeWinogradOutputTransformDatasetNCHW),
-                                                                                              framework::dataset::make("DataType", { DataType::F32 })),
+                                                                                              framework::dataset::make("DataType", { DataType::F32, DataType::F16 })),
                shape_a, winograd_info, data_type)
 {
     TensorShape shape_b = compute_winograd_output_transform_shape(TensorInfo(shape_a, 1, data_type), winograd_info);
@@ -475,8 +557,25 @@
     CLWinogradOutputTransform winograd_output_transform;
     winograd_output_transform.configure(&a, nullptr, &b, winograd_info);
 }
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP16, framework::DatasetMode::ALL,
+                       combine(SmallWinogradOutputTransformDatasetNCHW,
+                               framework::dataset::make("DataType", { DataType::F16 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixture, framework::DatasetMode::ALL,
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixtureFP16, framework::DatasetMode::NIGHTLY,
+                       combine(LargeWinogradOutputTransformDatasetNCHW,
+                               framework::dataset::make("DataType", { DataType::F16 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP32, framework::DatasetMode::ALL,
                        combine(SmallWinogradOutputTransformDatasetNCHW,
                                framework::dataset::make("DataType", { DataType::F32 })))
 {
@@ -484,19 +583,20 @@
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixture, framework::DatasetMode::NIGHTLY,
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixtureFP32, framework::DatasetMode::NIGHTLY,
                        combine(LargeWinogradOutputTransformDatasetNCHW,
                                framework::dataset::make("DataType", { DataType::F32 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE_END() // FP32
 TEST_SUITE_END() // NCHW
 
 TEST_SUITE(NHWC)
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(SmallWinogradOutputTransformDatasetNHWC,
                                                                                               LargeWinogradOutputTransformDatasetNHWC),
-                                                                                              framework::dataset::make("DataType", { DataType::F32 })),
+                                                                                              framework::dataset::make("DataType", { DataType::F32, DataType::F16 })),
                shape_a, winograd_info, data_type)
 {
     TensorShape shape_b = compute_winograd_output_transform_shape(TensorInfo(shape_a, 1, data_type), winograd_info);
@@ -513,7 +613,25 @@
     winograd_output_transform.configure(&a, nullptr, &b, winograd_info);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixture, framework::DatasetMode::ALL,
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP16, framework::DatasetMode::ALL,
+                       combine(SmallWinogradOutputTransformDatasetNHWC,
+                               framework::dataset::make("DataType", { DataType::F16 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixtureFP16, framework::DatasetMode::NIGHTLY,
+                       combine(LargeWinogradOutputTransformDatasetNHWC,
+                               framework::dataset::make("DataType", { DataType::F16 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP32, framework::DatasetMode::ALL,
                        combine(SmallWinogradOutputTransformDatasetNHWC,
                                framework::dataset::make("DataType", { DataType::F32 })))
 {
@@ -521,41 +639,42 @@
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixture, framework::DatasetMode::NIGHTLY,
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixtureFP32, framework::DatasetMode::NIGHTLY,
                        combine(LargeWinogradOutputTransformDatasetNHWC,
                                framework::dataset::make("DataType", { DataType::F32 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE_END() // FP32
 TEST_SUITE_END() // NHWC
 TEST_SUITE_END() // OutputTransform
 
 TEST_SUITE(ConvolutionLayer)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                 framework::dataset::make("InputInfo", {
-                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16),     // FP16 not supported
+                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16),     // Insufficient padding
                                                                                         TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),     // Datatype mismatch
                                                                                         TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Stride y not supported
                                                                                         TensorInfo(TensorShape(16U, 16U, 8U), 1, DataType::F32),     // Padding needed
                                                                                         TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)  // Kernel size not supported
                                                                                       }),
                                                 framework::dataset::make("WeightsInfo", {
-                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F16),
                                                                                         TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::QASYMM8),
                                                                                         TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(3U, 3U, 8U, 16U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
                                                                                         })),
                                                 framework::dataset::make("BiasesInfo", {
-                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F16),
                                                                                         TensorInfo(TensorShape(19U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(21U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(16U), 1, DataType::F32)
                                                                                        })),
                                                 framework::dataset::make("OutputInfo", {
-                                                                                        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F16),
                                                                                         TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(16U, 16U, 16U), 1, DataType::F32),
@@ -574,6 +693,7 @@
     ARM_COMPUTE_EXPECT(bool(CLWinogradConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info)) == expected, framework::LogLevel::ERRORS);
 }
 
+TEST_SUITE(FP32)
 using CLWinogradConvolutionLayerFastMathFixture = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float>;
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
@@ -712,6 +832,151 @@
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 TEST_SUITE_END() // Conv1x5
+TEST_SUITE_END() // FP32
+
+
+TEST_SUITE(FP16)
+
+using CLWinogradConvolutionLayerFastMathFixture16 = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, half>;
+TEST_SUITE(Conv3x3)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                               framework::dataset::make("DataType", { DataType::F16 })),
+                                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                                               framework::dataset::make("DataType", { DataType::F16 })),
+                                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+TEST_SUITE_END() // Conv3x3
+
+TEST_SUITE(Conv3x1)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
+                                       framework::dataset::make("DataType", { DataType::F16 })),
+                                       framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
+                                       framework::dataset::make("DataType", { DataType::F16 })),
+                                       framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+TEST_SUITE_END() // Conv3x1
+
+TEST_SUITE(Conv1x3)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
+                                       framework::dataset::make("DataType", { DataType::F16 })),
+                                       framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
+                                       framework::dataset::make("DataType", { DataType::F16 })),
+                                       framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+TEST_SUITE_END() // Conv1x3
+
+TEST_SUITE(Conv5x5)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
+                                               framework::dataset::make("DataType", { DataType::F16 })),
+                                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
+                                               framework::dataset::make("DataType", { DataType::F16 })),
+                                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+TEST_SUITE_END() // Conv5x5
+
+TEST_SUITE(Conv5x1)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
+                                               framework::dataset::make("DataType", { DataType::F16 })),
+                                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
+                                               framework::dataset::make("DataType", { DataType::F16 })),
+                                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+TEST_SUITE_END() // Conv5x1
+
+TEST_SUITE(Conv1x5)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
+                                               framework::dataset::make("DataType", { DataType::F16 })),
+                                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
+                                               framework::dataset::make("DataType", { DataType::F16 })),
+                                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16);
+}
+TEST_SUITE_END() // Conv1x5
+
+TEST_SUITE_END() // FP16
 
 TEST_SUITE_END() // ConvolutionLayer
 TEST_SUITE_END() // Winograd
diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index 1044e32..fd034b6 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp
@@ -169,7 +169,8 @@
     return dst;
 }
 
-void matrix_multiply(const SimpleTensor<float> &a, const SimpleTensor<float> &b, SimpleTensor<float> &out)
+template <typename T>
+void matrix_multiply(const SimpleTensor<T> &a, const SimpleTensor<T> &b, SimpleTensor<T> &out)
 {
     ARM_COMPUTE_ERROR_ON(a.shape()[0] != b.shape()[1]);
     ARM_COMPUTE_ERROR_ON(a.shape()[1] != out.shape()[1]);
@@ -194,7 +195,8 @@
     }
 }
 
-void transpose_matrix(const SimpleTensor<float> &in, SimpleTensor<float> &out)
+template <typename T>
+void transpose_matrix(const SimpleTensor<T> &in, SimpleTensor<T> &out)
 {
     ARM_COMPUTE_ERROR_ON((in.shape()[0] != out.shape()[1]) || (in.shape()[1] != out.shape()[0]));
 
@@ -301,7 +303,14 @@
 }
 
 template void get_tile(const SimpleTensor<float> &in, SimpleTensor<float> &roi, const Coordinates &coord);
+template void get_tile(const SimpleTensor<half> &in, SimpleTensor<half> &roi, const Coordinates &coord);
 template void zeros(SimpleTensor<float> &in, const Coordinates &anchor, const TensorShape &shape);
+template void zeros(SimpleTensor<half> &in, const Coordinates &anchor, const TensorShape &shape);
+template void transpose_matrix(const SimpleTensor<float> &in, SimpleTensor<float> &out);
+template void transpose_matrix(const SimpleTensor<half> &in, SimpleTensor<half> &out);
+template void matrix_multiply(const SimpleTensor<float> &a, const SimpleTensor<float> &b, SimpleTensor<float> &out);
+template void matrix_multiply(const SimpleTensor<half> &a, const SimpleTensor<half> &b, SimpleTensor<half> &out);
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index e5ba148..8b99494 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -237,7 +237,8 @@
  * @param[out] out Output tensor
  *
  */
-void matrix_multiply(const SimpleTensor<float> &a, const SimpleTensor<float> &b, SimpleTensor<float> &out);
+template <typename T>
+void matrix_multiply(const SimpleTensor<T> &a, const SimpleTensor<T> &b, SimpleTensor<T> &out);
 
 /** Transpose matrix
  *
@@ -245,7 +246,8 @@
  * @param[out] out Output tensor
  *
  */
-void transpose_matrix(const SimpleTensor<float> &in, SimpleTensor<float> &out);
+template <typename T>
+void transpose_matrix(const SimpleTensor<T> &in, SimpleTensor<T> &out);
 
 /** Get a 2D tile from a tensor
  *
diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
index aba3eff..41f16d3 100644
--- a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
@@ -70,6 +70,7 @@
     {
         switch(tensor.data_type())
         {
+            case DataType::F16:
             case DataType::F32:
             {
                 std::uniform_real_distribution<> distribution(min, max);
@@ -175,6 +176,7 @@
     {
         switch(tensor.data_type())
         {
+            case DataType::F16:
             case DataType::F32:
             {
                 std::uniform_real_distribution<> distribution(min, max);
@@ -318,6 +320,7 @@
     {
         switch(tensor.data_type())
         {
+            case DataType::F16:
             case DataType::F32:
             {
                 std::uniform_real_distribution<> distribution(min, max);
@@ -401,6 +404,7 @@
     {
         switch(tensor.data_type())
         {
+            case DataType::F16:
             case DataType::F32:
             {
                 std::uniform_real_distribution<> distribution(min, max);
@@ -481,6 +485,7 @@
     {
         switch(tensor.data_type())
         {
+            case DataType::F16:
             case DataType::F32:
             {
                 std::uniform_real_distribution<> distribution(min, max);
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 132d252..3c2c11d 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -232,7 +232,7 @@
     initialize_matrix_transform(matrix, output_tile_size, kernel_size, WinogradTransformType::INPUT);
 
     // Transpose matrix
-    transpose_matrix(matrix, matrix_transposed);
+    transpose_matrix<T>(matrix, matrix_transposed);
 
     const int in_w        = in.shape().x();
     const int in_h        = in.shape().y();
@@ -293,14 +293,14 @@
                     int yi = y * step_y - conv_info.pad_top();
 
                     // Get the tile from the input tensor
-                    get_tile(in, src_tile, Coordinates(xi, yi, z, b));
+                    get_tile<T>(in, src_tile, Coordinates(xi, yi, z, b));
 
                     // Fill partially with zeros in case of 1D convolution
-                    zeros(src_tile, anchor_zeros, shape_zeros);
+                    zeros<T>(src_tile, anchor_zeros, shape_zeros);
 
                     // Compute the transformation
-                    matrix_multiply(matrix, src_tile, tmp_tile);
-                    matrix_multiply(tmp_tile, matrix_transposed, dst_tile);
+                    matrix_multiply<T>(matrix, src_tile, tmp_tile);
+                    matrix_multiply<T>(tmp_tile, matrix_transposed, dst_tile);
 
                     // Store the output tile across the channels
                     for(int i = 0; i < out_d; ++i)
@@ -358,7 +358,7 @@
     initialize_matrix_transform(trans_matrix, output_tile_size, kernel_size, WinogradTransformType::FILTER);
 
     // Transpose the transformation matrix
-    transpose_matrix(trans_matrix, trans_matrix_transposed);
+    transpose_matrix<T>(trans_matrix, trans_matrix_transposed);
 
     const int num_channels = in.shape()[2];
     const int num_filters  = in.shape()[3];
@@ -374,13 +374,13 @@
             for(int z = 0; z < num_channels; ++z)
             {
                 // Load the tile from the input tensor
-                get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+                get_tile<T>(in, input_tile, Coordinates(0, 0, z, w, n));
 
                 // First transformation
-                matrix_multiply(trans_matrix, input_tile, tmp_tile);
+                matrix_multiply<T>(trans_matrix, input_tile, tmp_tile);
 
                 // Second transformation
-                matrix_multiply(tmp_tile, trans_matrix_transposed, transf_tile);
+                matrix_multiply<T>(tmp_tile, trans_matrix_transposed, transf_tile);
 
                 // Store the output tile across the channels
                 const int output_offset = w + z * num_filters;
@@ -451,7 +451,7 @@
     initialize_matrix_transform(trans_matrix, output_tile_size, kernel_size, WinogradTransformType::OUTPUT);
 
     // Transpose the transformation matrix
-    transpose_matrix(trans_matrix, trans_matrix_transposed);
+    transpose_matrix<T>(trans_matrix, trans_matrix_transposed);
 
     const int w_in        = in.shape()[0];
     const int h_in        = in.shape()[1];
@@ -487,7 +487,7 @@
     const int step_y_transf_tile = kernel_size.width == 1 ? 1 : output_tile.shape()[0];
 
     // Initialize with zeros the input tile
-    zeros(input_tile, Coordinates(0, 0), input_tile.shape());
+    zeros<T>(input_tile, Coordinates(0, 0), input_tile.shape());
 
     for(int n = 0; n < num_batches; ++n)
     {
@@ -502,10 +502,10 @@
                 }
 
                 // First transformation
-                matrix_multiply(trans_matrix, input_tile, tmp_tile);
+                matrix_multiply<T>(trans_matrix, input_tile, tmp_tile);
 
                 // Second transformation
-                matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
+                matrix_multiply<T>(tmp_tile, trans_matrix_transposed, output_tile);
 
                 // Store the output tile
                 const int xo = (y % num_tiles_x) * out_tile_w;
@@ -538,6 +538,10 @@
 template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
 template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
 template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const SimpleTensor<float> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<half> winograd_filter_transform(const SimpleTensor<half> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<half> winograd_input_transform(const SimpleTensor<half> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<half> winograd_output_transform(const SimpleTensor<half> &in, const SimpleTensor<half> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+
 } // namespace reference
 } // namespace validation
 } // namespace test