COMPMID-1176: Add FP16 support in CLDeconvolutionLayer.

Change-Id: Ic82ca002220fa31d8618a55084ff1dfc2585bea7
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/131944
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index f31560c..d63f5d4 100644
--- a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -50,16 +50,16 @@
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input        Source tensor. Data types supported: F32.
-     * @param[out] output       Destination tensor. Data types supported: F32. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  input        Source tensor. Data types supported: F16/F32.
+     * @param[out] output       Destination tensor. Data types supported: F16/F32. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  inner_border Top and right inner border sizes. These rows and columns will be filled with zero.
      * @param[in]  info         Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, const PadStrideInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
      *
-     * @param[in] input        Source tensor info. Data types supported: F32.
-     * @param[in] output       Destination tensor info. Data types supported: F32. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in] input        Source tensor info. Data types supported: F16/F32.
+     * @param[in] output       Destination tensor info. Data types supported: F16/F32. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled with zero.
      * @param[in] info         Contains padding and stride information described in @ref PadStrideInfo.
      *
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index a66fc25..f29039a 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
@@ -64,7 +64,7 @@
     CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input, weights, biases and output tensors.
      *
-     * @param[in,out] input              Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32.
+     * @param[in,out] input              Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F16/F32.
      * @param[in]     weights            The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input.
      * @param[in]     bias               (Optional) The biases have one dimension. Data type supported: Same as @p input.
      * @param[out]    output             Output tensor. The output has the same number of dimensions as the @p input.
@@ -77,7 +77,7 @@
                    unsigned int inner_border_right, unsigned int inner_border_top);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayer
      *
-     * @param[in] input              Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32.
+     * @param[in] input              Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F16/F32.
      * @param[in] weights            The 4d weights info with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input.
      * @param[in] bias               (Optional) The biases have one dimension. Data type supported: Same as @p input.
      * @param[in] output             Output tensor info. The output has the same number of dimensions as the @p input.
diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl
index 2514ddc..794f4aa 100644
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ b/src/core/CL/cl_kernels/deconvolution_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,13 @@
 
 /** This function applies upsample on an input image.
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16/F32
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: F32
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: F16/F32
  * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
@@ -46,5 +46,5 @@
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
     // Store result
-    *((__global float *)dst.ptr) = *((__global float *)src.ptr);
+    *((__global DATA_TYPE *)dst.ptr) = *((__global DATA_TYPE *)src.ptr);
 }
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index 650c5b8..e7cdf8c 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -43,7 +43,7 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) == 0);
@@ -74,7 +74,9 @@
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), inner_border, info));
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("deconvolution_upsample"));
+    CLBuildOptions build_opts;
+    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
 
     constexpr unsigned int num_elems_processed_per_iteration = 1;
 
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index cb8dc02..4d971f3 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -46,7 +46,7 @@
                                       unsigned int inner_border_right, unsigned int inner_border_top)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
diff --git a/tests/validation/CL/DeconvolutionLayer.cpp b/tests/validation/CL/DeconvolutionLayer.cpp
index 58a2026..758cf36 100644
--- a/tests/validation/CL/DeconvolutionLayer.cpp
+++ b/tests/validation/CL/DeconvolutionLayer.cpp
@@ -43,7 +43,9 @@
 {
 namespace
 {
-constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f); /**< Tolerance for floating point tests */
+constexpr AbsoluteTolerance<float>  tolerance_fp32(0.001f);               /**< Tolerance for floating point tests */
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2)); /**< Tolerance value for comparing reference's for DataType::F16 */
+constexpr float                     tolerance_num = 0.07f;                /**< Tolerance number */
 
 const auto data4x4 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 3)
                      * framework::dataset::make("PadY", 0, 3) * framework::dataset::make("ax", 0) * framework::dataset::make("ay", 0) * framework::dataset::make("NumKernels", { 1, 3 });
@@ -169,10 +171,9 @@
 using CLDeconvolutionLayerFixture1x1 = DeconvolutionValidationFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 1, 1>;
 
 TEST_SUITE(Float)
-
 TEST_SUITE(FP32)
-TEST_SUITE(W4x4)
 
+TEST_SUITE(W4x4)
 FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture4x4<float>, framework::DatasetMode::ALL, combine(data4x4, framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
@@ -181,7 +182,6 @@
 TEST_SUITE_END()
 
 TEST_SUITE(W3x3)
-
 FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(data3x3, framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
@@ -198,6 +198,34 @@
 TEST_SUITE_END()
 
 TEST_SUITE_END()
+
+TEST_SUITE(FP16)
+
+TEST_SUITE(W4x4)
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture4x4<half>, framework::DatasetMode::ALL, combine(data4x4, framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(W3x3)
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(data3x3, framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(W1x1)
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture1x1<half>, framework::DatasetMode::ALL, combine(data1x1, framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END()
+
+TEST_SUITE_END()
 TEST_SUITE_END()
 
 TEST_SUITE_END()
diff --git a/tests/validation/fixtures/DeconvolutionLayerFixture.h b/tests/validation/fixtures/DeconvolutionLayerFixture.h
index f2455f3..137068a 100644
--- a/tests/validation/fixtures/DeconvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DeconvolutionLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,16 +58,14 @@
     template <typename U>
     void fill(U &&tensor, int i)
     {
-        switch(tensor.data_type())
+        if(is_data_type_float(tensor.data_type()))
         {
-            case DataType::F32:
-            {
-                std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            default:
-                library->fill_tensor_uniform(tensor, i);
+            std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+            library->fill(tensor, distribution, i);
+        }
+        else
+        {
+            library->fill_tensor_uniform(tensor, i);
         }
     }
 
diff --git a/tests/validation/reference/DeconvolutionLayer.cpp b/tests/validation/reference/DeconvolutionLayer.cpp
index 617f690..3543708 100644
--- a/tests/validation/reference/DeconvolutionLayer.cpp
+++ b/tests/validation/reference/DeconvolutionLayer.cpp
@@ -91,6 +91,8 @@
 
 template SimpleTensor<float> deconvolution_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
                                                  const PadStrideInfo &info, const std::pair<unsigned int, unsigned int> &a);
+template SimpleTensor<half> deconvolution_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, const TensorShape &output_shape,
+                                                const PadStrideInfo &info, const std::pair<unsigned int, unsigned int> &a);
 } // namespace reference
 } // namespace validation
 } // namespace test