COMPMID-429: Port CLSoftmaxLayer to QS16.

Change-Id: I3a0394364629654747439372d32f692b6ca29ee0
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80219
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
index 024169a..137b60b 100644
--- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
@@ -36,7 +36,7 @@
 public:
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
@@ -58,7 +58,7 @@
     CLLogits1DShiftExpSumKernel &operator=(CLLogits1DShiftExpSumKernel &&) = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
      * @param[in]  max    Max values tensor. Data types supported: same as @p input
      * @param[out] output Destination tensor. Data types supported: same as @p input
      * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input
@@ -91,7 +91,7 @@
     CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
      * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index a5b19bd..18f7a02 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -49,7 +49,7 @@
     CLSoftmaxLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h
index bf126a2..9fd3a6f 100644
--- a/src/core/CL/cl_kernels/fixed_point.h
+++ b/src/core/CL/cl_kernels/fixed_point.h
@@ -116,6 +116,11 @@
 MAXQ_IMPL(qs8x4)
 MAXQ_IMPL(qs8x8)
 MAXQ_IMPL(qs8x16)
+MAXQ_IMPL(qs16x1)
+MAXQ_IMPL(qs16x2)
+MAXQ_IMPL(qs16x4)
+MAXQ_IMPL(qs16x8)
+MAXQ_IMPL(qs16x16)
 
 #define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b))
 #define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size)
@@ -163,6 +168,11 @@
 SUBQ_SAT_IMPL(qs8x4)
 SUBQ_SAT_IMPL(qs8x8)
 SUBQ_SAT_IMPL(qs8x16)
+SUBQ_SAT_IMPL(qs16x1)
+SUBQ_SAT_IMPL(qs16x2)
+SUBQ_SAT_IMPL(qs16x4)
+SUBQ_SAT_IMPL(qs16x8)
+SUBQ_SAT_IMPL(qs16x16)
 
 #define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b))
 #define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size)
@@ -270,6 +280,7 @@
     }
 
 DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16)
+DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16)
 
 #define DIV_SAT_OP_EXPAND_STR(a, b, type, size, position) div_sat_##type##x##size((a), (b), (position))
 #define DIV_SAT_OP_EXPAND(a, b, type, size, position) DIV_SAT_OP_EXPAND_STR(a, b, type, size, position)
@@ -304,6 +315,7 @@
     }
 
 EXPQ_IMPL(qs8, qs8x16, 16)
+EXPQ_IMPL(qs16, qs16x16, 16)
 
 #define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position))
 #define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position)
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 113fc76..04736c4 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -64,7 +64,7 @@
  * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
  * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/F16/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -125,7 +125,7 @@
  * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
  * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/F16/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -210,7 +210,7 @@
  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/F16/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 3bca938..ccaf745 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -41,7 +41,7 @@
 
 void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     // Softmax across the x dimension
@@ -103,7 +103,7 @@
 
 void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
 
     // Output auto initialization if not yet initialized
@@ -187,7 +187,7 @@
 
 void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output);
 
     // Output auto initialization if not yet initialized
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 0f20af0..850eb2c 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -36,7 +36,7 @@
 
 void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
     // Create intermediate tensors shapes
     _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index 43efc04..2680070 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp
@@ -49,10 +49,9 @@
 
 namespace
 {
-/** Tolerance for float operations */
-const float tolerance = 0.000001f;
-/** Tolerance for fixed point operations */
-const float tolerance_fixed_point = 2.f;
+const float tolerance      = 0.000001f; /** Tolerance for float operations */
+const float tolerance_qs8  = 2.f;       /** Tolerance for QS8 fixed point operations */
+const float tolerance_qs16 = 2.f;       /** Tolerance for QS16 fixed point operations */
 
 /** Compute OpenCL softmax layer function.
  *
@@ -160,35 +159,69 @@
 BOOST_AUTO_TEST_SUITE_END()
 
 BOOST_AUTO_TEST_SUITE(Quantized)
+BOOST_AUTO_TEST_SUITE(QS8)
+// Testing for fixed point position [1,6) as reciprocal limits the maximum fixed point position to 5
 BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
-BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * CNNFixedPointDataTypes() * boost::unit_test::data::xrange(1, 6),
-                     shape, dt, fixed_point_position)
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::xrange(1, 6),
+                     shape, fixed_point_position)
 {
     // Compute function
-    CLTensor dst = compute_softmax_layer(shape, dt, fixed_point_position);
+    CLTensor dst = compute_softmax_layer(shape, DataType::QS8, fixed_point_position);
 
     // Compute reference
-    RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, dt, fixed_point_position);
+    RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, DataType::QS8, fixed_point_position);
 
     // Validate output
-    validate(CLAccessor(dst), ref_dst, tolerance_fixed_point);
+    validate(CLAccessor(dst), ref_dst, tolerance_qs8);
 }
 
 BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
-BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * CNNFixedPointDataTypes() * boost::unit_test::data::xrange(1, 6),
-                     shape, dt, fixed_point_position)
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::xrange(1, 6),
+                     shape, fixed_point_position)
 {
     // Compute function
-    CLTensor dst = compute_softmax_layer(shape, dt, fixed_point_position);
+    CLTensor dst = compute_softmax_layer(shape, DataType::QS8, fixed_point_position);
 
     // Compute reference
-    RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, dt, fixed_point_position);
+    RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, DataType::QS8, fixed_point_position);
 
     // Validate output
-    validate(CLAccessor(dst), ref_dst, tolerance_fixed_point);
+    validate(CLAccessor(dst), ref_dst, tolerance_qs8);
 }
 BOOST_AUTO_TEST_SUITE_END()
 
+BOOST_AUTO_TEST_SUITE(QS16)
+// Testing for fixed point position [1,14) as reciprocal limits the maximum fixed point position to 14
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::xrange(1, 14),
+                     shape, fixed_point_position)
+{
+    // Compute function
+    CLTensor dst = compute_softmax_layer(shape, DataType::QS16, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, DataType::QS16, fixed_point_position);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst, tolerance_qs16);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::xrange(1, 14),
+                     shape, fixed_point_position)
+{
+    // Compute function
+    CLTensor dst = compute_softmax_layer(shape, DataType::QS16, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, DataType::QS16, fixed_point_position);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst, tolerance_qs16);
+}
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+
 BOOST_AUTO_TEST_SUITE_END()
 BOOST_AUTO_TEST_SUITE_END()
 #endif /* DOXYGEN_SKIP_THIS */