IVGCVSW-619: Support for Cl u8 bounded Relu

Change-Id: I3c39ecbd36f06d5376c35ed4eb38dd73533ef97e
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/93686
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index bb3ac6e..0935152 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -190,6 +190,18 @@
      * @param[in] valid_region Valid region to set.
      */
     virtual void set_valid_region(ValidRegion valid_region) = 0;
+
+    /** Get the quantization settings (scale and offset) of the tensor.
+    *
+    * @return A QuantizationInfo containing the scale and offset.
+    */
+    virtual QuantizationInfo quantization_info() const = 0;
+
+    /** Set the quantization settings (scale and offset) of the tensor.
+    *
+    * @param[in] quantization_info QuantizationInfo containing the scale and offset.
+    */
+    virtual void set_quantization_info(QuantizationInfo quantization_info) = 0;
 };
 }
 #endif /*__ARM_COMPUTE_TENSORINFO_H__ */
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index 81a2702..3a88eba 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -186,6 +186,16 @@
         }
         _valid_region = std::move(valid_region);
     }
+    QuantizationInfo quantization_info() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->quantization_info();
+    }
+    void set_quantization_info(QuantizationInfo quantization_info) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_quantization_info(quantization_info);
+    }
 
 private:
     ITensorInfo *_parent;
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 35b9ccb..5d1ee7c 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 
+#include "ITensorInfo.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
@@ -97,6 +98,16 @@
      * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
      */
     TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+
+    /** Constructor
+     *
+     * @param[in] tensor_shape      It specifies the size for each dimension of the tensor in number of elements.
+     * @param[in] num_channels      It indicates the number of channels for each tensor element
+     * @param[in] data_type         Data type to use for each tensor element
+     * @param[in] quantization_info The quantization settings for the tensor data.
+     */
+    TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info);
+
     /** Constructor
      *
      * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
@@ -147,6 +158,7 @@
      * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
      */
     void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+
     /** Initialize the metadata structure with the given parameters
      *
      * @param[in] tensor_shape                  Size for each dimension of the tensor in number of elements.
@@ -276,6 +288,14 @@
     {
         _valid_region = std::move(valid_region);
     }
+    QuantizationInfo quantization_info() const override
+    {
+        return _quantization_info;
+    }
+    void set_quantization_info(QuantizationInfo quantization_info) override
+    {
+        _quantization_info = quantization_info;
+    }
 
 private:
     /** Calculates strides, offset and total size resulting from the specified padding around the XY plane.
@@ -284,17 +304,18 @@
      */
     std::tuple<Strides, size_t, size_t> calculate_padding_requirements(const PaddingSize &padding);
 
-    size_t      _total_size;
-    int         _fixed_point_position;
-    size_t      _offset_first_element_in_bytes;
-    Strides     _strides_in_bytes;
-    size_t      _num_channels;
-    TensorShape _tensor_shape;
-    DataType    _data_type;
-    Format      _format;
-    bool        _is_resizable;
-    ValidRegion _valid_region;
-    PaddingSize _padding;
+    size_t           _total_size;
+    int              _fixed_point_position;
+    size_t           _offset_first_element_in_bytes;
+    Strides          _strides_in_bytes;
+    size_t           _num_channels;
+    TensorShape      _tensor_shape;
+    DataType         _data_type;
+    Format           _format;
+    bool             _is_resizable;
+    ValidRegion      _valid_region;
+    PaddingSize      _padding;
+    QuantizationInfo _quantization_info;
 };
 }
 #endif /*__ARM_COMPUTE_TENSORINFO_H__ */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index f52dd12..e567bac 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -67,6 +67,7 @@
     U8,
     S8,
     QS8,
+    QASYMM8,
     U16,
     S16,
     QS16,
@@ -90,6 +91,46 @@
 /* Constant value used to indicate a ORB scaled pyramid */
 constexpr float SCALE_PYRAMID_ORB = 8.408964152537146130583778358414e-01;
 
+/** Quantization settings (used for QASYMM8 data type) */
+struct QuantizationInfo
+{
+    QuantizationInfo()
+        : scale(0.0f), offset(0)
+    {
+    }
+
+    QuantizationInfo(float scale, int offset)
+        : scale(scale), offset(offset)
+    {
+    }
+
+    float scale;  /**< scale */
+    int   offset; /**< offset */
+
+    /** Quantizes a value using the scale/offset in this QuantizationInfo */
+    uint8_t quantize(float value) const
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(scale == 0, "QuantizationInfo::quantize: scale == 0");
+        int quantized = static_cast<int>(value / scale + offset);
+        quantized     = std::max(0, std::min(quantized, 255));
+        return quantized;
+    }
+
+    /** Dequantizes a value using the scale/offset in this QuantizationInfo */
+    float dequantize(uint8_t value) const
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(scale == 0, "QuantizationInfo::dequantize: scale == 0");
+        float dequantized = (value - offset) * scale;
+        return dequantized;
+    }
+
+    /** Indicates whether this QuantizationInfo has valid settings or not */
+    bool empty() const
+    {
+        return scale == 0;
+    }
+};
+
 struct ValidRegion
 {
     ValidRegion()
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 7f53bec..149e404 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -92,6 +92,7 @@
         case DataType::U8:
         case DataType::S8:
         case DataType::QS8:
+        case DataType::QASYMM8:
             return 1;
         case DataType::U16:
         case DataType::S16:
@@ -166,6 +167,7 @@
         case DataType::S8:
         case DataType::U8:
         case DataType::QS8:
+        case DataType::QASYMM8:
             return 1;
         case DataType::U16:
         case DataType::S16:
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 821fb4c..09ec329 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -72,6 +72,8 @@
             return "qs8";
         case DataType::S8:
             return "char";
+        case DataType::QASYMM8:
+            return "uchar";
         case DataType::U16:
             return "ushort";
         case DataType::S16:
@@ -105,6 +107,7 @@
         case DataType::U8:
         case DataType::QS8:
         case DataType::S8:
+        case DataType::QASYMM8:
             return "8";
         case DataType::U16:
         case DataType::S16:
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 6e5e802..62ef259 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -107,6 +107,7 @@
     { "accumulate_squared", "accumulate.cl" },
     { "accumulate_weighted", "accumulate.cl" },
     { "activation_layer", "activation_layer.cl" },
+    { "activation_layer_qa8", "activation_layer_qa8.cl" },
     { "arithmetic_add", "arithmetic_op.cl" },
     { "arithmetic_sub", "arithmetic_op.cl" },
     { "bitwise_or", "bitwise_op.cl" },
@@ -306,6 +307,10 @@
 #include "./cl_kernels/activation_layer.clembed"
     },
     {
+        "activation_layer_qa8.cl",
+#include "./cl_kernels/activation_layer_qa8.clembed"
+    },
+    {
         "arithmetic_op.cl",
 #include "./cl_kernels/arithmetic_op.clembed"
     },
diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
new file mode 100644
index 0000000..4d9bf0e
--- /dev/null
+++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+// Bounded RELU Activation
+inline TYPE brelu_op(TYPE x)
+{
+    return min((TYPE)A_VAL, max(0, x));
+}
+// Lower Upper Bounded RELU Activation
+inline TYPE lu_brelu_op(TYPE x)
+{
+    return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
+/** This performs an activation function on QASYMM8 inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer_qa8(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    // Perform activation
+    data = ACTIVATION_OP(ACT, data);
+
+    // requantize to output space
+    float16 fdata = convert_float16(data);
+    fdata         = round((fdata - O1_VAL) * (S1_VAL / S2_VAL) + O2_VAL);
+    uchar16 qdata = convert_uchar16(fdata);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (qdata, 0, (__global DATA_TYPE *)output.ptr);
+}
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 18202c1..bed407a 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -34,6 +34,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
@@ -47,7 +50,14 @@
 
 void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32, DataType::QASYMM8);
+
+    // For QA8 only lower/upper bounded relu is supported
+    if(input->info()->data_type() == DataType::QASYMM8)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                 "For QASYMM8 only lower/upper bounded relu is supported");
+    }
 
     if(output != nullptr)
     {
@@ -74,8 +84,22 @@
     build_opts.emplace(("-DACT=" + lower_string(string_from_activation_func(act_info.activation()))));
     build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-    build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const)));
-    build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const)));
+
+    if(input->info()->data_type() == DataType::QASYMM8)
+    {
+        // For lower/upper bounded relu make sure that the min/max values are in the quantized input space
+        int a_const_u8 = input->info()->quantization_info().quantize(a_const);
+        int b_const_u8 = input->info()->quantization_info().quantize(b_const);
+
+        build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_u8)));
+        build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_u8)));
+    }
+    else
+    {
+        build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const)));
+        build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const)));
+    }
+
     build_opts.emplace(output == nullptr ? "-DIN_PLACE" : "");
     if(is_data_type_fixed_point(input->info()->data_type()))
     {
@@ -83,7 +107,23 @@
     }
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
+    if(input->info()->data_type() == DataType::QASYMM8)
+    {
+        float s1 = input->info()->quantization_info().scale;
+        float o1 = input->info()->quantization_info().offset;
+        // If output is nullptr, assume same quantization scale/offset as input
+        float s2 = output != nullptr ? output->info()->quantization_info().scale : s1;
+        float o2 = output != nullptr ? output->info()->quantization_info().offset : o1;
+        build_opts.emplace(("-DS1_VAL=" + support::cpp11::to_string(s1)));
+        build_opts.emplace(("-DS2_VAL=" + support::cpp11::to_string(s2)));
+        build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
+        build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer_qa8", build_opts));
+    }
+    else
+    {
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
+    }
 
     // Make sure _kernel is initialized before calling the parent's configure
 
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 91a3531..f3cd776 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -26,13 +26,14 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/HOGInfo.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
 using namespace arm_compute;
 
 TensorInfo::TensorInfo()
     : _total_size(0), _fixed_point_position(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN),
-      _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }
+      _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info()
 {
 }
 
@@ -80,6 +81,13 @@
     init(tensor_shape, num_channels, data_type, fixed_point_position);
 }
 
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info)
+    : TensorInfo()
+{
+    init(tensor_shape, num_channels, data_type, 0);
+    _quantization_info = quantization_info;
+}
+
 TensorInfo::TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height)
     : TensorInfo()
 {
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index b64739a..fbb90d9 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/Types.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;