COMPMID-2247: Extend support of CLBoundingBoxTransform for QUANT16_ASYMM

Change-Id: I8af7a382c0bccf55cf7f4a64f46ce9e6cd965afe
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1833
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index bb3cf7f..d051810 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -45,6 +45,7 @@
         case DataType::QSYMM8_PER_CHANNEL:
             return "char";
         case DataType::U16:
+        case DataType::QASYMM16:
             return "ushort";
         case DataType::S16:
         case DataType::QSYMM16:
@@ -80,6 +81,7 @@
         case DataType::QSYMM8_PER_CHANNEL:
             return "char";
         case DataType::U16:
+        case DataType::QASYMM16:
             return "ushort";
         case DataType::F16:
         case DataType::S16:
@@ -114,6 +116,7 @@
         case DataType::U16:
         case DataType::S16:
         case DataType::QSYMM16:
+        case DataType::QASYMM16:
         case DataType::F16:
             return "16";
         case DataType::U32:
@@ -258,6 +261,7 @@
         case DataType::U16:
         case DataType::S16:
         case DataType::QSYMM16:
+        case DataType::QASYMM16:
             return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT>();
         case DataType::U32:
         case DataType::S32:
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 1f530a2..4f017b7 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -161,6 +161,7 @@
     { "bitwise_xor", "bitwise_op.cl" },
     { "bitwise_not", "bitwise_op.cl" },
     { "bounding_box_transform", "bounding_box_transform.cl" },
+    { "bounding_box_transform_quantized", "bounding_box_transform_quantized.cl" },
     { "channel_combine_NV", "channel_combine.cl" },
     { "channel_combine_RGB888", "channel_combine.cl" },
     { "channel_combine_RGBA8888", "channel_combine.cl" },
@@ -595,6 +596,10 @@
 #include "./cl_kernels/bounding_box_transform.clembed"
     },
     {
+        "bounding_box_transform_quantized.cl",
+#include "./cl_kernels/bounding_box_transform_quantized.clembed"
+    },
+    {
         "canny.cl",
 #include "./cl_kernels/canny.clembed"
     },
diff --git a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl b/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
new file mode 100644
index 0000000..bebad62
--- /dev/null
+++ b/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#if defined(DATA_TYPE) && defined(DATA_TYPE_DELTAS) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) && defined(OFFSET_BOXES) && defined(SCALE_BOXES) && defined(OFFSET_DELTAS) && defined(SCALE_DELTAS) && defined(OFFSET_PRED_BOXES) && defined(SCALE_PRED_BOXES) // Check for compile time constants
+
+/** Perform a padded copy of input tensor to the output tensor for quantized data types. Padding values are defined at compile time
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM16 for boxes and pred_boxes, QASYMM8 for for deltas
+ * -# -DWEIGHT{X,Y,W,H}= Weights [wx, wy, ww, wh] for the deltas
+ * -# -DIMG_WIDTH= Original image width
+ * -# -DIMG_HEIGHT= Original image height
+ * -# -DBOX_FIELDS= Number of fields that are used to represent a box in boxes
+ *
+ * @param[in]  boxes_ptr                                Pointer to the boxes tensor. Supported data types: QASYMM16
+ * @param[in]  boxes_stride_x                           Stride of the boxes tensor in X dimension (in bytes)
+ * @param[in]  boxes_step_x                             boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  boxes_stride_y                           Stride of the boxes tensor in Y dimension (in bytes)
+ * @param[in]  boxes_step_y                             boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  boxes_stride_z                           Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  boxes_step_z                             boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  boxes_offset_first_element_in_bytes      The offset of the first element in the boxes tensor
+ * @param[out] pred_boxes_ptr                           Pointer to the predicted boxes. Supported data types: same as @p in_ptr
+ * @param[in]  pred_boxes_stride_x                      Stride of the predicted boxes in X dimension (in bytes)
+ * @param[in]  pred_boxes_step_x                        pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_y                      Stride of the predicted boxes in Y dimension (in bytes)
+ * @param[in]  pred_boxes_step_y                        pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_z                      Stride of the predicted boxes in Z dimension (in bytes)
+ * @param[in]  pred_boxes_step_z                        pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  pred_boxes_offset_first_element_in_bytes The offset of the first element in the predicted boxes
+ * @param[in]  deltas_ptr                               Pointer to the deltas tensor. Supported data types: QASYMM8
+ * @param[in]  deltas_stride_x                          Stride of the deltas tensor in X dimension (in bytes)
+ * @param[in]  deltas_step_x                            deltas_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  deltas_stride_y                          Stride of the deltas tensor in Y dimension (in bytes)
+ * @param[in]  deltas_step_y                            deltas_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  deltas_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  deltas_step_z                            deltas_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  deltas_offset_first_element_in_bytes     The offset of the first element in the deltas tensor
+ */
+__kernel void bounding_box_transform_quantized(
+    VECTOR_DECLARATION(boxes),
+    IMAGE_DECLARATION(pred_boxes),
+    IMAGE_DECLARATION(deltas))
+{
+    // Get pixels pointer
+    Vector boxes      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(boxes);
+    Image  pred_boxes = CONVERT_TO_IMAGE_STRUCT(pred_boxes);
+    Image  deltas     = CONVERT_TO_IMAGE_STRUCT(deltas);
+
+    // Load delta and box values into registers
+    const float one     = 1.f;
+    const float halfone = 0.5f;
+
+    const int py           = get_global_id(1); // box
+    float4    scale_before = (float4)SCALE_BEFORE;
+    float4 delta           = DEQUANTIZE(vload4(0, (__global DATA_TYPE_DELTAS *)deltas.ptr), OFFSET_DELTAS, SCALE_DELTAS, DATA_TYPE_DELTAS, 4);
+    float4 box             = DEQUANTIZE(vload4(0, (__global DATA_TYPE *)vector_offset(&boxes, BOX_FIELDS * py)), OFFSET_BOXES, SCALE_BOXES, DATA_TYPE, 4) / scale_before;
+
+    // Calculate width and centers of the old boxes
+    float2 dims    = box.s23 - box.s01 + one;
+    float2 ctr     = box.s01 + halfone * dims;
+    float4 weights = (float4)(WEIGHT_X, WEIGHT_Y, WEIGHT_W, WEIGHT_H);
+    delta /= weights;
+    delta.s23 = min(delta.s23, (float)BBOX_XFORM_CLIP);
+
+    // Calculate widths and centers of the new boxes (translation + aspect ratio transformation)
+    float2 pred_ctr  = delta.s01 * dims + ctr;
+    float2 pred_dims = exp(delta.s23) * dims;
+
+    // Useful vector constant definitions
+    float4 max_values = (float4)(IMG_WIDTH - 1, IMG_HEIGHT - 1, IMG_WIDTH - 1, IMG_HEIGHT - 1);
+    float4 sign       = (float4)(-1, -1, 1, 1);
+    float4 min_values = 0;
+
+    // Calculate the coordinates of the new boxes
+    float4 pred_box = pred_ctr.s0101 + sign * halfone * pred_dims.s0101;
+#ifdef OFFSET // Possibly adjust the predicted boxes
+    pred_box.s23 -= one;
+#endif // Possibly adjust the predicted boxes
+    pred_box = CLAMP(pred_box, min_values, max_values);
+#ifdef SCALE_AFTER // Possibly scale the predicted boxes
+    pred_box *= (float4)SCALE_AFTER;
+#endif // Possibly scale the predicted boxes
+
+    // Store them into the output
+    vstore4(QUANTIZE(pred_box, OFFSET_PRED_BOXES, SCALE_PRED_BOXES, DATA_TYPE, 4), 0, (__global DATA_TYPE *)pred_boxes.ptr);
+}
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index c314d17..ad06451 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,67 @@
 
 #include "helpers.h"
 
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+    float out_f32 = input / scale + offset;
+    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+    return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+    return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+    {                                                                                                                   \
+        VEC_DATA_TYPE(float, size)                                                                                      \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
+        VEC_DATA_TYPE(type, size)                                                                                       \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
+        return res;                                                                                                     \
+    }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                                                       \
+    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                                                     \
+        return (CONVERT(input, VEC_DATA_TYPE(float, 4)) - offset) * scale;                                                \
+    }
+
 /** Correctly-rounded-to-nearest division by a power-of-two.
  *
  * @param[in] size Size of vector.
@@ -292,6 +353,11 @@
         return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
     }
 
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
 #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
@@ -307,6 +373,12 @@
 #define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
 #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
 
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
@@ -367,4 +439,4 @@
 ASYMM_RESCALE_IMPL(8)
 ASYMM_RESCALE_IMPL(16)
 
-#endif // ARM_COMPUTE_HELPERS_ASYMM_H
\ No newline at end of file
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
diff --git a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl b/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
index f9360e9..030731b 100644
--- a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "helpers.h"
+#include "helpers_asymm.h"
 
 // This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
 // It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
@@ -29,26 +29,6 @@
 
 #if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) && defined(OFFSET_IN) && defined(OFFSET_OUT) && defined(SCALE_IN) && defined(SCALE_OUT) && defined(OFFSET_ROIS) && defined(SCALE_ROIS) // Check for compile time constants
 
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-inline float dequantize_qasymm8(uchar input, float offset, float scale)
-{
-    return ((float)input - offset) * scale;
-}
-
-inline uchar quantize_qasymm8(float input, float offset, float scale)
-{
-    float out_f32 = input / scale + offset;
-    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN(out_f32, int), uchar);
-    return res_u8;
-}
-
-inline float4 dequantize_qasymm16(ushort4 input, float offset, float scale)
-{
-    float4 in_f32 = (CONVERT(input, float4) - (float4)(offset)) * (float4)(scale);
-    return in_f32;
-}
-
 /** Performs a roi align on a single output pixel.
  *
  * @param[in] input          Pointer to input Tensor3D struct.
@@ -178,7 +158,7 @@
     // Load roi parameters
     // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
     const ushort roi_batch = *((__global ushort *)offset(&rois, 0, pw));
-    float4 roi             = dequantize_qasymm16(vload4(0, (__global ushort *)offset(&rois, 1, pw)), OFFSET_ROIS, SCALE_ROIS);
+    float4 roi             = DEQUANTIZE(vload4(0, (__global ushort *)offset(&rois, 1, pw)), OFFSET_ROIS, SCALE_ROIS, ushort, 4);
     float2 roi_anchor      = roi.s01 * convert_float(SPATIAL_SCALE);
     float2 roi_dims        = fmax((roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
 
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 045bd02..08e5cc6 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -43,21 +43,37 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(boxes, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(deltas, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(boxes, DataType::QASYMM16, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(deltas, DataType::QASYMM8, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON(deltas->tensor_shape()[1] != boxes->tensor_shape()[1]);
     ARM_COMPUTE_RETURN_ERROR_ON(deltas->tensor_shape()[0] % 4 != 0);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
     ARM_COMPUTE_RETURN_ERROR_ON(deltas->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
 
+    const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16;
+    if(is_qasymm16)
+    {
+        const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform();
+        ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
+        ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.offset != 0);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(deltas, DataType::QASYMM8);
+    }
+
     if(pred_boxes->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes);
         ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
+        if(is_qasymm16)
+        {
+            const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform();
+            ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f);
+            ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.offset != 0);
+        }
     }
     ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0);
+
     return Status{};
 }
 } // namespace
@@ -70,7 +86,7 @@
 void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
-    auto_init_if_empty(*pred_boxes->info(), *deltas->info());
+    auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
 
@@ -83,9 +99,11 @@
     const int img_h = floor(info.img_height() / info.scale() + 0.5f);
     const int img_w = floor(info.img_width() / info.scale() + 0.5f);
 
+    const bool is_quantized = is_data_type_quantized(boxes->info()->data_type());
+
     // Set build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(deltas->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(boxes->info()->data_type()));
     build_opts.add_option("-DWEIGHT_X=" + float_to_string_with_full_precision(info.weights()[0]));
     build_opts.add_option("-DWEIGHT_Y=" + float_to_string_with_full_precision(info.weights()[1]));
     build_opts.add_option("-DWEIGHT_W=" + float_to_string_with_full_precision(info.weights()[2]));
@@ -98,8 +116,23 @@
     build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale()));
     build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1");
 
+    if(is_quantized)
+    {
+        build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type()));
+        const UniformQuantizationInfo boxes_qinfo      = boxes->info()->quantization_info().uniform();
+        const UniformQuantizationInfo deltas_qinfo     = deltas->info()->quantization_info().uniform();
+        const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->info()->quantization_info().uniform();
+        build_opts.add_option("-DOFFSET_BOXES=" + float_to_string_with_full_precision(boxes_qinfo.offset));
+        build_opts.add_option("-DSCALE_BOXES=" + float_to_string_with_full_precision(boxes_qinfo.scale));
+        build_opts.add_option("-DOFFSET_DELTAS=" + float_to_string_with_full_precision(deltas_qinfo.offset));
+        build_opts.add_option("-DSCALE_DELTAS=" + float_to_string_with_full_precision(deltas_qinfo.scale));
+        build_opts.add_option("-DOFFSET_PRED_BOXES=" + float_to_string_with_full_precision(pred_boxes_qinfo.offset));
+        build_opts.add_option("-DSCALE_PRED_BOXES=" + float_to_string_with_full_precision(pred_boxes_qinfo.scale));
+    }
+
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bounding_box_transform", build_opts.options()));
+    const std::string kernel_name = (is_quantized) ? "bounding_box_transform_quantized" : "bounding_box_transform";
+    _kernel                       = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor
     const unsigned int num_elems_processed_per_iteration = 4;
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index d11788a..7ce94e2 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -286,6 +286,7 @@
     {
         case DataType::U8:
         case DataType::QASYMM8:
+        case DataType::QASYMM8_PER_CHANNEL:
             // Needs conversion to 32 bit, otherwise interpreted as ASCII values
             ss << uint32_t(value.get<uint8_t>());
             converted_string = ss.str();
@@ -296,6 +297,7 @@
             converted_string = ss.str();
             break;
         case DataType::U16:
+        case DataType::QASYMM16:
             ss << value.get<uint16_t>();
             converted_string = ss.str();
             break;
@@ -429,14 +431,16 @@
 {
     switch(dt)
     {
-        case DataType::QASYMM8:
         case DataType::U8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_PER_CHANNEL:
             print_consecutive_elements_impl<uint8_t>(s, ptr, n, stream_width, element_delim);
             break;
         case DataType::S8:
             print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
             break;
         case DataType::U16:
+        case DataType::QASYMM16:
             print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
             break;
         case DataType::S16:
@@ -464,12 +468,14 @@
 {
     switch(dt)
     {
-        case DataType::QASYMM8:
         case DataType::U8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_PER_CHANNEL:
             return max_consecutive_elements_display_width_impl<uint8_t>(s, ptr, n);
         case DataType::S8:
             return max_consecutive_elements_display_width_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n);
         case DataType::U16:
+        case DataType::QASYMM16:
             return max_consecutive_elements_display_width_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n);
         case DataType::S16:
         case DataType::QSYMM16: