Add CLPool3d Int8 Support

- Adds Qasymm8 and Qasymm8_signed support to the 3d pool operator

Resolves: COMPMID-4669
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: I36038c2b7c4f36baf67f7aae801356890e104538
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/410496
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Sheri Zhang <sheri.zhang@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7391
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index 691e46e..c19d0d4 100644
--- a/Android.bp
+++ b/Android.bp
@@ -110,6 +110,7 @@
         "src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl",
         "src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl",
         "src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl",
+        "src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl",
         "src/core/CL/cl_kernels/nhwc/pooling_layer.cl",
         "src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl",
         "src/core/CL/cl_kernels/nhwc/reorg_layer.cl",
diff --git a/SConscript b/SConscript
index 3e5fa2f..620e811 100644
--- a/SConscript
+++ b/SConscript
@@ -446,6 +446,7 @@
                     'src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl',
                     'src/core/CL/cl_kernels/nhwc/pooling_layer.cl',
                     'src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl',
                     'src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl',
                     'src/core/CL/cl_kernels/nhwc/reorg_layer.cl',
                     'src/core/CL/cl_kernels/nhwc/scale.cl',
diff --git a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
index 8bad449..91c4677 100644
--- a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
@@ -62,13 +62,15 @@
      * |:--------------|:--------------|
      * |F16            |F16            |
      * |F32            |F32            |
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
      *
      * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
      *       Cases where pooling region is completely outside input tensor are not supported
      *
      * @note Asymmetric padding is not supported when dimension rounding type == CEIL.
      *
-     * @param[in,out] input     Source tensor. Data types supported: F16/F32.
+     * @param[in,out] input     Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
      * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]     pool_info Contains 3d pooling operation information described in @ref Pooling3dLayerInfo.
      */
@@ -76,14 +78,14 @@
     /** Set the input and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] input           Source tensor. Data types supported: F16/F32.
+     * @param[in,out] input           Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
      * @param[out]    output          Destination tensor. Data types supported: Same as @p input.
      * @param[in]     pool_info       Contains 3d pooling operation information described in @ref Pooling3dLayerInfo.
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPooling3dLayer
      *
-     * @param[in] input     Source tensor info. Data types supported: F16/F32.
+     * @param[in] input     Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains 3d pooling operation information described in @ref Pooling3dLayerInfo.
      *
diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox
index c0888f1..51a72bf 100644
--- a/docs/user_guide/operator_list.dox
+++ b/docs/user_guide/operator_list.dox
@@ -2330,6 +2330,8 @@
     <tr><th>src<th>dst
     <tr><td>F16<td>F16
     <tr><td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
     </table>
 <tr>
   <td rowspan="2">PReluLayer
@@ -3155,4 +3157,4 @@
 </table>
 
 */
-} // namespace
\ No newline at end of file
+} // namespace
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl
index 7c64143..4e5481d 100644
--- a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl
+++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl
@@ -177,8 +177,15 @@
     res0 = SQRT_OP(res0);
 #endif // defined(POOL_L2)
 
-    // Store result
-#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+
+
+   // Store result
+#if defined(QUANTIZED)
+    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#elif defined(FP_MIXED_PRECISION)
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
     STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl
new file mode 100644
index 0000000..abf0db9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h" // Needed for GET_SPATIAL_IDX()
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG)  */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                 \
+    {                                                                                                                                                                 \
+        const VEC_FLOAT(VEC_SIZE) in_f32  = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
+        const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset));                            \
+        res                               = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                                \
+    }
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+/** Performs 3d pooling layer of size equal to MxNXD. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be excluded, -DEXCLUDE_PADDING should be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are QASYMM8_SIGNED, QASYMM8
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X, -DPOOL_SIZE_Y, and -DPOOL_SIZE_Z. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4, -DPOOL_SIZE_Z=2
+ * @note Input tensor width, height and depth must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT, and -DSRC_DEPTH
+ * @note Output tensor height, channels, depth, and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS, -DDST_DEPTH, and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z which are the steps of the window along the x, y and z directions
+ * @note Pool pads must be passed at compile time using -DPAD_X, -DPAD_Y, -DPAD_Z
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8_SIGNED, QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_stride_v                       Stride of the source tensor in V dimension (in bytes)
+ * @param[in]  input_step_v                         input_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_stride_v                      Stride of the destination tensor in V dimension (in bytes)
+ * @param[in]  output_step_v                        output_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_3d_layer_MxN_ndhwc_quantized(
+    TENSOR5D_DECLARATION(input),
+    TENSOR5D_DECLARATION(output))
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be shrunk to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+
+    // The depth size dimension and the batch size dimension are collapsed over the height dimension
+    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+    int idx_out_d = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) % DST_DEPTH;
+    int idx_out_n = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) / DST_DEPTH;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_v;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_d *
+                                           output_stride_w + idx_out_n * output_stride_v;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+    int idx_in_w = idx_out_w * STRIDE_X - (int)PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - (int)PAD_Y;
+    int idx_in_d = idx_out_d * STRIDE_Z - (int)PAD_Z;
+
+    // The start of width to consider in calculation should exclude padding
+    int pool_x_s = max((int)0, -idx_in_w);
+    // Assumed Symmetric Padding (left padding = right padding = PAD_X), the filter end should be either the pool width or what is remaining from current pos to the (src width + pad right)
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH + PAD_X - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT + PAD_Y - idx_in_h);
+    int pool_z_s = max((int)0, -idx_in_d);
+    int pool_z_e = min((int)POOL_SIZE_Z, (int)SRC_DEPTH + PAD_Z - idx_in_d);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = 0;
+#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = pool_z_e * pool_y_e * pool_x_e;
+#endif                                               // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
+
+    // The end of width to consider in calculation should exclude PAD_X
+    pool_x_e = min(pool_x_e, SRC_WIDTH - idx_in_w);
+    pool_y_e = min(pool_y_e, SRC_HEIGHT - idx_in_h);
+    pool_z_e = min(pool_z_e, SRC_DEPTH - idx_in_d);
+
+    for(int z = pool_z_s; z < pool_z_e; ++z)
+    {
+        int depth_offset_src = (z + idx_in_d) * input_stride_w;
+        for(int y = pool_y_s; y < pool_y_e; ++y)
+        {
+            int height_offset_src = (y + idx_in_h) * input_stride_z;
+#pragma unroll 8
+            for(int x = pool_x_s; x < pool_x_e; ++x)
+            {
+                int width_offset_src = (x + idx_in_w) * input_stride_y;
+
+                VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+                data;
+                VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+                data0;
+
+                data  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + width_offset_src + height_offset_src + depth_offset_src));
+                data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+                res0 = POOL_OP(res0, data0);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+                filter_size++;
+#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+            }
+        }
+    }
+
+#if defined(POOL_AVG)
+    res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
+#endif // defined(POOL_AVG)
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index f675dbe..1bf7f2b 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -438,6 +438,7 @@
     { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" },
     { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" },
     { "pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl" },
+    { "pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl" },
     { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
     { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
     { "scale_bilinear_nhwc", "nhwc/scale.cl" },
@@ -907,6 +908,10 @@
 #include "./cl_kernels/nhwc/pooling_3d_layer.clembed"
     },
     {
+        "nhwc/pooling_3d_layer_quantized.cl",
+#include "./cl_kernels/nhwc/pooling_3d_layer_quantized.clembed"
+    },
+    {
         "nhwc/pooling_layer_quantized.cl",
 #include "./cl_kernels/nhwc/pooling_layer_quantized.clembed"
     },
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.cpp b/src/gpu/cl/kernels/ClPool3dKernel.cpp
index 929ccf7..a090ac5 100644
--- a/src/gpu/cl/kernels/ClPool3dKernel.cpp
+++ b/src/gpu/cl/kernels/ClPool3dKernel.cpp
@@ -49,19 +49,22 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0), "Strides cannot be zero.");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding
+                                                                                && (pool_info.pool_type == PoolingType::AVG)),
+                                    "Exclude padding is unsupported for non-float types for Avg op");
 
-    const auto   data_layout       = src->data_layout();
-    const int    idx_width         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int    idx_height        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int    idx_depth         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
-    const bool   is_global_pooling = pool_info.is_global_pooling;
+    const auto         data_layout       = src->data_layout();
+    const int          idx_width         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int          idx_height        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int          idx_depth         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+    const bool         is_global_pooling = pool_info.is_global_pooling;
     const unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
     const unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
     const unsigned int pool_size_z       = is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
-    int          output_width      = 0;
-    int          output_height     = 0;
-    int          output_depth      = 0;
+    int                output_width      = 0;
+    int                output_height     = 0;
+    int                output_depth      = 0;
 
     bool round_type_ceil_with_asymm_padding = (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding));
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding, "Cannot use dimension round type CEIL when padding is asymmetric.");
@@ -143,10 +146,31 @@
     build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
     build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(idx_depth)));
 
+    // If datatype is quantized add relevant parameters
+    if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
+    {
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
     // Set the initial value for the pooling operation accordingly with the data type
     if(pool_type == PoolingType::MAX)
     {
-        build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
+        if(is_data_type_quantized(data_type))
+        {
+            PixelValue type_min{};
+            std::tie(type_min, std::ignore) = get_min_max(data_type);
+            build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
+        }
+        else
+        {
+            build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
+        }
     }
     else
     {
@@ -164,6 +188,11 @@
     {
         acc_data_type = DataType::F32;
     }
+    else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division
+    {
+        acc_data_type = DataType::S32;
+    }
+
     build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
     build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
     build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
@@ -172,8 +201,10 @@
     build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
     build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
-    std::string kernel_name = "pooling_3d_layer_MxN_ndhwc";
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // if datatype is quantized use quantized kernel function
+    std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized" : "pooling_3d_layer_MxN_ndhwc");
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.h b/src/gpu/cl/kernels/ClPool3dKernel.h
index 30c76ed..0085234 100644
--- a/src/gpu/cl/kernels/ClPool3dKernel.h
+++ b/src/gpu/cl/kernels/ClPool3dKernel.h
@@ -46,7 +46,7 @@
      * @note Asymmetric padding is not supported when dimension rounding type == CEIL.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  pool_info       Contains pooling operation information described in @ref Pooling3dLayerInfo.
      */
diff --git a/tests/validation/CL/Pooling3dLayer.cpp b/tests/validation/CL/Pooling3dLayer.cpp
index 5c80351..84d630e 100644
--- a/tests/validation/CL/Pooling3dLayer.cpp
+++ b/tests/validation/CL/Pooling3dLayer.cpp
@@ -59,10 +59,18 @@
                                                           framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
                                                   framework::dataset::make("ExcludePadding", { true, false }));
 
+const auto Pooling3DLayerDatasetQuantized = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                                            framework::dataset::make("PoolingSize", { Size3D(2, 3, 2) })),
+                                                                    framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(1, 1, 2), Size3D(2, 2, 1)})),
+                                                            framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
+                                                    framework::dataset::make("ExcludePadding", { true }));
+
 using ShapeDataset = framework::dataset::ContainerDataset<std::vector<TensorShape>>;
 
-constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
-constexpr AbsoluteTolerance<float> tolerance_f16(0.1f);  /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+constexpr AbsoluteTolerance<float>   tolerance_f32(0.001f);       /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
+constexpr AbsoluteTolerance<float>   tolerance_f16(0.1f);         /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1); /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8_SIGNED integer datatype*/
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);        /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric type */
 
 } // namespace
 
@@ -133,8 +141,64 @@
 template <typename T>
 using CLPooling3dLayerGlobalFixture = Pooling3dLayerGlobalValidationFixture<CLTensor, CLAccessor, CLPooling3dLayer, T>;
 
+template <typename T>
+using CLPooling3dLayerQuantizedFixture = Pooling3dLayerValidationQuantizedFixture<CLTensor, CLAccessor, CLPooling3dLayer, T>;
+
 // clang-format on
 // *INDENT-ON*
+TEST_SUITE(QUANTIZED)
+
+TEST_SUITE(QASYMM8)
+// Small Dataset Quantized Dataset
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                       combine(Pooling3DLayerDatasetQuantized,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, 10), QuantizationInfo(1.f / 127.f, 10) })),
+                                                                                                                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, 5), QuantizationInfo(1.f / 127.f, 10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+// Large Dataset Quantized Dataset
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large5dShapes(),
+                                                                                                                       combine(Pooling3DLayerDatasetQuantized,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, 10), QuantizationInfo(1.f / 127.f, 10) })),
+                                                                                                                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, 5), QuantizationInfo(1.f / 127.f, 10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(QASYMM8_SIGNED)
+
+// Large Dataset Quantized Dataset Signed
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                      combine(Pooling3DLayerDatasetQuantized,
+                                                                                                                              framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                                                                                                                      framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10), QuantizationInfo(1.f / 127.f, -10) })),
+                                                                                                                      framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -5), QuantizationInfo(1.f / 127.f, -10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+// Large Dataset Quantized pooling test
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large5dShapes(),
+                                                                                                                    combine(Pooling3DLayerDatasetQuantized,
+                                                                                                                            framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                                                                                                                    framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10), QuantizationInfo(1.f / 127.f, -10) })),
+                                                                                                                    framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -5), QuantizationInfo(1.f / 127.f, -10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
@@ -152,8 +216,7 @@
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFP,
-                                                                                                            framework::dataset::make("DataType",
-                                                                                                                    DataType::F32))))
+                                                                                                          framework::dataset::make("DataType", DataType::F32))))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -219,8 +282,7 @@
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFP,
-                                                                                                           framework::dataset::make("DataType",
-                                                                                                                   DataType::F16))))
+                                                                                                         framework::dataset::make("DataType", DataType::F16))))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/fixtures/Pooling3dLayerFixture.h b/tests/validation/fixtures/Pooling3dLayerFixture.h
index 563f1dc..9ba7bab 100644
--- a/tests/validation/fixtures/Pooling3dLayerFixture.h
+++ b/tests/validation/fixtures/Pooling3dLayerFixture.h
@@ -163,6 +163,19 @@
     }
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class Pooling3dLayerValidationQuantizedFixture : public Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, PoolingType pool_type, Size3D pool_size, Size3D stride, Padding3D padding, bool exclude_padding, DataType data_type,
+               QuantizationInfo input_qinfo, QuantizationInfo output_qinfo)
+    {
+        Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, Pooling3dLayerInfo(pool_type, pool_size, stride, padding, exclude_padding),
+                                                                                                 data_type, input_qinfo, output_qinfo);
+    }
+};
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute