COMPMID-414 - Port CLConvolutionLayer to support 8 bit fixed point - CLGEMMMatrixAccumulateBiasesKernel

Change-Id: Idba13b578dc564b8003ce2fa3392eea2af3ce806
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78664
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 45a247d..6c64265 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -151,8 +151,7 @@
     { "finalize", "optical_flow_pyramid_lk.cl" },
     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
-    { "gemm_accumulate_biases_f16", "gemm.cl" },
-    { "gemm_accumulate_biases_f32", "gemm.cl" },
+    { "gemm_accumulate_biases", "gemm.cl" },
     { "gemm_interleave4x4_8bit", "gemm.cl" },
     { "gemm_interleave4x4_16bit", "gemm.cl" },
     { "gemm_interleave4x4_32bit", "gemm.cl" },
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index d80b526..9bec8d5 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -248,6 +248,8 @@
 
 /** This kernel accumulates each row with the biases vector
  *
+ * @note The data type must be passed at compile time -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
  * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F32
  * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
  * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
@@ -259,48 +261,24 @@
  * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void gemm_accumulate_biases_f32(
+#if(defined DATA_TYPE)
+__kernel void gemm_accumulate_biases(
     IMAGE_DECLARATION(accum),
     VECTOR_DECLARATION(biases))
 {
     Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
     Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
 
-    float4 accum_value  = vload4(0, (__global float *)accum.ptr);
-    float4 biases_value = vload4(0, (__global float *)biases.ptr);
-    accum_value         = biases_value + accum_value;
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    accum_value = vload16(0, (__global DATA_TYPE *)accum.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    biases_value = vload16(0, (__global DATA_TYPE *)biases.ptr);
+    accum_value  = biases_value + accum_value;
 
     // Store result in the accummulate buffer
-    vstore4(accum_value, 0, (__global float *)accum.ptr);
+    vstore16(accum_value, 0, (__global DATA_TYPE *)accum.ptr);
 }
-
-/** This kernel accumulates each row with the biases vector
- *
- * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
- * @param[in]      accum_stride_x                       Stride of the accumulate tensor in X dimension (in bytes)
- * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
- * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
- * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
- * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gemm_accumulate_biases_f16(
-    IMAGE_DECLARATION(accum),
-    VECTOR_DECLARATION(biases))
-{
-    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-
-    half8 accum_value  = vload8(0, (__global half *)accum.ptr);
-    half8 biases_value = vload8(0, (__global half *)biases.ptr);
-    accum_value        = biases_value + accum_value;
-
-    // Store result in the accummulate buffer
-    vstore8(accum_value, 0, (__global half *)accum.ptr);
-}
+#endif // defined DATA_TYPE
 
 #if(defined WIDTH_MATRIX_B)
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index 289873c..75c1a6e 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -43,20 +43,22 @@
 
 void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum);
     ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
 
     _biases = biases;
     _accum  = accum;
 
+    std::set<std::string> build_opts;
+    build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type())));
+
     // Create kernel
-    std::string data_type_name = lower_string(string_from_data_type(accum->info()->data_type()));
-    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases_" + data_type_name));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts));
 
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(accum->info()->data_type());
+    const unsigned int num_elems_processed_per_iteration = 16;
 
     Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));