COMPMID-853 Fuse CL DepthwiseConvolution with Activation for QASYM8

Change-Id: I287908f76af458ad4b4d865d353dc37e33877250
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120839
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index 4424a66..a8ea738 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -115,6 +115,8 @@
 #define ACTIVATION_OP2(op, x) op##_op(x)
 #define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
 
+#if defined(ACT)
+
 /** This performs an activation function floating point inputs.
  *
  * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
@@ -168,3 +170,5 @@
     VSTORE(VEC_SIZE)
     (data, 0, (__global DATA_TYPE *)output.ptr);
 }
+
+#endif /* defined(ACT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
index cb31e99..66e54ed 100644
--- a/src/core/CL/cl_kernels/activation_layer_qa8.cl
+++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl
@@ -44,6 +44,26 @@
 #define ACTIVATION_OP2(op, x) op##_op(x)
 #define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
 
+#if defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
+#define PERFORM_ACTIVATION_QA8(act, data)                                                         \
+    ({                                                                                            \
+        data = ACTIVATION_OP(act, data);                                                          \
+        \
+        VEC_DATA_TYPE(float, VEC_SIZE)                                                            \
+        fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));                                    \
+        \
+        fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
+        data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(uchar, VEC_SIZE));                               \
+    })
+#else /* defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) */
+#define PERFORM_ACTIVATION_QA8(act, data) \
+    ({                                    \
+        data = ACTIVATION_OP(act, data);  \
+    })
+#endif /* defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) */
+
+#if defined(ACT)
+
 /** This performs an activation function on QASYMM8 inputs.
  *
  * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
@@ -92,19 +112,11 @@
     // Load data
     TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
 
-    // Perform activation
-    data = ACTIVATION_OP(ACT, data);
-
-#if defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
-    // requantize to output space
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));
-
-    fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL);
-    data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(uchar, VEC_SIZE));
-#endif // defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
+    data = PERFORM_ACTIVATION_QA8(ACT, data);
 
     // Store result
     VSTORE(VEC_SIZE)
     (data, 0, (__global DATA_TYPE *)output.ptr);
 }
+
+#endif /* defined(ACT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 5ddeb1a..0b61b56 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -44,15 +44,12 @@
 
 #endif /* FIXED_POINT_POSITION */
 
-#if defined(LU_BRELU)
-#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL)
-#elif defined(BRELU)
-#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)0, (DATA_TYPE)A_VAL)
-#elif defined(RELU)
-#define ACTIVATION_FUNC(x) max(x, (DATA_TYPE)0)
-#else /* FUSED_ACT */
+#if defined(FUSED_ACTIVATION)
+#include "activation_layer.cl"
+#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
 #define ACTIVATION_FUNC(x) (x)
-#endif /* FUSED_ACT */
+#endif /* defined(FUSED_ACTIVATION) */
 
 /** Apply batch normalization.
  *
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 40538a1..21daee8 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -26,6 +26,15 @@
 
 #if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
 
+#if defined(FUSED_ACTIVATION)
+#define DATA_TYPE uchar
+#define VEC_SIZE 8
+#include "activation_layer_qa8.cl"
+#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
+#define ACTIVATION_FUNC(x) (x)
+#endif /* defined(FUSED_ACTIVATION) */
+
 #if CONV_STRIDE_X > 3
 #error "Stride X not supported"
 #endif /* CONV_STRIDE_X > 3 */
@@ -222,7 +231,7 @@
     res0        = max(res0, (uchar8)0);
     res0        = min(res0, (uchar8)255);
 
-    vstore8(res0, 0, dst.ptr);
+    vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
 #if CONV_STRIDE_Y == 1
 
     values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
@@ -231,7 +240,7 @@
     res1        = max(res1, (uchar8)0);
     res1        = min(res1, (uchar8)255);
 
-    vstore8(res1, 0, dst.ptr + dst_stride_y);
+    vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
 #endif /* CONV_STRIDE_Y == 1 */
 }