COMPMID-908 - Merge Activation layer with Convolution Layer (NEON. CL, GLES)

Change-Id: Iab06d0768ecf805b841e601185608aae88cf9166
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120874
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
index 5d1464a..25ac02e 100644
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
@@ -232,6 +232,14 @@
 {
 #ifdef EMBEDDED_KERNELS
     {
+        "helpers_cs.h",
+#include "./cs_shaders/helpers_cs.hembed"
+    },
+    {
+        "activation_layer_helpers_cs.h",
+#include "./cs_shaders/activation_layer_helpers_cs.hembed"
+    },
+    {
         "absdiff.cs",
 #include "./cs_shaders/absdiff.csembed"
     },
diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
index 7d3f4ee..9a1e233 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,97 +23,9 @@
  */
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 
+#include "activation_layer_helpers_cs.h"
 #include "helpers_cs.h"
 
-#ifdef DATA_TYPE_FP32
-precision highp float;
-#elif defined(DATA_TYPE_FP16)
-#if defined(LOGISTIC) || defined(TANH) || defined(SRELU) || defined(SQRT)
-precision highp float;
-#else  /*LOGISTIC_TANH_SRELU_SQRT*/
-precision mediump float;
-#endif /*LOGISTIC_TANH_SRELU_SQRT*/
-#endif /*DATA_TYPE_FP32*/
-
-#define ABS_OP(a) abs((a))
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define MLA_OP(a, b, c) ((b) * (c) + (a))
-#define DIV_OP(a, b) ((a) / (b))
-#define EXP_OP(a) exp((a))
-#define LOG_OP(a) log((a))
-#define SQRT_OP(a) sqrt((a))
-#define CONST_ONE (1.f)
-
-// Logistic Activation
-float logistic_op(float x)
-{
-    return DIV_OP(CONST_ONE, ADD_OP(CONST_ONE, EXP_OP(-x)));
-}
-// Hyperbolic Tangent Activation
-float tanh_op(float x)
-{
-    float tmp = float(B_VAL) * x;
-    if(tmp > 10.f)
-    {
-        return MUL_OP(float(A_VAL), 1.f);
-    }
-    else if(tmp < -10.f)
-    {
-        return MUL_OP(float(A_VAL), -1.f);
-    }
-    else
-    {
-        return MUL_OP(float(A_VAL), tanh(tmp + 0.000001f));
-    }
-}
-// RELU Tangent Activation
-float relu_op(float x)
-{
-    return max(0.f, x);
-}
-// Bounded RELU Activation
-float brelu_op(float x)
-{
-    return min(float(A_VAL), max(float(0.0), x));
-}
-// Lower Upper Bounded RELU Activation
-float lu_brelu_op(float x)
-{
-    return min(max(x, float(B_VAL)), float(A_VAL));
-}
-// Leaky RELU Activation
-float lrelu_op(float x)
-{
-    return (x > float(0.0)) ? x : MUL_OP(float(A_VAL), x);
-}
-// Soft RELU Activation
-float srelu_op(float x)
-{
-    return LOG_OP(ADD_OP(CONST_ONE, EXP_OP(x)));
-}
-// Absolute Activation
-float abs_op(float x)
-{
-    return ABS_OP(x);
-}
-// Square Activation
-float square_op(float x)
-{
-    return MUL_OP(x, x);
-}
-// Square-root Activation
-float sqrt_op(float x)
-{
-    return SQRT_OP(x);
-}
-// Linear Activation
-float linear_op(float x)
-{
-    return MLA_OP(float(B_VAL), float(A_VAL), x);
-}
-
 /** This performs an activation function floating point inputs.
  *
  * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h
new file mode 100644
index 0000000..f43a33f
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef DATA_TYPE_FP32
+precision highp float;
+#elif defined(DATA_TYPE_FP16)
+#if defined(LOGISTIC) || defined(TANH) || defined(SRELU) || defined(SQRT)
+precision highp float;
+#else  /*LOGISTIC_TANH_SRELU_SQRT*/
+precision mediump float;
+#endif /*LOGISTIC_TANH_SRELU_SQRT*/
+#endif /*DATA_TYPE_FP32*/
+
+#define ABS_OP(a) abs((a))
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define MLA_OP(a, b, c) ((b) * (c) + (a))
+#define DIV_OP(a, b) ((a) / (b))
+#define EXP_OP(a) exp((a))
+#define LOG_OP(a) log((a))
+#define SQRT_OP(a) sqrt((a))
+#define CONST_ONE (1.f)
+
+// Logistic Activation
+float logistic_op(float x)
+{
+    return DIV_OP(CONST_ONE, ADD_OP(CONST_ONE, EXP_OP(-x)));
+}
+vec4 logistic_op(vec4 x)
+{
+    return DIV_OP(vec4(CONST_ONE), ADD_OP(CONST_ONE, EXP_OP(-x)));
+}
+// Hyperbolic Tangent Activation
+float tanh_op(float x)
+{
+    float tmp = float(B_VAL) * x;
+    if(tmp > 10.f)
+    {
+        return MUL_OP(float(A_VAL), 1.f);
+    }
+    else if(tmp < -10.f)
+    {
+        return MUL_OP(float(A_VAL), -1.f);
+    }
+    else
+    {
+        return MUL_OP(float(A_VAL), tanh(tmp + 0.000001f));
+    }
+}
+// RELU Tangent Activation
+float relu_op(float x)
+{
+    return max(0.f, x);
+}
+vec4 relu_op(vec4 x)
+{
+    return max(vec4(0.f), x);
+}
+// Bounded RELU Activation
+float brelu_op(float x)
+{
+    return min(float(A_VAL), max(float(0.0), x));
+}
+// Lower Upper Bounded RELU Activation
+float lu_brelu_op(float x)
+{
+    return min(max(x, float(B_VAL)), float(A_VAL));
+}
+// Leaky RELU Activation
+float lrelu_op(float x)
+{
+    return (x > float(0.0)) ? x : MUL_OP(float(A_VAL), x);
+}
+// Soft RELU Activation
+float srelu_op(float x)
+{
+    return LOG_OP(ADD_OP(CONST_ONE, EXP_OP(x)));
+}
+// Absolute Activation
+float abs_op(float x)
+{
+    return ABS_OP(x);
+}
+// Square Activation
+float square_op(float x)
+{
+    return MUL_OP(x, x);
+}
+// Square-root Activation
+float sqrt_op(float x)
+{
+    return SQRT_OP(x);
+}
+// Linear Activation
+float linear_op(float x)
+{
+    return MLA_OP(float(B_VAL), float(A_VAL), x);
+}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
index ea4e9c1..b42c09b 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,10 @@
 
 #include "helpers_cs.h"
 
+#ifdef FUSED_ACTIVATION
+#include "activation_layer_helpers_cs.h"
+#endif /* FUSED_ACTIVATION */
+
 #if defined(DATA_TYPE_FP16)
 precision mediump float;
 #endif // DATA_TYPE_FP16
@@ -99,6 +103,10 @@
     pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 
@@ -210,6 +218,10 @@
     pixels += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
 }
 #elif defined(PROCESS_4X_2Y_1Z)
@@ -333,6 +345,11 @@
     pixels[1] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
 }
@@ -470,6 +487,12 @@
     pixels[2] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+    pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -609,6 +632,13 @@
     pixels1[1] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0]  = ACT_OP(pixels[0]);
+    pixels[1]  = ACT_OP(pixels[1]);
+    pixels1[0] = ACT_OP(pixels1[0]);
+    pixels1[1] = ACT_OP(pixels1[1]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels1[0]);
@@ -745,6 +775,11 @@
         pixels[1] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+        pixels[0] = ACT_OP(pixels[0]);
+        pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
         STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
         STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
 
@@ -868,6 +903,11 @@
     pixels[1] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
 }
 #elif defined(PROCESS_8X_2Y_1Z)
@@ -1001,6 +1041,13 @@
     pixels1[1] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0]  = ACT_OP(pixels[0]);
+    pixels[1]  = ACT_OP(pixels[1]);
+    pixels1[0] = ACT_OP(pixels1[0]);
+    pixels1[1] = ACT_OP(pixels1[1]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
     STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels1);
 }
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
index 855d450..e51cc37 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,10 @@
 
 #include "helpers_cs.h"
 
+#ifdef FUSED_ACTIVATION
+#include "activation_layer_helpers_cs.h"
+#endif /* FUSED_ACTIVATION */
+
 #if defined(DATA_TYPE_FP16)
 precision mediump float;
 #endif // DATA_TYPE_FP16
@@ -114,6 +118,10 @@
     pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 
@@ -238,6 +246,11 @@
     pixels[1] += vec4(b);
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
     VSTORE2_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 
@@ -335,6 +348,10 @@
     pixels += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 
@@ -434,6 +451,12 @@
     pixels[2] += vec4(b);
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+    pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels[0]);
     STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -601,6 +624,12 @@
     }
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+    pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -728,6 +757,10 @@
     pixels += vec4(b);
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
 }
 
@@ -841,6 +874,12 @@
     }
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+    pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -962,6 +1001,13 @@
     }
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+    pixels[2] = ACT_OP(pixels[2]);
+    pixels[3] = ACT_OP(pixels[3]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -1087,6 +1133,13 @@
         }
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+        pixels[0] = ACT_OP(pixels[0]);
+        pixels[1] = ACT_OP(pixels[1]);
+        pixels[2] = ACT_OP(pixels[2]);
+        pixels[3] = ACT_OP(pixels[3]);
+#endif /* FUSED_ACTIVATION */
+
         STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
         STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
         STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
index c919e4e..728e964 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,10 @@
 
 #include "helpers_cs.h"
 
+#ifdef FUSED_ACTIVATION
+#include "activation_layer_helpers_cs.h"
+#endif /* FUSED_ACTIVATION */
+
 #if defined(DATA_TYPE_FP16)
 precision mediump float;
 #endif // DATA_TYPE_FP16
@@ -116,6 +120,10 @@
     pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 #elif defined(DATA_TYPE_FP16)
@@ -204,6 +212,10 @@
     res += vec4(b);
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    res = ACT_OP(res);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res);
 }
 
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index bef30d5..67a1530 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -50,7 +50,8 @@
 }
 
 template <unsigned int kernel_size>
-void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output,
+                                                            const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
@@ -58,6 +59,7 @@
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
     ARM_COMPUTE_ERROR_ON_MSG((kernel_size == 3 && std::get<0>(conv_info.stride()) > 2), "Strides larger than 2 not supported in 3x3 direct convolution!");
     ARM_COMPUTE_ERROR_ON(kernel_size != weights->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(act_info.enabled() && act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC);
 
     if(bias != nullptr)
     {
@@ -108,6 +110,16 @@
     std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
     options.emplace(("#define " + dt_name));
 
+    // Activation information in case of a fused activation
+    if(act_info.enabled())
+    {
+        options.emplace("#define FUSED_ACTIVATION");
+        options.emplace(("#define " + string_from_activation_func(act_info.activation())));
+        options.emplace(("#define ACT_OP  " + lower_string(string_from_activation_func(act_info.activation())) + "_op"));
+        options.emplace(("#define A_VAL " + float_to_string_with_full_precision(act_info.a())));
+        options.emplace(("#define B_VAL " + float_to_string_with_full_precision(act_info.b())));
+    }
+
     unsigned int num_elems_read_per_iteration_x    = kernel_size * _conv_stride_x;
     unsigned int num_elems_read_per_iteration_y    = 1;
     unsigned int num_elems_written_per_iteration_x = 1;