COMPMID-1707: Create 3 special CLWidthConcatenate kernel to concatenate 2/4 and 8 tensors (Part 1)

Creating special cases for concatening 2 and 4 tensors.

Change-Id: I6a739a494ae45011acb65369e353f9ef96970b90
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 36abb7b..df76366 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -127,6 +127,8 @@
 #include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
 #include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
 #include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
 #include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
 #include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
new file mode 100644
index 0000000..cc2eaa2
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H__
+#define __ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the width concatenate kernel of 2 tensors.
+ *  The input1 and input2 tensors will be concatenated into the output tensor.
+ */
+class CLWidthConcatenate2TensorsKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLWidthConcatenate2TensorsKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWidthConcatenate2TensorsKernel(const CLWidthConcatenate2TensorsKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWidthConcatenate2TensorsKernel &operator=(const CLWidthConcatenate2TensorsKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWidthConcatenate2TensorsKernel(CLWidthConcatenate2TensorsKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWidthConcatenate2TensorsKernel &operator=(CLWidthConcatenate2TensorsKernel &&) = default;
+    /** Default destructor */
+    ~CLWidthConcatenate2TensorsKernel() = default;
+    /** Initialise the kernel's input1s and output
+     *
+     * @param[in]  input1 First input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input2 Second input tensor. Data types supported: same as @p input1
+     * @param[out] output Output tensor. Data types supported: Same as @p input1.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate2TensorsKernel
+     *
+     * @param[in] input1 First tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1;
+    const ICLTensor *_input2;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
new file mode 100644
index 0000000..952fd99
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H__
+#define __ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the width concatenate kernel of 4 tensors.
+ *  All input tensors will be concatenated into the output tensor.
+ */
+class CLWidthConcatenate4TensorsKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLWidthConcatenate4TensorsKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWidthConcatenate4TensorsKernel(const CLWidthConcatenate4TensorsKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWidthConcatenate4TensorsKernel &operator=(const CLWidthConcatenate4TensorsKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWidthConcatenate4TensorsKernel(CLWidthConcatenate4TensorsKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWidthConcatenate4TensorsKernel &operator=(CLWidthConcatenate4TensorsKernel &&) = default;
+    /** Default destructor */
+    ~CLWidthConcatenate4TensorsKernel() = default;
+    /** Initialise the kernel's input1s and output
+     *
+     * @param[in]  input1 First input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input2 Second input tensor. Data types supported: same as @p input1
+     * @param[in]  input3 Third input tensor. Data types supported: same as @p input1
+     * @param[in]  input4 Fourth input tensor. Data types supported: same as @p input1
+     * @param[out] output Output tensor. Data types supported: Same as @p input1.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate4TensorsKernel
+     *
+     * @param[in] input1 First tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
+     * @param[in] input3 Third tensor info. Data types supported: same as @p input1
+     * @param[in] input4 Fourth tensor info. Data types supported: same as @p input1
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1;
+    const ICLTensor *_input2;
+    const ICLTensor *_input3;
+    const ICLTensor *_input4;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h
index 44462b0..55b65da 100644
--- a/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/IFunction.h"
 
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
 #include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 
 #include <memory>
@@ -40,7 +42,9 @@
 
 /** Basic function to execute concatenate tensors along x axis. This function calls the following kernel:
  *
- * -# @ref CLDepthConcatenateLayerKernel
+ * -# @ref CLWidthConcatenateLayerKernel
+ * -# @ref CLWidthConcatenate2TensorsKernel (if there are exactly 2 input tensors)
+ * -# @ref CLWidthConcatenate4TensorsKernel (if there are exactly 4 input tensors)
  *
  */
 class CLWidthConcatenateLayer : public IFunction
@@ -74,6 +78,8 @@
 
 private:
     std::unique_ptr<CLWidthConcatenateLayerKernel[]> _concat_kernels_vector;
+    CLWidthConcatenate2TensorsKernel                 _concat_x2_kernel;
+    CLWidthConcatenate4TensorsKernel                 _concat_x4_kernel;
     unsigned int                                     _num_inputs;
 };
 } // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index b9b3ce9..8472369 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -182,6 +182,8 @@
     { "combine_gradients_L2", "canny.cl" },
     { "concatenate_depth", "concatenate.cl" },
     { "concatenate_width", "concatenate.cl" },
+    { "concatenate_width_x2", "concatenate.cl" },
+    { "concatenate_width_x4", "concatenate.cl" },
     { "convolution_rectangle", "convolution_rectangle.cl" },
     { "col2im", "col2im.cl" },
     { "convert_depth_down", "depth_convert.cl" },
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index a232a94..0e8805f 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -25,13 +25,218 @@
 
 #if defined(DATA_TYPE) && defined(VEC_SIZE)
 
+#if defined(DEPTH) && defined(ELEMENT_SIZE)
+
+#if defined(INPUT1_WIDTH)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not supported"
+#endif // ELEMENT_SIZE
+
+#if VEC_SIZE == 2
+#define SEQ ((int2)(0, 1))
+#elif VEC_SIZE == 4
+#define SEQ ((int4)(0, 1, 2, 3))
+#elif VEC_SIZE == 8
+#define SEQ ((int8)(0, 1, 2, 3, 4, 5, 6, 7))
+#elif VEC_SIZE == 16
+#define SEQ ((int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))
+#else // VEC_SIZE
+#error "Vector size not supported"
+#endif // VEC_SIZE
+/** This kernel concatenates two input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void concatenate_width_x2(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(dst))
+{
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+    // Calculate input indices
+    const int x  = get_global_id(0) * (int)VEC_SIZE;
+    const int y  = get_global_id(1);
+    const int z  = get_global_id(2) % (int)DEPTH;
+    const int w  = get_global_id(2) / (int)DEPTH;
+    const int x1 = min(x, (int)INPUT1_WIDTH);
+    const int x2 = max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE);
+
+    // Calculate inputs and output addresses
+    const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+
+    const VEC_DATA_TYPE(int, VEC_SIZE) x_coords        = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values    = select(src2_values, src1_values, cond);
+
+    VSTORE(VEC_SIZE)
+    (values, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
+/** This kernel concatenates four input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
+ * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src3_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src3_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src3_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src3_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src3_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src3_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src4_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src4_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src4_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src4_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src4_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src4_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void concatenate_width_x4(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(src3),
+    TENSOR4D_DECLARATION(src4),
+    TENSOR4D_DECLARATION(dst))
+{
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+    // Calculate input indices
+    const int x = get_global_id(0) * (int)VEC_SIZE;
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % (int)DEPTH;
+    const int w = get_global_id(2) / (int)DEPTH;
+
+    const int x1 = min(x, (int)INPUT1_WIDTH);
+    const int x2 = min(max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE), (int)INPUT2_WIDTH);
+    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, -(int)VEC_SIZE), (int)INPUT3_WIDTH);
+    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, -(int)VEC_SIZE);
+
+    // Calculate inputs and output addresses
+    const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+    const __global uchar *in3_ptr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * (int)src3_stride_x + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
+    const __global uchar *in4_ptr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * (int)src4_stride_x + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
+
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr);
+
+    const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
+
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in2 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in3 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in4 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    values = select(src2_values, src1_values, cond_in2);
+    values = select(src3_values, values, cond_in3);
+    values = select(src4_values, values, cond_in4);
+
+    VSTORE(VEC_SIZE)
+    (values, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
+#endif /* defined(INPUT1_WIDTH) */
+#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
+
 #if defined(WIDTH_OFFSET) && defined(DEPTH)
 /** This kernel concatenates the input tensor into the output tensor along the first dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
  * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
  * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH16
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -53,7 +258,6 @@
  * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  offset                            The offset to the first valid element of the output tensor in bytes
  */
 __kernel void concatenate_width(
     TENSOR4D_DECLARATION(src),
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
new file mode 100644
index 0000000..b0d27cb
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    // The window needs to be based on the output
+    Window             win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
+    AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+                                     input2->dimension(1));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win_collapsed);
+}
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
+                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) > output->dimension(0));
+
+    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+CLWidthConcatenate2TensorsKernel::CLWidthConcatenate2TensorsKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+Status CLWidthConcatenate2TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
+void CLWidthConcatenate2TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x2", build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+    ICLKernel::configure_internal(std::get<1>(win_config));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "concatenate_width_x2_";
+    _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input1->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input1->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input2->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input2->info()->dimension(1));
+}
+
+void CLWidthConcatenate2TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_4D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, _input1, slice);
+        add_4D_tensor_argument(idx, _input2, slice);
+        add_4D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, window, lws_hint());
+    }
+    while(window.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
new file mode 100644
index 0000000..75aef9c
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output)
+{
+    // The window needs to be based on the output
+    Window             win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
+    AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+                                     input2->dimension(1));
+    AccessWindowStatic input3_access(input3, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input3->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+                                     input3->dimension(1));
+    AccessWindowStatic input4_access(input4, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input4->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+                                     input4->dimension(1));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input1_access, input2_access, input3_access, input4_access, output_access);
+
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win_collapsed);
+}
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
+                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, input3, input4, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) + input3->dimension(0) + input4->dimension(0) > output->dimension(0));
+
+    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(input3->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(input4->dimension(i) != output->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+CLWidthConcatenate4TensorsKernel::CLWidthConcatenate4TensorsKernel()
+    : _input1(nullptr), _input2(nullptr), _input3(nullptr), _input4(nullptr), _output(nullptr)
+{
+}
+
+Status CLWidthConcatenate4TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, input3, input4, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), input3->clone().get(), input4->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
+void CLWidthConcatenate4TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), input3->info(), input4->info(), output->info()));
+
+    _input1 = input1;
+    _input2 = input2;
+    _input3 = input3;
+    _input4 = input4;
+    _output = output;
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
+    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->info()->dimension(0)));
+    build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0)));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x4", build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), input3->info(), input4->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+    ICLKernel::configure_internal(std::get<1>(win_config));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "concatenate_width_x4_";
+    _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input1->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input1->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input2->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input2->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input3->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input3->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input4->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input4->info()->dimension(1));
+}
+
+void CLWidthConcatenate4TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_4D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, _input1, slice);
+        add_4D_tensor_argument(idx, _input2, slice);
+        add_4D_tensor_argument(idx, _input3, slice);
+        add_4D_tensor_argument(idx, _input4, slice);
+        add_4D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, window, lws_hint());
+    }
+    while(window.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index 5233ff4..46a2d80 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -36,26 +36,46 @@
 
 CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
     : _concat_kernels_vector(),
+      _concat_x2_kernel(),
+      _concat_x4_kernel(),
       _num_inputs(0)
 {
 }
 
 Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
 {
+    const unsigned int num_inputs = inputs_vector.size();
+
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
 
     // Output auto inizialitation if not yet initialized
     TensorInfo  tmp_output_info = *output->clone();
     TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
-    unsigned int width_offset = 0;
-    for(const auto &input : inputs_vector)
+    switch(num_inputs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
-        width_offset += input->dimension(0);
+        case 2:
+            // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], &tmp_output_info));
+            break;
+        case 4:
+            // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], &tmp_output_info));
+            break;
+        default:
+            unsigned int width_offset = 0;
+            // Validate generic case of WidthConcatenate kernel
+            for(const auto &input : inputs_vector)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+                ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
+                width_offset += input->dimension(0);
+            }
+            break;
     }
 
     return Status{};
@@ -74,16 +94,30 @@
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+
     ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
 
-    unsigned int width_offset = 0;
-
-    _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
-
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    switch(_num_inputs)
     {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
-        width_offset += inputs_vector.at(i)->info()->dimension(0);
+        case 2:
+            // Configure WidthConcatenate2Tensors kernel
+            _concat_x2_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), output);
+            break;
+        case 4:
+            // Configure WidthConcatenate4Tensors kernel
+            _concat_x4_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
+            break;
+        default:
+            // Configure generic case WidthConcatenate kernels
+            _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+
+            unsigned int width_offset = 0;
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
+                width_offset += inputs_vector.at(i)->info()->dimension(0);
+            }
+            break;
     }
 }
 
@@ -91,8 +125,19 @@
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    for(unsigned i = 0; i < _num_inputs; i++)
+    switch(_num_inputs)
     {
-        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+        case 2:
+            CLScheduler::get().enqueue(_concat_x2_kernel, true);
+            break;
+        case 4:
+            CLScheduler::get().enqueue(_concat_x4_kernel, true);
+            break;
+        default:
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+            }
+            break;
     }
 }
diff --git a/tests/validation/fixtures/WidthConcatenateLayerFixture.h b/tests/validation/fixtures/WidthConcatenateLayerFixture.h
index caad0fe..1f79210 100644
--- a/tests/validation/fixtures/WidthConcatenateLayerFixture.h
+++ b/tests/validation/fixtures/WidthConcatenateLayerFixture.h
@@ -52,7 +52,7 @@
     {
         // Create input shapes
         std::mt19937                    gen(library->seed());
-        std::uniform_int_distribution<> num_dis(2, 4);
+        std::uniform_int_distribution<> num_dis(2, 8);
         const int                       num_tensors = num_dis(gen);
 
         std::vector<TensorShape>         shapes(num_tensors, shape);