COMPMID-1691: Optimize CLDepthwiseConvolutionKernel (QASYMM8/NHWC) for 3x3 kernels (stride=1 and stride=2)

Change-Id: I7d0d2dc350feeb40d253d17f9ffd5051a8fb42ef
Reviewed-on: https://review.mlplatform.org/511
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 317c899..9fbd0ef 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1745,6 +1745,12 @@
     const bool _reinterpret_input_as_3d;
 };
 
+struct DepthwiseConvolutionReshapeInfo
+{
+    unsigned int c0{ 1 };            /**< Number of channels processed by the depth-wise convolution */
+    bool         transpose{ false }; /**< True if the block MxC0 (where M is the area of the filter i.e. KwxKh) has to be transposed */
+};
+
 /** GEMMLowp output stage type */
 enum class GEMMLowpOutputStageType
 {