COMPMID-926 Add depth multiplier support to NEON/CL/GLES depthwise convolution

Change-Id: I03f32c62350e5ea43e77bb15fc5a832d83719e3b
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/126657
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
index adfc126..134cc10 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -108,6 +108,8 @@
     uint z_index = gl_GlobalInvocationID.z;
     TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_attrs.stride_z);
 
+    src_iter.current_offset_in_bytes -= int((z_index - z_index / uint(DEPTH_MULTIPLIER)) * src_attrs.step_z);
+
     vec4 w[3];
     w[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
     w[1] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
@@ -263,6 +265,8 @@
     uint z_index = gl_GlobalInvocationID.z;
     TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_attrs.stride_z);
 
+    src_iter.current_offset_in_bytes -= int((z_index - z_index / uint(DEPTH_MULTIPLIER)) * src_attrs.step_z);
+
     vec4 w[3];
     w[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
     w[1] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
index 9343268..c237409 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -33,31 +33,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 using namespace arm_compute;
-
-namespace
-{
-/** Calculates expected output shape dimension
- *
- * @param[in] Input shape
- *
- * @return Expected output shape
- */
-TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-
-    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
-
-    TensorShape output_shape = input_shape;
-    output_shape.set(0, output_width);
-    output_shape.set(1, output_height);
-
-    return output_shape;
-}
-} // namespace
+using namespace arm_compute::misc::shape_calculator;
 
 GCDepthwiseConvolutionLayer3x3Kernel::GCDepthwiseConvolutionLayer3x3Kernel()
     : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0), _lws(gles::NDRange(1U, 1U, 1U))
@@ -69,7 +48,8 @@
     return _border_size;
 }
 
-void GCDepthwiseConvolutionLayer3x3Kernel::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDepthwiseConvolutionLayer3x3Kernel::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
+                                                     unsigned int depth_multiplier)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -83,7 +63,7 @@
     }
 
     // Get convolved dimensions
-    TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
+    const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(),
@@ -93,6 +73,7 @@
                        input->info()->fixed_point_position());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(2) != weights->info()->dimension(2));
 
     _input         = input;
     _output        = output;
@@ -108,6 +89,7 @@
     ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
     std::set<std::string> options;
 
+    options.emplace("#define DEPTH_MULTIPLIER " + support::cpp11::to_string(depth_multiplier));
     options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
     options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
     options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));