COMPMID-1071: (3RDPARTY_UPDATE) Add depth multiplier on DepthwiseConv 3x3 NHWC

Change-Id: I316ff40dda379d4b84fac5d63f0c56efbacbc2b4
Reviewed-on: https://review.mlplatform.org/371
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index 4863101..60dddbb 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -34,8 +34,10 @@
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
@@ -53,7 +55,15 @@
 {
 public:
     /** Default constructor */
-    CLDepthwiseConvolutionLayer3x3();
+    CLDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthwiseConvolutionLayer3x3(const CLDepthwiseConvolutionLayer3x3 &) = delete;
+    /** Default move constructor */
+    CLDepthwiseConvolutionLayer3x3(CLDepthwiseConvolutionLayer3x3 &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthwiseConvolutionLayer3x3 &operator=(const CLDepthwiseConvolutionLayer3x3 &) = delete;
+    /** Default move assignment operator */
+    CLDepthwiseConvolutionLayer3x3 &operator=(CLDepthwiseConvolutionLayer3x3 &&) = default;
     /** Initialize the function's source, destination, conv and border_size.
      *
      * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
@@ -86,10 +96,21 @@
                            ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD);
     // Inherited methods overriden:
     void run() override;
+    void prepare() override;
 
 private:
+    CLMemoryGroup                                          _memory_group;
     std::unique_ptr<ICLDepthwiseConvolutionLayer3x3Kernel> _kernel;
     CLFillBorderKernel                                     _border_handler;
+    CLPermute                                              _permute_input_to_nchw;
+    CLPermute                                              _permute_weights_to_nchw;
+    CLPermute                                              _permute_output_to_nhwc;
+    CLTensor                                               _permuted_input;
+    CLTensor                                               _permuted_weights;
+    CLTensor                                               _permuted_output;
+    const ITensor                                         *_original_weights;
+    bool                                                   _needs_permute;
+    bool                                                   _is_prepared;
 };
 
 /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels:
@@ -166,5 +187,5 @@
     const ICLTensor                          *_original_weights;
     std::unique_ptr<IFunction>                _optimised_function;
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H__ */