Rework OpenCL Depthwise Convolution

- Remove dedicated kernels for NCHW. Now we only use NHWC with permute
- Remove specialized kernels for 3x3 NHWC
- Simplify CLDepthwiseConvolutionLayer.cpp to call just the native
  implementation for both floating-point and quantized data types
- Develop two parametric opencl kernels for depthwise convolution layer NHWC
  (floating-point and quantized)
- Add support to export the weights to cl_image
- Extend test for depthwise convolution on opencl

Resolves COMPMID-4417

Change-Id: Ibe533f79c2860f9cac8e921895d5a8f947753a5c
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5893
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 068131f..eeed115 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -47,23 +47,21 @@
     CLDepthwiseConvolutionLayerNativeKernel(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
     /** Allow instances of this class to be moved */
     CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
+
     /** Initialize the function's source, destination and parameters
      *
-     * @param[in]     compile_context    The compile context to be used.
-     * @param[in,out] input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in]     weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
-     *                                   Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in]     biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                   Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out]    output             Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input.
-     * @param[in]     dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in]     dwc_info           Depthwise convolution layer info
-     * @param[in]     conv_info          Padding and stride information to use for the convolution.
-     * @param[in]     depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]     dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]     output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                   the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]     output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
+     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output             Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input.
+     * @param[in]  dwc_info           Depthwise convolution layer info
+     * @param[in]  conv_info          Convolution info (padding, stride, dilation, ...)
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
      *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
      *
      * @note: In-place is only supported when
@@ -75,17 +73,15 @@
      *          * no padding
      *          * no change of data layout after configure
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
 
-    /** Initialize the function's source, destination and parameters
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
      * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
@@ -93,9 +89,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                           const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info,
+                           const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -108,6 +103,7 @@
     unsigned int     _depth_multiplier{ 0 };
     const ICLTensor *_output_multipliers{};
     const ICLTensor *_output_shifts{};
+    bool             _export_to_cl_image { true };
     bool             _is_quantized{ false };
 };
 } // namespace arm_compute