Rework OpenCL Depthwise Convolution - Remove dedicated kernels for NCHW. Now we only use NHWC with permute - Remove specialized kernels for 3x3 NHWC - Simplify CLDepthwiseConvolutionLayer.cpp to call just the native implementation for both floating-point and quantized data types - Develop two parametric opencl kernels for depthwise convolution layer NHWC (floating-point and quantized) - Add support to export the weights to cl_image - Extend test for depthwise convolution on opencl Resolves COMPMID-4417 Change-Id: Ibe533f79c2860f9cac8e921895d5a8f947753a5c Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5893 Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>

commit: 8155c0253c00aa9e26651361460c66feb39829a6 [log] [tgz]
author: Gian Marco Iodice <gianmarco.iodice@arm.com> Fri Apr 16 15:08:59 2021 +0100
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> Fri Jul 02 15:56:45 2021 +0000
tree: 41dacc432d4d1f1daa32d20d15e5120c11b9fa56
parent: 2eb5d16b839cbc28c6cb7f0de7a0bf15290b425a [diff] [blame]
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 068131f..eeed115 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h

@@ -47,23 +47,21 @@
     CLDepthwiseConvolutionLayerNativeKernel(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
     /** Allow instances of this class to be moved */
     CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
+
     /** Initialize the function's source, destination and parameters
      *
-     * @param[in]     compile_context    The compile context to be used.
-     * @param[in,out] input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in]     weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
-     *                                   Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in]     biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                   Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out]    output             Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input.
-     * @param[in]     dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in]     dwc_info           Depthwise convolution layer info
-     * @param[in]     conv_info          Padding and stride information to use for the convolution.
-     * @param[in]     depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]     dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]     output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                   the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]     output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
+     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output             Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input.
+     * @param[in]  dwc_info           Depthwise convolution layer info
+     * @param[in]  conv_info          Convolution info (padding, stride, dilation, ...)
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
      *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
      *
      * @note: In-place is only supported when
@@ -75,17 +73,15 @@
      *          * no padding
      *          * no change of data layout after configure
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
 
-    /** Initialize the function's source, destination and parameters
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
      * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
@@ -93,9 +89,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                           const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info,
+                           const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -108,6 +103,7 @@
     unsigned int     _depth_multiplier{ 0 };
     const ICLTensor *_output_multipliers{};
     const ICLTensor *_output_shifts{};
+    bool             _export_to_cl_image { true };
     bool             _is_quantized{ false };
 };
 } // namespace arm_compute
commit	8155c0253c00aa9e26651361460c66feb39829a6	[log] [tgz]
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	Fri Apr 16 15:08:59 2021 +0100
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	Fri Jul 02 15:56:45 2021 +0000
tree	41dacc432d4d1f1daa32d20d15e5120c11b9fa56
parent	2eb5d16b839cbc28c6cb7f0de7a0bf15290b425a [diff] [blame]