Fix depthwise convolution not using assembly kernel

* Take dilation into account when checking padding.

Resolves: COMPMID-6348
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I897a13ba7f37382733c35c1701d1ec310ed55331
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10147
Reviewed-by: SiCong Li <sicong.li@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
index e092c83..b503a8b 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
@@ -306,11 +306,15 @@
 
     // Assembly kernels cannot work with padding greater than the kernel.
     const auto &padding = info.pad_stride_info;
+    const auto &dilation = info.dilation;
     const auto &wei_shape = weights->tensor_shape();
 
+    const auto dilated_wei_w = wei_shape[1] + (wei_shape[1] - 1) * (dilation.x() - 1);
+    const auto dilated_wei_h = wei_shape[2] + (wei_shape[2] - 1) * (dilation.y() - 1);
+
     ARM_COMPUTE_RETURN_ERROR_ON(
-        padding.pad_top() >= wei_shape[2] || padding.pad_bottom() >= wei_shape[2] ||
-        padding.pad_left() >= wei_shape[1] || padding.pad_right() >= wei_shape[1]
+        padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w ||
+        padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h
     );
 
     return Status{};