Add in-place calculation support for CL elementwise arithmetic kernels - Add in-place calculation support in ClArithmeticKernel, ClSaturatedArithmeticKernel and ClMulKernel - Add in-place test cases Resolves: COMPMID-4431 Signed-off-by: Sheri Zhang <sheri.zhang@arm.com> Change-Id: Id484bdb76b74478a33fedb471ae0c7f799c599f6 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5885 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>

commit: a387e271b1e02ffd5c2993702b9a21c1ed5c95fa [log] [tgz]
author: Sheri Zhang <sheri.zhang@arm.com> Tue Jun 29 17:34:06 2021 +0100
committer: Sheri Zhang <sheri.zhang@arm.com> Tue Jul 13 15:36:03 2021 +0000
tree: f53416756c70c85d962218168ad3cd3359d9f5c8
parent: 6fc7d528382716de9e417c9dcf0fddf109446e9f [diff] [blame]
diff --git a/src/core/gpu/cl/kernels/ClMulKernel.cpp b/src/core/gpu/cl/kernels/ClMulKernel.cpp
index 65f3bec..7c4dddc 100644
--- a/src/core/gpu/cl/kernels/ClMulKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClMulKernel.cpp

@@ -63,6 +63,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
+    // Check whether it is in_place calculation
+    const bool in_place      = (src1 == dst) || (src2 == dst);
+    const bool src1_in_place = in_place && (src1 == dst);
+
     const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -85,7 +89,16 @@
                                         "Dst can only be QSYMM16 if both src are QSYMM16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32),
                                         "Dst must be S32 if source tensors are S32");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+        if(in_place)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
+                                            "Wrong shape for dst, cannot do in_place calculation");
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                            "Wrong shape for dst");
+        }
     }
 
     return Status{};
@@ -194,11 +207,17 @@
         }
     }
 
+    // Check whether it is in_place calculation
+    const bool in_place      = (src1 == dst) || (src2 == dst);
+    const bool src1_in_place = in_place && (src1 == dst);
+    build_opts.add_option_if(in_place, "-DIN_PLACE");
+    build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE");
+
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set scale argument
-    unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+    unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
 
     if(scale_int >= 0 && !is_quantized)
     {
@@ -256,6 +275,8 @@
     const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
     auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
+
     const TensorShape &in_shape1 = src_0->info()->tensor_shape();
     const TensorShape &in_shape2 = src_1->info()->tensor_shape();
     const TensorShape &out_shape = dst->info()->tensor_shape();
@@ -280,12 +301,17 @@
     Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
     Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
 
+    // Check whether it is in_place calculation
+    const bool in_place = (src_0 == dst) || (src_1 == dst);
     do
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src_0, slice_input1);
         add_3D_tensor_argument(idx, src_1, slice_input2);
-        add_3D_tensor_argument(idx, dst, slice);
+        if(!in_place)
+        {
+            add_3D_tensor_argument(idx, dst, slice);
+        }
         enqueue(queue, *this, slice, lws_hint());
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
commit	a387e271b1e02ffd5c2993702b9a21c1ed5c95fa	[log] [tgz]
author	Sheri Zhang <sheri.zhang@arm.com>	Tue Jun 29 17:34:06 2021 +0100
committer	Sheri Zhang <sheri.zhang@arm.com>	Tue Jul 13 15:36:03 2021 +0000
tree	f53416756c70c85d962218168ad3cd3359d9f5c8
parent	6fc7d528382716de9e417c9dcf0fddf109446e9f [diff] [blame]