Mismatches in dynamically fused direct conv2d + add kernel

Resolves: COMPMID-5269

Change-Id: I4372ea4365d14ead79153e4b08b690a1e20ab0b7
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7531
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index eab06aa..ec57022 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -1030,6 +1030,26 @@
         })                                                  \
     })
 
+/** Element-wise addition between two tiles (LHS and RHS)
+ *
+ * @note Performs: LHS + RHS = DST
+ * @note Both tiles must have same data type
+ *
+ * @param[in]  DST_DATA_TYPE DST data type
+ * @param[in]  M0            Number of LHS rows
+ * @param[in]  N0            Number of LHS columns
+ * @param[in]  lhs           LHS tile
+ * @param[in]  rhs           RHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
+    ({                                                      \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
+        {                                                   \
+            dst[_m0].v = CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)) + CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0));             \
+        })                                                  \
+    })
+
 /** Matrix multiplication
  *
  * @note Performs: LHS X RHS + DST = DST
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index 17437c2..9923b7a 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
@@ -857,15 +857,13 @@
     {
         auto       dst_info   = get_kernel_argument_info(_dst_id);
         auto       dst_w      = dst_info->dimension(0);
-        auto       dst_h      = dst_info->dimension(1);
         const auto tile_w     = std::max(1, get_execution_window().x().step());
         const auto tile_h     = std::max(1, get_execution_window().y().step());
         auto       leftover_w = dst_w % tile_w;
-        auto       leftover_h = dst_h % tile_h;
 
         std::string code = "";
         code += std::string("    int cout = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
-        code += std::string("    int mout = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + std::to_string(leftover_h) + ");\n";
+        code += std::string("    int mout = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
         code += std::string("    int bout = GET_SPATIAL_IDX(2, 1, 0);\n\n");
 
         switch(_tile_info.clipping)
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index 2bbea87..47f95b5 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
@@ -88,7 +88,12 @@
         T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, cout, mout, 1, {{lhs}}_stride_y, lhs_tile);
         T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{rhs}}, cout, mout, 1, {{rhs}}_stride_y, rhs_tile);
 
+#if defined(IS_BROADCAST)
         T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+#else // !defined(IS_BROADCAST)
+        T_ADD({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+#endif // defined(IS_BROADCAST)
+
     }
     //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
 )_";
@@ -106,7 +111,11 @@
 
         T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{addend}}, cout, mout, 1, {{addend}}_stride_y, addend_tile);
 
+#if defined(IS_BROADCAST)
         T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
+#else // !defined(IS_BROADCAST)
+        T_ADD({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
+#endif // defined(IS_BROADCAST)
     }
     //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
 )_";
@@ -115,16 +124,17 @@
 
 CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const
 {
+    const auto t_src_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
     const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
 
     CLBuildOptions build_opts{};
-    const auto     n0         = _blueprint->impl().get_execution_window().x().step();
-    const auto     m0         = _blueprint->impl().get_execution_window().y().step();
-    const auto     partial_m0 = t_dst_info->dimension(1) % m0;
+    const auto     n0           = _blueprint->impl().get_execution_window().x().step();
+    const auto     m0           = _blueprint->impl().get_execution_window().y().step();
+    const bool     is_broadcast = t_src_info->tensor_shape() != t_dst_info->tensor_shape();
 
     build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0));
+    build_opts.add_option_if(is_broadcast, "-DIS_BROADCAST");
 
     return build_opts;
 }