[Dynamic Fusion] Implement build options generation

Resolves: COMPMID-5153

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: Ic34cc1f0d092fafa7c2faa4dd705cf8f68eaf87e
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7317
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index b285cc2..6e1291c 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "src/core/common/Macros.h"
+#include "support/StringSupport.h"
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
 
@@ -191,7 +192,7 @@
     struct TagVal
     {
         TagVal() = default;
-        TagVal(SharedVarTable::SharedVar var)
+        TagVal(const SharedVarTable::SharedVar &var)
             : value{ var.uniq_name }
         {
         }
@@ -201,6 +202,11 @@
         {
         }
 
+        TagVal(const std::string &val)
+            : value{ val }
+        {
+        }
+
         std::string value{};
     };
     using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags
@@ -217,12 +223,12 @@
     virtual std::vector<Link> get_links() const          = 0;
     virtual std::string       name() const               = 0;
 
+    // @note: some tags can be unused since they could be used only for the macros, or only for the component code
     static std::string replace_tags(const std::string &code_template, const TagLUT &tags)
     {
-        std::string                     replaced_code = "";
-        std::unordered_set<std::string> used_tags{};
-        bool                            scanning_pattern = false;
-        std::string                     pattern_found    = "";
+        std::string replaced_code    = "";
+        bool        scanning_pattern = false;
+        std::string pattern_found    = "";
         for(size_t i = 0; i < code_template.size() - 1; ++i)
         {
             if(!scanning_pattern)
@@ -247,7 +253,6 @@
                     std::string err  = "Pattern " + pattern_found + " not found in tags";
                     ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
                     replaced_code += tags.find(pattern_found)->second.value;
-                    used_tags.insert(pattern_found);
                 }
                 else
                 {
@@ -255,12 +260,7 @@
                 }
             }
         }
-        // Check for unused tags
-        for(const auto &tag : tags)
-        {
-            ARM_COMPUTE_UNUSED(tag);
-            ARM_COMPUTE_ERROR_ON_MSG(used_tags.find(tag.first) == used_tags.end(), "Warning: unused tags");
-        }
+
         return replaced_code;
     }
     ComponentID id() const
@@ -303,6 +303,11 @@
         return "";
     }
 
+    virtual CLBuildOptions generate_build_options() const
+    {
+        return CLBuildOptions{};
+    }
+
 protected:
     const ClKernelBlueprint *_blueprint;
 
@@ -445,12 +450,10 @@
     {
         std::string name = "";
 
-        auto stack = topological_sort();
-        while(!stack.empty())
+        traverse([&](std::stack<ComponentID> stack)
         {
             name += _components.find(stack.top())->second->name() + (stack.size() > 2 ? "___" : "");
-            stack.pop();
-        }
+        });
 
         return name;
     }
@@ -480,7 +483,7 @@
             headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
             if(!curr_additional_macros.empty()) // Some components might not have any
             {
-                additional_macros.insert(curr_additional_macros);
+                additional_macros.insert(IClKernelComponent::replace_tags(curr_additional_macros, var_lut));
             }
 
             stack.pop();
@@ -524,7 +527,19 @@
 
     CLBuildOptions build_options() const
     {
-        return CLBuildOptions{};
+        CLBuildOptions build_opts{};
+
+        traverse([&](std::stack<ComponentID> stack)
+        {
+            build_opts.add_options(_components.find(stack.top())->second->generate_build_options().options());
+        });
+
+        return build_opts;
+    }
+
+    TileDescriptor get_tile_info() const
+    {
+        return _tile_info;
     }
 
     Window get_execution_window() const
@@ -596,6 +611,17 @@
         return stack;
     }
 
+    void traverse(const std::function<void(std::stack<ComponentID>)> &func) const
+    {
+        std::stack<ComponentID> stack = topological_sort();
+
+        while(!stack.empty())
+        {
+            func(stack);
+            stack.pop();
+        }
+    }
+
     std::string generate_argument_declaration(const SharedVarTable::SharedVar &var) const
     {
         ARM_COMPUTE_ERROR_ON_MSG(var.group != SharedVarGroup::Argument, "An argument declaration can only be generated from a kernel argument");
@@ -672,7 +698,7 @@
                 ARM_COMPUTE_ERROR("Unsupported clipping strategy");
         }
 
-        code += "\n    REPEAT_VAR_INIT_TO_CONST(M0, uint, g_zout, 0);\n";
+        code += "\n    REPEAT_VAR_INIT_TO_CONST(" + std::to_string(tile_dim_y) + ", uint, g_zout, 0);\n";
         code += "    REPEAT_VAR_INIT_TO_CONST(16, uint, g_zero, 0);\n\n";
 
         return code;
@@ -684,7 +710,7 @@
     int32_t _num_components{};
     int32_t _num_complex_components{};
 
-    ArgumentID _dst_id{ -1 };
+    ArgumentID _dst_id{ -1 }; // Initially set to -1, which means the graph has no dst yet, since node IDs are positive numbers
 
     // Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate)
     std::unordered_map<ComponentID, ComponentUniquePtr> _components{};
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index 06c29c4..bbdf8df 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
@@ -82,6 +82,22 @@
 
 )_";
 }
+
+CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const
+{
+    auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    auto tile_info  = _blueprint->impl().get_tile_info();
+
+    CLBuildOptions build_opts{};
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+
+    return build_opts;
+}
+
 ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::allocate_vars(SharedVarTable &vtable) const
 {
     // Determine which argument is the accumulator
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
index fe5f964..c259811 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
@@ -46,6 +46,7 @@
     std::set<std::string> get_headers_list() const override;
     std::string           get_component_code() const override;
     Window                get_window() const override;
+    CLBuildOptions        generate_build_options() const override;
 
     virtual std::vector<Link> get_links() const override
     {
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
index e70e5d5..4bf0b76 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
@@ -28,6 +28,9 @@
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/utils/helpers/float_ops.h"
+#include "support/StringSupport.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -214,6 +217,13 @@
 
 std::string ClGemmNativeKernelComponent::get_component_code() const
 {
+    auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+    auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+
+    auto has_alpha               = !(helpers::float_ops::is_one(_desc.alpha));
+    auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
+    auto dont_slide_b            = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
+
     std::string code = R"_(
     //------------------ START KERNEL {{meta_kernel_id}} ---------------------
     // IN_0(lhs)            {{lhs}}
@@ -245,34 +255,49 @@
 
         // Compute RHS matrix address
         uint rhs_offset = {{rhs}}_offset_first_element_in_bytes + g_x * N0 * sizeof(DATA_TYPE);
+    )_";
 
-#if defined(MATRIX_B_DEPTH)
-        // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-        rhs_offset += (g_z % MATRIX_B_DEPTH) * {{rhs}}_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-        rhs_offset += g_z * {{rhs}}_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
+    if(dont_slide_b)
+    {
+        code += R"_(
+            // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+            rhs_offset += (g_z % {{MATRIX_B_DEPTH}}) * {{rhs}}_stride_z;
+        )_";
+    }
+    else
+    {
+        code += R"_(
+            rhs_offset += g_z * {{rhs}}_stride_z;
+        )_";
+    }
 
+    code += R"_(
         REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+    )_";
 
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D
-        CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y);
+    if(reinterpret_input_as_3d)
+    {
+        code += R"_(
+            // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D
+            CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), {{HEIGHT_GEMM3D}}, {{DEPTH_GEMM3D}}, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y);
 
-        // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-        // multiply lhs_stride_z by DEPTH_GEMM3D
-        lhs_offset += g_z * {{lhs}}_stride_z * DEPTH_GEMM3D;
+            // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+            // multiply lhs_stride_z by DEPTH_GEMM3D
+            lhs_offset += g_z * {{lhs}}_stride_z * {{DEPTH_GEMM3D}};
+        )_";
+    }
+    else
+    {
+        code += R"_(
+            // Add offset for batched GEMM
+            lhs_offset += g_z * {{lhs}}_stride_z;
+        )_";
+    }
 
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Add offset for batched GEMM
-        lhs_offset += g_z * {{lhs}}_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
+    code += R"_(
         int i = 0;
-#if K0 > 1
-        for(; i <= (K - K0); i += K0)
+#if {{K0}} > 1
+        for(; i <= (K - {{K0}}); i += {{K0}})
         {
             // Supported cases (M0, K0):
             // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
@@ -284,26 +309,26 @@
             // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
             // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
             // Load values from LHS matrix
-            LOAD_BLOCK(M0, K0, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs);
+            LOAD_BLOCK(M0, {{K0}}, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs);
 
             // Load values from RHS matrix
-            LOAD_BLOCK(K0, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero);
+            LOAD_BLOCK({{K0}}, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero);
 
             RHS_VFMA_M0xN0(0, a, b0, {{dst}});
             RHS_VFMA_M0xN0(1, a, b1, {{dst}});
-#if K0 > 2
+#if {{K0}} > 2
             RHS_VFMA_M0xN0(2, a, b2, {{dst}});
 #endif // K0 > 2
-#if K0 > 3
+#if {{K0}} > 3
             RHS_VFMA_M0xN0(3, a, b3, {{dst}});
 #endif // K0 > 3
-#if K0 > 4
+#if {{K0}} > 4
             RHS_VFMA_M0xN0(4, a, b4, {{dst}});
             RHS_VFMA_M0xN0(5, a, b5, {{dst}});
             RHS_VFMA_M0xN0(6, a, b6, {{dst}});
             RHS_VFMA_M0xN0(7, a, b7, {{dst}});
 #endif // K0 > 4
-#if K0 > 8
+#if {{K0}} > 8
             RHS_VFMA_M0xN0(8, a, b8, {{dst}});
             RHS_VFMA_M0xN0(9, a, b9, {{dst}});
             RHS_VFMA_M0xN0(A, a, bA, {{dst}});
@@ -314,8 +339,8 @@
             RHS_VFMA_M0xN0(F, a, bF, {{dst}});
 #endif // K0 > 8
 
-            lhs_offset += K0 * sizeof(DATA_TYPE);
-            rhs_offset += K0 * {{rhs}}_stride_y;
+            lhs_offset += {{K0}} * sizeof(DATA_TYPE);
+            rhs_offset += {{K0}} * {{rhs}}_stride_y;
         }
 #endif // K0 > 1
         // Left-over accumulations
@@ -362,44 +387,61 @@
         }
 
         // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-        SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, ALPHA);
-#endif // defined(ALPHA)
     )_";
+    if(has_alpha)
+    {
+        code += R"_(
+            SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, {{ALPHA}});
+        )_";
+    }
 
     if(!_bias.is_empty())
     {
-        code += R"_(
-        // Add beta*bias
-#if defined(BROADCAST_BIAS)
-        __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+        if(_desc.broadcast_bias)
+        {
+            code += R"_(
+                // Add beta*bias
+                __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
 
-        LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
+                LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
+            )_";
 
-#ifndef UNIT_BETA
-        SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
+            if(helpers::float_ops::is_one(_desc.beta))
+            {
+                code += R"_(
+                    SCALE_BLOCK(1, DATA_TYPE, bias, {{BETA}});
+                )_";
+            }
 
-        // c = c + bias[broadcasted]
-        ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0);
+            code += R"_(
+                // c = c + bias[broadcasted]
+                ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0);
+            )_";
+        }
+        else
+        {
+            code += R"_(
+                // Add beta*bias
+                __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0,
+                                            PARTIAL_STORE_M0)
+                                            * {{bias}}_stride_y)
+                                            + g_z * {{bias}}_stride_z;
 
-#else // defined(BROADCAST_BIAS)
-        __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0,
-                                    PARTIAL_STORE_M0)
-                                    * {{bias}}_stride_y)
-                                    + g_z * {{bias}}_stride_z;
+                LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
+            )_";
 
-        LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
+            if(helpers::float_ops::is_one(_desc.beta))
+            {
+                code += R"_(
+                    SCALE_BLOCK(M0, DATA_TYPE, bias, {{BETA}});
+                )_";
+            }
 
-#ifndef UNIT_BETA
-        SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-        // c = c + bias
-        ADD_BLOCK(M0, {{dst}}, bias);
-
-#endif // defined(BROADCAST_BIAS)
-    )_";
+            code += R"_(
+                // c = c + bias
+                ADD_BLOCK(M0, {{dst}}, bias);
+            )_";
+        }
     }
 
     code += R"_(
@@ -409,6 +451,25 @@
     return code.c_str();
 }
 
+CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const
+{
+    auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    auto tile_info  = _blueprint->impl().get_tile_info();
+
+    CLBuildOptions build_opts{};
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
+    build_opts.add_option("-DM=" + support::cpp11::to_string(tile_info.boundaries.y()));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(tile_info.boundaries.x()));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(_desc.k));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
+
+    return build_opts;
+}
+
 ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const
 {
     TagLUT lut{};
@@ -421,6 +482,44 @@
         lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Image_3D), "bias");
     }
     lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst");
+
+    // Local build options
+    auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+    auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+    auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+    auto has_alpha                = !(helpers::float_ops::is_one(_desc.alpha));
+    auto has_beta                 = _blueprint->impl().get_kernel_argument_info(_bias.arg_id) != nullptr;
+    auto reinterpret_input_as_3d  = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
+    auto reinterpret_output_as_3d = !_desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d != 0;
+    auto dont_slide_b             = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
+
+    lut["K0"] = support::cpp11::to_string(_desc.rhs_info.k0);
+
+    if(has_alpha)
+    {
+        lut["ALPHA"] = float_to_string_with_full_precision(_desc.alpha);
+    }
+    if(has_beta)
+    {
+        lut["BETA"] = float_to_string_with_full_precision(_desc.beta);
+    }
+    if(dont_slide_b)
+    {
+        lut["MATRIX_B_DEPTH"] = support::cpp11::to_string(t_rhs_info->dimension(2));
+    }
+
+    if(reinterpret_output_as_3d)
+    {
+        lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(1));
+        lut["DEPTH_GEMM3D"]  = support::cpp11::to_string(t_dst_info->dimension(2));
+    }
+    else if(reinterpret_input_as_3d)
+    {
+        lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(1));
+        lut["DEPTH_GEMM3D"]  = support::cpp11::to_string(t_lhs_info->dimension(2));
+    }
+
     return lut;
 }
 } // namespace dynamic_fusion
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
index 09933a8..1a1e3e3 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
@@ -52,6 +52,7 @@
     std::string           get_component_code() const override;
     Window                get_window() const override;
     ClKernelArgList       get_args();
+    CLBuildOptions        generate_build_options() const override;
 
     virtual std::vector<Link> get_links() const override
     {
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
index 430fafb..2d7b466 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
@@ -61,6 +61,23 @@
 
 )_";
 }
+
+CLBuildOptions ClStoreBlockBoundaryAwareKernelComponent::generate_build_options() const
+{
+    auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    auto tile_info  = _blueprint->impl().get_tile_info();
+
+    CLBuildOptions build_opts{};
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
+
+    return build_opts;
+}
+
 ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::allocate_vars(SharedVarTable &vtable) const
 {
     return {
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
index ad7a207..8d58da2 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
@@ -41,8 +41,9 @@
         : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
     {
     }
-    ComponentType get_component_type() const override;
-    std::string   get_component_code() const override;
+    ComponentType  get_component_type() const override;
+    std::string    get_component_code() const override;
+    CLBuildOptions generate_build_options() const override;
 
     virtual std::vector<Link> get_links() const override
     {