| /* |
| * Copyright (c) 2022 Arm Limited. |
| * |
| * SPDX-License-Identifier: MIT |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to |
| * deal in the Software without restriction, including without limitation the |
| * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| * sell copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in all |
| * copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) |
| |
| #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h" |
| #include "arm_compute/core/TensorInfo.h" |
| #include "src/core/AccessWindowStatic.h" |
| #include "src/core/helpers/WindowHelpers.h" |
| |
| #include "src/core/utils/helpers/float_ops.h" |
| #include "support/StringSupport.h" |
| |
| namespace arm_compute |
| { |
| namespace experimental |
| { |
| namespace dynamic_fusion |
| { |
| ComponentType ClGemmNativeKernelComponent::get_component_type() const |
| { |
| return ComponentType::Complex; |
| } |
| |
| std::set<std::string> ClGemmNativeKernelComponent::get_headers_list() const |
| { |
| return std::set<std::string> { "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", "gemm_helpers.h", "repeat.h" }; |
| } |
| |
| Window ClGemmNativeKernelComponent::get_window() const |
| { |
| ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); |
| ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); |
| ITensorInfo *bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id); |
| ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); |
| |
| ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info); |
| |
| bool reinterpret_input_as_3d = _desc.reinterpret_input_as_3d; |
| bool reinterpret_output_as_3d = _desc.depth_output_gemm3d != 0; |
| |
| Window win{}; |
| Window win_out{}; |
| bool window_changed = false; |
| |
| // In case both input and dst have to be reinterpreted as 3D tensors, |
| // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. |
| if(reinterpret_input_as_3d == reinterpret_output_as_3d) |
| { |
| reinterpret_output_as_3d = false; |
| } |
| |
| // activation_layer is set to dummy because it's required by GEMMKernelInfo, but it's not used in shape calculation |
| GEMMKernelInfo gemm_info(_desc.m, _desc.n, _desc.k, _desc.depth_output_gemm3d, _desc.reinterpret_input_as_3d, |
| _desc.broadcast_bias, _desc.fp_mixed_precision, _desc.has_pad_y, ActivationLayerInfo(), _desc.nmult_transpose1xW_width, |
| _desc.mult_interleave4x4_height, _desc.lhs_info, _desc.rhs_info, _desc.a_offset, _desc.b_offset); |
| |
| // dst tensor auto initialization if not yet initialized |
| auto_init_if_empty(*dst_info, lhs_info->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*lhs_info, *rhs_info, gemm_info))); |
| |
| TensorInfo tmp_info(*dst_info); |
| |
| if(reinterpret_output_as_3d) |
| { |
| // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, |
| // the window needs to be constructed on the 2D collapsed version of the tensor |
| TensorShape tmp_shape(dst_info->tensor_shape()); |
| tmp_shape.collapse(2U, 1U); |
| tmp_info.set_tensor_shape(tmp_shape); |
| } |
| |
| win = calculate_max_window(tmp_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0)); |
| win_out = calculate_max_window(*dst_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0)); |
| |
| AccessWindowStatic src0_access(lhs_info, 0, 0, |
| lhs_info->dimension(0), |
| lhs_info->dimension(1)); |
| AccessWindowStatic src1_access(rhs_info, 0, 0, |
| ceil_to_multiple(rhs_info->dimension(0), _desc.rhs_info.n0), |
| rhs_info->dimension(1)); |
| AccessWindowStatic dst_access(dst_info, 0, 0, |
| dst_info->dimension(0), |
| dst_info->dimension(1)); |
| |
| if(bias_info != nullptr) |
| { |
| const int bias_processed_per_iteration_x = _desc.rhs_info.n0; |
| |
| AccessWindowStatic src2_access(bias_info, 0, 0, |
| ceil_to_multiple(bias_info->dimension(0), bias_processed_per_iteration_x), |
| bias_info->dimension(1)); |
| |
| window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop |
| update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor |
| } |
| else |
| { |
| window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop |
| update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor |
| } |
| |
| // Collapse along the Z direction |
| // This collapse needs to be here in order to tune the Z dimension of LWS |
| Window collapsed = win; |
| const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst_info->num_dimensions()), 2u); |
| collapsed = win.collapse(win, dimension_to_collapse); |
| |
| if(window_changed == true) |
| { |
| ARM_COMPUTE_ERROR("Insufficient Padding!"); |
| } |
| |
| return collapsed; |
| } |
| |
| std::string ClGemmNativeKernelComponent::get_additional_macros() const |
| { |
| return R"_( |
| #define VFMA(a, b, c) \ |
| ({ \ |
| c = fma(a, b, c); \ |
| }) |
| |
| #if M0 == 1 |
| #define RHS_VFMA_M0xN0(i, a, b, c) \ |
| ({ \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ |
| }) |
| #elif M0 == 2 // M0 == 2 |
| #define RHS_VFMA_M0xN0(i, a, b, c) \ |
| ({ \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ |
| }) |
| #elif M0 == 3 // M0 == 3 |
| #define RHS_VFMA_M0xN0(i, a, b, c) \ |
| ({ \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ |
| }) |
| #elif M0 == 4 // M0 == 4 |
| #define RHS_VFMA_M0xN0(i, a, b, c) \ |
| ({ \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ |
| }) |
| #elif M0 == 5 // M0 == 5 |
| #define RHS_VFMA_M0xN0(i, a, b, c) \ |
| ({ \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ |
| }) |
| #elif M0 == 6 // M0 == 6 |
| #define RHS_VFMA_M0xN0(i, a, b, c) \ |
| ({ \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ |
| }) |
| #elif M0 == 7 // M0 == 7 |
| #define RHS_VFMA_M0xN0(i, a, b, c) \ |
| ({ \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ |
| }) |
| #elif M0 == 8 // M0 == 8 |
| #define RHS_VFMA_M0xN0(i, a, b, c) \ |
| ({ \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ |
| VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ |
| }) |
| #else // M0 not supported |
| #error "M0 not supported" |
| #endif // M0 not supported |
| )_"; |
| } |
| |
| std::string ClGemmNativeKernelComponent::get_component_code() const |
| { |
| auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); |
| auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); |
| |
| auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha)); |
| auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0; |
| auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions(); |
| |
| std::string code = R"_( |
| //------------------ START KERNEL {{meta_kernel_id}} --------------------- |
| // IN_0(lhs) {{lhs}} |
| // IN_1(rhs) {{rhs}} |
| )_"; |
| |
| if(!_bias.is_empty()) |
| { |
| code += R"_( |
| // IN_2(bias) {{bias}} |
| )_"; |
| } |
| |
| code += R"_( |
| // OUT(dst, accum) {{dst}} |
| |
| // Initialize the accumulators |
| REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), {{dst}}, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; |
| { |
| #if defined(DUMMY_WORK_ITEMS) |
| if((g_x * N0 >= N) || (g_y * M0 >= M)) |
| { |
| return; |
| } |
| #endif // defined(DUMMY_WORK_ITEMS) |
| |
| // Compute LHS matrix address |
| uint lhs_offset = {{lhs}}_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * (uint){{lhs}}_stride_y; |
| |
| // Compute RHS matrix address |
| uint rhs_offset = {{rhs}}_offset_first_element_in_bytes + g_x * N0 * sizeof(DATA_TYPE); |
| )_"; |
| |
| if(dont_slide_b) |
| { |
| code += R"_( |
| // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 |
| rhs_offset += (g_z % {{MATRIX_B_DEPTH}}) * {{rhs}}_stride_z; |
| )_"; |
| } |
| else |
| { |
| code += R"_( |
| rhs_offset += g_z * {{rhs}}_stride_z; |
| )_"; |
| } |
| |
| code += R"_( |
| REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); |
| )_"; |
| |
| if(reinterpret_input_as_3d) |
| { |
| code += R"_( |
| // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D |
| CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), {{HEIGHT_GEMM3D}}, {{DEPTH_GEMM3D}}, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y); |
| |
| // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we |
| // multiply lhs_stride_z by DEPTH_GEMM3D |
| lhs_offset += g_z * {{lhs}}_stride_z * {{DEPTH_GEMM3D}}; |
| )_"; |
| } |
| else |
| { |
| code += R"_( |
| // Add offset for batched GEMM |
| lhs_offset += g_z * {{lhs}}_stride_z; |
| )_"; |
| } |
| |
| code += R"_( |
| int i = 0; |
| #if {{K0}} > 1 |
| for(; i <= (K - {{K0}}); i += {{K0}}) |
| { |
| // Supported cases (M0, K0): |
| // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 |
| // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 |
| // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 |
| // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 |
| // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 |
| // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 |
| // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 |
| // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 |
| // Load values from LHS matrix |
| LOAD_BLOCK(M0, {{K0}}, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs); |
| |
| // Load values from RHS matrix |
| LOAD_BLOCK({{K0}}, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero); |
| |
| RHS_VFMA_M0xN0(0, a, b0, {{dst}}); |
| RHS_VFMA_M0xN0(1, a, b1, {{dst}}); |
| #if {{K0}} > 2 |
| RHS_VFMA_M0xN0(2, a, b2, {{dst}}); |
| #endif // K0 > 2 |
| #if {{K0}} > 3 |
| RHS_VFMA_M0xN0(3, a, b3, {{dst}}); |
| #endif // K0 > 3 |
| #if {{K0}} > 4 |
| RHS_VFMA_M0xN0(4, a, b4, {{dst}}); |
| RHS_VFMA_M0xN0(5, a, b5, {{dst}}); |
| RHS_VFMA_M0xN0(6, a, b6, {{dst}}); |
| RHS_VFMA_M0xN0(7, a, b7, {{dst}}); |
| #endif // K0 > 4 |
| #if {{K0}} > 8 |
| RHS_VFMA_M0xN0(8, a, b8, {{dst}}); |
| RHS_VFMA_M0xN0(9, a, b9, {{dst}}); |
| RHS_VFMA_M0xN0(A, a, bA, {{dst}}); |
| RHS_VFMA_M0xN0(B, a, bB, {{dst}}); |
| RHS_VFMA_M0xN0(C, a, bC, {{dst}}); |
| RHS_VFMA_M0xN0(D, a, bD, {{dst}}); |
| RHS_VFMA_M0xN0(E, a, bE, {{dst}}); |
| RHS_VFMA_M0xN0(F, a, bF, {{dst}}); |
| #endif // K0 > 8 |
| |
| lhs_offset += {{K0}} * sizeof(DATA_TYPE); |
| rhs_offset += {{K0}} * {{rhs}}_stride_y; |
| } |
| #endif // K0 > 1 |
| // Left-over accumulations |
| for(; i < K; ++i) |
| { |
| // Load values from LHS matrix |
| VEC_DATA_TYPE(DATA_TYPE, 2) |
| a0 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 0 * {{lhs}}_stride_y + zlhs0)); |
| #if M0 > 1 |
| VEC_DATA_TYPE(DATA_TYPE, 2) |
| a1 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 1 * {{lhs}}_stride_y + zlhs1)); |
| #endif // M0 > 1 |
| #if M0 > 2 |
| VEC_DATA_TYPE(DATA_TYPE, 2) |
| a2 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 2 * {{lhs}}_stride_y + zlhs2)); |
| #endif // M0 > 2 |
| #if M0 > 3 |
| VEC_DATA_TYPE(DATA_TYPE, 2) |
| a3 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 3 * {{lhs}}_stride_y + zlhs3)); |
| #endif // M0 > 3 |
| #if M0 > 4 |
| VEC_DATA_TYPE(DATA_TYPE, 2) |
| a4 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 4 * {{lhs}}_stride_y + zlhs4)); |
| #endif // M0 > 4 |
| #if M0 > 5 |
| VEC_DATA_TYPE(DATA_TYPE, 2) |
| a5 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 5 * {{lhs}}_stride_y + zlhs5)); |
| #endif // M0 > 5 |
| #if M0 > 6 |
| VEC_DATA_TYPE(DATA_TYPE, 2) |
| a6 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 6 * {{lhs}}_stride_y + zlhs6)); |
| #endif // M0 > 6 |
| #if M0 > 7 |
| VEC_DATA_TYPE(DATA_TYPE, 2) |
| a7 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 7 * {{lhs}}_stride_y + zlhs7)); |
| #endif // M0 > 7 |
| |
| VEC_DATA_TYPE(DATA_TYPE, N0) |
| b = VLOAD(N0)(0, (__global DATA_TYPE *)({{rhs}}_ptr + rhs_offset + 0 * {{rhs}}_stride_y)); |
| RHS_VFMA_M0xN0(0, a, b, {{dst}}); |
| |
| lhs_offset += sizeof(DATA_TYPE); |
| rhs_offset += {{rhs}}_stride_y; |
| } |
| |
| // Multiply by the weight of matrix-matrix product and store the result |
| )_"; |
| if(has_alpha) |
| { |
| code += R"_( |
| SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, {{ALPHA}}); |
| )_"; |
| } |
| |
| if(!_bias.is_empty()) |
| { |
| if(_desc.broadcast_bias) |
| { |
| code += R"_( |
| // Add beta*bias |
| __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); |
| |
| LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero); |
| )_"; |
| |
| if(helpers::float_ops::is_one(_desc.beta)) |
| { |
| code += R"_( |
| SCALE_BLOCK(1, DATA_TYPE, bias, {{BETA}}); |
| )_"; |
| } |
| |
| code += R"_( |
| // c = c + bias[broadcasted] |
| ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0); |
| )_"; |
| } |
| else |
| { |
| code += R"_( |
| // Add beta*bias |
| __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, |
| PARTIAL_STORE_M0) |
| * {{bias}}_stride_y) |
| + g_z * {{bias}}_stride_z; |
| |
| LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero); |
| )_"; |
| |
| if(helpers::float_ops::is_one(_desc.beta)) |
| { |
| code += R"_( |
| SCALE_BLOCK(M0, DATA_TYPE, bias, {{BETA}}); |
| )_"; |
| } |
| |
| code += R"_( |
| // c = c + bias |
| ADD_BLOCK(M0, {{dst}}, bias); |
| )_"; |
| } |
| } |
| |
| code += R"_( |
| } |
| //------------------ END KERNEL {{meta_kernel_id}} --------------------- |
| )_"; |
| return code.c_str(); |
| } |
| |
| CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const |
| { |
| auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); |
| auto tile_info = _blueprint->impl().get_tile_info(); |
| |
| CLBuildOptions build_opts{}; |
| |
| build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type())); |
| build_opts.add_option("-DM=" + support::cpp11::to_string(tile_info.boundaries.y())); |
| build_opts.add_option("-DN=" + support::cpp11::to_string(tile_info.boundaries.x())); |
| build_opts.add_option("-DK=" + support::cpp11::to_string(_desc.k)); |
| build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y())); |
| build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x())); |
| build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y())); |
| build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x())); |
| |
| return build_opts; |
| } |
| |
| std::string ClGemmNativeKernelComponent::generate_config_id() const |
| { |
| auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); |
| std::string config_id{}; |
| config_id += (_bias.is_empty() ? "add_bias_" : ""); |
| config_id += (_desc.broadcast_bias ? "broadcast_bias_" : ""); |
| config_id += (_desc.reinterpret_input_as_3d ? "3di_" : ""); |
| config_id += (_desc.depth_output_gemm3d > 0 ? "3do_" : ""); |
| config_id += lower_string(string_from_data_type(t_dst_info->data_type())); |
| config_id += "_"; |
| config_id += support::cpp11::to_string(t_dst_info->dimension(1)); |
| config_id += "_"; |
| config_id += support::cpp11::to_string(t_dst_info->dimension(0)); |
| config_id += "_"; |
| config_id += support::cpp11::to_string(_desc.k); |
| config_id += "_"; |
| config_id += support::cpp11::to_string(t_dst_info->dimension(2)); |
| config_id += "_"; |
| config_id += support::cpp11::to_string(_desc.lhs_info.m0); |
| config_id += "_"; |
| config_id += support::cpp11::to_string(_desc.rhs_info.n0); |
| config_id += "_"; |
| config_id += support::cpp11::to_string(_desc.rhs_info.k0); |
| return config_id; |
| } |
| |
| ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const |
| { |
| TagLUT lut{}; |
| |
| lut["meta_kernel_id"] = id(); |
| lut["lhs"] = vtable.add(_lhs, ClKernelArgRuntimeDescriptor(_lhs.arg_id, TensorArgType::Image_3D), "lhs"); |
| lut["rhs"] = vtable.add(_rhs, ClKernelArgRuntimeDescriptor(_rhs.arg_id, TensorArgType::Image_3D), "rhs"); |
| if(!_bias.is_empty()) // optional bias |
| { |
| lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Image_3D), "bias"); |
| } |
| lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst"); |
| |
| // Local build options |
| auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); |
| auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); |
| auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); |
| |
| auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha)); |
| auto has_beta = _blueprint->impl().get_kernel_argument_info(_bias.arg_id) != nullptr; |
| auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0; |
| auto reinterpret_output_as_3d = !_desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d != 0; |
| auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions(); |
| |
| lut["K0"] = support::cpp11::to_string(_desc.rhs_info.k0); |
| |
| if(has_alpha) |
| { |
| lut["ALPHA"] = float_to_string_with_full_precision(_desc.alpha); |
| } |
| if(has_beta) |
| { |
| lut["BETA"] = float_to_string_with_full_precision(_desc.beta); |
| } |
| if(dont_slide_b) |
| { |
| lut["MATRIX_B_DEPTH"] = support::cpp11::to_string(t_rhs_info->dimension(2)); |
| } |
| |
| if(reinterpret_output_as_3d) |
| { |
| lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(1)); |
| lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(2)); |
| } |
| else if(reinterpret_input_as_3d) |
| { |
| lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(1)); |
| lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(2)); |
| } |
| |
| return lut; |
| } |
| } // namespace dynamic_fusion |
| } // namespace experimental |
| } // namespace arm_compute |
| |
| #endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) |