| /* |
| * Copyright (c) 2023 Arm Limited. |
| * |
| * SPDX-License-Identifier: MIT |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to |
| * deal in the Software without restriction, including without limitation the |
| * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| * sell copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in all |
| * copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h" |
| |
| #include "arm_compute/core/utils/helpers/AdjustVecSize.h" |
| |
| #include "src/core/helpers/WindowHelpers.h" |
| #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" |
| #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" |
| #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" |
| #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" |
| #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" |
| #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" |
| #include "support/StringSupport.h" |
| |
| using namespace ckw; |
| namespace arm_compute |
| { |
| namespace experimental |
| { |
| namespace dynamic_fusion |
| { |
| |
| GpuCkwMatMul::GpuCkwMatMul(ComponentId id, |
| const ArgumentPack<ITensorInfo> &tensors, |
| const Attributes &attributes, |
| const Settings &settings) |
| : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}, _settings{settings} |
| { |
| _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); |
| _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); |
| _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); |
| ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst); |
| } |
| |
| void GpuCkwMatMul::write_component_code(const ComponentGroup &comp_group, |
| GpuCkwVariableTable &vtable, |
| GpuCkwScopedKernelWriter writer) const |
| { |
| const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); |
| |
| GpuCkwComponentArgument *lhs = |
| vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs"); |
| GpuCkwComponentArgument *rhs = |
| vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs"); |
| GpuCkwComponentArgument *dst = |
| vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); |
| |
| // Constants |
| const int height_idx = get_data_layout_dimension_index(_lhs->data_layout(), DataLayoutDimension::HEIGHT); |
| const auto &rhs_h = writer->declare_tile("rhs_h", static_cast<int32_t>(_rhs->dimension(height_idx))); |
| const int m = static_cast<int>(_dst->dimension(1)); |
| const int n = static_cast<int>(_dst->dimension(0)); |
| const int k = |
| _attributes.adj_lhs() ? static_cast<int>(_lhs->tensor_shape().y()) : static_cast<int>(_lhs->tensor_shape().x()); |
| const int m0 = root_window.y().step(); |
| const int n0 = root_window.x().step(); |
| const int k0 = _settings.k0(); |
| const int partial_store_m0 = m % m0; |
| const int partial_store_n0 = n % n0; |
| |
| const auto &const_1 = writer->declare_tile("1", 1); |
| auto &const_0 = writer->declare_tile("0", 0); |
| auto &k0_tile = writer->declare_tile("k0", k0); |
| auto &k_tile = writer->declare_tile("k", k); |
| |
| auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32); |
| auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32); |
| auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32); |
| |
| writer->op_get_global_id(gid_0, 0); |
| writer->op_get_global_id(gid_1, 1); |
| writer->op_get_global_id(gid_2, 2); |
| |
| auto &x = writer->declare_tile("x", ckw::DataType::Int32); |
| auto &y = writer->declare_tile("y", ckw::DataType::Int32); |
| auto &z = writer->declare_tile("z", ckw::DataType::Int32); |
| |
| get_coord(writer, x, gid_0, n0, partial_store_n0, "gid_x_", const_0); |
| get_coord(writer, y, gid_1, m0, partial_store_m0, "gid_y_", const_0); |
| get_coord(writer, z, gid_2, 1, 0, "gid_z_", const_0); |
| |
| TensorTileSampler lhs_sampler; |
| lhs_sampler.height(m0); |
| lhs_sampler.width(k0); |
| lhs_sampler.format(TensorSamplerFormat::C_W_H); |
| lhs_sampler.address_mode_x(TensorSamplerAddressModeX::None); |
| lhs_sampler.address_mode_y(TensorSamplerAddressModeY::None); |
| lhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None); |
| |
| TensorTileSampler rhs_sampler; |
| rhs_sampler.height(k0); |
| rhs_sampler.width(n0); |
| rhs_sampler.format(TensorSamplerFormat::C_WH_1); |
| rhs_sampler.address_mode_x(TensorSamplerAddressModeX::None); |
| rhs_sampler.address_mode_y(TensorSamplerAddressModeY::None); |
| rhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None); |
| |
| TensorTileSampler dst_sampler; |
| dst_sampler.width(n0); |
| dst_sampler.height(m0); |
| dst_sampler.format(TensorSamplerFormat::C_W_H); |
| dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin); |
| dst_sampler.address_mode_y(TensorSamplerAddressModeY::None); |
| dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None); |
| dst_sampler.x(x); |
| dst_sampler.y(y); |
| dst_sampler.z(z); |
| dst_sampler.b(const_0); |
| |
| if (!dst->has_tile()) |
| { |
| auto &dst_tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), m0, n0)); |
| dst->init_virtual_tensor(dst_tile, dst_sampler); |
| } |
| auto &dst_tile = dst->tile(); |
| |
| // Initialize the accumulators |
| writer->op_assign(dst_tile, const_0); |
| |
| auto &rhs_z = writer->declare_tile("rhs_z", ckw::DataType::Int32); |
| writer->op_binary_expression(rhs_z, z, BinaryOp::Mul, rhs_h); |
| |
| auto &k_i = writer->declare_tile("k_i", ckw::DataType::Int32); |
| auto &k_limit = writer->declare_tile("k_limit", k - k0); |
| |
| auto &x_i = writer->declare_tile("x_i", ckw::DataType::Int32); |
| writer->op_assign(x_i, const_0); |
| |
| writer->op_assign(k_i, const_0); |
| |
| // *INDENT-OFF* |
| // clang-format off |
| writer->op_for_loop(k_i, BinaryOp::LessEqual, k_limit, k_i, AssignmentOp::Increment, k0_tile, |
| [&]() |
| { |
| //Initialize tiles |
| // lhs_tile |
| auto &a = writer->declare_tile("a", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, k0)); |
| // rhs_tile |
| auto &b = writer->declare_tile("b", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, k0)); |
| writer->op_assign(a, const_0); |
| writer->op_assign(b, const_0); |
| |
| // Loading the tiles |
| // LHS |
| lhs_sampler.x(x_i); |
| lhs_sampler.y(y); |
| lhs_sampler.z(z); |
| lhs_sampler.b(const_0); |
| writer->op_load(a, lhs->tensor(), lhs_sampler); |
| |
| // RHS |
| auto &y_i = writer->declare_tile("y_i", ckw::DataType::Int32); |
| writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z); |
| rhs_sampler.x(k_i); |
| rhs_sampler.y(y_i); |
| rhs_sampler.z(const_0); |
| rhs_sampler.b(const_0); |
| writer->op_load(b, rhs->tensor(), rhs_sampler); |
| |
| // Perform Matmul |
| writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b); |
| writer->op_binary_expression(x_i, x_i, BinaryOp::Add, k0_tile); |
| }); |
| // *INDENT-ON* |
| // clang-format on |
| |
| // Handling leftovers |
| if (k % k0 != 0) |
| { |
| // *INDENT-OFF* |
| // clang-format off |
| writer->op_for_loop(k_i, BinaryOp::Less, k_tile, k_i, AssignmentOp::Increment, const_1, |
| [&]() |
| { |
| //Initialize tiles |
| // lhs_tile |
| auto &a = |
| writer->declare_tile("a_leftover", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, 1)); |
| // rhs_tile |
| auto &b = |
| writer->declare_tile("b_leftover", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, 1)); |
| writer->op_assign(a, const_0); |
| writer->op_assign(b, const_0); |
| |
| // Loading the tiles |
| // LHS |
| lhs_sampler.x(x_i); |
| lhs_sampler.y(y); |
| lhs_sampler.z(z); |
| lhs_sampler.b(const_0); |
| writer->op_load(a, lhs->tensor(), lhs_sampler); |
| |
| // RHS |
| auto &y_i = writer->declare_tile("y_i_leftover", ckw::DataType::Int32); |
| writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z); |
| rhs_sampler.x(k_i); |
| rhs_sampler.y(y_i); |
| rhs_sampler.z(const_0); |
| rhs_sampler.b(const_0); |
| writer->op_load(b, rhs->tensor(), rhs_sampler); |
| |
| // Perform Matmul |
| writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b); |
| writer->op_binary_expression(x_i, x_i, BinaryOp::Add, const_1); |
| }); |
| // *INDENT-ON* |
| // clang-format on |
| } |
| } |
| |
| Window GpuCkwMatMul::get_window() const |
| { |
| ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); |
| |
| const int m = _dst->dimension(1); |
| const int n = _dst->dimension(0); |
| const bool adj_lhs = _attributes.adj_lhs(); |
| |
| int m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m); |
| int n0 = adjust_vec_size(_settings.n0(), n); |
| |
| // Configure kernel window |
| Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0)); |
| win = win.collapse(win, Window::DimZ); |
| |
| return win; |
| } |
| |
| std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const |
| { |
| ARM_COMPUTE_UNUSED(comp_group); |
| |
| std::string kernel_name("mat_mul_native"); |
| |
| const int m = _dst->dimension(1); |
| const int n = _dst->dimension(0); |
| const int k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x(); |
| |
| kernel_name += _attributes.adj_lhs() ? "_t" : "_nt"; |
| kernel_name += _attributes.adj_rhs() ? "_t" : "_nt"; |
| kernel_name += "_"; |
| kernel_name += support::cpp11::to_string(m); |
| kernel_name += "_"; |
| kernel_name += support::cpp11::to_string(n); |
| kernel_name += "_"; |
| kernel_name += support::cpp11::to_string(k); |
| kernel_name += "_"; |
| kernel_name += support::cpp11::to_string(_dst->dimension(2)); |
| kernel_name += "_"; |
| kernel_name += support::cpp11::to_string(_settings.m0()); |
| kernel_name += "_"; |
| kernel_name += support::cpp11::to_string(_settings.n0()); |
| kernel_name += "_"; |
| kernel_name += support::cpp11::to_string(_settings.k0()); |
| |
| return kernel_name; |
| } |
| |
| } // namespace dynamic_fusion |
| } // namespace experimental |
| } // namespace arm_compute |