blob: 39cec6e31c17c74c05cf5b34283661a3c5a40307 [file] [log] [blame]
Ramy Elgammal404462a2022-11-08 02:14:46 +00001/*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "ClTemplateElementwiseBinary.h"
25
26#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
27#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
28
29#include "arm_compute/core/utils/misc/ShapeCalculator.h"
30#include "src/core/helpers/WindowHelpers.h"
31
32#include "support/StringSupport.h"
33
34namespace arm_compute
35{
36namespace experimental
37{
38namespace dynamic_fusion
39{
40constexpr unsigned int vector_size_byte_opencl = 16;
41
42ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId id,
43 const ArgumentPack<ITensorInfo> &tensors,
44 const Attributes &attributes)
45 : IGpuTemplateComponentWriter{ id, tensors },
46 _lhs{},
47 _rhs{},
48 _dst{},
49 _attributes{ attributes }
50{
51 _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
52 _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
53 _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
54 ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
55}
56
57std::string ClTemplateElementwiseBinary::get_name() const
58{
59 return "elementwise_binary";
60}
61
62std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup &comp_group) const
63{
64 ARM_COMPUTE_UNUSED(comp_group);
65 std::string code;
66 const bool is_broadcast = _lhs->tensor_shape() != _rhs->tensor_shape();
67 const bool is_root = (comp_group.get_root_component()->id() == this->id());
68
69 if(is_root)
70 {
71 code =
72R"_(
73 //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
74)_"
75 // IN_0(LHS) {{lhs}}
76 // IN_1(RHS) {{rhs}}
77 // OUT(dst, accum) {{dst}}
78 // dst = lhs + rhs (mix-precision, broadcast, boundary aware)
79R"_(
80 TILE({{DATA_TYPE}}, M0, N0, {{dst}});
81 TILE(uint, M0, 1, g_dst_indirect_y);
82 {
83 TILE({{DATA_TYPE}}, M0, N0, lhs_tile);
84 TILE({{DATA_TYPE}}, M0, N0, rhs_tile);
85)_"
86 // Assuming un-collapsed window
87R"_(
88 {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z;
89 {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z;
90
91 T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, g_ind_0, g_ind_1, 1, {{lhs}}_stride_y, lhs_tile);
92 T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, rhs_tile);
93)_";
94 if(is_broadcast)
95 {
96 code +=
97R"_(
98 T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
99)_";
100 }
101 else
102 {
103 code +=
104R"_(
105 T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
106)_";
107 }
108 code +=
109 // Calculate the destination indirect Y
110R"_(
111 LOOP_UNROLLING(int, i, 0, 1, M0,
112 {
Viet-Hoa Dob84e2532022-12-13 13:09:10 +0000113 g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{out}}_w * {{out}}_h) - 1);
114 g_dst_indirect_y[i].v += g_ind_2 * (int)({{out}}_w * {{out}}_h);
Ramy Elgammal404462a2022-11-08 02:14:46 +0000115 })
116 }
117 //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
118)_";
119 }
120
121 else // non-root
122 {
123 code =
124R"_(
125 //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
126)_"
127 // IN_0/Out(Accumulator) {{acc}}
128 // IN_1(Operand) {{operand}}
129 // acc = operand + acc (mix-precision, broadcast, boundary aware)
130R"_(
131 {
132 TILE(DATA_TYPE, M0, N0, operand_tile);
133 T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{operand}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{operand}}_stride_y, operand_tile);
134)_";
135
136 if(is_broadcast)
137 {
138 code +=
139R"_(
140 T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}});
141)_";
142 }
143 else
144 {
145 code +=
146R"_(
147 T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}});
148)_";
149 }
150 code +=
151R"_(
152 }
153 //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
154)_";
155 }
156
157 return code;
158}
159
160void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
161{
162 vtable.declare_variable(
163 _lhs,
164 GpuKernelArgumentInfo(common_tensor_type),
165 comp_group.is_intermediate_tensor(_lhs),
166 "lhs");
167
168 vtable.declare_variable(
169 _rhs,
170 GpuKernelArgumentInfo(common_tensor_type),
171 comp_group.is_intermediate_tensor(_rhs),
172 "rhs");
173
174 vtable.declare_variable(
175 _dst,
176 GpuKernelArgumentInfo(common_tensor_type),
177 comp_group.is_intermediate_tensor(_dst),
178 "dst");
179}
180
181TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
182{
183 TagLUT lut{};
184 const ITensorInfo *accumulator = _lhs;
185 const ITensorInfo *operand = _rhs;
186
187 // Local build options
188 lut["meta_kernel_id"] = id();
189 lut["DATA_TYPE"] = get_cl_type_from_data_type(_lhs->data_type());
190 // Arguments and global shared variables
191 const bool is_root = (comp_group.get_root_component()->id() == this->id());
192 if(is_root)
193 {
194 lut["lhs"] = vtable.get_variable(_lhs);
195 lut["rhs"] = vtable.get_variable(_rhs);
196 lut["dst"] = vtable.get_variable(_dst);
Viet-Hoa Do04f46202022-12-14 14:49:56 +0000197 lut["out"] = vtable.get_variable(comp_group.get_any_dst_tensor());
Ramy Elgammal404462a2022-11-08 02:14:46 +0000198 }
199 else
200 {
201 // Determine which tensor is the accumulator
202 if(comp_group.is_intermediate_tensor(_lhs))
203 {
204 accumulator = _lhs;
205 operand = _rhs;
206 }
207 else if(comp_group.is_intermediate_tensor(_rhs))
208 {
209 accumulator = _rhs;
210 operand = _lhs;
211 }
212 else
213 {
214 ARM_COMPUTE_ERROR("Invalid elementwise component linking");
215 }
216 lut["acc"] = vtable.get_variable(accumulator);
217 lut["operand"] = vtable.get_variable(operand);
218 }
219 switch(_attributes.operation())
220 {
221 case Attributes::ElementwiseOp::ADD:
222 lut["ELTWISE_OP"] = "ADD";
223 break;
224 default:
225 ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
226 }
227 ARM_COMPUTE_ERROR_ON_MSG(detail::have_different_dimensions(accumulator->tensor_shape(), _dst->tensor_shape(), 0), "Only the operand can be broadcast to match the accumulator's shape");
228 const bool is_broadcast = (operand->tensor_shape() != _dst->tensor_shape());
229
230 // Set broadcast parameters
231 // PRE: All tensors are broadcast-compatible
232 if(is_broadcast)
233 {
234 // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy
235 if(operand->dimension(0) == 1U && operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1]
236 {
237 lut["rhs_m0"] = "1";
238 lut["rhs_n0"] = "1";
239 lut["rhs_start_ind_1"] = "0";
240 lut["rhs_start_ind_0"] = "0";
241 }
242 else if(operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN]
243 {
244 lut["rhs_m0"] = "1";
245 lut["rhs_n0"] = "N0";
246 lut["rhs_start_ind_1"] = "0";
247 lut["rhs_start_ind_0"] = "g_ind_0";
248 }
249 else
250 {
251 ARM_COMPUTE_ERROR("Only support rhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions");
252 }
253 }
254 else
255 {
256 lut["rhs_m0"] = "M0";
257 lut["rhs_n0"] = "N0";
258 lut["rhs_start_ind_1"] = "g_ind_1";
259 lut["rhs_start_ind_0"] = "g_ind_0";
260 }
261 return lut;
262}
263
264CLBuildOptions ClTemplateElementwiseBinary::get_build_options(const ComponentGroup &comp_group) const
265{
266 CLBuildOptions build_opts{};
267 /// NOTE: For now tile sizes (n0, m0) are set by the execution window. This may change in the future
268 const auto root_window = comp_group.get_root_component()->template_writer()->get_window();
269 const unsigned int n0 = root_window.x().step();
270 const unsigned int m0 = root_window.y().step();
271 const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
272
273 build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
274 build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
275 build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_lhs->data_type()));
276 build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
277
278 return build_opts;
279}
280
281std::string ClTemplateElementwiseBinary::get_config_id() const
282{
283 std::string config_id{};
284 config_id += lower_string(string_from_data_type(_dst->data_type()));
285 config_id += "_";
286 config_id += support::cpp11::to_string(_dst->dimension(0));
287 config_id += "_";
288 config_id += support::cpp11::to_string(_dst->dimension(1));
289 config_id += "_";
290 config_id += lower_string(string_from_data_layout(_dst->data_layout()));
291
292 return config_id;
293}
294
295std::set<std::string> ClTemplateElementwiseBinary::get_headers_list() const
296{
297 return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
298}
299
300Window ClTemplateElementwiseBinary::get_window() const
301{
302 ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
303
304 TensorShape output_shape = _dst->tensor_shape();
305 // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged
306 // This is in line with the collapsing convention used by operators like Conv2d
307 output_shape.collapse(2U, 1U);
308 const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
309 Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
310
311 return win;
312}
313
314} // namespace dynamic_fusion
315} // namespace experimental
316} // namespace arm_compute