blob: 3322487910e7f82bf084ce7203f0df33bc3f665f [file] [log] [blame]
SiCong Lif44bbc52022-08-29 18:25:51 +01001/*
Ramy Elgammalf800adf2022-12-14 15:39:29 +00002 * Copyright (c) 2022-2023 Arm Limited.
SiCong Lif44bbc52022-08-29 18:25:51 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "ClTemplateDirectConv2d.h"
25
26#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
27#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
28
Matthew Bentham314d3e22023-06-23 10:53:52 +000029#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
SiCong Lif44bbc52022-08-29 18:25:51 +010030#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Matthew Bentham314d3e22023-06-23 10:53:52 +000031#include "arm_compute/core/utils/StringUtils.h"
SiCong Lif44bbc52022-08-29 18:25:51 +010032#include "src/core/helpers/WindowHelpers.h"
33
34#include "support/StringSupport.h"
35
36namespace arm_compute
37{
38namespace experimental
39{
40namespace dynamic_fusion
41{
42ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId id,
43 const ArgumentPack<ITensorInfo> &tensors,
44 const Attributes &attributes,
45 const Settings &settings)
46 : IGpuTemplateComponentWriter{ id, tensors },
47 _src{},
48 _weight{},
49 _bias{},
50 _dst{},
51 _attributes{ attributes },
52 _settings{ settings }
53{
54 _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
55 _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
56 if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
57 {
58 _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
59 }
60 _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
61 ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
62}
63
64std::string ClTemplateDirectConv2d::get_name() const
65{
66 return "direct_conv2d";
67}
68
69std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
70{
71 ARM_COMPUTE_UNUSED(comp_group);
72
73 const auto channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
Ramy Elgammaldf6a3b02022-11-30 16:23:10 +000074 const auto k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
SiCong Lif44bbc52022-08-29 18:25:51 +010075 const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
76
77 std::string code = R"_(
78//------------------ START KERNEL {{meta_kernel_id}} ---------------------
79// IN_0(src) {{src}}
80// IN_1(wei) {{weight}}
81)_";
82 if(_bias && _bias->has_valid_id())
83 {
84 code += R"_(
85// IN_1(bia) {{bias}}
86)_";
87 }
88 code += R"_(
89// OUT(dst, accum) {{dst}}
90
Gunes Bayir7dc02342022-11-21 21:46:50 +000091TILE(uint, M0, 1, g_dst_indirect_y);
92
SiCong Lif44bbc52022-08-29 18:25:51 +010093{
SiCong Lif44bbc52022-08-29 18:25:51 +010094#define _IWEI_WIDTH {{WEI_WIDTH}}
95#define _IWEI_HEIGHT {{WEI_HEIGHT}}
Viet-Hoa Doe2e6d742023-03-01 15:46:10 +000096#define _ISRC_WIDTH {{SRC_WIDTH}}
97#define _ISRC_HEIGHT {{SRC_HEIGHT}}
98#define _ISRC_CHANNELS {{SRC_CHANNELS}}
99#define _IDST_WIDTH {{DST_WIDTH}}
100#define _IDST_HEIGHT {{DST_HEIGHT}}
101#define _IDST_CHANNELS {{DST_CHANNELS}}
SiCong Lif44bbc52022-08-29 18:25:51 +0100102#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
103
SiCong Lif44bbc52022-08-29 18:25:51 +0100104 TILE(int, M0, 1, xi);
105 TILE(int, M0, 1, yi);
106
107 // Convert the linear index to coordinate
108 LOOP_UNROLLING(int, i, 0, 1, M0,
109 {
Gian Marco Iodice85260d82022-12-16 15:34:27 +0000110 xi[0].s[i] = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
111 yi[0].s[i] = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
112 xi[0].s[i] -= {{PAD_LEFT}};
113 yi[0].s[i] -= {{PAD_TOP}};
SiCong Lif44bbc52022-08-29 18:25:51 +0100114 })
115
116 LOOP_UNROLLING(int, i, 0, 1, M0,
117 {
118 {{dst}}[i].v = 0;
119 })
120
121 for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
122 {
SiCong Lif44bbc52022-08-29 18:25:51 +0100123 int xk = i % _IWEI_WIDTH;
124 int yk = i / _IWEI_WIDTH;
125
Gian Marco Iodice85260d82022-12-16 15:34:27 +0000126 TILE(int, 1, M0, my);
127
128 LOOP_UNROLLING(int, i, 0, 1, M0,
129 {
130 int x_s = xi[0].s[i] + xk;
131 int y_s = yi[0].s[i] + yk;
132 my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
133 my[0].s[i] = my[0].s[i] + g_ind_2 * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
134 my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
135 my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
136 my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
137 my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
138 })
139
140 int ck = 0;
141 for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
SiCong Lif44bbc52022-08-29 18:25:51 +0100142 {
143 TILE({{SRC_DATA_TYPE}}, M0, K0, a);
144 TILE({{WEI_DATA_TYPE}}, N0, K0, b);
145
SiCong Lif44bbc52022-08-29 18:25:51 +0100146 LOOP_UNROLLING(int, i, 0, 1, M0,
147 {
148 a[i].v = {{ZERO_VALUE}};
149 })
150
151 LOOP_UNROLLING(int, i, 0, 1, N0,
152 {
153 b[i].v = {{ZERO_VALUE}};
154 })
155
Gian Marco Iodice85260d82022-12-16 15:34:27 +0000156 T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
SiCong Lif44bbc52022-08-29 18:25:51 +0100157
SiCong Lif44bbc52022-08-29 18:25:51 +0100158 T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
159
SiCong Lif44bbc52022-08-29 18:25:51 +0100160 T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
SiCong Lif44bbc52022-08-29 18:25:51 +0100161 }
SiCong Lif44bbc52022-08-29 18:25:51 +0100162)_";
163
164 if(leftover_loop)
165 {
166 code += R"_(
Gian Marco Iodice85260d82022-12-16 15:34:27 +0000167 for(; ck < _ISRC_CHANNELS; ++ck)
SiCong Lif44bbc52022-08-29 18:25:51 +0100168 {
169 TILE({{SRC_DATA_TYPE}}, M0, 1, a);
170 TILE({{WEI_DATA_TYPE}}, N0, 1, b);
171
SiCong Lif44bbc52022-08-29 18:25:51 +0100172 LOOP_UNROLLING(int, i, 0, 1, M0,
173 {
174 a[i].v = {{ZERO_VALUE}};
175 })
176
177 LOOP_UNROLLING(int, i, 0, 1, N0,
178 {
179 b[i].v = {{ZERO_VALUE}};
180 })
181
Gian Marco Iodice85260d82022-12-16 15:34:27 +0000182 T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
SiCong Lif44bbc52022-08-29 18:25:51 +0100183
SiCong Lif44bbc52022-08-29 18:25:51 +0100184 T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
185
SiCong Lif44bbc52022-08-29 18:25:51 +0100186 T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
SiCong Lif44bbc52022-08-29 18:25:51 +0100187 }
188 )_";
189}
190
191code += R"_(
192#undef _I_WEI_WIDTH
193#undef _I_WEI_HEIGHT
194#undef _ISRC_WIDTH
195#undef _ISRC_HEIGHT
196#undef _ISRC_CHANNELS
197#undef _IDST_WIDTH
198#undef _IDST_HEIGHT
199#undef _IDST_CHANNELS
200#undef _IY_MULTIPLIER
201
202 }
203)_";
204
205 if(_bias && _bias->has_valid_id())
206 {
207 code += R"_(
208 TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
209
210 T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
211
SiCong Lif44bbc52022-08-29 18:25:51 +0100212 T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
213 )_";
214}
215
216code += R"_(
Gunes Bayir7dc02342022-11-21 21:46:50 +0000217 LOOP_UNROLLING(int, i, 0, 1, M0,
218 {
Viet-Hoa Doe2e6d742023-03-01 15:46:10 +0000219 g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1);
220 g_dst_indirect_y[i].v += g_ind_2 * (int)({{DST_WIDTH}} * {{DST_HEIGHT}});
Gunes Bayir7dc02342022-11-21 21:46:50 +0000221 })
SiCong Lif44bbc52022-08-29 18:25:51 +0100222}
223//------------------ END KERNEL {{meta_kernel_id}} ---------------------
224)_";
225 return code;
226}
227
228void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
229{
230 vtable.declare_variable(
Viet-Hoa Do3558c582022-12-16 14:45:57 +0000231 comp_group,
SiCong Lif44bbc52022-08-29 18:25:51 +0100232 _src,
233 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
SiCong Lif44bbc52022-08-29 18:25:51 +0100234 "src");
235
236 const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
237 vtable.declare_variable(
Viet-Hoa Do3558c582022-12-16 14:45:57 +0000238 comp_group,
SiCong Lif44bbc52022-08-29 18:25:51 +0100239 _weight,
240 GpuKernelArgumentInfo(weight_type),
SiCong Lif44bbc52022-08-29 18:25:51 +0100241 "weight");
242
243 if(_bias && _bias->has_valid_id()) // optional bias
244 {
245 vtable.declare_variable(
Viet-Hoa Do3558c582022-12-16 14:45:57 +0000246 comp_group,
SiCong Lif44bbc52022-08-29 18:25:51 +0100247 _bias,
248 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
SiCong Lif44bbc52022-08-29 18:25:51 +0100249 "bias");
250 }
251 vtable.declare_variable(
Viet-Hoa Do3558c582022-12-16 14:45:57 +0000252 comp_group,
SiCong Lif44bbc52022-08-29 18:25:51 +0100253 _dst,
Ramy Elgammal404462a2022-11-08 02:14:46 +0000254 GpuKernelArgumentInfo(common_tensor_type),
SiCong Lif44bbc52022-08-29 18:25:51 +0100255 "dst");
256}
257
258TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
259{
260 TagLUT lut{};
261 // Arguments and global shared variables
262 lut["src"] = vtable.get_variable(_src);
263 lut["weight"] = vtable.get_variable(_weight);
264
265 if(_bias && _bias->has_valid_id()) // optional bias
266 {
267 lut["bias"] = vtable.get_variable(_bias);
268 lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
269 }
270 lut["dst"] = vtable.get_variable(_dst);
271
Viet-Hoa Do04f46202022-12-14 14:49:56 +0000272 const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
SiCong Lif44bbc52022-08-29 18:25:51 +0100273 lut["arg_dst"] = dst_argument.uniq_name;
274
275 // Local build options
276 lut["meta_kernel_id"] = id();
277 lut["ACC_DATA_TYPE"] = _src->data_type();
278 lut["SRC_DATA_TYPE"] = _src->data_type();
279 lut["WEI_DATA_TYPE"] = _weight->data_type();
280
281 lut["SRC_TENSOR_TYPE"] = "BUFFER";
282 switch(vtable.get_variable(_weight).kernel_argument_info.type)
283 {
284 case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
285 case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
286 case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
287 {
288 lut["WEI_TENSOR_TYPE"] = "IMAGE";
289 break;
290 }
291 default:
292 {
293 lut["WEI_TENSOR_TYPE"] = "BUFFER";
294 break;
295 }
296 }
297 const auto width_idx = 1;
298 const auto height_idx = 2;
Viet-Hoa Doe2e6d742023-03-01 15:46:10 +0000299 const auto channel_idx = 0;
300
301 lut["SRC_WIDTH"] = _src->dimension(width_idx);
302 lut["SRC_HEIGHT"] = _src->dimension(height_idx);
303 lut["SRC_CHANNELS"] = _src->dimension(channel_idx);
304
SiCong Lif44bbc52022-08-29 18:25:51 +0100305 lut["WEI_WIDTH"] = _weight->dimension(width_idx);
306 lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
307
Viet-Hoa Doe2e6d742023-03-01 15:46:10 +0000308 lut["DST_WIDTH"] = _dst->dimension(width_idx);
309 lut["DST_HEIGHT"] = _dst->dimension(height_idx);
310 lut["DST_CHANNELS"] = _dst->dimension(channel_idx);
311
SiCong Lif44bbc52022-08-29 18:25:51 +0100312 lut["STRIDE_X"] = _attributes.stride().x();
313 lut["STRIDE_Y"] = _attributes.stride().y();
314
315 lut["PAD_LEFT"] = _attributes.pad().left;
316 lut["PAD_TOP"] = _attributes.pad().top;
317
318 lut["ZERO_VALUE"] = 0;
319
320 return lut;
321}
322
323CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
324{
325 const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
SiCong Lif44bbc52022-08-29 18:25:51 +0100326
SiCong Lif44bbc52022-08-29 18:25:51 +0100327 const auto root_window = comp_group.get_root_component()->template_writer()->get_window();
328 const unsigned int n0 = root_window.x().step();
329 const unsigned int m0 = root_window.y().step();
Ramy Elgammaldf6a3b02022-11-30 16:23:10 +0000330 const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
SiCong Lif44bbc52022-08-29 18:25:51 +0100331 const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
332
333 CLBuildOptions build_opts{};
334 if(_settings.fast_relaxed_math())
335 {
336 build_opts.add_option("-cl-fast-relaxed-math");
337 }
338 else
339 {
340 // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
341 // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
342 build_opts.add_option("-cl-unsafe-math-optimizations");
343 }
Jakub Sujak8ae57142022-12-02 16:09:06 +0000344
SiCong Lif44bbc52022-08-29 18:25:51 +0100345 build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
346 build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
347 build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
348 build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
349
350 return build_opts;
351}
352
353std::string ClTemplateDirectConv2d::get_config_id() const
354{
355 const DataType data_type = _src->data_type();
356 const DataLayout data_layout = _src->data_layout();
357
358 const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
359 const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
360
361 const unsigned int kernel_size = _weight->dimension(width_idx);
362
363 std::string config_id{};
364 config_id += lower_string(string_from_data_type(data_type));
365 config_id += "_";
366 config_id += support::cpp11::to_string(kernel_size);
367 config_id += "_";
368 config_id += support::cpp11::to_string(_attributes.stride().x());
369 config_id += "_";
370 config_id += support::cpp11::to_string(_attributes.stride().y());
371 config_id += "_";
372 config_id += support::cpp11::to_string(_dst->dimension(width_idx));
373 config_id += "_";
374 config_id += support::cpp11::to_string(_dst->dimension(height_idx));
375 config_id += "_";
376 config_id += lower_string(string_from_data_layout(data_layout));
377 return config_id;
378}
379
380std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
381{
382 return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
383}
384
385Window ClTemplateDirectConv2d::get_window() const
386{
387 ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
388
389 const auto output_shape = _dst->tensor_shape();
Ramy Elgammaldf6a3b02022-11-30 16:23:10 +0000390 const auto desc = _settings.direct_conv_descriptor();
SiCong Lif44bbc52022-08-29 18:25:51 +0100391
Ramy Elgammaldf6a3b02022-11-30 16:23:10 +0000392 const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
393 const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1] * output_shape[2]);
SiCong Lif44bbc52022-08-29 18:25:51 +0100394
395 // Create and configure kernel window
Ramy Elgammaldf6a3b02022-11-30 16:23:10 +0000396 Window win = calculate_max_window(output_shape, Steps(n0, m0));
SiCong Lif44bbc52022-08-29 18:25:51 +0100397
Ramy Elgammaldf6a3b02022-11-30 16:23:10 +0000398 const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], m0);
399 win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, m0));
SiCong Lif44bbc52022-08-29 18:25:51 +0100400 win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
401
402 return win;
403}
404
405} // namespace dynamic_fusion
406} // namespace experimental
407} // namespace arm_compute