blob: 870de64eb871b2f5025883ef8bd82e1f9c202841 [file] [log] [blame]
SiCong Lif44bbc52022-08-29 18:25:51 +01001/*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "ClTemplateDirectConv2d.h"
25
26#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
27#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
28
29#include "arm_compute/core/utils/misc/ShapeCalculator.h"
30#include "src/core/helpers/WindowHelpers.h"
31
32#include "support/StringSupport.h"
33
34namespace arm_compute
35{
36namespace experimental
37{
38namespace dynamic_fusion
39{
40ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId id,
41 const ArgumentPack<ITensorInfo> &tensors,
42 const Attributes &attributes,
43 const Settings &settings)
44 : IGpuTemplateComponentWriter{ id, tensors },
45 _src{},
46 _weight{},
47 _bias{},
48 _dst{},
49 _attributes{ attributes },
50 _settings{ settings }
51{
52 _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
53 _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
54 if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
55 {
56 _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
57 }
58 _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
59 ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
60}
61
62std::string ClTemplateDirectConv2d::get_name() const
63{
64 return "direct_conv2d";
65}
66
67std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
68{
69 ARM_COMPUTE_UNUSED(comp_group);
70
71 const auto channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
72 const auto k0 = adjust_vec_size(is_data_type_quantized(_src->data_type()) ? 16u : 8u, _src->dimension(channel_idx));
73 const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
74
75 std::string code = R"_(
76//------------------ START KERNEL {{meta_kernel_id}} ---------------------
77// IN_0(src) {{src}}
78// IN_1(wei) {{weight}}
79)_";
80 if(_bias && _bias->has_valid_id())
81 {
82 code += R"_(
83// IN_1(bia) {{bias}}
84)_";
85 }
86 code += R"_(
87// OUT(dst, accum) {{dst}}
88
89// Initialize the accumulators
90TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
91{
92 // All the tensor dimensions are passed at compile time.
93 // In case of dynamic tensor support, the following dimensions should be passed as function argument.
94#define _IWEI_WIDTH {{WEI_WIDTH}}
95#define _IWEI_HEIGHT {{WEI_HEIGHT}}
96#define _ISRC_WIDTH {{src}}_w
97#define _ISRC_HEIGHT {{src}}_h
98#define _ISRC_CHANNELS {{src}}_c
99#define _IDST_WIDTH {{arg_dst}}_w
100#define _IDST_HEIGHT {{arg_dst}}_h
101#define _IDST_CHANNELS {{arg_dst}}_c
102#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
103
104 // .v = access the whole vector (OpenCL vector)
105 // .s[x] = access the vector element at position x (scalar access)
106 TILE(int, M0, 1, xi);
107 TILE(int, M0, 1, yi);
108
109 // Convert the linear index to coordinate
110 LOOP_UNROLLING(int, i, 0, 1, M0,
111 {
112 xi[i].v = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
113 yi[i].v = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
114 xi[i].v -= {{PAD_LEFT}};
115 yi[i].v -= {{PAD_TOP}};
116 })
117
118 LOOP_UNROLLING(int, i, 0, 1, M0,
119 {
120 {{dst}}[i].v = 0;
121 })
122
123 for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
124 {
125 int ck = 0;
126 int xk = i % _IWEI_WIDTH;
127 int yk = i / _IWEI_WIDTH;
128
129 int k = 0;
130 for(; k <= (_ISRC_CHANNELS - K0); k += K0)
131 {
132 TILE({{SRC_DATA_TYPE}}, M0, K0, a);
133 TILE({{WEI_DATA_TYPE}}, N0, K0, b);
134
135 // Initialize tiles
136 LOOP_UNROLLING(int, i, 0, 1, M0,
137 {
138 a[i].v = {{ZERO_VALUE}};
139 })
140
141 LOOP_UNROLLING(int, i, 0, 1, N0,
142 {
143 b[i].v = {{ZERO_VALUE}};
144 })
145
146 // Load tile from the src tensor
147 T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
148
149 // Load tile from the weights tensor
150 T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
151
152 // Compute the matrix multiplication between two tiles
153 T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
154
155 ck += K0;
156 }
157
158 // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
159 // This #if directive should be removed in case of dynamic tensor support
160)_";
161
162 if(leftover_loop)
163 {
164 code += R"_(
165 // Left-over accumulations
166 for(; k < _ISRC_CHANNELS; ++k)
167 {
168 TILE({{SRC_DATA_TYPE}}, M0, 1, a);
169 TILE({{WEI_DATA_TYPE}}, N0, 1, b);
170
171 // Initialize tiles
172 LOOP_UNROLLING(int, i, 0, 1, M0,
173 {
174 a[i].v = {{ZERO_VALUE}};
175 })
176
177 LOOP_UNROLLING(int, i, 0, 1, N0,
178 {
179 b[i].v = {{ZERO_VALUE}};
180 })
181
182 // Load tile from the src tensor
183 T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
184
185 // Load tile from the weights tensor
186 // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
187 T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
188
189 // Compute the matrix multiplication between two tiles
190 T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
191
192 ++ck;
193 }
194 )_";
195}
196
197code += R"_(
198#undef _I_WEI_WIDTH
199#undef _I_WEI_HEIGHT
200#undef _ISRC_WIDTH
201#undef _ISRC_HEIGHT
202#undef _ISRC_CHANNELS
203#undef _IDST_WIDTH
204#undef _IDST_HEIGHT
205#undef _IDST_CHANNELS
206#undef _IY_MULTIPLIER
207
208 }
209)_";
210
211 if(_bias && _bias->has_valid_id())
212 {
213 code += R"_(
214 TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
215
216 T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
217
218 // c = c + bias[broadcasted]
219 T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
220 )_";
221}
222
223code += R"_(
224}
225//------------------ END KERNEL {{meta_kernel_id}} ---------------------
226)_";
227 return code;
228}
229
230void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
231{
232 vtable.declare_variable(
233 _src,
234 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
235 comp_group.is_intermediate_tensor(_src),
236 "src");
237
238 const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
239 vtable.declare_variable(
240 _weight,
241 GpuKernelArgumentInfo(weight_type),
242 comp_group.is_intermediate_tensor(_weight),
243 "weight");
244
245 if(_bias && _bias->has_valid_id()) // optional bias
246 {
247 vtable.declare_variable(
248 _bias,
249 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
250 comp_group.is_intermediate_tensor(_bias),
251 "bias");
252 }
253 vtable.declare_variable(
254 _dst,
255 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
256 comp_group.is_intermediate_tensor(_dst),
257 "dst");
258}
259
260TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
261{
262 TagLUT lut{};
263 // Arguments and global shared variables
264 lut["src"] = vtable.get_variable(_src);
265 lut["weight"] = vtable.get_variable(_weight);
266
267 if(_bias && _bias->has_valid_id()) // optional bias
268 {
269 lut["bias"] = vtable.get_variable(_bias);
270 lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
271 }
272 lut["dst"] = vtable.get_variable(_dst);
273
274 const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
275 lut["arg_dst"] = dst_argument.uniq_name;
276
277 // Local build options
278 lut["meta_kernel_id"] = id();
279 lut["ACC_DATA_TYPE"] = _src->data_type();
280 lut["SRC_DATA_TYPE"] = _src->data_type();
281 lut["WEI_DATA_TYPE"] = _weight->data_type();
282
283 lut["SRC_TENSOR_TYPE"] = "BUFFER";
284 switch(vtable.get_variable(_weight).kernel_argument_info.type)
285 {
286 case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
287 case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
288 case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
289 {
290 lut["WEI_TENSOR_TYPE"] = "IMAGE";
291 break;
292 }
293 default:
294 {
295 lut["WEI_TENSOR_TYPE"] = "BUFFER";
296 break;
297 }
298 }
299 const auto width_idx = 1;
300 const auto height_idx = 2;
301 lut["WEI_WIDTH"] = _weight->dimension(width_idx);
302 lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
303
304 lut["STRIDE_X"] = _attributes.stride().x();
305 lut["STRIDE_Y"] = _attributes.stride().y();
306
307 lut["PAD_LEFT"] = _attributes.pad().left;
308 lut["PAD_TOP"] = _attributes.pad().top;
309
310 lut["ZERO_VALUE"] = 0;
311
312 return lut;
313}
314
315CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
316{
317 const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
318 const DataType data_type = _src->data_type();
319
320 /// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future
321 const auto root_window = comp_group.get_root_component()->template_writer()->get_window();
322 const unsigned int n0 = root_window.x().step();
323 const unsigned int m0 = root_window.y().step();
324 const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, _src->dimension(channel_idx));
325 const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
326
327 CLBuildOptions build_opts{};
328 if(_settings.fast_relaxed_math())
329 {
330 build_opts.add_option("-cl-fast-relaxed-math");
331 }
332 else
333 {
334 // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
335 // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
336 build_opts.add_option("-cl-unsafe-math-optimizations");
337 }
338 build_opts.add_option("-DIS_TILED");
339 build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
340 build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
341 build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
342 build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
343
344 return build_opts;
345}
346
347std::string ClTemplateDirectConv2d::get_config_id() const
348{
349 const DataType data_type = _src->data_type();
350 const DataLayout data_layout = _src->data_layout();
351
352 const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
353 const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
354
355 const unsigned int kernel_size = _weight->dimension(width_idx);
356
357 std::string config_id{};
358 config_id += lower_string(string_from_data_type(data_type));
359 config_id += "_";
360 config_id += support::cpp11::to_string(kernel_size);
361 config_id += "_";
362 config_id += support::cpp11::to_string(_attributes.stride().x());
363 config_id += "_";
364 config_id += support::cpp11::to_string(_attributes.stride().y());
365 config_id += "_";
366 config_id += support::cpp11::to_string(_dst->dimension(width_idx));
367 config_id += "_";
368 config_id += support::cpp11::to_string(_dst->dimension(height_idx));
369 config_id += "_";
370 config_id += lower_string(string_from_data_layout(data_layout));
371 return config_id;
372}
373
374std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
375{
376 return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
377}
378
379Window ClTemplateDirectConv2d::get_window() const
380{
381 ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
382
383 const auto output_shape = _dst->tensor_shape();
384
385 const unsigned int vec_size = std::min(static_cast<unsigned int>(output_shape[0]), 4u);
386 const unsigned int num_rows = (_dst->tensor_shape()[0] > 16) ? ((_src->data_type() == DataType::F32) ? 2U : 4U) : 1U;
387
388 // Create and configure kernel window
389 Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
390
391 const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
392 win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
393 win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
394
395 return win;
396}
397
398} // namespace dynamic_fusion
399} // namespace experimental
400} // namespace arm_compute