blob: fb86e163f204bdcce0af7a571b499bb6a64af9a0 [file] [log] [blame]
Mohammed Suhail Munshia18d85c2023-01-03 10:16:16 +00001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "ClTemplatePool2d.h"
25
26#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
27#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
28
29#include "arm_compute/core/utils/misc/ShapeCalculator.h"
30#include "src/core/helpers/WindowHelpers.h"
31
32#include "support/StringSupport.h"
33
34namespace arm_compute
35{
36namespace experimental
37{
38namespace dynamic_fusion
39{
40namespace
41{
42// Shape indexes for NHWC Datalayout
Mohammed Suhail Munshia18d85c2023-01-03 10:16:16 +000043constexpr static int32_t height_idx = 2;
44constexpr static int32_t width_idx = 1;
45constexpr static int32_t channel_idx = 0;
Omar Al Khatib3c7c1fa2023-03-07 09:57:49 +000046} // namespace
Mohammed Suhail Munshia18d85c2023-01-03 10:16:16 +000047ClTemplatePool2d::ClTemplatePool2d(ComponentId id,
48 const ArgumentPack<ITensorInfo> &tensors,
49 const Attributes &attributes,
50 const Settings &settings)
51 : IGpuTemplateComponentWriter{ id, tensors },
52 _src{},
53 _dst{},
54 _attributes{ attributes },
55 _settings{ settings }
56{
57 _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
58 _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
59 ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
60}
61
62std::string ClTemplatePool2d::get_name() const
63{
64 return "pool2d";
65}
66
67std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_group) const
68{
69 ARM_COMPUTE_UNUSED(comp_group);
70
71 // Condition to use 2x2 optimized kernel
72 if(_attributes.pool_size() == Size2D(2, 2))
73 {
74 return get_2x2_kernel_code();
75 }
76 else
77 {
78 return get_MxN_kernel_code();
79 }
80}
81
82std::string ClTemplatePool2d::get_MxN_kernel_code() const
83{
84 const auto pool_type = _attributes.pool_type();
85 const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
86
87 // Define pool op macro.
88 std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
89
90 // Kernel start
91 // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0
92 // Note: If C is less than N0, N0 should be SHRINKED to the closest smaller N0. This operation is performed on the host side
93 std::string code = R"_(
94//------------------ START KERNEL {{meta_kernel_id}} ---------------------
95// IN_0(src) {{src}}
96// OUT(dst, accum) {{dst}}
97
98{
99 const int idx_out_c = g_ind_0;
100 const int idx_out_w = g_ind_1;
101)_";
102
103 // Add macro for POOL_OP
104 code += "\n" + pool_op + "\n";
105
106 code += R"_(
107 const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
108 const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
109)_";
110
111 // Define common variables.
112 code += R"_(
113 __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
114
115 __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n * {{dst}}_stride_w;
116
117 VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
118 res0 = {{INITIAL_VALUE}};
119
120 const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
121 const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
122
123 const int pool_x_s = max((int)0, -idx_in_w);
124 const int pool_x_e = min((int){{POOL_SIZE_X}}, (int){{SRC_WIDTH}} - idx_in_w);
125 const int pool_y_s = max((int)0, -idx_in_h);
126 const int pool_y_e = min((int){{POOL_SIZE_Y}}, (int){{SRC_HEIGHT}} - idx_in_h);
127)_";
128
129 // Determine filter size depending on if padding is excluded or not
130 if(_attributes.exclude_padding())
131 {
132 code += R"_(
133 const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
134)_";
135 }
136 else
137 {
138 code += R"_(
139 const int filter_size = {{POOL_SIZE_X}} * {{POOL_SIZE_Y}};
140)_";
141 }
142
143 // Loop through pool size
144 // if global pooling
145 if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx))
146 {
147 // Begin loop
148 code += R"_(
149 // Global pooling path
150 for(int y = 0; y < {{POOL_SIZE_Y}}; ++y)
151 {
152 #pragma unroll 8
153 for(int x = 0; x < {{POOL_SIZE_X}}; ++x)
154 {
155 VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
156 data0;
157)_";
158 }
159 else // if local pooling size
160 {
161 code += R"_(
162 for(int y = pool_y_s; y < pool_y_e; ++y)
163 {
164 #pragma unroll 8
165 for(int x = pool_x_s; x < pool_x_e; ++x)
166 {
167 VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
168 data0;
169)_";
170 } // end else
171
172 // if condition inside loop - use 32bit acc if mixed_precision.
173 // End loop through pooling section.
174 if(fp_mixed_precision)
175 {
176 // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
177 code += R"_(
178 data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
179 res0 = POOL_OP(res0, data0);
180 }
181 }
182)_";
183 }
184 else // load data, compute result and end loop
185 {
186 code += R"_(
187 data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z));
188 res0 = POOL_OP(res0, data0);
189 }
190 }
191)_";
192 }
193
194 // For Pool AVG ONLY, divide pool output by filter size
195 if(pool_type == PoolingType::AVG)
196 {
197 code += R"_(
198 res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
199)_";
200 }
201
202 // If mixed precision convert datatype before storing. Then end kernel.
203 if(fp_mixed_precision)
204 {
205 code += R"_(
206 VEC_DATA_TYPE({{DATA_TYPE}}, N0)
207 res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
208 STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
209)_";
210 }
211 else
212 {
213 // Store data
214 code += R"_(
215 STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
216)_";
217 }
218
219 code += R"_(
220//------------------ END KERNEL {{meta_kernel_id}} ---------------------
221}
222)_";
223
224 return code;
225}
226
227std::string ClTemplatePool2d::get_2x2_kernel_code() const
228{
229 const auto pool_type = _attributes.pool_type();
230 const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
231 std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
232
233 std::string code = R"_(
234//------------------ START KERNEL {{meta_kernel_id}} ---------------------
235// IN_0(src) {{src}}
236// OUT(dst, accum) {{dst}}
237
238#define SELECT_TYPE SELECT_VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
239
240{
241 const int idx_out_c = g_ind_0;
242 const int idx_out_w = g_ind_1;
243)_";
244
245 // Add pool op macro
246 code += "\n" + pool_op + "\n";
247
248 // If batch size != 1, the batch size dimension is collapsed over the height dimension
249 code += R"_(
250 const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
251 const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
252)_";
253
254 code += R"_(
255 const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
256 const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
257
258 __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
259 __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n *
260 {{dst}}_stride_w;
261 const int pool_x_s = max((int)0, -idx_in_w);
262 const int pool_x_e = min((int)2, (int){{SRC_WIDTH}} - idx_in_w);
263 const int pool_y_s = max((int)0, -idx_in_h);
264 const int pool_y_e = min((int)2, (int){{SRC_HEIGHT}} - idx_in_h);
265
266 const int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
267 const int x0 = pool_x_s + idx_in_w;
268 const int y0 = pool_y_s + idx_in_h;
269 const int x1 = pool_x_e - 1 + idx_in_w;
270 const int y1 = pool_y_e - 1 + idx_in_h;
271
272 REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0);
273)_";
274
275 if(fp_mixed_precision)
276 {
277 // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
278 code += R"_(
279 data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
280 data1 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
281 data2 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
282 data3 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
283)_";
284 }
285 else
286 {
287 code += R"_(
288 data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z));
289 data1 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z));
290 data2 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z));
291 data3 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z));
292)_";
293 }
294
295 if(pool_type != PoolingType::MAX)
296 {
297 // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
298 code += R"_(
299 if(filter_size != 4)
300 {
301 SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
302 SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)({{SRC_WIDTH}} - 1);
303 SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
304 SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)({{SRC_HEIGHT}} - 1);
305
306 data0 = select(data0, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_s));
307 data1 = select(data1, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_s));
308 data2 = select(data2, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_e));
309 data3 = select(data3, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_e));
310 }
311)_";
312 }
313
314 code += R"_(
315 VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
316 res0 = data0;
317 res0 = POOL_OP(res0, data1);
318 res0 = POOL_OP(res0, data2);
319 res0 = POOL_OP(res0, data3);
320)_";
321
322 if(pool_type == PoolingType::AVG)
323 {
324 // If avg pooling divide result accordingly.
325 if(_attributes.exclude_padding())
326 {
327 code += R"_(
328 res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
329)_";
330 }
331 else
332 {
333 code += R"_(
334 res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))4;
335)_";
336 }
337 }
338
339 // Store result
340 if(fp_mixed_precision)
341 {
342 code += R"_(
343 VEC_DATA_TYPE({{DATA_TYPE}}, N0)
344 res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
345 STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
346)_";
347 }
348 else
349 {
350 code += R"_(
351 STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
352)_";
353 }
354
355 code += R"_(
356 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
357}
358#undef SELECT_TYPE
359)_";
360
361 return code;
362}
363
364void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
365{
366 vtable.declare_variable(
367 comp_group,
368 _src,
369 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
370 "src");
371
372 vtable.declare_variable(
373 comp_group,
374 _dst,
375 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
376 "dst");
377}
378
379TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
380{
381 ARM_COMPUTE_UNUSED(comp_group);
382
383 TagLUT lut{};
384 // Arguments and global shared variables
385 lut["src"] = vtable.get_variable(_src);
386 lut["dst"] = vtable.get_variable(_dst);
387
388 // Local build options
389 lut["meta_kernel_id"] = id();
390
391 // Retrieve relevant data
Omar Al Khatib3c7c1fa2023-03-07 09:57:49 +0000392 const auto padding = _attributes.pad();
393 const auto stride = _attributes.stride();
394 const auto pool_size = _attributes.pool_size();
395 const auto data_type = _src->data_type();
396 const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
397 const std::string max_initial_value = _settings.use_inf_as_limit() ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
Mohammed Suhail Munshia18d85c2023-01-03 10:16:16 +0000398
399 // pool specific
400 lut["STRIDE_X"] = stride.x();
401 lut["STRIDE_Y"] = stride.y();
402 lut["PAD_X"] = padding.left;
403 lut["PAD_Y"] = padding.top;
404 lut["POOL_SIZE_X"] = pool_size.width;
405 lut["POOL_SIZE_Y"] = pool_size.height;
406
407 // Datatypes and variables
408 lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
409 lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type);
410 lut["SRC_WIDTH"] = _src->dimension(width_idx);
411 lut["SRC_HEIGHT"] = _src->dimension(height_idx);
Adnan AlSinan227db8d2023-02-14 14:24:09 +0000412 lut["INITIAL_VALUE"] = (_attributes.pool_type() == PoolingType::MAX) ? max_initial_value : std::string("0");
Mohammed Suhail Munshia18d85c2023-01-03 10:16:16 +0000413
414 // Tensor specific data
415 lut["DST_HEIGHT"] = _dst->dimension(height_idx);
416
417 return lut;
418}
419
420CLBuildOptions ClTemplatePool2d::get_build_options(const ComponentGroup &comp_group) const
421{
422 const auto root_window = comp_group.get_root_component()->template_writer()->get_window();
423 const unsigned int n0 = root_window.x().step();
424 const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
425
426 CLBuildOptions build_opts{};
427 build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
428 build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
429
430 return build_opts;
431}
432
433std::string ClTemplatePool2d::get_config_id() const
434{
435 const DataType data_type = _src->data_type();
436 const DataLayout data_layout = _src->data_layout();
437
438 std::string config_id{};
439 config_id += "pooling_layer_2d_";
440 config_id += lower_string(string_from_data_type(data_type));
441 config_id += "_";
442 config_id += lower_string(string_from_data_layout(data_layout));
443 config_id += "_";
444 config_id += support::cpp11::to_string(_dst->dimension(width_idx));
445 config_id += "_";
446 config_id += support::cpp11::to_string(_dst->dimension(height_idx));
447 config_id += "_";
448 config_id += support::cpp11::to_string(_dst->dimension(channel_idx));
449
450 return config_id;
451}
452
453std::set<std::string> ClTemplatePool2d::get_headers_list() const
454{
455 return std::set<std::string>{ "helpers.h", "tile_helpers.h", "repeat.h" };
456}
457
458Window ClTemplatePool2d::get_window() const
459{
460 ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
461 const auto output_shape = _dst->tensor_shape();
462 const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
463
464 // Create and configure kernel window
465 auto win = calculate_max_window(output_shape, Steps(vec_size));
466 win = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
467 return win;
468}
469
470} // namespace dynamic_fusion
471} // namespace experimental
472} // namespace arm_compute