blob: 11fb1d53d0144893e0e0706af1b38ebd1832bb67 [file] [log] [blame]
Gunes Bayir16c56972022-03-28 21:32:33 +01001/*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
SiCong Lib63b1192022-01-28 18:24:39 +000024#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
25#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
26#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
Gunes Bayir16c56972022-03-28 21:32:33 +010027
28#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
29
30#include "arm_compute/core/utils/misc/ShapeCalculator.h"
31#include "src/core/CL/ICLKernel.h"
32#include "src/core/helpers/AutoConfiguration.h"
33#include "src/core/helpers/WindowHelpers.h"
34#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
35
SiCong Lib63b1192022-01-28 18:24:39 +000036#include "arm_compute/runtime/CL/CLScheduler.h"
Gunes Bayir16c56972022-03-28 21:32:33 +010037namespace arm_compute
38{
39namespace experimental
40{
41namespace dynamic_fusion
42{
43ComponentType ClDirectConvolutionKernelComponent::get_component_type() const
44{
45 return ComponentType::Complex;
46}
47
48std::set<std::string> ClDirectConvolutionKernelComponent::get_headers_list() const
49{
SiCong Lib63b1192022-01-28 18:24:39 +000050 return std::set<std::string> { "helpers.h", "tile_helpers.h" };
Gunes Bayir16c56972022-03-28 21:32:33 +010051}
52
53Window ClDirectConvolutionKernelComponent::get_window() const
54{
55 const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
56 const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
57 auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
58
59 // Get dst shape
SiCong Lib63b1192022-01-28 18:24:39 +000060 PadStrideInfo pad_stride_info
61 {
62 static_cast<unsigned int>(_desc.conv2d.stride.x()),
63 static_cast<unsigned int>(_desc.conv2d.stride.y()),
64 static_cast<unsigned int>(_desc.conv2d.pad.left),
65 static_cast<unsigned int>(_desc.conv2d.pad.right),
66 static_cast<unsigned int>(_desc.conv2d.pad.top),
67 static_cast<unsigned int>(_desc.conv2d.pad.bottom),
68 DimensionRoundingType::FLOOR /*default rounding type*/
69 };
70 TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, pad_stride_info);
Gunes Bayir16c56972022-03-28 21:32:33 +010071
72 // Output auto initialization if not yet initialized
73 auto_init_if_empty(*dst_info, output_shape,
74 1,
75 src_info->data_type(),
76 src_info->quantization_info());
77
78 const unsigned int vec_size = std::min(static_cast<unsigned int>(dst_info->tensor_shape()[0]), 4u);
79 const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U;
SiCong Lib63b1192022-01-28 18:24:39 +000080 // const unsigned int num_rows = 1;
81 // const unsigned int vec_size = tile_info.tile_dims.x();
82 // const unsigned int num_rows = tile_info.tile_dims.y();
Gunes Bayir16c56972022-03-28 21:32:33 +010083
84 // Create and configure kernel window
85 Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
86
87 const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
88 win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
89 win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
90
91 return win;
92}
93
94std::string ClDirectConvolutionKernelComponent::get_additional_macros() const
95{
96 return R"_()_"; // no macros
97}
98
99std::string ClDirectConvolutionKernelComponent::get_component_code() const
100{
101 const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
102 const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
103
104 ARM_COMPUTE_ERROR_ON_MSG(src_info->data_layout() != DataLayout::NHWC, "Only NHWC data layout is supported by this component.");
105
106 const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
107 const auto k0 = adjust_vec_size(is_data_type_quantized(src_info->data_type()) ? 16u : 8u, src_info->dimension(channel_idx));
108 const bool leftover_loop = (src_info->dimension(channel_idx) % k0) != 0;
109
110 std::string code = R"_(
111 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
112 // IN_0(src) {{src}}
113 // IN_1(wei) {{weight}}
SiCong Lib63b1192022-01-28 18:24:39 +0000114 )_";
115 if(bias_info != nullptr)
116 {
117 code += R"_(
Gunes Bayir16c56972022-03-28 21:32:33 +0100118 // IN_1(bia) {{bias}}
SiCong Lib63b1192022-01-28 18:24:39 +0000119 )_";
120 }
121 code += R"_(
Gunes Bayir16c56972022-03-28 21:32:33 +0100122 // OUT(dst, accum) {{dst}}
123
Gunes Bayir16c56972022-03-28 21:32:33 +0100124 // Initialize the accumulators
125 TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
126 {
127 // All the tensor dimensions are passed at compile time.
128 // In case of dynamic tensor support, the following dimensions should be passed as function argument.
SiCong Lib63b1192022-01-28 18:24:39 +0000129 #define _IWEI_WIDTH {{WEI_WIDTH}}
130 #define _IWEI_HEIGHT {{WEI_HEIGHT}}
Gunes Bayir16c56972022-03-28 21:32:33 +0100131 #define _ISRC_WIDTH {{src}}_w
132 #define _ISRC_HEIGHT {{src}}_h
133 #define _ISRC_CHANNELS {{src}}_c
SiCong Lib63b1192022-01-28 18:24:39 +0000134 #define _IDST_WIDTH {{arg_dst}}_w
135 #define _IDST_HEIGHT {{arg_dst}}_h
136 #define _IDST_CHANNELS {{arg_dst}}_c
137 #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
Gunes Bayir16c56972022-03-28 21:32:33 +0100138
139 // .v = access the whole vector (OpenCL vector)
140 // .s[x] = access the vector element at position x (scalar access)
141 TILE(int, M0, 1, xi);
142 TILE(int, M0, 1, yi);
143
144 // Convert the linear index to coordinate
145 LOOP_UNROLLING(int, i, 0, 1, M0,
146 {
147 xi[i].v = ((mout + i) % _IDST_WIDTH) * {{STRIDE_X}};
148 yi[i].v = ((mout + i) / _IDST_WIDTH) * {{STRIDE_Y}};
149 xi[i].v -= {{PAD_LEFT}};
150 yi[i].v -= {{PAD_TOP}};
151 })
152
153 LOOP_UNROLLING(int, i, 0, 1, M0,
154 {
155 {{dst}}[i].v = 0;
156 })
157
SiCong Lib63b1192022-01-28 18:24:39 +0000158 for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
Gunes Bayir16c56972022-03-28 21:32:33 +0100159 {
160 int ck = 0;
SiCong Lib63b1192022-01-28 18:24:39 +0000161 int xk = i % _IWEI_WIDTH;
162 int yk = i / _IWEI_HEIGHT;
Gunes Bayir16c56972022-03-28 21:32:33 +0100163
164 int k = 0;
165 for(; k <= (_ISRC_CHANNELS - K0); k += K0)
166 {
167 TILE({{SRC_DATA_TYPE}}, M0, K0, a);
168 TILE({{WEI_DATA_TYPE}}, N0, K0, b);
169
170 LOOP_UNROLLING(int, i, 0, 1, M0,
171 {
172 a[i].v = {{ZERO_VALUE}};
173 })
174
175 // Load tile from the src tensor
176 T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
177
178 // Load tile from the weights tensor
179 T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
180
181 // Compute the matrix multiplication between two tiles
182 T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
183
184 ck += K0;
185 }
186
187 // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
188 // This #if directive should be removed in case of dynamic tensor support
189 )_";
190
191 if(leftover_loop)
192 {
193 code += R"_(
194 // Left-over accumulations
195 for(; k < _ISRC_CHANNELS; ++k)
196 {
197 TILE({{SRC_DATA_TYPE}}, M0, 1, a);
198 TILE({{WEI_DATA_TYPE}}, N0, 1, b);
199
200 LOOP_UNROLLING(int, i, 0, 1, M0,
201 {
202 a[i].v = {{ZERO_VALUE}};
203 })
204
205 // Load tile from the src tensor
206 T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
207
208 // Load tile from the weights tensor
209 // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
210 T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
211
212 // Compute the matrix multiplication between two tiles
213 T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
214
215 ++ck;
216 }
217 )_";
218 }
219
220 code += R"_(
SiCong Lib63b1192022-01-28 18:24:39 +0000221 #undef _I_WEI_WIDTH
222 #undef _I_WEI_HEIGHT
223 #undef _ISRC_WIDTH
224 #undef _ISRC_HEIGHT
225 #undef _ISRC_CHANNELS
226 #undef _IDST_WIDTH
227 #undef _IDST_HEIGHT
228 #undef _IDST_CHANNELS
229 #undef _IY_MULTIPLIER
230
Gunes Bayir16c56972022-03-28 21:32:33 +0100231 }
232 )_";
233
234 if(bias_info != nullptr)
235 {
236 code += R"_(
237 TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
238
239 T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, cout, 0, 1, 0, bias0);
240
241 // c = c + bias[broadcasted]
242 T_ADD_BROADCAST_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
243 )_";
244 }
245
246 code += R"_(
Gunes Bayir16c56972022-03-28 21:32:33 +0100247 }
Gunes Bayir16c56972022-03-28 21:32:33 +0100248//------------------ END KERNEL {{meta_kernel_id}} ---------------------
249 )_";
250 return code.c_str();
251}
252
253bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout)
254{
255 if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
256 {
257 return false;
258 }
259
260 // If not floating point
261 if(!is_data_type_float(tensor->data_type()))
262 {
263 return false;
264 }
265
266 if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
267 {
268 return false;
269 }
270
271 // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
272 if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
273 {
274 return false;
275 }
276
277 // Check cl image pitch alignment
278 if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
279 {
280 return false;
281 }
282
283 const size_t image_w = tensor->tensor_shape()[0] / 4;
284 const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
285 const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
286 const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
287
288 if(image_w > max_image_w || image_h > max_image_h)
289 {
290 return false;
291 }
292
293 return true;
294}
295
296CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() const
297{
298 const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
SiCong Lib63b1192022-01-28 18:24:39 +0000299 auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
Gunes Bayir16c56972022-03-28 21:32:33 +0100300 const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
SiCong Lib63b1192022-01-28 18:24:39 +0000301 // const auto tile_info = _blueprint->impl().get_tile_info();
Gunes Bayir16c56972022-03-28 21:32:33 +0100302
303 const unsigned int channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
304 const DataType data_type = src_info->data_type();
SiCong Lib63b1192022-01-28 18:24:39 +0000305 const GPUTarget gpu_target = CLScheduler::get().target();
Gunes Bayir16c56972022-03-28 21:32:33 +0100306
SiCong Lib63b1192022-01-28 18:24:39 +0000307 const unsigned int n0 = _blueprint->impl().get_execution_window().x().step();
308 const unsigned int m0 = _blueprint->impl().get_execution_window().y().step();
Gunes Bayir16c56972022-03-28 21:32:33 +0100309 const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx));
SiCong Lib63b1192022-01-28 18:24:39 +0000310 const unsigned int partial_store_n0 = dst_info->dimension(0) % n0;
Gunes Bayir16c56972022-03-28 21:32:33 +0100311 const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
312
313 // Update the padding for the weights tensor if we can export to cl_image
314 if(export_to_cl_image)
315 {
316 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weight_info);
317 }
318
319 CLBuildOptions build_opts{};
320 build_opts.add_option("-cl-fast-relaxed-math");
321 build_opts.add_option("-DIS_TILED");
322 build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
323 build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
324 build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
325 build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
326
327 return build_opts;
328}
329
SiCong Lib63b1192022-01-28 18:24:39 +0000330void ClDirectConvolutionKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
331{
332 const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
333 const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
334
335 vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
336
337 const GPUTarget gpu_target = CLScheduler::get().target();
338 const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
339 const ClKernelTensorArgType weight_type = export_to_cl_image ? ClKernelTensorArgType::Tensor_4D_t_Image : ClKernelTensorArgType::Tensor_4D_t_Buffer;
340 vtable.add(_weight, _blueprint->impl().group(_weight.arg_id), ClKernelArgDescriptor(_weight.arg_id, weight_type), "weight");
341
342 if(!_bias.is_empty()) // optional bias
343 {
344 vtable.add(_bias, _blueprint->impl().group(_bias.arg_id), ClKernelArgDescriptor(_bias.arg_id, ClKernelTensorArgType::Vector), "bias");
345 }
346 vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
347}
348
349ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
Gunes Bayir16c56972022-03-28 21:32:33 +0100350{
351 TagLUT lut{};
352
353 const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
354 const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
355 const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
Gunes Bayir16c56972022-03-28 21:32:33 +0100356
SiCong Lib63b1192022-01-28 18:24:39 +0000357 // Arguments and global shared variables
358 lut["src"] = vtable.get(_src);
359 lut["weight"] = vtable.get(_weight);
Gunes Bayir16c56972022-03-28 21:32:33 +0100360
361 if(!_bias.is_empty()) // optional bias
362 {
SiCong Lib63b1192022-01-28 18:24:39 +0000363 lut["bias"] = vtable.get(_bias);
Gunes Bayir16c56972022-03-28 21:32:33 +0100364 lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type());
365 }
SiCong Lib63b1192022-01-28 18:24:39 +0000366 lut["dst"] = vtable.get(_dst);
367
368 const auto dst_argument = _blueprint->impl().get_argument_shared_vars().get_dst_var();
369 lut["arg_dst"] = dst_argument.uniq_name;
Gunes Bayir16c56972022-03-28 21:32:33 +0100370
371 // Local build options
SiCong Lib63b1192022-01-28 18:24:39 +0000372 lut["meta_kernel_id"] = id();
373 lut["ACC_DATA_TYPE"] = src_info->data_type();
374 lut["SRC_DATA_TYPE"] = src_info->data_type();
375 lut["WEI_DATA_TYPE"] = weight_info->data_type();
Gunes Bayir16c56972022-03-28 21:32:33 +0100376
377 lut["SRC_TENSOR_TYPE"] = "BUFFER";
SiCong Lib63b1192022-01-28 18:24:39 +0000378 switch(vtable.get(_weight).desc.tensor_arg_type)
379 {
380 case ClKernelTensorArgType::Image_Export_To_ClImage2D:
381 case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D:
382 case ClKernelTensorArgType::Tensor_4D_t_Image:
383 {
384 lut["WEI_TENSOR_TYPE"] = "IMAGE";
385 break;
386 }
387 default:
388 {
389 lut["WEI_TENSOR_TYPE"] = "BUFFER";
390 break;
391 }
392 }
393 const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH);
394 const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT);
395 lut["WEI_WIDTH"] = weight_info->dimension(width_idx);
396 lut["WEI_HEIGHT"] = weight_info->dimension(height_idx);
Gunes Bayir16c56972022-03-28 21:32:33 +0100397
SiCong Lib63b1192022-01-28 18:24:39 +0000398 lut["STRIDE_X"] = _desc.conv2d.stride.x();
399 lut["STRIDE_Y"] = _desc.conv2d.stride.y();
Gunes Bayir16c56972022-03-28 21:32:33 +0100400
SiCong Lib63b1192022-01-28 18:24:39 +0000401 lut["PAD_LEFT"] = _desc.conv2d.pad.left;
402 lut["PAD_TOP"] = _desc.conv2d.pad.top;
Gunes Bayir16c56972022-03-28 21:32:33 +0100403
404 lut["ZERO_VALUE"] = 0;
405
406 return lut;
407}
408} // namespace dynamic_fusion
409} // namespace experimental
SiCong Lib63b1192022-01-28 18:24:39 +0000410} // namespace arm_compute