blob: b032bc5668d473ea451a46f65070e9cdc58d5d88 [file] [log] [blame]
Anthony Barbier7068f992017-10-26 15:23:08 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
29#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
30#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
31#include "arm_compute/core/Helpers.h"
32#include "arm_compute/core/IAccessWindow.h"
33#include "arm_compute/core/ITensor.h"
34#include "arm_compute/core/Types.h"
35#include "arm_compute/core/Validate.h"
36#include "support/ToolchainSupport.h"
37
38using namespace arm_compute;
39
40template <unsigned int kernel_size>
41GCDirectConvolutionLayerKernel<kernel_size>::GCDirectConvolutionLayerKernel()
42 : _input(nullptr), _bias(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_x(0), _conv_pad_y(0), _lws(gles::NDRange(1U, 1U, 1U))
43{
44}
45
46template <unsigned int kernel_size>
47BorderSize GCDirectConvolutionLayerKernel<kernel_size>::border_size() const
48{
49 return _border_size;
50}
51
52template <unsigned int kernel_size>
53void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info)
54{
55 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
56 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
57 ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
58 ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
59 ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
60 ARM_COMPUTE_ERROR_ON_MSG((kernel_size == 3 && std::get<0>(conv_info.stride()) > 2), "Strides larger than 2 not supported in 3x3 direct convolution!");
61 ARM_COMPUTE_ERROR_ON(kernel_size != weights->info()->dimension(0));
62
63 if(bias != nullptr)
64 {
65 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
66 // FIXME: Bug in framework, workaround it in tests currently.
67 //ARM_COMPUTE_ERROR_ON(bias->info()->dimension(0) != weights->info()->dimension(3));
68 ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
69 }
70
71 _conv_stride_x = std::get<0>(conv_info.stride());
72 _conv_stride_y = std::get<1>(conv_info.stride());
73 _conv_pad_x = std::get<0>(conv_info.pad());
74 _conv_pad_y = std::get<1>(conv_info.pad());
75
76 _input = input;
77 _weights = weights;
78 _output = output;
79 _bias = bias;
80 _border_size = BorderSize(_conv_pad_y, _conv_pad_x);
81
82 std::set<std::string> options;
83
84 options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
85 options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
86 options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
87 options.emplace("#define STRIDE_X " + support::cpp11::to_string(_conv_stride_x));
88
89 std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
90 options.emplace(("#define " + dt_name));
91
92 unsigned int num_elems_read_per_iteration_x = kernel_size * _conv_stride_x;
93 unsigned int num_elems_read_per_iteration_y = 1;
94 unsigned int num_elems_written_per_iteration_x = 1;
95 unsigned int num_elems_written_per_iteration_y = 1;
96 unsigned int num_elems_written_per_iteration_z = 1;
97
98 if(kernel_size == 3)
99 {
100 if((_conv_stride_x == 1) && (_conv_stride_y == 1))
101 {
102 switch(input->info()->data_type())
103 {
104 // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
105#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
106
107 case DataType::F16:
108#if defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16)
109 options.emplace("#define PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16");
110 num_elems_read_per_iteration_x = 16;
111 num_elems_read_per_iteration_y = 5;
112 num_elems_written_per_iteration_x = 8;
113 num_elems_written_per_iteration_y = 3;
114#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16)
115 options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16");
116 num_elems_read_per_iteration_x = 8;
117 num_elems_read_per_iteration_y = 5;
118 num_elems_written_per_iteration_x = 4;
119 num_elems_written_per_iteration_y = 3;
120#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16)
121 options.emplace("#define PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16");
122 num_elems_read_per_iteration_x = 8;
123 num_elems_read_per_iteration_y = 6;
124 num_elems_written_per_iteration_x = 4;
125 num_elems_written_per_iteration_y = 4;
126#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16)
127 options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16");
128 num_elems_read_per_iteration_x = 8;
129 num_elems_read_per_iteration_y = 5;
130 num_elems_written_per_iteration_x = 4;
131 num_elems_written_per_iteration_y = 3;
132 num_elems_written_per_iteration_z = 2;
133#endif /* PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16 */
134 break;
135
136 case DataType::F32:
137 options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS");
138 num_elems_read_per_iteration_x = 8;
139 num_elems_read_per_iteration_y = 5;
140 num_elems_written_per_iteration_x = 4;
141 num_elems_written_per_iteration_y = 3;
142 break;
143
144 default:
145 ARM_COMPUTE_ERROR("Current data type is not supported");
146 break;
147 }
148 }
149 // FIXME: Just keep one in release
150 else
151 {
152 switch(input->info()->data_type())
153 {
154 case DataType::F16:
155 options.emplace("#define PROCESS_X_4ELEMENTS_FP16");
156 num_elems_read_per_iteration_x = 8;
157 num_elems_written_per_iteration_x = 4;
158 break;
159
160 case DataType::F32:
161 // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
162#define PROCESS_4_ELEMENT
163
164#if defined(PROCESS_1_ELEMENT)
165 options.emplace("#define PROCESS_1_ELEMENT");
166 num_elems_read_per_iteration_x = 3;
167 num_elems_written_per_iteration_x = 1;
168#elif defined(PROCESS_4_ELEMENT)
169 options.emplace("#define PROCESS_4_ELEMENT");
170 num_elems_read_per_iteration_x = 8;
171 num_elems_written_per_iteration_x = 4;
172#elif defined(PROCESS_8_ELEMENT)
173 options.emplace("#define PROCESS_8_ELEMENT");
174 num_elems_read_per_iteration_x = 12;
175 num_elems_written_per_iteration_x = 8;
176#else /* PROCESS_1_ELEMENT */
177#error Have to declare how many elements to process in one thread.
178#endif /* PROCESS_1_ELEMENT */
179 break;
180
181 default:
182 ARM_COMPUTE_ERROR("Current data type is not supported");
183 break;
184 }
185 }
186 }
187 else if(kernel_size == 1)
188 {
189 switch(input->info()->data_type())
190 {
191 case DataType::F16:
192 num_elems_read_per_iteration_x = 8;
193 num_elems_written_per_iteration_x = 8;
zhenglin666635c2017-12-04 14:38:09 +0800194 if(weights->info()->dimension(2) % 2 == 0)
195 {
196 options.emplace("#define WEIGHTS_OPTIMIZATION");
197 }
Anthony Barbier7068f992017-10-26 15:23:08 +0100198 break;
199
200 case DataType::F32:
201 num_elems_read_per_iteration_x = 1;
202 num_elems_written_per_iteration_x = 1;
203 break;
204
205 default:
206 break;
207 }
208 }
209 else if(kernel_size == 5)
210 {
211 switch(input->info()->data_type())
212 {
213 case DataType::F16:
ASIAPAC\steli0123ac91b2017-11-07 16:14:44 +0800214 options.emplace("#define PROCESS_4X_1Y_1Z");
Anthony Barbier7068f992017-10-26 15:23:08 +0100215 num_elems_read_per_iteration_x = 8;
216 num_elems_written_per_iteration_x = 4;
217
218 default:
219 break;
220 }
221 }
222 else
223 {
224 }
225
226 if(_bias != nullptr)
227 {
228 options.emplace("#define BIAS");
229 }
230
231 std::stringstream kernel_name;
232 kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
233
234 _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name.str(), options));
235
Anthony Barbier7068f992017-10-26 15:23:08 +0100236 unsigned int idx = (_bias == nullptr) ? 3 * num_arguments_per_3D_tensor() : (num_arguments_per_1D_tensor() + 3 * num_arguments_per_3D_tensor());
237
238 // Calculate output right and bottom border
239 const int output_width = output->info()->dimension(0);
240 const int output_height = output->info()->dimension(1);
241 const int output_padding_right = ceil_to_multiple(output_width, num_elems_written_per_iteration_x * _lws[0]) - output_width;
242 const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
243
244 // Calculate input right and bottom border
245 const int input_width = input->info()->dimension(0);
246 const int input_height = input->info()->dimension(1);
247 const int upper_bound_w = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + (kernel_size - 1)), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_x - input_width;
248 const int upper_bound_h = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + (kernel_size - 1)), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_y - input_height;
249 const int padding_right = std::max(upper_bound_w, _conv_pad_x);
250 const int padding_bottom = std::max(upper_bound_h, _conv_pad_y);
251
252 BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0);
253
254 Window win = calculate_max_enlarged_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z), border);
255
256 AccessWindowStatic input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + padding_right, input_height + padding_bottom);
257 AccessWindowStatic weights_access = AccessWindowStatic(nullptr, 0, 0, 0, 0);
258 AccessWindowStatic bias_access = AccessWindowStatic(nullptr, 0, 0, 0, 1);
259
260 switch(weights->info()->data_type())
261 {
262 case DataType::F16:
zhenglin666635c2017-12-04 14:38:09 +0800263 if((weights->info()->dimension(2) % 2 != 0) || (kernel_size != 1))
264 {
265 weights_access = AccessWindowStatic(weights->info(), 0, 0, kernel_size + 1, kernel_size);
266 }
Anthony Barbier7068f992017-10-26 15:23:08 +0100267 if(_bias != nullptr)
268 {
269 bias_access = AccessWindowStatic(_bias->info(), 0, 0, _bias->info()->dimension(0) + 1, 1);
270 }
271 break;
272
273 case DataType::F32:
274 weights_access = AccessWindowStatic(weights->info(), 0, 0, kernel_size, kernel_size);
275 if(_bias != nullptr)
276 {
277 bias_access = AccessWindowStatic(_bias->info(), 0, 0, _bias->info()->dimension(0), 1);
278 }
279 break;
280
281 default:
282 ARM_COMPUTE_ERROR("Current data type is not supported");
283 break;
284 }
285
286 AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
287
288 if(_bias != nullptr)
289 {
290 update_window_and_padding(win, input_access, weights_access, bias_access, output_access);
291 }
292 else
293 {
294 update_window_and_padding(win, input_access, weights_access, output_access);
295 }
296
297 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
298
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800299 _kernel.set_argument(idx++, _weights->info()->strides_in_bytes()[3]); // weights_stride_w
300 _kernel.set_argument(idx++, _weights->info()->dimension(2)); // weights_depth
Anthony Barbier7068f992017-10-26 15:23:08 +0100301
302 IGCKernel::configure(win);
303}
304
305template <unsigned int kernel_size>
306void GCDirectConvolutionLayerKernel<kernel_size>::run(const Window &window)
307{
308 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
309 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
310
311 _kernel.use();
312
313 // Get initial windows
314 Window slice = window.first_slice_window_3D();
315 Window win_in = window;
316
317 win_in.adjust(Window::DimX, -_conv_pad_x, true);
318 win_in.adjust(Window::DimY, -_conv_pad_y, true);
319 win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
320 win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
321
322 Window slice_in = win_in.first_slice_window_3D();
323
324 unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
325 add_3D_tensor_argument(idx1, _weights, BufferParam(3, 2), slice);
326
327 if(_bias != nullptr)
328 {
329 Window slice_bias;
330 slice_bias.use_tensor_dimensions(_bias->info()->tensor_shape());
331 add_1D_tensor_argument(idx1, _bias, BufferParam(4, 2), slice_bias);
332 }
333
334 do
335 {
336 unsigned int idx = 0;
337
338 switch(_input->info()->data_type())
339 {
340 case DataType::F16:
341 switch(kernel_size)
342 {
343 case 1:
344 add_3D_tensor_argument(idx, _input, BufferParam(1, 4), slice_in);
345 add_3D_tensor_argument(idx, _output, BufferParam(2, 4), slice);
346 break;
347
348 case 3:
349 add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice_in);
350 add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
351 break;
352
353 case 5:
354 add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice_in);
355 add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
356 break;
357
358 default:
359 ARM_COMPUTE_ERROR("Current kernel size %d is not supported", kernel_size);
360 break;
361 }
362 break;
363
364 case DataType::F32:
365 switch(kernel_size)
366 {
367 case 1:
368 case 5:
369 add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice_in);
370 add_3D_tensor_argument(idx, _output, BufferParam(2, 2), slice);
371 break;
372
373 case 3:
374 add_3D_tensor_argument(idx, _input, BufferParam(1, 4), slice_in);
375 add_3D_tensor_argument(idx, _output, BufferParam(2, 4), slice);
376 break;
377
378 default:
379 ARM_COMPUTE_ERROR("Current kernel size %d is not supported", kernel_size);
380 break;
381 }
382 break;
383
384 default:
385 ARM_COMPUTE_ERROR("Current data type is not supported");
386 break;
387 }
388
389 _kernel.update_shader_params();
390 enqueue(*this, slice, _lws);
391 }
392 while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
393}
394
395template class arm_compute::GCDirectConvolutionLayerKernel<1>;
396template class arm_compute::GCDirectConvolutionLayerKernel<3>;
397template class arm_compute::GCDirectConvolutionLayerKernel<5>;