blob: 38f0a7523c4b1cde2c96ff0eb938e0315ea92f62 [file] [log] [blame]
Michalis Spyrou04f089c2017-08-08 17:42:38 +01001/*
Manuel Bottinib412fab2018-12-10 17:40:23 +00002 * Copyright (c) 2017-2019 ARM Limited.
Michalis Spyrou04f089c2017-08-08 17:42:38 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
25
26#include "arm_compute/core/CL/ICLTensor.h"
27#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
28#include "arm_compute/core/Error.h"
29#include "arm_compute/core/PixelValue.h"
30#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Validate.h"
32#include "arm_compute/runtime/CL/CLScheduler.h"
33#include "arm_compute/runtime/Tensor.h"
34#include "support/ToolchainSupport.h"
35
36using namespace arm_compute;
37
John Richardson62385bc2018-04-20 13:11:36 +010038namespace
39{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010040unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis)
John Richardson62385bc2018-04-20 13:11:36 +010041{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010042 // We need only 1 stage for all axis except x-axis and x-axis for QASYMM8.
43 if(axis != 0 || (axis == 0 && is_data_type_quantized(input->data_type())))
44 {
45 return 1;
46 }
John Richardson62385bc2018-04-20 13:11:36 +010047 // Calculate number of WGs. 16 elements per thread, 8 threads per WG
48 const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
49
50 // Calculate number of stages. First stage performs op and the rest reduction sum
51 // depending on the size of the input. Last stage should have only 1 WG.
52 const unsigned int num_of_stages = num_of_wg / 128 + 2;
53
54 return num_of_stages;
55}
56} // namespace
57
Georgios Pinitas8a94e7c2017-09-15 19:06:47 +010058CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
Manuel Bottinib412fab2018-12-10 17:40:23 +000059 : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
Michalis Spyrou04f089c2017-08-08 17:42:38 +010060{
61}
62
John Richardson62385bc2018-04-20 13:11:36 +010063Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
64{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010065 const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
Manuel Bottinib412fab2018-12-10 17:40:23 +000066 bool is_serial = is_data_type_quantized(input->data_type()) || axis != 0;
67 if(is_serial)
68 {
69 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
70 }
71 else
John Richardson62385bc2018-04-20 13:11:36 +010072 {
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010073 // Create temporary tensor infos
Michalis Spyroubcfd09a2019-05-01 13:03:59 +010074 std::vector<TensorInfo> sums_vector(num_of_stages - 1);
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010075
76 // Create intermediate tensor info
77 TensorShape shape{ input->tensor_shape() };
78
79 for(unsigned int i = 0; i < num_of_stages - 1; i++)
80 {
81 shape.set(0, ceil(shape.x() / 128.f));
82 sums_vector[i].set_data_type(input->data_type());
83 sums_vector[i].set_tensor_shape(shape);
84 sums_vector[i].set_num_channels(input->num_channels());
85 }
86
Michalis Spyroue55a0132018-10-26 10:48:56 +010087 ReductionOperation first_kernel_op;
Manuel Bottinib412fab2018-12-10 17:40:23 +000088 ReductionOperation intermediate_kernel_op;
Michalis Spyroue55a0132018-10-26 10:48:56 +010089 ReductionOperation last_kernel_op;
90 switch(op)
91 {
92 case ReductionOperation::SUM:
93 case ReductionOperation::MEAN_SUM:
Manuel Bottinib412fab2018-12-10 17:40:23 +000094 first_kernel_op = ReductionOperation::SUM;
95 intermediate_kernel_op = ReductionOperation::SUM;
96 last_kernel_op = op;
Michalis Spyroue55a0132018-10-26 10:48:56 +010097 break;
98 case ReductionOperation::SUM_SQUARE:
Manuel Bottinib412fab2018-12-10 17:40:23 +000099 first_kernel_op = ReductionOperation::SUM_SQUARE;
100 intermediate_kernel_op = ReductionOperation::SUM;
101 last_kernel_op = ReductionOperation::SUM;
102 break;
103 case ReductionOperation::PROD:
104 first_kernel_op = ReductionOperation::PROD;
105 intermediate_kernel_op = ReductionOperation::PROD;
106 last_kernel_op = ReductionOperation::PROD;
Michalis Spyroue55a0132018-10-26 10:48:56 +0100107 break;
Usama Arifb2890502019-05-21 11:48:37 +0100108 case ReductionOperation::MIN:
109 first_kernel_op = ReductionOperation::MIN;
110 intermediate_kernel_op = ReductionOperation::MIN;
111 last_kernel_op = ReductionOperation::MIN;
112 break;
Usama Arif048b0f32019-05-22 16:32:27 +0100113 case ReductionOperation::MAX:
114 first_kernel_op = ReductionOperation::MAX;
115 intermediate_kernel_op = ReductionOperation::MAX;
116 last_kernel_op = ReductionOperation::MAX;
117 break;
Michalis Spyroue55a0132018-10-26 10:48:56 +0100118 default:
119 ARM_COMPUTE_ERROR("Not supported");
120 }
121
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100122 // Validate ReductionOperation only on first kernel
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100123 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, &sums_vector[0], axis, first_kernel_op));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100124
125 // Validate ReductionOperation on intermediate stages
126 for(unsigned int i = 1; i < num_of_stages - 1; ++i)
127 {
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100128 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[i - 1], &sums_vector[i], axis, intermediate_kernel_op));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100129 }
130
131 // Validate ReductionOperation on the last stage
132 const unsigned int last_stage = num_of_stages - 1;
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100133 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output, axis, last_kernel_op, input->dimension(0)));
John Richardson62385bc2018-04-20 13:11:36 +0100134 }
John Richardson62385bc2018-04-20 13:11:36 +0100135
John Richardson62385bc2018-04-20 13:11:36 +0100136 return Status{};
137}
138
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100139void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
140{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100141 _num_of_stages = calculate_number_of_stages(input->info(), axis);
142 _reduction_axis = axis;
Manuel Bottinib412fab2018-12-10 17:40:23 +0000143 _is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0;
Georgios Pinitasaec513c2017-09-15 19:36:30 +0100144
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100145 // Configure reduction operation kernels
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100146 _reduction_kernels_vector.resize(_num_of_stages);
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100147
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100148 // Create temporary tensors
Manuel Bottinib412fab2018-12-10 17:40:23 +0000149 if(_is_serial)
150 {
151 _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
152 }
153 else
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100154 {
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100155 _border_handlers_vector.resize(_num_of_stages);
156 _results_vector.resize(_num_of_stages - 1);
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100157 TensorShape shape{ input->info()->tensor_shape() };
158 for(unsigned int i = 0; i < _num_of_stages - 1; i++)
159 {
160 shape.set(0, ceil(shape.x() / 128.f));
Manuel Bottinib412fab2018-12-10 17:40:23 +0000161 _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100162 }
163
164 // Apply ReductionOperation only on first kernel
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100165 _memory_group.manage(&_results_vector[0]);
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100166
167 ReductionOperation first_kernel_op;
Manuel Bottinib412fab2018-12-10 17:40:23 +0000168 ReductionOperation intermediate_kernel_op;
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100169 ReductionOperation last_kernel_op;
Manuel Bottinib412fab2018-12-10 17:40:23 +0000170 PixelValue pixelValue;
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100171 switch(op)
172 {
173 case ReductionOperation::SUM:
174 case ReductionOperation::MEAN_SUM:
Manuel Bottinib412fab2018-12-10 17:40:23 +0000175 first_kernel_op = ReductionOperation::SUM;
176 intermediate_kernel_op = ReductionOperation::SUM;
177 last_kernel_op = op;
Manuel Bottini55e16782019-01-15 13:21:57 +0000178 pixelValue = PixelValue();
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100179 break;
180 case ReductionOperation::SUM_SQUARE:
Manuel Bottinib412fab2018-12-10 17:40:23 +0000181 first_kernel_op = ReductionOperation::SUM_SQUARE;
182 intermediate_kernel_op = ReductionOperation::SUM;
183 last_kernel_op = ReductionOperation::SUM;
Manuel Bottini55e16782019-01-15 13:21:57 +0000184 pixelValue = PixelValue();
Manuel Bottinib412fab2018-12-10 17:40:23 +0000185 break;
186 case ReductionOperation::PROD:
187 first_kernel_op = ReductionOperation::PROD;
188 intermediate_kernel_op = ReductionOperation::PROD;
189 last_kernel_op = ReductionOperation::PROD;
190 pixelValue = PixelValue(1, input->info()->data_type());
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100191 break;
Usama Arifb2890502019-05-21 11:48:37 +0100192 case ReductionOperation::MIN:
193 first_kernel_op = ReductionOperation::MIN;
194 intermediate_kernel_op = ReductionOperation::MIN;
195 last_kernel_op = ReductionOperation::MIN;
196 switch(input->info()->data_type())
197 {
198 case DataType::F32:
199 {
200 pixelValue = PixelValue(std::numeric_limits<float>::max());
201 break;
202 }
203 case DataType::F16:
204 {
205 pixelValue = PixelValue(static_cast<half>(65504.0f));
206 break;
207 }
208 case DataType::QASYMM8:
209 {
210 pixelValue = PixelValue(255, input->info()->data_type(), input->info()->quantization_info());
211 break;
212 }
213 default:
214 {
215 ARM_COMPUTE_ERROR("Unsupported DataType");
216 }
217 }
218 break;
Usama Arif048b0f32019-05-22 16:32:27 +0100219 case ReductionOperation::MAX:
220 first_kernel_op = ReductionOperation::MAX;
221 intermediate_kernel_op = ReductionOperation::MAX;
222 last_kernel_op = ReductionOperation::MAX;
223 switch(input->info()->data_type())
224 {
225 case DataType::F32:
226 {
227 pixelValue = PixelValue(-std::numeric_limits<float>::max());
228 break;
229 }
230 case DataType::F16:
231 {
232 pixelValue = PixelValue(static_cast<half>(-65504.0f));
233 break;
234 }
235 case DataType::QASYMM8:
236 {
237 pixelValue = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
238 break;
239 }
240 default:
241 {
242 ARM_COMPUTE_ERROR("Unsupported DataType");
243 }
244 }
245 break;
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100246 default:
247 ARM_COMPUTE_ERROR("Not supported");
248 }
249
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100250 _reduction_kernels_vector[0].configure(input, &_results_vector[0], axis, first_kernel_op);
Manuel Bottinib412fab2018-12-10 17:40:23 +0000251 _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100252
253 // Apply ReductionOperation on intermediate stages
254 for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
255 {
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100256 _memory_group.manage(&_results_vector[i]);
257 _reduction_kernels_vector[i].configure(&_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
258 _border_handlers_vector[i].configure(&_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
Manuel Bottinib412fab2018-12-10 17:40:23 +0000259 _results_vector[i - 1].allocator()->allocate();
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100260 }
261
262 // Apply ReductionOperation on the last stage
263 const unsigned int last_stage = _num_of_stages - 1;
264 const unsigned int input_width = input->info()->dimension(0);
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100265 _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output, axis, last_kernel_op, input_width);
266 _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
Manuel Bottinib412fab2018-12-10 17:40:23 +0000267 _results_vector[last_stage - 1].allocator()->allocate();
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100268 }
269}
270
271void CLReductionOperation::run()
272{
Georgios Pinitasda953f22019-04-02 17:27:03 +0100273 MemoryGroupResourceScope scope_mg(_memory_group);
Georgios Pinitas8a94e7c2017-09-15 19:06:47 +0100274
Manuel Bottinib412fab2018-12-10 17:40:23 +0000275 if(_is_serial)
276 {
277 CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
278 }
279 else
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100280 {
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100281 for(unsigned int i = 0; i < _num_of_stages; ++i)
282 {
283 CLScheduler::get().enqueue(_border_handlers_vector[i], false);
284 CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
285 }
286 }
John Richardson62385bc2018-04-20 13:11:36 +0100287}