blob: bb285d7cc8029c2956b9ede45088efbce34bcdf2 [file] [log] [blame]
Michalis Spyrou04f089c2017-08-08 17:42:38 +01001/*
Manuel Bottinib412fab2018-12-10 17:40:23 +00002 * Copyright (c) 2017-2019 ARM Limited.
Michalis Spyrou04f089c2017-08-08 17:42:38 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
25
26#include "arm_compute/core/CL/ICLTensor.h"
27#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
28#include "arm_compute/core/Error.h"
29#include "arm_compute/core/PixelValue.h"
30#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Validate.h"
32#include "arm_compute/runtime/CL/CLScheduler.h"
33#include "arm_compute/runtime/Tensor.h"
34#include "support/ToolchainSupport.h"
35
36using namespace arm_compute;
37
John Richardson62385bc2018-04-20 13:11:36 +010038namespace
39{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010040unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis)
John Richardson62385bc2018-04-20 13:11:36 +010041{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010042 // We need only 1 stage for all axis except x-axis and x-axis for QASYMM8.
43 if(axis != 0 || (axis == 0 && is_data_type_quantized(input->data_type())))
44 {
45 return 1;
46 }
John Richardson62385bc2018-04-20 13:11:36 +010047 // Calculate number of WGs. 16 elements per thread, 8 threads per WG
48 const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
49
50 // Calculate number of stages. First stage performs op and the rest reduction sum
51 // depending on the size of the input. Last stage should have only 1 WG.
52 const unsigned int num_of_stages = num_of_wg / 128 + 2;
53
54 return num_of_stages;
55}
56} // namespace
57
Georgios Pinitas8a94e7c2017-09-15 19:06:47 +010058CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
Manuel Bottinib412fab2018-12-10 17:40:23 +000059 : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
Michalis Spyrou04f089c2017-08-08 17:42:38 +010060{
61}
62
John Richardson62385bc2018-04-20 13:11:36 +010063Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
64{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010065 const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
Manuel Bottinib412fab2018-12-10 17:40:23 +000066 bool is_serial = is_data_type_quantized(input->data_type()) || axis != 0;
67 if(is_serial)
68 {
69 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
70 }
71 else
John Richardson62385bc2018-04-20 13:11:36 +010072 {
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010073 // Create temporary tensor infos
74 auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
75
76 // Create intermediate tensor info
77 TensorShape shape{ input->tensor_shape() };
78
79 for(unsigned int i = 0; i < num_of_stages - 1; i++)
80 {
81 shape.set(0, ceil(shape.x() / 128.f));
82 sums_vector[i].set_data_type(input->data_type());
83 sums_vector[i].set_tensor_shape(shape);
84 sums_vector[i].set_num_channels(input->num_channels());
85 }
86
Michalis Spyroue55a0132018-10-26 10:48:56 +010087 ReductionOperation first_kernel_op;
Manuel Bottinib412fab2018-12-10 17:40:23 +000088 ReductionOperation intermediate_kernel_op;
Michalis Spyroue55a0132018-10-26 10:48:56 +010089 ReductionOperation last_kernel_op;
90 switch(op)
91 {
92 case ReductionOperation::SUM:
93 case ReductionOperation::MEAN_SUM:
Manuel Bottinib412fab2018-12-10 17:40:23 +000094 first_kernel_op = ReductionOperation::SUM;
95 intermediate_kernel_op = ReductionOperation::SUM;
96 last_kernel_op = op;
Michalis Spyroue55a0132018-10-26 10:48:56 +010097 break;
98 case ReductionOperation::SUM_SQUARE:
Manuel Bottinib412fab2018-12-10 17:40:23 +000099 first_kernel_op = ReductionOperation::SUM_SQUARE;
100 intermediate_kernel_op = ReductionOperation::SUM;
101 last_kernel_op = ReductionOperation::SUM;
102 break;
103 case ReductionOperation::PROD:
104 first_kernel_op = ReductionOperation::PROD;
105 intermediate_kernel_op = ReductionOperation::PROD;
106 last_kernel_op = ReductionOperation::PROD;
Michalis Spyroue55a0132018-10-26 10:48:56 +0100107 break;
108 default:
109 ARM_COMPUTE_ERROR("Not supported");
110 }
111
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100112 // Validate ReductionOperation only on first kernel
Michalis Spyroue55a0132018-10-26 10:48:56 +0100113 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, first_kernel_op));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100114
115 // Validate ReductionOperation on intermediate stages
116 for(unsigned int i = 1; i < num_of_stages - 1; ++i)
117 {
Manuel Bottinib412fab2018-12-10 17:40:23 +0000118 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100119 }
120
121 // Validate ReductionOperation on the last stage
122 const unsigned int last_stage = num_of_stages - 1;
Michalis Spyroue55a0132018-10-26 10:48:56 +0100123 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
John Richardson62385bc2018-04-20 13:11:36 +0100124 }
John Richardson62385bc2018-04-20 13:11:36 +0100125
John Richardson62385bc2018-04-20 13:11:36 +0100126 return Status{};
127}
128
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100129void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
130{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100131 _num_of_stages = calculate_number_of_stages(input->info(), axis);
132 _reduction_axis = axis;
Manuel Bottinib412fab2018-12-10 17:40:23 +0000133 _is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0;
Georgios Pinitasaec513c2017-09-15 19:36:30 +0100134
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100135 // Configure reduction operation kernels
136 _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100137
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100138 // Create temporary tensors
Manuel Bottinib412fab2018-12-10 17:40:23 +0000139 if(_is_serial)
140 {
141 _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
142 }
143 else
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100144 {
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100145 _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
Manuel Bottinib412fab2018-12-10 17:40:23 +0000146 _results_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100147 TensorShape shape{ input->info()->tensor_shape() };
148 for(unsigned int i = 0; i < _num_of_stages - 1; i++)
149 {
150 shape.set(0, ceil(shape.x() / 128.f));
Manuel Bottinib412fab2018-12-10 17:40:23 +0000151 _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100152 }
153
154 // Apply ReductionOperation only on first kernel
Manuel Bottinib412fab2018-12-10 17:40:23 +0000155 _memory_group.manage(_results_vector.get());
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100156
157 ReductionOperation first_kernel_op;
Manuel Bottinib412fab2018-12-10 17:40:23 +0000158 ReductionOperation intermediate_kernel_op;
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100159 ReductionOperation last_kernel_op;
Manuel Bottinib412fab2018-12-10 17:40:23 +0000160 PixelValue pixelValue;
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100161 switch(op)
162 {
163 case ReductionOperation::SUM:
164 case ReductionOperation::MEAN_SUM:
Manuel Bottinib412fab2018-12-10 17:40:23 +0000165 first_kernel_op = ReductionOperation::SUM;
166 intermediate_kernel_op = ReductionOperation::SUM;
167 last_kernel_op = op;
Manuel Bottini55e16782019-01-15 13:21:57 +0000168 pixelValue = PixelValue();
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100169 break;
170 case ReductionOperation::SUM_SQUARE:
Manuel Bottinib412fab2018-12-10 17:40:23 +0000171 first_kernel_op = ReductionOperation::SUM_SQUARE;
172 intermediate_kernel_op = ReductionOperation::SUM;
173 last_kernel_op = ReductionOperation::SUM;
Manuel Bottini55e16782019-01-15 13:21:57 +0000174 pixelValue = PixelValue();
Manuel Bottinib412fab2018-12-10 17:40:23 +0000175 break;
176 case ReductionOperation::PROD:
177 first_kernel_op = ReductionOperation::PROD;
178 intermediate_kernel_op = ReductionOperation::PROD;
179 last_kernel_op = ReductionOperation::PROD;
180 pixelValue = PixelValue(1, input->info()->data_type());
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100181 break;
182 default:
183 ARM_COMPUTE_ERROR("Not supported");
184 }
185
Manuel Bottinib412fab2018-12-10 17:40:23 +0000186 _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
187 _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100188
189 // Apply ReductionOperation on intermediate stages
190 for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
191 {
Manuel Bottinib412fab2018-12-10 17:40:23 +0000192 _memory_group.manage(_results_vector.get() + i);
193 _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
194 _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
195 _results_vector[i - 1].allocator()->allocate();
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100196 }
197
198 // Apply ReductionOperation on the last stage
199 const unsigned int last_stage = _num_of_stages - 1;
200 const unsigned int input_width = input->info()->dimension(0);
Manuel Bottinib412fab2018-12-10 17:40:23 +0000201 _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
202 _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
203 _results_vector[last_stage - 1].allocator()->allocate();
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100204 }
205}
206
207void CLReductionOperation::run()
208{
Georgios Pinitasda953f22019-04-02 17:27:03 +0100209 MemoryGroupResourceScope scope_mg(_memory_group);
Georgios Pinitas8a94e7c2017-09-15 19:06:47 +0100210
Manuel Bottinib412fab2018-12-10 17:40:23 +0000211 if(_is_serial)
212 {
213 CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
214 }
215 else
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100216 {
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100217 for(unsigned int i = 0; i < _num_of_stages; ++i)
218 {
219 CLScheduler::get().enqueue(_border_handlers_vector[i], false);
220 CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
221 }
222 }
John Richardson62385bc2018-04-20 13:11:36 +0100223}