blob: c5447ffd6b03c4eed38ac54966521c37cd442b12 [file] [log] [blame]
Michalis Spyrou04f089c2017-08-08 17:42:38 +01001/*
John Richardson62385bc2018-04-20 13:11:36 +01002 * Copyright (c) 2017-2018 ARM Limited.
Michalis Spyrou04f089c2017-08-08 17:42:38 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
25
26#include "arm_compute/core/CL/ICLTensor.h"
27#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
28#include "arm_compute/core/Error.h"
29#include "arm_compute/core/PixelValue.h"
30#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Validate.h"
32#include "arm_compute/runtime/CL/CLScheduler.h"
33#include "arm_compute/runtime/Tensor.h"
34#include "support/ToolchainSupport.h"
35
36using namespace arm_compute;
37
John Richardson62385bc2018-04-20 13:11:36 +010038namespace
39{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010040unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis)
John Richardson62385bc2018-04-20 13:11:36 +010041{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010042 // We need only 1 stage for all axis except x-axis and x-axis for QASYMM8.
43 if(axis != 0 || (axis == 0 && is_data_type_quantized(input->data_type())))
44 {
45 return 1;
46 }
John Richardson62385bc2018-04-20 13:11:36 +010047 // Calculate number of WGs. 16 elements per thread, 8 threads per WG
48 const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
49
50 // Calculate number of stages. First stage performs op and the rest reduction sum
51 // depending on the size of the input. Last stage should have only 1 WG.
52 const unsigned int num_of_stages = num_of_wg / 128 + 2;
53
54 return num_of_stages;
55}
56} // namespace
57
Georgios Pinitas8a94e7c2017-09-15 19:06:47 +010058CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010059 : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
Michalis Spyrou04f089c2017-08-08 17:42:38 +010060{
61}
62
John Richardson62385bc2018-04-20 13:11:36 +010063Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
64{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010065 const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
John Richardson62385bc2018-04-20 13:11:36 +010066
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010067 if(axis == 0 && !is_data_type_quantized(input->data_type()))
John Richardson62385bc2018-04-20 13:11:36 +010068 {
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010069 // Create temporary tensor infos
70 auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
71
72 // Create intermediate tensor info
73 TensorShape shape{ input->tensor_shape() };
74
75 for(unsigned int i = 0; i < num_of_stages - 1; i++)
76 {
77 shape.set(0, ceil(shape.x() / 128.f));
78 sums_vector[i].set_data_type(input->data_type());
79 sums_vector[i].set_tensor_shape(shape);
80 sums_vector[i].set_num_channels(input->num_channels());
81 }
82
Michalis Spyroue55a0132018-10-26 10:48:56 +010083 ReductionOperation first_kernel_op;
84 ReductionOperation last_kernel_op;
85 switch(op)
86 {
87 case ReductionOperation::SUM:
88 case ReductionOperation::MEAN_SUM:
89 first_kernel_op = ReductionOperation::SUM;
90 last_kernel_op = op;
91 break;
92 case ReductionOperation::SUM_SQUARE:
93 first_kernel_op = ReductionOperation::SUM_SQUARE;
94 last_kernel_op = ReductionOperation::SUM;
95 break;
96 default:
97 ARM_COMPUTE_ERROR("Not supported");
98 }
99
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100100 // Validate ReductionOperation only on first kernel
Michalis Spyroue55a0132018-10-26 10:48:56 +0100101 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, first_kernel_op));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100102
103 // Validate ReductionOperation on intermediate stages
104 for(unsigned int i = 1; i < num_of_stages - 1; ++i)
105 {
Michalis Spyroue55a0132018-10-26 10:48:56 +0100106 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100107 }
108
109 // Validate ReductionOperation on the last stage
110 const unsigned int last_stage = num_of_stages - 1;
Michalis Spyroue55a0132018-10-26 10:48:56 +0100111 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
John Richardson62385bc2018-04-20 13:11:36 +0100112 }
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100113 else
John Richardson62385bc2018-04-20 13:11:36 +0100114 {
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100115 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
John Richardson62385bc2018-04-20 13:11:36 +0100116 }
117
John Richardson62385bc2018-04-20 13:11:36 +0100118 return Status{};
119}
120
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100121void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
122{
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100123 _num_of_stages = calculate_number_of_stages(input->info(), axis);
124 _reduction_axis = axis;
125 _is_quantized = is_data_type_quantized(input->info()->data_type());
Georgios Pinitasaec513c2017-09-15 19:36:30 +0100126
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100127 // Configure reduction operation kernels
128 _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100129
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100130 // Create temporary tensors
131 if(axis == 0 && !_is_quantized)
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100132 {
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100133 _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
134 _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
135 TensorShape shape{ input->info()->tensor_shape() };
136 for(unsigned int i = 0; i < _num_of_stages - 1; i++)
137 {
138 shape.set(0, ceil(shape.x() / 128.f));
Michalis Spyrou8aaf93e2018-10-11 17:33:32 +0100139 _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100140 }
141
142 // Apply ReductionOperation only on first kernel
143 _memory_group.manage(_sums_vector.get());
144
145 ReductionOperation first_kernel_op;
146 ReductionOperation last_kernel_op;
147 switch(op)
148 {
149 case ReductionOperation::SUM:
150 case ReductionOperation::MEAN_SUM:
151 first_kernel_op = ReductionOperation::SUM;
152 last_kernel_op = op;
153 break;
154 case ReductionOperation::SUM_SQUARE:
155 first_kernel_op = ReductionOperation::SUM_SQUARE;
156 last_kernel_op = ReductionOperation::SUM;
157 break;
158 default:
159 ARM_COMPUTE_ERROR("Not supported");
160 }
161
162 _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
163 _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
164
165 // Apply ReductionOperation on intermediate stages
166 for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
167 {
168 _memory_group.manage(_sums_vector.get() + i);
169 _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
170 _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
171 _sums_vector[i - 1].allocator()->allocate();
172 }
173
174 // Apply ReductionOperation on the last stage
175 const unsigned int last_stage = _num_of_stages - 1;
176 const unsigned int input_width = input->info()->dimension(0);
177 _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
178 _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
179 _sums_vector[last_stage - 1].allocator()->allocate();
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100180 }
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100181 else
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100182 {
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100183 _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100184 }
185}
186
187void CLReductionOperation::run()
188{
Georgios Pinitas8a94e7c2017-09-15 19:06:47 +0100189 _memory_group.acquire();
190
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100191 if(_reduction_axis == 0 && !_is_quantized)
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100192 {
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100193 for(unsigned int i = 0; i < _num_of_stages; ++i)
194 {
195 CLScheduler::get().enqueue(_border_handlers_vector[i], false);
196 CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
197 }
198 }
199 else
200 {
201 CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100202 }
Georgios Pinitas8a94e7c2017-09-15 19:06:47 +0100203
204 _memory_group.release();
John Richardson62385bc2018-04-20 13:11:36 +0100205}