blob: 66a858d3ed2527a6ebcdfc09b567e7d797b0cef4 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
25
Gian Marco Iodice13edbff2017-06-26 17:20:16 +010026#include "arm_compute/core/Size2D.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Validate.h"
28#include "arm_compute/runtime/CL/CLScheduler.h"
29
30#include <algorithm>
31#include <cmath>
32
Moritz Pflanzer768e9f12017-08-11 15:33:30 +010033namespace arm_compute
34{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010035CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
36 : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
37{
38}
39
40void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer)
41{
Gian Marco Iodice7d323a62017-07-05 20:05:23 +010042 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
Moritz Pflanzer768e9f12017-08-11 15:33:30 +010043 ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044 ARM_COMPUTE_ERROR_ON(output == nullptr);
Moritz Pflanzer768e9f12017-08-11 15:33:30 +010045 ARM_COMPUTE_ERROR_ON(!transpose_weights && !is_batched_fc_layer);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010046
Moritz Pflanzer768e9f12017-08-11 15:33:30 +010047 const DataType data_type = input->info()->data_type();
Anthony Barbier6ff3b192017-09-04 18:44:23 +010048 const int fixed_point_position = input->info()->fixed_point_position();
49
50 _transpose_weights = transpose_weights;
51 _is_batched_fc_layer = is_batched_fc_layer;
52
53 // Check if we need to transpose the weights
54 if(_transpose_weights)
55 {
56 if(_is_batched_fc_layer)
57 {
58 // Initialize the output tensor for transpose
59 TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
Moritz Pflanzer768e9f12017-08-11 15:33:30 +010060 _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, data_type, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010061 _transpose_kernel.configure(input, &_transpose_output);
62
63 // Configure transpose 1xW kernel
64 _transpose1xW_kernel.configure(&_transpose_output, output);
65
66 // Allocate temporary tensor used for transposing the weights
67 _transpose_output.allocator()->allocate();
68 }
69 else
70 {
71 _transpose_kernel.configure(input, output);
72 }
73 }
74 else
75 {
76 if(_is_batched_fc_layer)
77 {
78 // Configure transpose 1xW kernel
79 _transpose1xW_kernel.configure(input, output);
80 }
81 else
82 {
83 ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
84 }
85 }
86}
87
88void CLFullyConnectedLayerReshapeWeights::run()
89{
90 if(_transpose_weights)
91 {
92 CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
93 }
Moritz Pflanzer768e9f12017-08-11 15:33:30 +010094
Anthony Barbier6ff3b192017-09-04 18:44:23 +010095 if(_is_batched_fc_layer)
96 {
97 CLScheduler::get().enqueue(_transpose1xW_kernel);
98 }
99}
100
101CLFullyConnectedLayer::CLFullyConnectedLayer()
102 : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100103 _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100104{
105}
106
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100107void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
108{
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100109 // With the Fully Connected layer we can have 4 different cases:
110 // 1) Convolution layer -> Fully Connected layer without batches
111 // 2) Fully Connected layer -> Fully Connected layer without batches
112 // 3) Convolution layer -> Fully Connected layer with batches
113 // 4) Fully Connected layer -> Fully Connected layer with batches
114
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100115 // Expected shape before transpose and reshaping
116 // Input: In x B (In and B can be multi-dimensional)
117 // Weights: flat(In) x Out
118 // Biases: Out
119 // Output: Out x B (B can be multi-dimensional)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100120
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100121 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
122 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
123 ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, weights, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100124
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100125 const DataType data_type = input->info()->data_type();
126 const int fixed_point_position = input->info()->fixed_point_position();
127 const int num_batch_dimensions = std::max(0, static_cast<int>(output->info()->tensor_shape().num_dimensions()) - 1);
128 const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
129 const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
130
131 _linearize_input = input->info()->tensor_shape().x() != linear_input_size;
132 _are_weights_reshaped = are_weights_reshaped;
133 _accumulate_biases = biases != nullptr;
134 _is_batched_fc_layer = num_batch_dimensions > 0;
135
136 // Check if number of batches match
137 ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size_upper(num_input_dimensions) != output->info()->tensor_shape().total_size_upper(1));
138 ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
139
140 const size_t interleave_width = 16 / input->info()->element_size();
141 const ICLTensor *weights_to_use = weights;
142
143 if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100144 {
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100145 weights_to_use = &_reshape_weights_output;
146
147 TensorShape reshaped_weights_shape(weights->info()->tensor_shape());
148
149 // Transpose weights if the user hasn't done it
150 if(transpose_weights)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100151 {
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100152 const size_t shape_x = reshaped_weights_shape.x();
153 reshaped_weights_shape.set(0, reshaped_weights_shape.y());
154 reshaped_weights_shape.set(1, shape_x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100155 }
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100156
157 // If the we run multiple batches we need 1xW transpose, too.
158 if(_is_batched_fc_layer)
159 {
160 const float shape_x = reshaped_weights_shape.x();
161 reshaped_weights_shape.set(0, reshaped_weights_shape.y() * interleave_width);
162 reshaped_weights_shape.set(1, static_cast<unsigned int>(std::ceil(shape_x / interleave_width)));
163 }
164
165 _reshape_weights_output.allocator()->init(TensorInfo(reshaped_weights_shape, 1, data_type, fixed_point_position));
166
167 // Reshape the weights
168 _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
169 }
170
171 // Check correct shape of weights
172 if(_is_batched_fc_layer)
173 {
174 // Transpose + Transpose1xW
175 ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != linear_input_size * interleave_width);
176 ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != static_cast<unsigned int>(std::ceil(static_cast<float>(output->info()->tensor_shape().x()) / interleave_width)));
177 }
178 else
179 {
180 // Transpose
181 ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != output->info()->tensor_shape().x());
182 ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != linear_input_size);
183 }
184
185 const ICLTensor *multiply_input = input;
186
187 if(_linearize_input)
188 {
189 TensorShape shape_im2col(input->info()->tensor_shape());
190 shape_im2col.collapse(num_input_dimensions);
191 _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, data_type, fixed_point_position));
192
193 // Configure im2col kernel
194 _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
195
196 multiply_input = &_im2col_output;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100197 }
198
199 if(_is_batched_fc_layer)
200 {
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100201 TensorShape shape_interleaved(multiply_input->info()->tensor_shape());
202 shape_interleaved.set(0, shape_interleaved.x() * 4);
203 shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
204 _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, data_type, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100205
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100206 // Configure interleave4x4 kernel
207 _interleave4x4_kernel.configure(multiply_input, &_interleave4x4_output);
208
209 multiply_input = &_interleave4x4_output;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100210 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100211
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100212 // Configure matrix multiply kernel
213 _mm_kernel.configure(multiply_input, weights_to_use, output, 1.0f);
214
215 if(_accumulate_biases)
216 {
217 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
218 ARM_COMPUTE_ERROR_ON(biases->info()->tensor_shape().x() != output->info()->tensor_shape().x());
219
220 // Configure accumulate biases kernel
221 _accumulate_biases_kernel.configure(output, biases);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100222 }
223
224 // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100225 if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100226 {
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100227 // Allocate the tensor for the weights reshaped
228 _reshape_weights_output.allocator()->allocate();
229 }
230
231 if(_linearize_input)
232 {
233 _im2col_output.allocator()->allocate();
234 }
235
236 if(_is_batched_fc_layer)
237 {
238 _interleave4x4_output.allocator()->allocate();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100239 }
240}
241
242void CLFullyConnectedLayer::run()
243{
244 // Reshape of the weights (happens only once)
245 if(!_are_weights_reshaped)
246 {
247 _are_weights_reshaped = true;
248 _reshape_weights_kernel.run();
249 }
250
251 // Linearize input if it comes from a convolutional layer
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100252 if(_linearize_input)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100253 {
254 CLScheduler::get().enqueue(_im2col_kernel, false);
255 }
256
257 // Interleave input
258 if(_is_batched_fc_layer)
259 {
260 CLScheduler::get().enqueue(_interleave4x4_kernel, false);
261 }
262
263 // Run matrix multiply
264 CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
265
266 // Accumulate biases if provided
267 if(_accumulate_biases)
268 {
269 CLScheduler::get().enqueue(_accumulate_biases_kernel);
270 }
271}
Moritz Pflanzer768e9f12017-08-11 15:33:30 +0100272} // namespace arm_compute