blob: 03c53b001d4cddbdf31b8fa2f660c1eabe395e44 [file] [log] [blame]
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +01001/*
2 * Copyright (c) 2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/cpu/operators/CpuFullyConnected.h"
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010025
26#include "arm_compute/core/Helpers.h"
27#include "arm_compute/core/ITensorPack.h"
28#include "arm_compute/core/Validate.h"
29#include "arm_compute/core/utils/misc/ShapeCalculator.h"
30#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
31#include "arm_compute/runtime/NEON/NEScheduler.h"
ramelg013ae3d882021-09-12 23:07:47 +010032#include "src/common/utils/Log.h"
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010033#include "src/core/helpers/AutoConfiguration.h"
34#include "src/core/helpers/MemoryHelpers.h"
Georgios Pinitas7891a732021-08-20 21:39:25 +010035#include "src/cpu/kernels/CpuTransposeKernel.h"
36#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
37#include "src/cpu/operators/CpuFlatten.h"
38#include "src/cpu/operators/CpuGemm.h"
39#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
40#include "src/cpu/utils/CpuAuxTensorHandler.h"
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010041
42namespace arm_compute
43{
44namespace cpu
45{
46using namespace arm_compute::experimental;
47using namespace arm_compute::misc::shape_calculator;
48
49namespace
50{
51// Get min, max bound of a quantized asymmetric dst tensor, with the effect of fused activation
52std::pair<PixelValue, PixelValue> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type)
53{
54 PixelValue type_min{};
55 PixelValue type_max{};
56 std::tie(type_min, type_max) = get_min_max(data_type);
57 const UniformQuantizationInfo q_unif = q_info.uniform();
58
59 if(act_info.enabled())
60 {
61 switch(act_info.activation())
62 {
63 case ActivationLayerInfo::ActivationFunction::RELU:
64 type_min = PixelValue(q_unif.offset);
65 break;
66 case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
67 type_min = PixelValue(q_unif.offset);
68 type_max = PixelValue(act_info.a(), data_type, q_info);
69 break;
70 case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
71 type_min = PixelValue(act_info.b(), data_type, q_info);
72 type_max = PixelValue(act_info.a(), data_type, q_info);
73 break;
74 default:
75 ARM_COMPUTE_ERROR("Activation function not supported.");
76 break;
77 }
78 }
79
80 return std::make_pair(type_min, type_max);
81}
82
83Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
84 GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
85{
86 const auto data_type = src->data_type();
87 const QuantizationInfo oq_info = dst->quantization_info();
88 const UniformQuantizationInfo iq_unif = src->quantization_info().uniform();
89 const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform();
90 const UniformQuantizationInfo oq_unif = oq_info.uniform();
91
92 float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale;
93 int32_t output_multiplier;
94 int32_t output_shift;
95
96 ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
97
98 PixelValue type_min{};
99 PixelValue type_max{};
100 std::tie(type_min, type_max) = get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
101
102 gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
103 gemmlowp_output_stage_info.gemmlowp_shift = output_shift;
104 gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset;
105 gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
106 gemmlowp_output_stage_info.gemmlowp_min_bound = type_min.get<int32_t>();
107 gemmlowp_output_stage_info.gemmlowp_max_bound = type_max.get<int32_t>();
108
109 return Status{};
110}
111
112Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act)
113{
114 if(is_data_type_quantized_asymmetric(src->data_type()))
115 {
116 // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
117 // Extract and negate src and weights offset
118 const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
119 const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
120
121 GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
122 ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info));
123
124 GEMMInfo gemm_info;
125 gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
126
127 // Validate gemmlowp function
128 TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info);
129 TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
130 ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info,
131 &weights_info,
132 biases,
133 dst,
134 gemm_info));
135 }
136 else
137 {
138 ARM_COMPUTE_RETURN_ON_ERROR(CpuGemm::validate(src, weights, biases, dst, 1.f, 1.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
139 }
140
141 return Status{};
142}
143} // namespace
144
145CpuFullyConnected::CpuFullyConnected()
146 : _flatten(nullptr),
147 _convert_weights(nullptr),
148 _transpose_weights(nullptr),
149 _mm_gemm(nullptr),
150 _mm_gemmlowp(nullptr),
151 _flattened_src(),
152 _converted_weights(),
153 _reshaped_weights(),
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100154 _trans_weights(),
155 _trans_weights_idx(AuxTensorIdx::Count),
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100156 _aux_mem(Count),
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100157 _needs_weights_conversion(false),
158 _needs_weights_reshape(false),
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100159 _is_fc_after_conv(false),
160 _is_quantized_asymmetric(false),
161 _is_prepared(false)
162
163{
164}
165
166CpuFullyConnected::~CpuFullyConnected() = default;
167
168void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
169{
170 if(_is_quantized_asymmetric)
171 {
172 // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
173 // Extract and negate src and weights offset
174 const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
175 const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
176
177 TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info);
178 TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
179
180 // Configure gemmlowp function and output stage for asymmetric quantized types
181 GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
182 const Status status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
183 ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
184
185 GEMMInfo gemm_info;
186 gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
187 gemm_info.set_activation_info(act);
188 _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
189 _mm_gemmlowp->configure(&src_info, &weights_info, biases, dst, gemm_info);
190 }
191 else
192 {
193 // Configure matrix multiply kernel
194 GEMMInfo gemm_info(false, false, true /* Reshape weights only for the first run */);
195 gemm_info.set_activation_info(act);
196 _mm_gemm = std::make_unique<CpuGemm>();
197 _mm_gemm->configure(src, weights, biases, dst, 1.f, 1.0f, gemm_info);
198 }
199}
200
201void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
202{
203 ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
204
205 // If the fully connected layer is called after a convolution layer, the src tensor must be linearized
206
207 // Initialize output tensor for flatten
208 auto_init_if_empty(_flattened_src, src->clone()->set_tensor_shape(compute_flatten_shape(src)));
209
210 _flatten = std::make_unique<CpuFlatten>();
211 _flatten->configure(src, &_flattened_src);
212
213 // Configure matrix multiply kernel
214 configure_mm(&_flattened_src, weights, biases, dst, act);
215}
216
217void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
218{
219 ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
220
221 // Configure matrix multiply kernel
222 configure_mm(src, weights, biases, dst, act);
223}
224
225void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
226 FullyConnectedLayerInfo fc_info)
227{
228 // Perform validate step
229 ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
230 ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src,
231 weights,
232 biases != nullptr ? biases : nullptr,
233 dst,
234 fc_info));
ramelg013ae3d882021-09-12 23:07:47 +0100235 ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info);
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100236
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100237 _needs_weights_conversion = false;
238 _needs_weights_reshape = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
239 _needs_weights_reshape = _needs_weights_reshape && !fc_info.retain_internal_weights;
240 _is_fc_after_conv = true;
241 _is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
242 _is_prepared = false;
243 _trans_weights_idx = AuxTensorIdx::Count;
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100244
245 // With the Fully Connected layer we can have 4 different cases:
246 // 1) Convolution layer -> Fully Connected layer without batches
247 // 2) Fully Connected layer -> Fully Connected layer without batches
248 // 3) Convolution layer -> Fully Connected layer with batches
249 // 4) Fully Connected layer -> Fully Connected layer with batches
250
251 const ITensorInfo *weights_to_use = weights;
252
253 // Check if we have a fully connected layer with batches
254 const bool is_batched_fc_layer = dst->dimension(1) > 1;
255 if(is_batched_fc_layer)
256 {
257 _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
258 src->tensor_shape().cend(),
259 dst->tensor_shape().cbegin() + 1));
260 }
261 else
262 {
263 _is_fc_after_conv = src->num_dimensions() > 1;
264 }
265
266 // Reshape weights if needed
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100267 if(_needs_weights_reshape)
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100268 {
269 // Reshape the weights
270 _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>();
271 _transpose_weights->configure(weights, &_reshaped_weights);
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100272 weights_to_use = &_reshaped_weights;
273 _trans_weights_idx = AuxTensorIdx::TransposedWeights;
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100274 }
275
276 // Convert weights if needed
277 if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
278 {
279 // Convert weights
280 _convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>();
281 _convert_weights->configure(weights_to_use,
282 &_converted_weights,
283 src->tensor_shape(),
284 fc_info.weights_trained_layout);
285
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100286 weights_to_use = &_converted_weights;
287 _needs_weights_conversion = true;
288 _trans_weights_idx = AuxTensorIdx::ConvertedWeights;
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100289 }
290
291 if(_is_fc_after_conv)
292 {
293 // Fully Connected layer after a Convolution Layer without batches
294 configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
295 }
296 else
297 {
298 // Fully Connected layer after a Fully Connected Layer without batches
299 configure_fc_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
300 }
301
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100302 // Retain the tensorinfo with the weights to use
303 if(_needs_weights_reshape || _needs_weights_conversion)
304 {
305 _trans_weights = *weights_to_use;
306 }
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100307
308 // Set auxiliary memory requirements
309 auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
310 for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
311 {
312 _aux_mem[i] = gemm_mem_req[i];
313 }
314
315 if(_aux_mem[Pretranspose].size > 0)
316 {
Giorgio Arena63e0beb2021-09-24 14:04:27 +0100317 // Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch
318 // Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation
319 _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), (_is_quantized_asymmetric
320 && biases && !(biases->are_values_constant())) ?
321 MemoryLifetime::Persistent :
322 MemoryLifetime::Prepare,
323 _reshaped_weights.total_size());
324 _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size());
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100325 }
326 else
327 {
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100328 _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), _needs_weights_conversion ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _reshaped_weights.total_size());
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100329 _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Persistent, _converted_weights.total_size());
330 }
331 _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
332}
333
334Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
335 FullyConnectedLayerInfo fc_info)
336{
337 ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
338 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
339 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
340 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
341 ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100342 ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
343 && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
Giorgio Arena63e0beb2021-09-24 14:04:27 +0100344 ARM_COMPUTE_RETURN_ERROR_ON(!weights->are_values_constant() && (!fc_info.are_weights_reshaped || fc_info.transpose_weights));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100345
346 bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
347 bool is_fc_after_conv = true;
348
349 const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
350 const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
351 const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
352
353 // With the Fully Connected layer we can have 4 different cases:
354 // 1) Convolution layer -> Fully Connected layer without batches
355 // 2) Fully Connected layer -> Fully Connected layer without batches
356 // 3) Convolution layer -> Fully Connected layer with batches
357 // 4) Fully Connected layer -> Fully Connected layer with batches
358
359 const ITensorInfo *src_to_use = src;
360 const ITensorInfo *weights_to_use = weights;
361
362 // Check if we have a fully connected layer with batches
363 const bool is_batched_fc_layer = dst->dimension(1) > 1;
364
Giorgio Arena63e0beb2021-09-24 14:04:27 +0100365 if(biases != nullptr)
366 {
367 ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
368 if(is_data_type_quantized(src->data_type()))
369 {
370 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
371 }
372 else
373 {
374 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
375 }
376 }
377
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100378 if(is_batched_fc_layer)
379 {
380 is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
381 src->tensor_shape().cend(),
382 dst->tensor_shape().cbegin() + 1));
383 }
384 else
385 {
386 is_fc_after_conv = src->num_dimensions() > 1;
387 }
388
389 if(!weights_reshaped)
390 {
391 // Validate reshape weights kernel
392 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights));
393 weights_to_use = &reshaped_weights;
394 }
395
396 if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
397 {
398 // Validate convert weights kernel
399 ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use,
400 &converted_weights,
401 src->tensor_shape(),
402 fc_info.weights_trained_layout));
403 weights_to_use = &converted_weights;
404 }
405
406 if(is_fc_after_conv)
407 {
408 // Fully Connected layer after a Convolution Layer without batches
409 ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
410
411 // Validate flatten kernel
412 ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src));
413 src_to_use = &flatten_src;
414 }
415 else
416 {
417 // Fully Connected layer after a Fully Connected Layer without batches
418 ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
419 }
420 // Validate matrix multiply kernel
421 ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info));
422
423 return Status{};
424}
425
426void CpuFullyConnected::run(ITensorPack &tensors)
427{
428 prepare(tensors);
429
430 auto src = tensors.get_const_tensor(ACL_SRC_0);
431
432 CpuAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100433 CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false);
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100434
435 // Linearize src if it comes from a convolutional layer
436 if(_is_fc_after_conv)
437 {
438 ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
439 _flatten->run(flatten_pack);
440 }
441
442 ITensorPack gemm_pack = tensors;
443 gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100444 if(_needs_weights_reshape || _needs_weights_conversion)
445 {
446 gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get());
447 }
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100448
449 // Run matrix multiply
450 if(_is_quantized_asymmetric)
451 {
452 _mm_gemmlowp->run(gemm_pack);
453 }
454 else
455 {
456 _mm_gemm->run(gemm_pack);
457 }
458}
459
460void CpuFullyConnected::prepare(ITensorPack &tensors)
461{
462 if(!_is_prepared)
463 {
464 auto weights = tensors.get_const_tensor(ACL_SRC_1);
465
466 CpuAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
467 CpuAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false);
468
469 // Pointer to current weights
470 const ITensor *cur_weights = weights;
471
472 // Reshape of the weights (happens only once)
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100473 if(_needs_weights_reshape)
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100474 {
475 // Run reshape weights kernel and mark weights as unused
476 ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
477 NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack);
478
479 cur_weights->mark_as_unused();
480 cur_weights = reshaped_weights.get();
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100481 }
482
483 // Convert weights if needed (happens only once)
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100484 if(_needs_weights_conversion)
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100485 {
486 ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
487 _convert_weights->run(convert_pack);
488
489 cur_weights->mark_as_unused();
490 cur_weights = converted_weights.get();
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100491 }
492
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100493 ITensorPack gemm_pack = tensors;
494 gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100495
496 // Prepare GEMM prepare and release unused weights
497 if(!_is_quantized_asymmetric)
498 {
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100499 _mm_gemm->prepare(gemm_pack);
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100500 }
501 else
502 {
Georgios Pinitasfa1db172021-08-12 06:28:09 +0100503 _mm_gemmlowp->prepare(gemm_pack);
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100504 }
505
506 _is_prepared = true;
507 }
508}
509
510experimental::MemoryRequirements CpuFullyConnected::workspace() const
511{
512 return _aux_mem;
513}
514} // namespace cpu
515} // namespace arm_compute