blob: c62e4b531f5c0233757b621b68cde54cd3ec748c [file] [log] [blame]
Georgios Pinitas529b5a22021-07-27 15:55:30 +01001/*
Jakub Sujak617ed502023-03-29 11:16:18 +01002 * Copyright (c) 2017-2021, 2023 Arm Limited.
Georgios Pinitas529b5a22021-07-27 15:55:30 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/gpu/cl/operators/ClFullyConnected.h"
Georgios Pinitas529b5a22021-07-27 15:55:30 +010025
26#include "arm_compute/core/Size2D.h"
27#include "arm_compute/core/Validate.h"
28#include "arm_compute/core/utils/misc/ShapeCalculator.h"
29#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
30#include "arm_compute/runtime/CL/CLScheduler.h"
31#include "src/core/CL/kernels/CLFillBorderKernel.h"
32
33#include "src/core/helpers/MemoryHelpers.h"
Georgios Pinitas7891a732021-08-20 21:39:25 +010034#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
35#include "src/gpu/cl/operators/ClFlatten.h"
36#include "src/gpu/cl/operators/ClGemm.h"
37#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
38#include "src/gpu/cl/operators/ClTranspose.h"
39#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
Georgios Pinitas529b5a22021-07-27 15:55:30 +010040
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +010041#include "src/gpu/cl/operators/ClMatMul.h"
42#include "utils/TypePrinter.h"
43
44#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
45#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
46
ramelg012e53f172021-09-22 10:48:25 +010047#include "src/common/utils/Log.h"
Georgios Pinitas529b5a22021-07-27 15:55:30 +010048#include "support/Cast.h"
49
50#include <algorithm>
51
52namespace arm_compute
53{
54namespace opencl
55{
56using namespace arm_compute::experimental;
57using namespace arm_compute::misc::shape_calculator;
58
59namespace
60{
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +010061// Function to calculate batched tensor shape in format [M, 1, B0, B1 ..] which is the format matmul expects
62inline TensorShape get_reshaped_matmul_tensor(const TensorShape &src)
63{
64 return TensorShape(src.x(), 1, src.y(), src.collapsed_from(2).z()); // Return value optimisation
65}
66
Georgios Pinitas529b5a22021-07-27 15:55:30 +010067Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst,
68 GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info)
69{
70 gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
71 gemmlowp_output_stage.gemmlowp_offset = 0;
72 gemmlowp_output_stage.gemmlowp_multiplier = 0;
73 gemmlowp_output_stage.gemmlowp_shift = 0;
74
75 const auto data_type = src.data_type();
76
77 // Configure output stage for quantized case
78 if(is_data_type_quantized_asymmetric(data_type))
79 {
80 const QuantizationInfo oq_info = dst.quantization_info();
81 const UniformQuantizationInfo iq_unif = src.quantization_info().uniform();
82 const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform();
83 const UniformQuantizationInfo oq_unif = oq_info.uniform();
84
85 const auto output_quant_info = (dst.total_size() == 0) ? iq_unif : oq_unif;
86
87 const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
88 int output_multiplier = 0;
89 int output_shift = 0;
90 ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
91
92 PixelValue type_min{};
93 PixelValue type_max{};
94 std::tie(type_min, type_max) = get_min_max(data_type);
95
96 if(activation_info.enabled())
97 {
98 std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
99 }
100
101 // Set the GEMMLowp output stage info
102 gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
103 gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
104 gemmlowp_output_stage.gemmlowp_shift = output_shift;
105 gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
106 gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
107 type_min.get(gemmlowp_output_stage.gemmlowp_min_bound);
108 type_max.get(gemmlowp_output_stage.gemmlowp_max_bound);
109 }
110
111 return Status{};
112}
113
114Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info)
115{
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100116 // If weights are dynamic, data is not batched, and bias is nullptr validate using matmul.
117 const bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
118 const bool use_matmul = !weights.are_values_constant() && !weights_reshaped && !(dst.dimension(1) > 1) && (bias != nullptr);
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100119
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100120 if(use_matmul)
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100121 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100122 MatMulInfo m_info{};
123 m_info.adj_rhs(fc_info.transpose_weights);
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100124
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100125 // Note: Currently, shape is [M, B0, B1]
126 // LHS is reshaped here to match ClMatMul expectations of batch index in format - [M, 1, B0, B1, .. ]
127 TensorInfo lhs_to_use{ src };
128 lhs_to_use.set_tensor_shape(get_reshaped_matmul_tensor(src.tensor_shape()));
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100129
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100130 // Operator level validation.
131 ARM_COMPUTE_RETURN_ON_ERROR(ClMatMul::validate(&lhs_to_use, &weights, &dst, m_info, fc_info.activation_info));
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100132 }
133 else
134 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100135 GEMMLowpOutputStageInfo gemmlowp_output_stage;
136 ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
137
138 const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
139 false, // is_b_reshaped
140 true, // reshape_b_only_on_first_run
141 0, // depth_output_gemm3d
142 false, // reinterpret_input_as_3d
143 fc_info.retain_internal_weights, // retain_internal_weights
144 gemmlowp_output_stage, // gemmlowp_output_stage
145 fc_info.fp_mixed_precision, // fp_mixed_precision
146 false, // fast_math
147 true, // broadcast_bias
148 ActivationLayerInfo()); // activation_info
149
150 if(is_data_type_quantized_asymmetric(src.data_type()))
151 {
152 const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
153 const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
154
155 // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
156 // Extract and negate src and weights offset
157 const QuantizationInfo src_quantization_info(iq_info.scale, -iq_info.offset);
158 const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
159
160 // Validate gemmlowp function
161 ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info),
162 &weights.clone()->set_quantization_info(weights_quantization_info),
163 bias,
164 &dst,
165 gemm_info));
166 }
167 else
168 {
169 ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&src, &weights, bias, &dst, 1.f, 1.f, gemm_info));
170 }
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100171 }
172
173 return Status{};
174}
175} // namespace
176
177ClFullyConnected::ClFullyConnected()
178 : _convert_weights(nullptr),
179 _flatten(nullptr),
180 _reshape_weights(nullptr),
181 _mm_gemm(nullptr),
182 _mm_gemmlowp(nullptr),
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100183 _matmul_native_kernel(nullptr),
184 _matmul_lowp_native_kernel(nullptr),
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100185 _aux_mem(Count)
186{
187}
188
189ClFullyConnected::~ClFullyConnected() = default;
190
191void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
192 const FullyConnectedLayerInfo &fc_info)
193{
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100194 // If weights are dynamic, configure matmul operator - else use gemm
195 if(_use_matmul)
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100196 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100197 // Transpose RHS as _are_weights_reshaped == false when mat_mul is used.
198 const MatMulInfo mat_info = MatMulInfo().adj_rhs(fc_info.transpose_weights);
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100199
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100200 // Note: MatMul does not need offset negation unlike gemm
201 // 1. Change shape when calling matmul to fit batch expectations.
202 _lhs_to_use = *src->clone();
203 _lhs_to_use.set_tensor_shape(get_reshaped_matmul_tensor(_lhs_to_use.tensor_shape())); // Collapse all dims > 2 into final dimension.
204 _is_quantized = is_data_type_quantized_asymmetric(_lhs_to_use.data_type());
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100205
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100206 // 2. Call kernel for matmul directly.
207 const GPUTarget gpu_target = CLScheduler::get().target();
208 std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100209
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100210 // Configure relevant matmul kernel
211 MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info);
212 if(_is_quantized)
213 {
214 _matmul_lowp_native_kernel = std::make_unique<kernels::ClMatMulLowpNativeKernel>();
215 _matmul_lowp_native_kernel->set_target(gpu_target);
216 _matmul_lowp_native_kernel->configure(compile_context, src, weights, dst, kernel_info, fc_info.activation_info);
217 }
218 else
219 {
220 _matmul_native_kernel = std::make_unique<kernels::ClMatMulNativeKernel>();
221 _matmul_native_kernel->set_target(gpu_target);
222 _matmul_native_kernel->configure(compile_context, src, weights, dst, kernel_info, fc_info.activation_info);
223 }
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100224 }
225 else
226 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100227 // Configure GEMM
228 GEMMLowpOutputStageInfo gemmlowp_output_stage;
229 construct_gemmlowp_output_stage(*src, *weights, *dst, gemmlowp_output_stage, fc_info.activation_info);
230
231 const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
232 false, // is_b_reshaped
233 !_dynamic_weights, // reshape_b_only_on_first_run
234 0, // depth_output_gemm3d
235 false, // reinterpret_input_as_3d
236 fc_info.retain_internal_weights, // retain_internal_weights
237 gemmlowp_output_stage, // gemmlowp_output_stage
238 fc_info.fp_mixed_precision, // fp_mixed_precision
239 false, // fast_math
240 true, // broadcast_bias
241 fc_info.activation_info); // activation_info
242
243 if(_is_quantized)
244 {
245 // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
246 // Extract and negate input and weights offset
247 const QuantizationInfo src_quantization_info = src->quantization_info();
248 const QuantizationInfo weights_quantization_info = weights->quantization_info();
249
250 TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info);
251 TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
252
253 src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
254 weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
255
256 // Configure gemmlowp function
257 _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
258 _mm_gemmlowp->configure(compile_context, &src_info, &weights_info, bias, dst, gemm_info);
259 }
260 else
261 {
262 // Configure matrix multiply kernel
263 _mm_gemm = std::make_unique<ClGemm>();
264 _mm_gemm->configure(compile_context, src, weights, bias, dst, 1.f, 1.f, gemm_info);
265 }
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100266 }
267}
268
269void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
270 const FullyConnectedLayerInfo &fc_info)
271{
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100272 ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul) ? 0 : 1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100273
274 // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
275
276 // Initialize output tensor for flatten
277 _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW);
278
279 // Configure flatten kernel
280 _flatten = std::make_unique<ClFlatten>();
281 _flatten->configure(compile_context, src, &_flattened_src);
282
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100283 // Note: if flatten has > 1 dimensions after, these dimensions are batch
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100284 // Configure matrix multiply kernel
285 configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info);
286}
287
288void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
289 const FullyConnectedLayerInfo &fc_info)
290{
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100291 // Compare first dimension when using matmul, as it performs transpose operation
292 ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension((_use_matmul) ? 0 : 1));
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100293
294 // Configure matrix multiply kernel
295 configure_mm(compile_context, src, weights, bias, dst, fc_info);
296}
297
298void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
299 FullyConnectedLayerInfo fc_info)
300{
301 ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
302
303 // Perform validate step
304 ARM_COMPUTE_ERROR_THROW_ON(ClFullyConnected::validate(src, weights, biases, dst, fc_info));
ramelg012e53f172021-09-22 10:48:25 +0100305 ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info);
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100306
307 _are_weights_converted = true;
308 _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
309 _is_fc_after_conv = true;
310 _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
311 _is_prepared = fc_info.retain_internal_weights;
312 _weights_to_use = TensorInfo(*weights);
313 _weights_to_use_idx = ACL_SRC_1;
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100314
315 // When using dynamic weights - use matmul kernels.
316 // Note: We don't appear to support dynamic weights with pre-reshaped RHS.
317 // Note: No matmul with biases for the moment.
318 const bool is_batched_fc_layer = dst->dimension(1) > 1;
319 _dynamic_weights = !weights->are_values_constant() && !_are_weights_reshaped;
320 _use_matmul = _dynamic_weights && !is_batched_fc_layer && !(biases);
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100321
322 // With the Fully Connected layer we can have 4 different cases:
323 // 1) Convolution layer -> Fully Connected layer without batches
324 // 2) Fully Connected layer -> Fully Connected layer without batches
325 // 3) Convolution layer -> Fully Connected layer with batches
326 // 4) Fully Connected layer -> Fully Connected layer with batches
327
328 // Check if we have a fully connected layer with batches
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100329 if(is_batched_fc_layer)
330 {
331 _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
332 src->tensor_shape().cend(),
333 dst->tensor_shape().cbegin() + 1));
334 }
335 else
336 {
337 _is_fc_after_conv = src->num_dimensions() > 1;
338 }
339
340 ITensorInfo *weights_used = weights;
341
342 // Reshape weights if needed
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100343 // Not needed when matmul is in use - MatMul has transpose RHS flags.
344 if(!_are_weights_reshaped && !_use_matmul)
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100345 {
346 // Reshape the weights
347 _reshape_weights = std::make_unique<ClTranspose>();
348 _reshape_weights->configure(compile_context, weights, &_reshaped_weights);
349 weights_used = &_reshaped_weights;
350 _weights_to_use_idx = offset_int_vec(TransposedWeights);
351 }
352
353 // Convert weights if needed
354 if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
355 {
356 // Convert weights
357 _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>();
358 _convert_weights->configure(compile_context,
359 weights_used,
360 &_converted_weights,
361 src->tensor_shape(),
362 fc_info.weights_trained_layout);
363
364 weights_used = &_converted_weights;
365 _weights_to_use_idx = offset_int_vec(ConvertedWeights);
366 _are_weights_converted = false;
367 }
368
369 if(_is_fc_after_conv)
370 {
371 // Fully Connected layer after a Convolution Layer without batches
372 configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info);
373 }
374 else
375 {
376 // Fully Connected layer after a Fully Connected Layer without batches
377 configure_fc_fc(compile_context, src, weights_used, biases, dst, fc_info);
378 }
379 // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion)
380 _weights_to_use = *weights_used;
381
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100382 if(_use_matmul)
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100383 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100384 // Note : MatMul does not use transpose and does not need auxillary memory, so only converted weights are added to aux_mem
385 _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size());
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100386 }
387 else
388 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100389 // Set auxiliary memory requirements for gemm operators
390 auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
391 for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
392 {
393 _aux_mem[i] = gemm_mem_req[i];
394 }
395 if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
396 {
397 // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
398 // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time
399 _aux_mem[TransposedWeights] = MemoryInfo(
400 offset_int_vec(TransposedWeights),
401 _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
402 _reshaped_weights.total_size());
403 _aux_mem[ConvertedWeights] = MemoryInfo(
404 offset_int_vec(ConvertedWeights),
405 _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
406 _converted_weights.total_size());
407 }
408 else
409 {
410 // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
411 const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
412 const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100413
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100414 _aux_mem[TransposedWeights] = MemoryInfo(
415 offset_int_vec(TransposedWeights),
416 _dynamic_weights ? MemoryLifetime::Temporary : transposed_wei_lft,
417 _reshaped_weights.total_size());
418 _aux_mem[ConvertedWeights] = MemoryInfo(
419 offset_int_vec(ConvertedWeights),
420 _dynamic_weights ? MemoryLifetime::Temporary : converted_wei_lft,
421 _converted_weights.total_size());
422 }
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100423 }
424 _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
425}
426
427Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
428 FullyConnectedLayerInfo fc_info)
429{
430 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
431 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
432 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
433 ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
434 ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
435 && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100436
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100437 const bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
438 bool is_fc_after_conv = true;
439
440 // When using dynamic weights - use matmul kernels.
441 // Note: MatMul does not support broadcasting or biases so fallback with batched cases or when biases != nullptr.
442 // Note: Pre-Shaped RHS is a deprecated use case and is therefore not supported with matmul.
443 const bool dynamic_weights = !weights->are_values_constant() && !weights_reshaped;
444 const bool is_batched_fc_layer = dst->dimension(1) > 1;
445 const bool use_matmul = dynamic_weights && !is_batched_fc_layer && (biases != nullptr);
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100446
447 const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW));
448 const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
449 const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
450
451 // With the Fully Connected layer we can have 4 different cases:
452 // 1) Convolution layer -> Fully Connected layer without batches
453 // 2) Fully Connected layer -> Fully Connected layer without batches
454 // 3) Convolution layer -> Fully Connected layer with batches
455 // 4) Fully Connected layer -> Fully Connected layer with batches
456
457 const ITensorInfo *src_to_use = src;
458 const ITensorInfo *weights_to_use = weights;
459
Giorgio Arena63e0beb2021-09-24 14:04:27 +0100460 if(biases != nullptr)
461 {
462 ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
463 if(is_data_type_quantized(src->data_type()))
464 {
465 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
466 }
467 else
468 {
469 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
470 }
471 }
472
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100473 // Check if FC is after conv (flatten kernel is run in case where FC is after conv.)
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100474 if(is_batched_fc_layer)
475 {
476 is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
477 src->tensor_shape().cend(),
478 dst->tensor_shape().cbegin() + 1));
479 }
480 else
481 {
482 is_fc_after_conv = src->num_dimensions() > 1;
483 }
484
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100485 if(!weights_reshaped && !use_matmul)
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100486 {
487 // Validate reshape weights kernel
488 ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights));
489 weights_to_use = &reshaped_weights;
490 }
491
492 if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
493 {
494 // Validate convert weights kernel
495 ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use,
496 &converted_weights,
497 src->tensor_shape(),
498 fc_info.weights_trained_layout));
499 weights_to_use = &converted_weights;
500 }
501
502 if(is_fc_after_conv)
503 {
504 // Fully Connected layer after a Convolution Layer without batches
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100505 if(use_matmul)
506 {
507 ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(0) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
508 }
509 else
510 {
511 ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
512 }
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100513
514 // Validate flatten kernel
515 ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src));
516 src_to_use = &flatten_src;
517 }
518 else
519 {
520 // Fully Connected layer after a Fully Connected Layer without batches
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100521 ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension((use_matmul) ? 0 : 1));
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100522 }
523
524 // Validate matrix multiply kernel
525 ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*src_to_use, *weights_to_use, biases, *dst, fc_info));
526
527 return Status{};
528}
529
530void ClFullyConnected::run(ITensorPack &tensors)
531{
532 prepare(tensors);
533
Jakub Sujak617ed502023-03-29 11:16:18 +0100534#ifdef ARM_COMPUTE_ASSERTS_ENABLED
535 ++_asrt_run_count;
536 ARM_COMPUTE_ERROR_ON(_dynamic_weights && _asrt_prepare_count != _asrt_run_count);
537#endif // ARM_COMPUTE_ASSERTS_ENABLED
538
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100539 auto src = tensors.get_const_tensor(ACL_SRC_0);
540
541 CLAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
542 CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false);
543
544 // Linearize input if it comes from a convolutional layer
545 if(_is_fc_after_conv)
546 {
547 ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
548 _flatten->run(flatten_pack);
549 }
550
551 ITensorPack gemm_pack = tensors;
552 gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
553 if(_weights_to_use_idx != ACL_SRC_1)
554 {
555 gemm_pack.add_const_tensor(ACL_SRC_1, weights.get());
556 }
557
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100558 // Run MatMul Op
559 if(_use_matmul)
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100560 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100561 // Run matmul kernels for matrix multiplication
562 if(_is_quantized)
563 {
564 CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, gemm_pack, true);
565 }
566 else
567 {
568 CLScheduler::get().enqueue_op(*_matmul_native_kernel, gemm_pack, true);
569 }
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100570 }
571 else
572 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100573 // Run matrix multiply
574 if(_is_quantized)
575 {
576 _mm_gemmlowp->run(gemm_pack);
577 }
578 else
579 {
580 _mm_gemm->run(gemm_pack);
581 }
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100582 }
583}
584
585void ClFullyConnected::prepare(ITensorPack &tensors)
586{
Jakub Sujak617ed502023-03-29 11:16:18 +0100587 if(!_is_prepared || _dynamic_weights)
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100588 {
Jakub Sujak617ed502023-03-29 11:16:18 +0100589#ifdef ARM_COMPUTE_ASSERTS_ENABLED
590 ++_asrt_prepare_count;
591 ARM_COMPUTE_ERROR_ON(!_dynamic_weights && _asrt_prepare_count > 1);
592#endif // ARM_COMPUTE_ASSERTS_ENABLED
593
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100594 auto weights = tensors.get_const_tensor(ACL_SRC_1);
595
596 CLAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
597 CLAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false);
598
599 // Pointer to current weights
600 const ITensor *cur_weights = weights;
601
Jakub Sujak617ed502023-03-29 11:16:18 +0100602 // Reshape of the weights if needed
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100603 if(!_are_weights_reshaped && !_use_matmul)
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100604 {
605 // Run reshape weights kernel and mark weights as unused
606 ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
607 _reshape_weights->run(transpose_pack);
608
609 cur_weights->mark_as_unused();
610 cur_weights = reshaped_weights.get();
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100611 }
612
Jakub Sujak617ed502023-03-29 11:16:18 +0100613 // Convert weights if needed
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100614 if(!_are_weights_converted)
615 {
616 ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
617 _convert_weights->run(convert_pack);
618
619 cur_weights->mark_as_unused();
620 cur_weights = converted_weights.get();
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100621 }
622
Jakub Sujak617ed502023-03-29 11:16:18 +0100623 ITensorPack gemm_pack = tensors;
624 gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100625
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100626 // Prepare GEMM prepare and release unused weights (If not using matmul)
627 if(!_use_matmul)
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100628 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100629 if(!_is_quantized)
630 {
631 _mm_gemm->prepare(gemm_pack);
632 }
633 else
634 {
635 _mm_gemmlowp->prepare(gemm_pack);
636 }
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100637 }
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100638
Georgios Pinitas529b5a22021-07-27 15:55:30 +0100639 _is_prepared = true;
640 }
641}
642
643experimental::MemoryRequirements ClFullyConnected::workspace() const
644{
645 return _aux_mem;
646}
647} // namespace opencl
648} // namespace arm_compute