Blame - src/gpu/cl/operators/ClFullyConnected.cpp - ml/ComputeLibrary

blob: bd2fddad0b55166caa1c7c26f8bdc63a29af472b [file] [log] [blame]

Georgios Pinitas	529b5a2	2021-07-27 15:55:30 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017-2021 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Georgios Pinitas	7891a73	2021-08-20 21:39:25 +0100	[diff] [blame]	24	#include "src/gpu/cl/operators/ClFullyConnected.h"
Georgios Pinitas	529b5a2	2021-07-27 15:55:30 +0100	[diff] [blame]	25
				26	#include "arm_compute/core/Size2D.h"
				27	#include "arm_compute/core/Validate.h"
				28	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
				29	#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
				30	#include "arm_compute/runtime/CL/CLScheduler.h"
				31	#include "src/core/CL/kernels/CLFillBorderKernel.h"
				32
				33	#include "src/core/helpers/MemoryHelpers.h"
Georgios Pinitas	7891a73	2021-08-20 21:39:25 +0100	[diff] [blame]	34	#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
				35	#include "src/gpu/cl/operators/ClFlatten.h"
				36	#include "src/gpu/cl/operators/ClGemm.h"
				37	#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
				38	#include "src/gpu/cl/operators/ClTranspose.h"
				39	#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
Georgios Pinitas	529b5a2	2021-07-27 15:55:30 +0100	[diff] [blame]	40
				41	#include "support/Cast.h"
				42
				43	#include <algorithm>
				44
				45	namespace arm_compute
				46	{
				47	namespace opencl
				48	{
				49	using namespace arm_compute::experimental;
				50	using namespace arm_compute::misc::shape_calculator;
				51
				52	namespace
				53	{
				54	Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst,
				55	GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info)
				56	{
				57	gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
				58	gemmlowp_output_stage.gemmlowp_offset = 0;
				59	gemmlowp_output_stage.gemmlowp_multiplier = 0;
				60	gemmlowp_output_stage.gemmlowp_shift = 0;
				61
				62	const auto data_type = src.data_type();
				63
				64	// Configure output stage for quantized case
				65	if(is_data_type_quantized_asymmetric(data_type))
				66	{
				67	const QuantizationInfo oq_info = dst.quantization_info();
				68	const UniformQuantizationInfo iq_unif = src.quantization_info().uniform();
				69	const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform();
				70	const UniformQuantizationInfo oq_unif = oq_info.uniform();
				71
				72	const auto output_quant_info = (dst.total_size() == 0) ? iq_unif : oq_unif;
				73
				74	const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
				75	int output_multiplier = 0;
				76	int output_shift = 0;
				77	ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
				78
				79	PixelValue type_min{};
				80	PixelValue type_max{};
				81	std::tie(type_min, type_max) = get_min_max(data_type);
				82
				83	if(activation_info.enabled())
				84	{
				85	std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
				86	}
				87
				88	// Set the GEMMLowp output stage info
				89	gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
				90	gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
				91	gemmlowp_output_stage.gemmlowp_shift = output_shift;
				92	gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
				93	gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
				94	type_min.get(gemmlowp_output_stage.gemmlowp_min_bound);
				95	type_max.get(gemmlowp_output_stage.gemmlowp_max_bound);
				96	}
				97
				98	return Status{};
				99	}
				100
				101	Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info)
				102	{
				103	GEMMLowpOutputStageInfo gemmlowp_output_stage;
				104	ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
				105
				106	const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
				107	false, // is_b_reshaped
				108	true, // reshape_b_only_on_first_run
				109	0, // depth_output_gemm3d
				110	false, // reinterpret_input_as_3d
				111	fc_info.retain_internal_weights, // retain_internal_weights
				112	gemmlowp_output_stage, // gemmlowp_output_stage
				113	fc_info.fp_mixed_precision, // fp_mixed_precision
				114	false, // fast_math
				115	true, // broadcast_bias
				116	ActivationLayerInfo()); // activation_info
				117
				118	if(is_data_type_quantized_asymmetric(src.data_type()))
				119	{
				120	const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
				121	const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
				122
				123	// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
				124	// Extract and negate src and weights offset
				125	const QuantizationInfo src_quantization_info(iq_info.scale, -iq_info.offset);
				126	const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
				127
				128	// Validate gemmlowp function
				129	ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info),
				130	&weights.clone()->set_quantization_info(weights_quantization_info),
				131	bias,
				132	&dst,
				133	gemm_info));
				134	}
				135	else
				136	{
				137	ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&src, &weights, bias, &dst, 1.f, 1.f, gemm_info));
				138	}
				139
				140	return Status{};
				141	}
				142	} // namespace
				143
				144	ClFullyConnected::ClFullyConnected()
				145	: _convert_weights(nullptr),
				146	_flatten(nullptr),
				147	_reshape_weights(nullptr),
				148	_mm_gemm(nullptr),
				149	_mm_gemmlowp(nullptr),
				150	_aux_mem(Count)
				151	{
				152	}
				153
				154	ClFullyConnected::~ClFullyConnected() = default;
				155
				156	void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo src, ITensorInfo weights, ITensorInfo bias, ITensorInfo dst,
				157	const FullyConnectedLayerInfo &fc_info)
				158	{
				159	GEMMLowpOutputStageInfo gemmlowp_output_stage;
				160	construct_gemmlowp_output_stage(src, weights, *dst, gemmlowp_output_stage, fc_info.activation_info);
				161
				162	const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
				163	false, // is_b_reshaped
				164	true, // reshape_b_only_on_first_run
				165	0, // depth_output_gemm3d
				166	false, // reinterpret_input_as_3d
				167	fc_info.retain_internal_weights, // retain_internal_weights
				168	gemmlowp_output_stage, // gemmlowp_output_stage
				169	fc_info.fp_mixed_precision, // fp_mixed_precision
				170	false, // fast_math
				171	true, // broadcast_bias
Michele Di Giorgio	aed63ee	2021-07-26 13:18:50 +0100	[diff] [blame^]	172	fc_info.activation_info); // activation_info
Georgios Pinitas	529b5a2	2021-07-27 15:55:30 +0100	[diff] [blame]	173
				174	if(_is_quantized)
				175	{
				176	// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
				177	// Extract and negate input and weights offset
				178	const QuantizationInfo src_quantization_info = src->quantization_info();
				179	const QuantizationInfo weights_quantization_info = weights->quantization_info();
				180
				181	TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info);
				182	TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
				183
				184	src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
				185	weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
				186
				187	// Configure gemmlowp function
				188	_mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
				189	_mm_gemmlowp->configure(compile_context, &src_info, &weights_info, bias, dst, gemm_info);
				190	}
				191	else
				192	{
				193	// Configure matrix multiply kernel
				194	_mm_gemm = std::make_unique<ClGemm>();
				195	_mm_gemm->configure(compile_context, src, weights, bias, dst, 1.f, 1.f, gemm_info);
				196	}
				197	}
				198
				199	void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo src, ITensorInfo weights, ITensorInfo bias, ITensorInfo dst,
				200	const FullyConnectedLayerInfo &fc_info)
				201	{
				202	ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
				203
				204	// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
				205
				206	// Initialize output tensor for flatten
				207	_flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW);
				208
				209	// Configure flatten kernel
				210	_flatten = std::make_unique<ClFlatten>();
				211	_flatten->configure(compile_context, src, &_flattened_src);
				212
				213	// Configure matrix multiply kernel
				214	configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info);
				215	}
				216
				217	void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo src, ITensorInfo weights, ITensorInfo bias, ITensorInfo dst,
				218	const FullyConnectedLayerInfo &fc_info)
				219	{
				220	ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
				221
				222	// Configure matrix multiply kernel
				223	configure_mm(compile_context, src, weights, bias, dst, fc_info);
				224	}
				225
				226	void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo src, ITensorInfo weights, ITensorInfo biases, ITensorInfo dst,
				227	FullyConnectedLayerInfo fc_info)
				228	{
				229	ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
				230
				231	// Perform validate step
				232	ARM_COMPUTE_ERROR_THROW_ON(ClFullyConnected::validate(src, weights, biases, dst, fc_info));
				233
				234	_are_weights_converted = true;
				235	_are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
				236	_is_fc_after_conv = true;
				237	_is_quantized = is_data_type_quantized_asymmetric(src->data_type());
				238	_is_prepared = fc_info.retain_internal_weights;
				239	_weights_to_use = TensorInfo(*weights);
				240	_weights_to_use_idx = ACL_SRC_1;
				241
				242	// With the Fully Connected layer we can have 4 different cases:
				243	// 1) Convolution layer -> Fully Connected layer without batches
				244	// 2) Fully Connected layer -> Fully Connected layer without batches
				245	// 3) Convolution layer -> Fully Connected layer with batches
				246	// 4) Fully Connected layer -> Fully Connected layer with batches
				247
				248	// Check if we have a fully connected layer with batches
				249	const bool is_batched_fc_layer = dst->dimension(1) > 1;
				250	if(is_batched_fc_layer)
				251	{
				252	_is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
				253	src->tensor_shape().cend(),
				254	dst->tensor_shape().cbegin() + 1));
				255	}
				256	else
				257	{
				258	_is_fc_after_conv = src->num_dimensions() > 1;
				259	}
				260
				261	ITensorInfo *weights_used = weights;
				262
				263	// Reshape weights if needed
				264	if(!_are_weights_reshaped)
				265	{
				266	// Reshape the weights
				267	_reshape_weights = std::make_unique<ClTranspose>();
				268	_reshape_weights->configure(compile_context, weights, &_reshaped_weights);
				269	weights_used = &_reshaped_weights;
				270	_weights_to_use_idx = offset_int_vec(TransposedWeights);
				271	}
				272
				273	// Convert weights if needed
				274	if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
				275	{
				276	// Convert weights
				277	_convert_weights = std::make_unique<ClConvertFullyConnectedWeights>();
				278	_convert_weights->configure(compile_context,
				279	weights_used,
				280	&_converted_weights,
				281	src->tensor_shape(),
				282	fc_info.weights_trained_layout);
				283
				284	weights_used = &_converted_weights;
				285	_weights_to_use_idx = offset_int_vec(ConvertedWeights);
				286	_are_weights_converted = false;
				287	}
				288
				289	if(_is_fc_after_conv)
				290	{
				291	// Fully Connected layer after a Convolution Layer without batches
				292	configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info);
				293	}
				294	else
				295	{
				296	// Fully Connected layer after a Fully Connected Layer without batches
				297	configure_fc_fc(compile_context, src, weights_used, biases, dst, fc_info);
				298	}
				299	// Update TensorInfo of final weights used (Need to be done in the end due to padding expansion)
				300	_weights_to_use = *weights_used;
				301
				302	// Set auxiliary memory requirements
				303	auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
				304	for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
				305	{
				306	_aux_mem[i] = gemm_mem_req[i];
				307	}
				308	if(_aux_mem[1].size > 0 \|\| _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
				309	{
				310	// Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
				311	_aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Prepare, _reshaped_weights.total_size());
				312	_aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size());
				313	}
				314	else
				315	{
				316	// Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
				317	const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
				318	const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
				319
				320	_aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), transposed_wei_lft, _reshaped_weights.total_size());
				321	_aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), converted_wei_lft, _converted_weights.total_size());
				322	}
				323	_aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
				324	}
				325
				326	Status ClFullyConnected::validate(const ITensorInfo src, const ITensorInfo weights, const ITensorInfo biases, const ITensorInfo dst,
				327	FullyConnectedLayerInfo fc_info)
				328	{
				329	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
				330	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
				331	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
				332	ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
				333	ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
				334	&& fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
Michele Di Giorgio	aed63ee	2021-07-26 13:18:50 +0100	[diff] [blame^]	335	ARM_COMPUTE_RETURN_ERROR_ON(!weights->are_values_constant() && (!fc_info.are_weights_reshaped \|\| fc_info.transpose_weights));
Georgios Pinitas	529b5a2	2021-07-27 15:55:30 +0100	[diff] [blame]	336
				337	bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
				338	bool is_fc_after_conv = true;
				339
				340	const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW));
				341	const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
				342	const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
				343
				344	// With the Fully Connected layer we can have 4 different cases:
				345	// 1) Convolution layer -> Fully Connected layer without batches
				346	// 2) Fully Connected layer -> Fully Connected layer without batches
				347	// 3) Convolution layer -> Fully Connected layer with batches
				348	// 4) Fully Connected layer -> Fully Connected layer with batches
				349
				350	const ITensorInfo *src_to_use = src;
				351	const ITensorInfo *weights_to_use = weights;
				352
Michele Di Giorgio	aed63ee	2021-07-26 13:18:50 +0100	[diff] [blame^]	353	if(biases != nullptr)
				354	{
				355	ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
				356	if(is_data_type_quantized(src->data_type()))
				357	{
				358	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
				359	}
				360	else
				361	{
				362	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
				363	}
				364	}
				365
Georgios Pinitas	529b5a2	2021-07-27 15:55:30 +0100	[diff] [blame]	366	// Check if we have a fully connected layer with batches
				367	const bool is_batched_fc_layer = dst->dimension(1) > 1;
				368	if(is_batched_fc_layer)
				369	{
				370	is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
				371	src->tensor_shape().cend(),
				372	dst->tensor_shape().cbegin() + 1));
				373	}
				374	else
				375	{
				376	is_fc_after_conv = src->num_dimensions() > 1;
				377	}
				378
				379	if(!weights_reshaped)
				380	{
				381	// Validate reshape weights kernel
				382	ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights));
				383	weights_to_use = &reshaped_weights;
				384	}
				385
				386	if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
				387	{
				388	// Validate convert weights kernel
				389	ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use,
				390	&converted_weights,
				391	src->tensor_shape(),
				392	fc_info.weights_trained_layout));
				393	weights_to_use = &converted_weights;
				394	}
				395
				396	if(is_fc_after_conv)
				397	{
				398	// Fully Connected layer after a Convolution Layer without batches
				399	ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
				400
				401	// Validate flatten kernel
				402	ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src));
				403	src_to_use = &flatten_src;
				404	}
				405	else
				406	{
				407	// Fully Connected layer after a Fully Connected Layer without batches
				408	ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
				409	}
				410
				411	// Validate matrix multiply kernel
				412	ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, *dst, fc_info));
				413
				414	return Status{};
				415	}
				416
				417	void ClFullyConnected::run(ITensorPack &tensors)
				418	{
				419	prepare(tensors);
				420
				421	auto src = tensors.get_const_tensor(ACL_SRC_0);
				422
				423	CLAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
				424	CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false);
				425
				426	// Linearize input if it comes from a convolutional layer
				427	if(_is_fc_after_conv)
				428	{
				429	ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
				430	_flatten->run(flatten_pack);
				431	}
				432
				433	ITensorPack gemm_pack = tensors;
				434	gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
				435	if(_weights_to_use_idx != ACL_SRC_1)
				436	{
				437	gemm_pack.add_const_tensor(ACL_SRC_1, weights.get());
				438	}
				439
				440	// Run matrix multiply
				441	if(_is_quantized)
				442	{
				443	_mm_gemmlowp->run(gemm_pack);
				444	}
				445	else
				446	{
				447	_mm_gemm->run(gemm_pack);
				448	}
				449	}
				450
				451	void ClFullyConnected::prepare(ITensorPack &tensors)
				452	{
				453	if(!_is_prepared)
				454	{
				455	auto weights = tensors.get_const_tensor(ACL_SRC_1);
				456
				457	CLAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
				458	CLAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false);
				459
				460	// Pointer to current weights
				461	const ITensor *cur_weights = weights;
				462
				463	// Reshape of the weights if needed (happens only once)
				464	if(!_are_weights_reshaped)
				465	{
				466	// Run reshape weights kernel and mark weights as unused
				467	ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
				468	_reshape_weights->run(transpose_pack);
				469
				470	cur_weights->mark_as_unused();
				471	cur_weights = reshaped_weights.get();
				472
				473	_are_weights_reshaped = true;
				474	}
				475
				476	// Convert weights if needed (happens only once)
				477	if(!_are_weights_converted)
				478	{
				479	ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
				480	_convert_weights->run(convert_pack);
				481
				482	cur_weights->mark_as_unused();
				483	cur_weights = converted_weights.get();
				484
				485	_are_weights_converted = true;
				486	}
				487
				488	tensors.add_const_tensor(ACL_SRC_1, cur_weights);
				489
				490	// Prepare GEMM prepare and release unused weights
				491	if(!_is_quantized)
				492	{
				493	_mm_gemm->prepare(tensors);
				494	}
				495	else
				496	{
				497	_mm_gemmlowp->prepare(tensors);
				498	}
				499	_is_prepared = true;
				500	}
				501	}
				502
				503	experimental::MemoryRequirements ClFullyConnected::workspace() const
				504	{
				505	return _aux_mem;
				506	}
				507	} // namespace opencl
				508	} // namespace arm_compute