Blame - src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

blob: 56eb4fbb879674717d5268b186caf59ef97a48f8 [file] [log] [blame]

Manuel Bottini	cfac51c	2021-06-18 15:47:28 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2021 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
				29	#include "arm_compute/core/KernelDescriptors.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Validate.h"
				32	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
				33	#include "arm_compute/runtime/NEON/NEScheduler.h"
				34	#include "arm_compute/runtime/TensorAllocator.h"
				35	#include "src/core/helpers/AutoConfiguration.h"
				36	#include "src/core/helpers/MemoryHelpers.h"
				37
				38	#include "src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
				39	#include "src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
				40	#include "src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
				41	#include "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
				42	#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"
				43	#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"
				44	#include "src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h"
				45	#include "src/runtime/cpu/operators/CpuActivation.h"
				46	#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
				47	#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
				48
				49	using namespace arm_compute::misc::shape_calculator;
				50	using namespace arm_compute::experimental;
				51
				52	namespace arm_compute
				53	{
				54	namespace cpu
				55	{
				56	namespace
				57	{
				58	cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
				59	{
				60	cpu::AsmGemmInfo asm_info;
				61	asm_info.method = cpu::AsmConvMethod::Im2Col;
				62	asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
				63	asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
				64	asm_info.activation_info = info.activation_info();
				65	asm_info.output_stage = info.gemmlowp_output_stage();
Georgios Pinitas	4ee8b15	2021-07-16 16:16:43 +0100	[diff] [blame^]	66	asm_info.fast_mode = info.fast_math();
Manuel Bottini	cfac51c	2021-06-18 15:47:28 +0100	[diff] [blame]	67
				68	return asm_info;
				69	}
				70	} // namespace
				71
				72	CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
				73	: _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()),
				74	_mm_kernel(),
				75	_mtx_a_reshape_kernel(),
				76	_mtx_b_reshape_kernel(),
				77	_mtx_a_reduction_kernel(),
				78	_mtx_b_reduction_kernel(),
				79	_offset_contribution_kernel(),
				80	_offset_contribution_output_stage_kernel(),
				81	_activation_func(),
				82	_convert_to_signed_asymm(),
				83	_convert_from_signed_asymm(),
				84	_vector_sum_col(),
				85	_vector_sum_row(),
				86	_tmp_a(),
				87	_tmp_b(),
				88	_mm_result_s32(),
				89	_signed_a(),
				90	_signed_output(),
				91	_a_offset(0),
				92	_b_offset(0),
				93	_run_vector_matrix_multiplication(false),
				94	_assembly_path(false),
				95	_fused_assembly_path(false),
				96	_reshape_b_only_on_first_run(false),
				97	_is_prepared(false),
				98	_fuse_output_stage(false),
				99	_run_activation(false),
				100	_flip_signedness(false),
				101	_gemm_info(),
				102	_aux_mem(Count)
				103	{
				104	}
				105	CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
				106
				107	void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo a, const ITensorInfo b, const ITensorInfo c, ITensorInfo dst, const GEMMInfo &gemm_info)
				108	{
				109	ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
				110	ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
				111
				112	const ITensorInfo *matrix_a = a;
				113	const ITensorInfo *matrix_b = b;
				114	GEMMInfo info = gemm_info;
				115
				116	// Set internal variables
				117	_a_offset = a->quantization_info().uniform().offset;
				118	_b_offset = b->quantization_info().uniform().offset;
				119	_run_vector_matrix_multiplication = a->dimension(1) < 2;
				120	_reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
				121	_is_prepared = false;
				122	_fused_assembly_path = false;
				123	_flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
				124	_gemm_info = gemm_info;
				125
				126	_asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
				127
				128	const ITensorInfo *a_to_use = a;
				129
				130	// Convert to QASYMM8 -> QASYMM8_SIGNED and back
				131	if(_flip_signedness)
				132	{
				133	const int32_t offset_correction = 128;
				134	const DataType dt = DataType::QASYMM8_SIGNED;
				135	const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
				136
				137	_signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
				138	_convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
				139	_convert_to_signed_asymm->configure(a_to_use, &_signed_a);
				140	a_to_use = &_signed_a;
				141	_a_offset = _signed_a.quantization_info().uniform().offset;
				142
				143	const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
				144	_signed_output = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
				145
				146	// Output stage correction
				147	GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
				148	output_stage_corr.gemmlowp_offset = _signed_output.quantization_info().uniform().offset;
				149	output_stage_corr.gemmlowp_min_bound -= offset_correction;
				150	output_stage_corr.gemmlowp_max_bound -= offset_correction;
				151	info.set_gemmlowp_output_stage(output_stage_corr);
				152
				153	// Update matrix a
				154	matrix_a = &_signed_a;
				155	}
				156
				157	// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
				158	if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
				159	{
				160	_fuse_output_stage = true;
				161	_mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
				162	}
				163
				164	// Initialize assembly kernel meta-data
				165	const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
				166	#ifdef __aarch64__
				167	switch(a->data_type())
				168	{
				169	case DataType::QASYMM8:
				170	case DataType::QASYMM8_SIGNED:
				171	case DataType::U8:
				172	case DataType::S8:
				173	{
				174	if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
				175	{
				176	auto c_info_to_use = c == nullptr ? nullptr : c;
				177	_asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
				178	_fused_assembly_path = _asm_glue->is_configured();
				179	}
				180	else
				181	{
				182	auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst);
				183	_asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info);
				184	}
				185	_assembly_path = _asm_glue->is_configured();
				186	break;
				187	}
				188	default:
				189	{
				190	ARM_COMPUTE_ERROR("Datatype not supported");
				191	break;
				192	}
				193	}
				194	#endif /* __aarch64__ */
				195	if(!(_assembly_path \|\| _run_vector_matrix_multiplication))
				196	{
				197	matrix_a = &_tmp_a;
				198	matrix_b = &_tmp_b;
				199
				200	// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
				201	_tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
				202	// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
				203	_tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
				204
				205	// Configure interleave kernel
				206	_mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();
				207	_mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
				208
				209	// Configure transpose kernel
				210	_mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();
				211	_mtx_b_reshape_kernel->configure(b, &_tmp_b);
				212	}
				213
				214	if(!_fused_assembly_path)
				215	{
				216	// Build reduction info
				217	const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
				218
				219	// Initialize matrix B reduction kernel only if _a_offset is not equal to 0
				220	if(_a_offset != 0)
				221	{
				222	_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
				223
				224	// Configure Matrix B reduction kernel
				225	_mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();
				226	_mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
				227	}
				228
				229	// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
				230	if(_b_offset != 0)
				231	{
				232	_vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
				233
				234	// Configure matrix A reduction kernel
				235	_mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();
				236	_mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
				237	}
				238
				239	if(_fuse_output_stage)
				240	{
				241	// Configure matrix multiply kernel
				242	if(!_assembly_path)
				243	{
				244	_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
				245	_mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
				246	}
				247
				248	_offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
				249	_offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
				250	_a_offset == 0 ? nullptr : &_vector_sum_col,
				251	_b_offset == 0 ? nullptr : &_vector_sum_row, c,
				252	_flip_signedness ? &_signed_output : dst,
				253	a->dimension(0),
				254	_a_offset, _b_offset, info.gemmlowp_output_stage());
				255
				256	if(_flip_signedness)
				257	{
				258	_convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
				259	_convert_from_signed_asymm->configure(&_signed_output, dst);
				260	}
				261	}
				262	else
				263	{
				264	// Configure matrix multiply kernel
				265	if(!_assembly_path)
				266	{
				267	_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
				268	_mm_kernel->configure(matrix_a, matrix_b, dst);
				269	}
				270	// Configure offset contribution kernel
				271	_offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
				272	_offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
				273	_a_offset, _b_offset);
				274	}
				275	}
				276	// Configure activation
				277	const ActivationLayerInfo &activation = gemm_info.activation_info();
				278	_run_activation = activation.enabled() && (!_assembly_path \|\| !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
				279	if(_run_activation)
				280	{
				281	_activation_func = std::make_unique<CpuActivation>();
				282	_activation_func->configure(dst, nullptr, activation);
				283	}
				284
				285	if(_assembly_path)
				286	{
				287	auto asm_mem_req = _asm_glue->workspace();
				288	_aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
				289	_aux_mem[Pretranspose] = asm_mem_req[Pretranspose];
				290	}
				291
				292	// Request memory for LHS and RHS reshape matrix
				293	_aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0
				294	&& _reshape_b_only_on_first_run ?
				295	MemoryLifetime::Persistent :
				296	MemoryLifetime::Temporary,
				297	_vector_sum_col.total_size());
				298	_aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
				299	_aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
				300	_aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
				301	_aux_mem[MMResultS32] = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
				302	_aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
				303	_aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
				304	}
				305
				306	Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo a, const ITensorInfo b, const ITensorInfo c, const ITensorInfo output, const GEMMInfo &gemm_info)
				307	{
				308	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
				309	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
				310	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
				311	ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
				312	ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
				313	"The product AB is defined only if the number of columns in A is equal to the number of rows in B");
				314	ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
				315	ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
				316
				317	GEMMInfo info = gemm_info;
				318	const ITensorInfo *matrix_a_info = a;
				319	const ITensorInfo *matrix_b_info = b;
				320
				321	const ITensorInfo *a_to_use = a;
				322
				323	TensorInfo tmp_a_info{};
				324	TensorInfo tmp_b_info{};
				325	TensorInfo mm_result_s32_info{};
				326
				327	int32_t a_offset = a->quantization_info().uniform().offset;
				328	int32_t b_offset = b->quantization_info().uniform().offset;
				329
				330	bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
				331	if(fuse_output_stage)
				332	{
				333	auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
				334	}
				335
				336	// Convert QASYMM8->QASYMM8_SIGNED
				337	TensorInfo signed_a{};
				338	TensorInfo signed_output{};
				339	bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
				340	if(flip_signedness)
				341	{
				342	const int32_t offset_correction = 128;
				343	const DataType dt = DataType::QASYMM8_SIGNED;
				344	const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
				345
				346	signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
				347	ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
				348	a_to_use = &signed_a;
				349	a_offset = signed_a.quantization_info().uniform().offset;
				350
				351	const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
				352	signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
				353
				354	// Output stage correction
				355	GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
				356	output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
				357	output_stage_corr.gemmlowp_min_bound -= offset_correction;
				358	output_stage_corr.gemmlowp_max_bound -= offset_correction;
				359	info.set_gemmlowp_output_stage(output_stage_corr);
				360
				361	// Update matrix a
				362	matrix_a_info = &signed_a;
				363	}
				364
				365	// Initialize assembly kernel meta-data
				366	const AsmGemmInfo asm_info = init_assembly_metadata(info);
				367
				368	// Check if we need to run the optimized assembly kernel
				369	bool run_optimised = false;
				370	bool run_optimised_requantized = false;
				371	if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
				372	{
				373	run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
				374	run_optimised_requantized = run_optimised;
				375	}
				376	else
				377	{
				378	run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
				379	}
				380
				381	if(run_optimised)
				382	{
				383	ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
				384	if(info.depth_output_gemm3d() != 0)
				385	{
				386	if(info.reinterpret_input_as_3d())
				387	{
				388	ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
				389	ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
				390	}
				391	else
				392	{
				393	ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
				394	}
				395	}
				396	else
				397	{
				398	ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
				399	}
				400	}
				401	else
				402	{
				403	ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
				404	ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
				405
				406	const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
				407	if(!run_vector_matrix_multiplication)
				408	{
				409	matrix_a_info = &tmp_a_info;
				410	matrix_b_info = &tmp_b_info;
				411
				412	// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
				413	TensorShape shape_tmp_a = a->tensor_shape();
				414	shape_tmp_a.set(0, a->dimension(0) * 4);
				415	shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
				416
				417	// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
				418	TensorShape shape_tmp_b = b->tensor_shape();
				419	shape_tmp_b.set(0, b->dimension(1) * 16);
				420	shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
				421
				422	// Validate interleave kernel
				423	auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
				424	auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
				425
				426	ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
				427	ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
				428	}
				429	}
				430
				431	if(!run_optimised_requantized)
				432	{
				433	TensorInfo info_vector_sum_col{};
				434	TensorInfo info_vector_sum_row{};
				435
				436	const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
				437
				438	// Validate matrix B reduction kernel only if _a_offset is not equal to 0
				439	if(a_offset != 0)
				440	{
				441	info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
				442
				443	// Configure Matrix B reduction kernel
				444	ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
				445	}
				446
				447	// Validate Matrix A reduction kernel only if _b_offset is not equal to 0
				448	if(b_offset != 0)
				449	{
				450	info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
				451
				452	// Configure matrix A reduction kernel
				453	ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
				454	}
				455
				456	if(fuse_output_stage)
				457	{
				458	if(!run_optimised)
				459	{
				460	ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
				461	ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
				462
				463	ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
				464	}
				465
				466	// Validate offset contribution kernel
				467	ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
				468	a_offset == 0 ? nullptr : &info_vector_sum_col,
				469	b_offset == 0 ? nullptr : &info_vector_sum_row,
				470	c,
				471	flip_signedness ? &signed_output : output,
				472	a_offset, b_offset,
				473	info.gemmlowp_output_stage()));
				474	}
				475	else
				476	{
				477	if(!run_optimised)
				478	{
				479	ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
				480	ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
				481
				482	ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
				483	}
				484	// Validate offset contribution kernel
				485	ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output,
				486	a_offset == 0 ? nullptr : &info_vector_sum_col,
				487	b_offset == 0 ? nullptr : &info_vector_sum_row,
				488	a_offset, b_offset));
				489	}
				490	}
				491
				492	// Validate activation
				493	const ActivationLayerInfo &activation = gemm_info.activation_info();
				494	if(activation.enabled())
				495	{
				496	ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
				497	}
				498
				499	return Status{};
				500	}
				501
				502	void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
				503	{
				504	prepare(tensors);
				505	auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
				506	auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
				507	auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
				508	auto dst = tensors.get_tensor(TensorType::ACL_DST);
				509	auto a_to_use = a;
				510	auto matrix_a = a;
				511	auto matrix_b = b;
				512
				513	CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false);
				514	CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false);
				515	CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false);
				516	CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true);
				517	CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false);
				518	CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false);
				519	CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
				520
				521	// Convert QASYMM8->QASYMM8_SIGNED
				522	if(_flip_signedness)
				523	{
				524	ITensorPack pack =
				525	{
				526	{ TensorType::ACL_SRC, a },
				527	{ TensorType::ACL_DST, signed_a.get() }
				528	};
				529	NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
				530	a_to_use = signed_a.get();
				531	}
				532
				533	// Run GEMM
				534	if(_asm_glue->is_configured())
				535	{
				536	ITensorPack asm_glue_tensors = tensors;
				537	auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst);
				538	if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
				539	{
				540	asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
				541	asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
				542	asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
				543	asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst);
				544	}
				545	else
				546	{
				547	asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
				548	asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
				549	asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
				550	}
				551	_asm_glue->run(asm_glue_tensors);
				552	}
				553	else
				554	{
				555	if(!_run_vector_matrix_multiplication)
				556	{
				557	matrix_a = tmp_a.get();
				558	matrix_b = tmp_b.get();
				559	// Run interleave kernel
				560	ITensorPack pack_a =
				561	{
				562	{ TensorType::ACL_SRC, a_to_use },
				563	{ TensorType::ACL_DST, tmp_a.get() }
				564	};
				565	NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a);
				566
				567	if(!_reshape_b_only_on_first_run)
				568	{
				569	ITensorPack pack_b =
				570	{
				571	{ TensorType::ACL_SRC, b },
				572	{ TensorType::ACL_DST, tmp_b.get() }
				573	};
				574	// Run transpose kernel
				575	NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b);
				576	}
				577	}
				578	ITensorPack pack_mm =
				579	{
				580	{ TensorType::ACL_SRC_0, matrix_a },
				581	{ TensorType::ACL_SRC_1, matrix_b }
				582	};
				583	if(_fuse_output_stage)
				584	{
				585	pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
				586	}
				587	else
				588	{
				589	pack_mm.add_tensor(TensorType::ACL_DST, dst);
				590	}
				591	NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
				592	}
				593
				594	if(!_fused_assembly_path)
				595	{
				596	// Run matrix A reduction kernel only if _b_offset is not equal to 0
				597	if(_b_offset != 0)
				598	{
				599	ITensorPack pack =
				600	{
				601	{ TensorType::ACL_SRC, a_to_use },
				602	{ TensorType::ACL_DST, vector_sum_row.get() }
				603	};
				604	NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack);
				605	}
				606
				607	// Run matrix B reduction kernel only if _a_offset is not equal to 0
				608	if(_a_offset != 0 && !_reshape_b_only_on_first_run)
				609	{
				610	ITensorPack pack =
				611	{
				612	{ TensorType::ACL_SRC, b },
				613	{ TensorType::ACL_DST, vector_sum_col.get() }
				614	};
				615	NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
				616	}
				617
				618	if(_fuse_output_stage)
				619	{
				620	ITensorPack pack;
				621	pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
				622	pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get());
				623	pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get());
				624	pack.add_tensor(TensorType::ACL_SRC_3, c);
				625	pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
				626
				627	// Run offset contribution kernel
				628	NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack);
				629	}
				630	else
				631	{
				632	ITensorPack pack;
				633	pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get());
				634	pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get());
				635	pack.add_tensor(TensorType::ACL_DST, dst);
				636
				637	// Run offset contribution kernel
				638	NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack);
				639	}
				640	}
				641
				642	// Convert QASYMM8_SIGNED->QASYMM8
				643	if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
				644	{
				645	ITensorPack pack =
				646	{
				647	{ TensorType::ACL_SRC, signed_output.get() },
				648	{ TensorType::ACL_DST, dst }
				649	};
				650	NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack);
				651	}
				652
				653	// Run fused activation unless already run in the fused assembly
				654	if(_run_activation)
				655	{
				656	ITensorPack pack =
				657	{
				658	{ TensorType::ACL_SRC, dst },
				659	{ TensorType::ACL_DST, dst }
				660	};
				661	_activation_func->run(pack);
				662	}
				663	}
				664
				665	void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
				666	{
				667	if(!_is_prepared)
				668	{
				669	auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
				670	// Run assembly reshape
				671	if(_asm_glue->is_configured())
				672	{
				673	_asm_glue->prepare(tensors);
				674
				675	auto has_reshape = std::find_if(_aux_mem.begin(),
				676	_aux_mem.end(),
				677	[](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
				678
				679	if(has_reshape != std::end(_aux_mem))
				680	{
				681	original_b->mark_as_unused();
				682	}
				683	}
				684	// Run non-assembly reshape
				685	else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
				686	{
				687	// Run reshape kernel and mark original weights tensor as unused
				688	ITensor tmp_b_p = utils::cast::polymorphic_downcast<ITensor >(tensors.get_tensor(offset_int_vec(TmpB)));
				689	CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
				690	ITensorPack pack =
				691	{
				692	{ TensorType::ACL_SRC, original_b },
				693	{ TensorType::ACL_DST, tmp_b.get() }
				694	};
				695	NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack);
				696	}
				697
				698	// Run matrix B reduction kernel only if _a_offset is not equal to 0
				699	if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
				700	{
				701	ITensor vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor >(tensors.get_tensor(offset_int_vec(VectorSumCol)));
				702	CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
				703	ITensorPack pack =
				704	{
				705	{ TensorType::ACL_SRC, original_b },
				706	{ TensorType::ACL_DST, vector_sum_col.get() }
				707	};
				708	NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
				709	}
				710	_is_prepared = true;
				711	}
				712	}
				713	experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() const
				714	{
				715	return _aux_mem;
				716	}
				717	} // namespace cpu
				718	} // namespace arm_compute