Blame - src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp - ml/ComputeLibrary

blob: 07ce0d3b55d5e0bcc83149163d4523be21e66fcb [file] [log] [blame]

Michele Di Giorgio	d02d5ed	2021-01-22 09:47:04 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2021 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#pragma once
				26
				27	#include "depthwise_depthfirst_multiplier.hpp"
				28
				29	namespace arm_conv {
				30	namespace depthwise {
				31
				32	template <class strategy>
				33	class DepthwiseDepthfirstWithMultiplierQuantized :
				34	public DepthwiseCommon<typename strategy::input_type,
				35	typename strategy::weight_type,
				36	typename strategy::return_type>
				37	{
				38	using Parent = DepthwiseCommon<typename strategy::input_type,
				39	typename strategy::weight_type,
				40	typename strategy::return_type>;
				41	using TInput = typename strategy::input_type;
				42	using TWeight = typename strategy::weight_type;
				43	using TOutput = typename strategy::return_type;
				44
				45	const arm_gemm::Requantize32 m_qp;
				46
				47	size_t sizeof_output_buffer(unsigned int n_channels) const
				48	{
				49	const unsigned int vl = arm_gemm::utils::get_vector_length<typename strategy::return_type>(strategy::vl_type);
				50	const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
				51	return sizeof(typename strategy::return_type) * rounded_channels;
				52	}
				53
				54	public:
				55	DepthwiseDepthfirstWithMultiplierQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp)
				56	: Parent(args), m_qp(qp)
				57	{
				58	}
				59
				60	DepthwiseDepthfirstWithMultiplierQuantized(DepthwiseDepthfirstWithMultiplierQuantized &) = delete;
				61	DepthwiseDepthfirstWithMultiplierQuantized &operator=(DepthwiseDepthfirstWithMultiplierQuantized &) = delete;
				62
				63	size_t get_storage_size(void) const override
				64	{
				65	// We produce VL<int32_t> channels at a time, for each of these blocks of
				66	// channels we store a vector of biases, weights (complicated) and
				67	// requantize parameters.
				68	const unsigned int iter_length =
				69	arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
				70	const unsigned int n_iters =
				71	this->m_args.input_channels * arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length);
				72
				73	// Compute the cost of storing the weights
				74	const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u);
				75
				76	return n_iters * iter_length * (
				77	sizeof(int32_t) + // Bias
				78	4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(TWeight) + // Weights
				79	2 * sizeof(int32_t) // Requantisation parameters
				80	);
				81	}
				82
				83	// We'll want an optimised version of this, but for now a C++ implementation
				84	// is probably sufficient.
				85	void pack_parameters(void _buffer, const void _biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
				86	{
				87	auto buffer = static_cast<uint8_t *>(_buffer);
				88	auto biases = static_cast<const int32_t *>(_biases);
				89	auto weights = static_cast<const TWeight *>(_weights);
				90	auto requant_muls = m_qp.per_channel_muls;
				91	auto requant_shifts = m_qp.per_channel_right_shifts;
				92
				93	const unsigned int iter_length =
				94	arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
				95	const unsigned int n_iters_per_input_channel =
				96	arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length);
				97
				98	const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u);
				99
				100	const size_t iter_stride = iter_length * (
				101	sizeof(int32_t) + // Bias
				102	4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(int8_t) + // Weights
				103	2 * sizeof(int32_t) // Requantisation parameters
				104	);
				105
				106	ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels * this->m_args.channel_multiplier : ld_weight_col;
				107	ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
				108
				109	for (unsigned int input_channel = 0; input_channel < this->m_args.input_channels; input_channel++)
				110	{
				111	auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride;
				112	auto weights_input_channel = weights + input_channel * this->m_args.channel_multiplier;
				113
				114	for (unsigned int iter = 0; iter < n_iters_per_input_channel; iter++)
				115	{
				116	// Get a pointer to the start of this portion of the buffer; consequently
				117	// derive pointers to the bias, weight and requantisation portions of
				118	// this frame.
				119	auto buffer_base = buffer_input_channel + iter_stride * iter;
				120	auto buffer_biases = reinterpret_cast<int32_t *>(buffer_base);
				121	auto buffer_weights = buffer_base + sizeof(int32_t) * iter_length;
				122	auto buffer_requant_mul = reinterpret_cast<int32_t *>(
				123	buffer_weights + strategy::kernel_rows * n_dots_per_kernel_row * 4 * iter_length);
				124	auto buffer_requant_shift = buffer_requant_mul + iter_length;
				125	auto weights_base = weights_input_channel + iter * iter_length;
				126
				127	// Hence work through the data for this iteration, on a
				128	// channel-by-channel basis.
				129	const auto this_iter_length = std::min<unsigned int>(
				130	iter_length, this->m_args.channel_multiplier - iter * iter_length
				131	);
				132	for (unsigned int i = 0; i < this_iter_length; i++)
				133	{
				134	auto weights_channel = weights_base + i;
				135
				136	// Read the bias value, we modify this as we read the weights.
				137	auto bias_value = biases == nullptr ? 0 : *(biases++);
				138	int32_t elements_sum = 0;
				139
				140	// Read through the kernel; for each row, marshal together as many dot
				141	// product terms as are required.
				142	for (unsigned int ki = 0; ki < strategy::kernel_rows; ki++)
				143	{
				144	auto buffer_row = buffer_weights + i4 + ki 4 * n_dots_per_kernel_row * iter_length;
				145	auto weights_row = weights_channel + ki * ld_weight_row;
				146
				147	unsigned int kj = 0;
				148	for (; kj < strategy::kernel_cols; kj++)
				149	{
				150	// Determine which element to which we're writing
				151	const auto dot = kj / 4;
				152	const auto elem = kj % 4;
				153
				154	// Copy the value; include in the sum
				155	const auto val = weights_row[kj * ld_weight_col];
				156	buffer_row[dot * 4 * iter_length + elem] = val;
				157	elements_sum += val;
				158	}
				159	for (; kj < 4 * n_dots_per_kernel_row; kj++)
				160	{
				161	const auto dot = kj / 4;
				162	const auto elem = kj % 4;
				163	buffer_row[dot * 4 * iter_length + elem] = 0;
				164	}
				165
				166	buffer_row += 4 * n_dots_per_kernel_row * iter_length;
				167	}
				168
				169	// Write back the bias and offset values
				170	*(buffer_biases++) =
				171	bias_value - m_qp.a_offset * elements_sum +
				172	strategy::kernel_rows * strategy::kernel_cols * m_qp.a_offset * m_qp.b_offset;
				173
				174	// Write out the requantisation parameters
				175	(buffer_requant_mul++) = m_qp.per_channel_requant ? (requant_muls++) : m_qp.per_layer_mul;
				176	(buffer_requant_shift++) = m_qp.per_channel_requant ? (requant_shifts++) : m_qp.per_layer_right_shift;
				177	}
				178	}
				179	}
				180	}
				181
				182	size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
				183	{
				184	const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
				185	return n_threads * sizeof_output_buffer(n_output_channels);
				186	}
				187
				188	using Parent::execute;
				189	void execute(
				190	const unsigned int batches,
				191	const unsigned int input_height,
				192	const unsigned int input_width,
				193	const unsigned int input_channels,
				194	const PaddingValues &padding,
				195	const void *const _input,
				196	const size_t ld_input_col,
				197	const size_t ld_input_row,
				198	const size_t ld_input_batch,
				199	const void *const parameters,
				200	const unsigned int output_height,
				201	const unsigned int output_width,
				202	void *const _output,
				203	const size_t ld_output_col,
				204	const size_t ld_output_row,
				205	const size_t ld_output_batch,
				206	void *const _working_space,
				207	const unsigned int thread_id,
				208	const unsigned int n_threads
				209	) const override
				210	{
				211	strategy strat(this->m_args.cpu_info);
				212	#ifdef CYCLE_PROFILING
				213	arm_gemm::profiler prof;
				214	#endif
				215
				216	auto executefn = [strat, this] (
				217	const TInput const const inptrs,
				218	TOutput const const outptr_array,
				219	const void *const params
				220	) {
				221	strat.kernel(inptrs, outptr_array, params, this->m_args.channel_multiplier, m_qp);
				222	};
				223
				224	// Get working space for this thread
				225	uint8_t const working_space = static_cast<uint8_t >(_working_space) + get_working_size(1, input_channels) * thread_id;
				226
				227	// Determine the stride across blocks of parameters
				228	const unsigned int iter_length =
				229	arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
				230	const unsigned int n_iters_per_input_channel = arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length);
				231	const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u);
				232	const size_t param_stride = n_iters_per_input_channel * iter_length * (
				233	sizeof(int32_t) + // Bias
				234	4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(int8_t) + // Weights
				235	2 * sizeof(int32_t) // Requantisation parameters
				236	);
				237
				238	common::depthwise_multiplier_execute<strategy>(
				239	executefn, m_qp.a_offset, this->m_args,
				240	batches, input_height, input_width, input_channels, padding,
				241	_input, ld_input_col, ld_input_row, ld_input_batch,
				242	parameters, param_stride,
				243	output_height, output_width,
				244	_output, ld_output_col, ld_output_row, ld_output_batch,
				245	working_space, thread_id, n_threads
				246	);
				247	}
				248	};
				249
				250	} // namespace depthwise
				251	} // namespace arm_conv