Blame - src/core/NEON/kernels/arm_conv/addressing.cpp - ml/ComputeLibrary

blob: d01627bc5adca93dc2a56009f2c6b7bd027f90d2 [file] [log] [blame]

ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame^]	1	/*
				2	* Copyright (c) 2022 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#include "addressing.hpp"
				26	#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
				27	#include <cstring>
				28
				29	namespace arm_conv {
				30	namespace addressing {
				31
				32	void fill_pointer_array(
				33	size_t element_size,
				34	void **dest_raw, const unsigned int array_rows, const unsigned int array_cols,
				35	void *base_ptr_raw, size_t ld_row, size_t ld_col,
				36	void *pad_buffer_raw,
				37	const unsigned int pad_top, const unsigned int valid_rows,
				38	const unsigned int pad_left, const unsigned int valid_cols
				39	)
				40	{
				41	auto dest = reinterpret_cast<char **>(dest_raw);
				42	auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
				43	auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
				44	ld_row *= element_size;
				45	ld_col *= element_size;
				46
				47	const auto last_valid_row = std::min(pad_top + valid_rows, array_rows);
				48	const auto last_valid_col = std::min(pad_left + valid_cols, array_cols);
				49
				50	unsigned int i = 0;
				51	for (; i < pad_top; i++)
				52	{
				53	for (unsigned int j = 0; j < array_cols; j++)
				54	{
				55	*(dest++) = pad_buffer;
				56	}
				57	}
				58	for (; i < last_valid_row; i++)
				59	{
				60	unsigned int j = 0;
				61	auto colptr = base_ptr;
				62	base_ptr += ld_row;
				63
				64	for (; j < pad_left; j++)
				65	{
				66	*(dest++) = pad_buffer;
				67	}
				68	for (; j < last_valid_col; j++)
				69	{
				70	*(dest++) = colptr;
				71	colptr += ld_col;
				72	}
				73	for (; j < array_cols; j++)
				74	{
				75	*(dest++) = pad_buffer;
				76	}
				77	}
				78	for (; i < array_rows; i++)
				79	{
				80	for (unsigned int j = 0; j < array_cols; j++)
				81	{
				82	*(dest++) = pad_buffer;
				83	}
				84	}
				85	}
				86
				87
				88	void fill_pointer_array_generic_kernel(
				89	const size_t element_size,
				90	void **dest_raw,
				91	const unsigned int output_rows, const unsigned int output_cols,
				92	const unsigned int kernel_rows, const unsigned int kernel_cols,
				93	const unsigned int stride_rows, const unsigned int stride_cols,
				94	void *base_ptr_raw, size_t ld_row, size_t ld_col,
				95	void *pad_buffer_raw,
				96	const unsigned int pad_top, const unsigned int valid_rows,
				97	const unsigned int pad_left, const unsigned int valid_cols
				98	)
				99	{
				100	auto dest = reinterpret_cast<char **>(dest_raw);
				101	auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
				102	auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
				103	ld_row *= element_size;
				104	ld_col *= element_size;
				105
				106	const auto last_valid_row = pad_top + valid_rows;
				107	const auto last_valid_col = pad_left + valid_cols;
				108	const auto point_stride = output_rows * output_cols;
				109
				110	// Iterate over the output points, after every point increment the pointer
				111	// into the address array.
				112	for (unsigned int oi = 0; oi < output_rows; oi++)
				113	{
				114	for (unsigned int oj = 0; oj < output_cols; oj++)
				115	{
				116	auto point_dest = dest;
				117	dest++;
				118
				119	// Iterate over kernel points and fill in the pointer array.
				120	unsigned int ki = 0, ii = oi*stride_rows;
				121	for (; ii < pad_top && ki < kernel_rows; ii++, ki++)
				122	{
				123	// Fill with padding
				124	for (unsigned int j = 0; j < kernel_cols; j++)
				125	{
				126	*point_dest = pad_buffer;
				127	point_dest += point_stride;
				128	}
				129	}
				130	for (; ii < last_valid_row && ki < kernel_rows; ii++, ki++)
				131	{
				132	unsigned int kj = 0, ij = oj*stride_cols;
				133	for (; ij < pad_left && kj < kernel_cols; ij++, kj++)
				134	{
				135	// Padding
				136	*point_dest = pad_buffer;
				137	point_dest += point_stride;
				138	}
				139	for (; ij < last_valid_col && kj < kernel_cols; ij++, kj++)
				140	{
				141	point_dest = base_ptr + (ii - pad_top)ld_row + (ij - pad_left)*ld_col;
				142	point_dest += point_stride;
				143	}
				144	for (; kj < kernel_cols; kj++)
				145	{
				146	// Padding
				147	*point_dest = pad_buffer;
				148	point_dest += point_stride;
				149	}
				150	}
				151	for (; ki < kernel_rows; ki++)
				152	{
				153	// Fill with padding
				154	for (unsigned int j = 0; j < kernel_cols; j++)
				155	{
				156	*point_dest = pad_buffer;
				157	point_dest += point_stride;
				158	}
				159	}
				160	}
				161	}
				162	}
				163
				164	/* Patch array constructor
				165	*
				166	* Some depthwise kernels require an NCHW-ordered patch of input. Here we
				167	* construct such a patch, and fill in an array of pointers to the rows of the
				168	* patch.
				169	*/
				170	void fill_nchw_patch_array(
				171	size_t element_size,
				172	const void **dest_row_pointers_raw, // Array of pointers to each row of the patch
				173	void *dest_patch_raw, // Pointer to space which can be used to construct the patch
				174	const unsigned int patch_rows, unsigned int patch_cols, // Patch size
				175	const void *src_ptr_raw, size_t ld_row, size_t ld_col, // Source tensor
				176	const void *pad_row, // Pointer to a row of padding values
				177	const unsigned int pad_top, const unsigned int valid_rows,
				178	const unsigned int pad_left, const unsigned int valid_cols
				179	)
				180	{
				181	// Convert into more useful types
				182	auto row_pointers = reinterpret_cast<const char **>(dest_row_pointers_raw);
				183	auto dest_patch = reinterpret_cast<char *>(dest_patch_raw);
				184	auto src = reinterpret_cast<const char *>(src_ptr_raw);
				185	ld_row *= element_size;
				186	ld_col *= element_size;
				187
				188	// Round up the patch columns to be a full quad
				189	patch_cols = arm_gemm::roundup<unsigned int>(patch_cols, 16 / element_size);
				190
				191	const auto last_valid_row = std::min(pad_top + valid_rows, patch_rows);
				192	const auto last_valid_col = std::min(pad_left + valid_cols, patch_cols);
				193
				194	// Construct the patch and row pointer array together
				195	unsigned int i = 0;
				196	for (; i < pad_top; i++)
				197	{
				198	// Insert pointers into the padding row
				199	(row_pointers++) = reinterpret_cast<const char >(pad_row);
				200	}
				201	for (; i < last_valid_row; i++)
				202	{
				203	// Get a copy of the pointer for this row
				204	auto colptr = src;
				205	src += ld_row;
				206
				207	// If the input is already in NCHW format (ld_col == element_size) AND
				208	// there is no padding, then we just use a pointer to the source tensor;
				209	// otherwise we need to construct a patch and provide a pointer to it.
				210	if (ld_col == element_size && pad_left == 0 && last_valid_col == patch_cols)
				211	{
				212	*(row_pointers++) = colptr;
				213	}
				214	else
				215	{
				216	auto patch_col = dest_patch;
				217	*(row_pointers++) = dest_patch;
				218	dest_patch += element_size * patch_cols; // Move the patch pointer on
				219
				220	// Construct the patch; fill the entirety with padding and then copy in
				221	// the valid elements.
				222	memcpy(patch_col, pad_row, element_size * patch_cols);
				223	patch_col += pad_left * element_size; // Move over the left padding
				224
				225	if (ld_col == element_size)
				226	{
				227	// If the input is NCHW then copy across as many columns as we can.
				228	memcpy(patch_col, colptr, (last_valid_col - pad_left) * element_size);
				229	}
				230	else
				231	{
				232	// If the input is NHWC then copy columns across in turn.
				233	for (auto j = pad_left; j < last_valid_col; j++)
				234	{
				235	memcpy(patch_col, colptr, element_size); // Copy the valid element
				236	patch_col += element_size; // Progress the patch destination
				237	colptr += ld_col; // Progress the patch source
				238	}
				239	}
				240	}
				241	}
				242	for (; i < patch_rows; i++)
				243	{
				244	// Insert pointers into the padding row
				245	(row_pointers++) = reinterpret_cast<const char >(pad_row);
				246	}
				247	}
				248
				249
				250	/* Patch array constructor (generic kernels)
				251	*
				252	* Construct an array of pointers; one pointer for each output row for each
				253	* kernel point. Pointers should point at a whole number of QUADS containing an
				254	* input point for each output point. If the kernel column stride is 1 and the
				255	* data is NCHW then the input tensor might be addressed directly, otherwise a
				256	* new patch sample might need to be constructed.
				257	*/
				258	void fill_patch_array_generic_kernel(
				259	size_t element_size,
				260	const void **dest_pointers_raw, // Pointers: one per output row per kernel point
				261	void *patch_raw, // Pointer to space which can be used to construct the patch
				262	const unsigned int output_rows, const unsigned int output_cols,
				263	const unsigned int kernel_rows, const unsigned int kernel_cols,
				264	const unsigned int stride_rows, const unsigned int stride_cols,
				265	const void *src_ptr_raw, size_t ld_row, size_t ld_col, // Source tensor
				266	const void *pad_row, // Pointer to a row of padding values
				267	const unsigned int pad_top, const unsigned int valid_rows,
				268	const unsigned int pad_left, const unsigned int valid_cols
				269	)
				270	{
				271	auto dest = reinterpret_cast<const char **>(dest_pointers_raw);
				272	auto patch = reinterpret_cast<char *>(patch_raw);
				273	auto src_ptr = reinterpret_cast<const char *>(src_ptr_raw);
				274	ld_row *= element_size;
				275	ld_col *= element_size;
				276
				277	// Round up the patch columns to a multiple of quad-length
				278	const auto patch_cols = arm_gemm::roundup<unsigned int>(output_cols, 16 / element_size);
				279
				280	const auto input_rows = kernel_rows + (output_rows - 1) * stride_rows;
				281	const auto last_valid_row = std::min(pad_top + valid_rows, input_rows);
				282
				283	const auto input_cols = kernel_cols + (output_cols - 1) * stride_cols;
				284	const auto last_valid_col = std::min(pad_left + valid_cols, input_cols);
				285
				286	for (auto ki = 0u; ki < kernel_rows; ki++)
				287	{
				288	for (auto kj = 0u; kj < kernel_cols; kj++)
				289	{
				290	auto oi = 0u, ii = ki;
				291	for (; oi < output_rows && ii < pad_top; oi++, ii += stride_rows)
				292	{
				293	// Insert a pointer to the padding row
				294	(dest++) = reinterpret_cast<const char >(pad_row);
				295	}
				296	for (; oi < output_rows && ii < last_valid_row; oi++, ii += stride_rows)
				297	{
				298	auto rowptr = src_ptr + (ii - pad_top) * ld_row;
				299
				300	// Construct a sample of the input here
				301	auto patch_pos = patch;
				302	*(dest++) = patch;
				303	patch += patch_cols * element_size;
				304
				305	// Fill with padding
				306	memcpy(patch_pos, pad_row, patch_cols * element_size);
				307
				308	// Fill in the valid elements
				309	auto oj = 0u, ij = kj;
				310	for (; oj < patch_cols && ij < pad_left; oj++, ij += stride_cols)
				311	{
				312	// Do nothing for padding
				313	patch_pos += element_size;
				314	}
				315	for (; oj < patch_cols && ij < last_valid_col; oj++, ij += stride_cols)
				316	{
				317	// Copy from the source tensor
				318	memcpy(patch_pos, rowptr + (ij - pad_left)*ld_col, element_size);
				319	patch_pos += element_size;
				320	}
				321	// No action required for right-hand padding
				322	}
				323	for (; oi < output_rows; oi++)
				324	{
				325	(dest++) = reinterpret_cast<const char >(pad_row);
				326	}
				327	}
				328	}
				329	}
				330
				331	} // namespace addressing
				332	} // namespace arm_conv