Blame - src/core/NEON/kernels/arm_conv/addressing.cpp - ml/ComputeLibrary

blob: 246039888028ac7335909bfc7a21d5d865c6701a [file] [log] [blame]

ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	1	/*
Michael Tyler	74921ee	2023-04-12 17:43:17 +0100	[diff] [blame^]	2	* Copyright (c) 2022-2023 Arm Limited.
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#include "addressing.hpp"
Michael Tyler	74921ee	2023-04-12 17:43:17 +0100	[diff] [blame^]	26	#include "utils.hpp"
				27	#include <algorithm>
ramelg01	c827e99	2022-04-08 03:52:28 +0100	[diff] [blame]	28	#include <cstring>
				29
				30	namespace arm_conv {
				31	namespace addressing {
				32
				33	void fill_pointer_array(
				34	size_t element_size,
				35	void **dest_raw, const unsigned int array_rows, const unsigned int array_cols,
				36	void *base_ptr_raw, size_t ld_row, size_t ld_col,
				37	void *pad_buffer_raw,
				38	const unsigned int pad_top, const unsigned int valid_rows,
				39	const unsigned int pad_left, const unsigned int valid_cols
				40	)
				41	{
				42	auto dest = reinterpret_cast<char **>(dest_raw);
				43	auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
				44	auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
				45	ld_row *= element_size;
				46	ld_col *= element_size;
				47
				48	const auto last_valid_row = std::min(pad_top + valid_rows, array_rows);
				49	const auto last_valid_col = std::min(pad_left + valid_cols, array_cols);
				50
				51	unsigned int i = 0;
				52	for (; i < pad_top; i++)
				53	{
				54	for (unsigned int j = 0; j < array_cols; j++)
				55	{
				56	*(dest++) = pad_buffer;
				57	}
				58	}
				59	for (; i < last_valid_row; i++)
				60	{
				61	unsigned int j = 0;
				62	auto colptr = base_ptr;
				63	base_ptr += ld_row;
				64
				65	for (; j < pad_left; j++)
				66	{
				67	*(dest++) = pad_buffer;
				68	}
				69	for (; j < last_valid_col; j++)
				70	{
				71	*(dest++) = colptr;
				72	colptr += ld_col;
				73	}
				74	for (; j < array_cols; j++)
				75	{
				76	*(dest++) = pad_buffer;
				77	}
				78	}
				79	for (; i < array_rows; i++)
				80	{
				81	for (unsigned int j = 0; j < array_cols; j++)
				82	{
				83	*(dest++) = pad_buffer;
				84	}
				85	}
				86	}
				87
				88
				89	void fill_pointer_array_generic_kernel(
				90	const size_t element_size,
				91	void **dest_raw,
				92	const unsigned int output_rows, const unsigned int output_cols,
				93	const unsigned int kernel_rows, const unsigned int kernel_cols,
				94	const unsigned int stride_rows, const unsigned int stride_cols,
				95	void *base_ptr_raw, size_t ld_row, size_t ld_col,
				96	void *pad_buffer_raw,
				97	const unsigned int pad_top, const unsigned int valid_rows,
				98	const unsigned int pad_left, const unsigned int valid_cols
				99	)
				100	{
				101	auto dest = reinterpret_cast<char **>(dest_raw);
				102	auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
				103	auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
				104	ld_row *= element_size;
				105	ld_col *= element_size;
				106
				107	const auto last_valid_row = pad_top + valid_rows;
				108	const auto last_valid_col = pad_left + valid_cols;
				109	const auto point_stride = output_rows * output_cols;
				110
				111	// Iterate over the output points, after every point increment the pointer
				112	// into the address array.
				113	for (unsigned int oi = 0; oi < output_rows; oi++)
				114	{
				115	for (unsigned int oj = 0; oj < output_cols; oj++)
				116	{
				117	auto point_dest = dest;
				118	dest++;
				119
				120	// Iterate over kernel points and fill in the pointer array.
				121	unsigned int ki = 0, ii = oi*stride_rows;
				122	for (; ii < pad_top && ki < kernel_rows; ii++, ki++)
				123	{
				124	// Fill with padding
				125	for (unsigned int j = 0; j < kernel_cols; j++)
				126	{
				127	*point_dest = pad_buffer;
				128	point_dest += point_stride;
				129	}
				130	}
				131	for (; ii < last_valid_row && ki < kernel_rows; ii++, ki++)
				132	{
				133	unsigned int kj = 0, ij = oj*stride_cols;
				134	for (; ij < pad_left && kj < kernel_cols; ij++, kj++)
				135	{
				136	// Padding
				137	*point_dest = pad_buffer;
				138	point_dest += point_stride;
				139	}
				140	for (; ij < last_valid_col && kj < kernel_cols; ij++, kj++)
				141	{
				142	point_dest = base_ptr + (ii - pad_top)ld_row + (ij - pad_left)*ld_col;
				143	point_dest += point_stride;
				144	}
				145	for (; kj < kernel_cols; kj++)
				146	{
				147	// Padding
				148	*point_dest = pad_buffer;
				149	point_dest += point_stride;
				150	}
				151	}
				152	for (; ki < kernel_rows; ki++)
				153	{
				154	// Fill with padding
				155	for (unsigned int j = 0; j < kernel_cols; j++)
				156	{
				157	*point_dest = pad_buffer;
				158	point_dest += point_stride;
				159	}
				160	}
				161	}
				162	}
				163	}
				164
				165	/* Patch array constructor
				166	*
				167	* Some depthwise kernels require an NCHW-ordered patch of input. Here we
				168	* construct such a patch, and fill in an array of pointers to the rows of the
				169	* patch.
				170	*/
				171	void fill_nchw_patch_array(
				172	size_t element_size,
				173	const void **dest_row_pointers_raw, // Array of pointers to each row of the patch
				174	void *dest_patch_raw, // Pointer to space which can be used to construct the patch
				175	const unsigned int patch_rows, unsigned int patch_cols, // Patch size
				176	const void *src_ptr_raw, size_t ld_row, size_t ld_col, // Source tensor
				177	const void *pad_row, // Pointer to a row of padding values
				178	const unsigned int pad_top, const unsigned int valid_rows,
				179	const unsigned int pad_left, const unsigned int valid_cols
				180	)
				181	{
				182	// Convert into more useful types
				183	auto row_pointers = reinterpret_cast<const char **>(dest_row_pointers_raw);
				184	auto dest_patch = reinterpret_cast<char *>(dest_patch_raw);
				185	auto src = reinterpret_cast<const char *>(src_ptr_raw);
				186	ld_row *= element_size;
				187	ld_col *= element_size;
				188
				189	// Round up the patch columns to be a full quad
				190	patch_cols = arm_gemm::roundup<unsigned int>(patch_cols, 16 / element_size);
				191
				192	const auto last_valid_row = std::min(pad_top + valid_rows, patch_rows);
				193	const auto last_valid_col = std::min(pad_left + valid_cols, patch_cols);
				194
				195	// Construct the patch and row pointer array together
				196	unsigned int i = 0;
				197	for (; i < pad_top; i++)
				198	{
				199	// Insert pointers into the padding row
				200	(row_pointers++) = reinterpret_cast<const char >(pad_row);
				201	}
				202	for (; i < last_valid_row; i++)
				203	{
				204	// Get a copy of the pointer for this row
				205	auto colptr = src;
				206	src += ld_row;
				207
				208	// If the input is already in NCHW format (ld_col == element_size) AND
				209	// there is no padding, then we just use a pointer to the source tensor;
				210	// otherwise we need to construct a patch and provide a pointer to it.
				211	if (ld_col == element_size && pad_left == 0 && last_valid_col == patch_cols)
				212	{
				213	*(row_pointers++) = colptr;
				214	}
				215	else
				216	{
				217	auto patch_col = dest_patch;
				218	*(row_pointers++) = dest_patch;
				219	dest_patch += element_size * patch_cols; // Move the patch pointer on
				220
				221	// Construct the patch; fill the entirety with padding and then copy in
				222	// the valid elements.
				223	memcpy(patch_col, pad_row, element_size * patch_cols);
				224	patch_col += pad_left * element_size; // Move over the left padding
				225
				226	if (ld_col == element_size)
				227	{
				228	// If the input is NCHW then copy across as many columns as we can.
				229	memcpy(patch_col, colptr, (last_valid_col - pad_left) * element_size);
				230	}
				231	else
				232	{
				233	// If the input is NHWC then copy columns across in turn.
				234	for (auto j = pad_left; j < last_valid_col; j++)
				235	{
				236	memcpy(patch_col, colptr, element_size); // Copy the valid element
				237	patch_col += element_size; // Progress the patch destination
				238	colptr += ld_col; // Progress the patch source
				239	}
				240	}
				241	}
				242	}
				243	for (; i < patch_rows; i++)
				244	{
				245	// Insert pointers into the padding row
				246	(row_pointers++) = reinterpret_cast<const char >(pad_row);
				247	}
				248	}
				249
				250
				251	/* Patch array constructor (generic kernels)
				252	*
				253	* Construct an array of pointers; one pointer for each output row for each
				254	* kernel point. Pointers should point at a whole number of QUADS containing an
				255	* input point for each output point. If the kernel column stride is 1 and the
				256	* data is NCHW then the input tensor might be addressed directly, otherwise a
				257	* new patch sample might need to be constructed.
				258	*/
				259	void fill_patch_array_generic_kernel(
				260	size_t element_size,
				261	const void **dest_pointers_raw, // Pointers: one per output row per kernel point
				262	void *patch_raw, // Pointer to space which can be used to construct the patch
				263	const unsigned int output_rows, const unsigned int output_cols,
				264	const unsigned int kernel_rows, const unsigned int kernel_cols,
				265	const unsigned int stride_rows, const unsigned int stride_cols,
				266	const void *src_ptr_raw, size_t ld_row, size_t ld_col, // Source tensor
				267	const void *pad_row, // Pointer to a row of padding values
				268	const unsigned int pad_top, const unsigned int valid_rows,
				269	const unsigned int pad_left, const unsigned int valid_cols
				270	)
				271	{
				272	auto dest = reinterpret_cast<const char **>(dest_pointers_raw);
				273	auto patch = reinterpret_cast<char *>(patch_raw);
				274	auto src_ptr = reinterpret_cast<const char *>(src_ptr_raw);
				275	ld_row *= element_size;
				276	ld_col *= element_size;
				277
				278	// Round up the patch columns to a multiple of quad-length
				279	const auto patch_cols = arm_gemm::roundup<unsigned int>(output_cols, 16 / element_size);
				280
				281	const auto input_rows = kernel_rows + (output_rows - 1) * stride_rows;
				282	const auto last_valid_row = std::min(pad_top + valid_rows, input_rows);
				283
				284	const auto input_cols = kernel_cols + (output_cols - 1) * stride_cols;
				285	const auto last_valid_col = std::min(pad_left + valid_cols, input_cols);
				286
				287	for (auto ki = 0u; ki < kernel_rows; ki++)
				288	{
				289	for (auto kj = 0u; kj < kernel_cols; kj++)
				290	{
				291	auto oi = 0u, ii = ki;
				292	for (; oi < output_rows && ii < pad_top; oi++, ii += stride_rows)
				293	{
				294	// Insert a pointer to the padding row
				295	(dest++) = reinterpret_cast<const char >(pad_row);
				296	}
				297	for (; oi < output_rows && ii < last_valid_row; oi++, ii += stride_rows)
				298	{
				299	auto rowptr = src_ptr + (ii - pad_top) * ld_row;
				300
				301	// Construct a sample of the input here
				302	auto patch_pos = patch;
				303	*(dest++) = patch;
				304	patch += patch_cols * element_size;
				305
				306	// Fill with padding
				307	memcpy(patch_pos, pad_row, patch_cols * element_size);
				308
				309	// Fill in the valid elements
				310	auto oj = 0u, ij = kj;
				311	for (; oj < patch_cols && ij < pad_left; oj++, ij += stride_cols)
				312	{
				313	// Do nothing for padding
				314	patch_pos += element_size;
				315	}
				316	for (; oj < patch_cols && ij < last_valid_col; oj++, ij += stride_cols)
				317	{
				318	// Copy from the source tensor
				319	memcpy(patch_pos, rowptr + (ij - pad_left)*ld_col, element_size);
				320	patch_pos += element_size;
				321	}
				322	// No action required for right-hand padding
				323	}
				324	for (; oi < output_rows; oi++)
				325	{
				326	(dest++) = reinterpret_cast<const char >(pad_row);
				327	}
				328	}
				329	}
				330	}
				331
				332	} // namespace addressing
				333	} // namespace arm_conv