Blame - src/core/NEON/kernels/NEConvolutionKernel.cpp - ml/ComputeLibrary

blob: bac27430f972ea510c60bc1e27f01fe8d39ead35 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	2	* Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Michalis Spyrou	ebcebf1	2020-10-21 00:04:14 +0100	[diff] [blame]	24	#include "src/core/NEON/kernels/NEConvolutionKernel.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
				26	#include "arm_compute/core/Coordinates.h"
				27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/ITensor.h"
				30	#include "arm_compute/core/TensorInfo.h"
				31	#include "arm_compute/core/Types.h"
				32	#include "arm_compute/core/Utils.h"
				33	#include "arm_compute/core/Validate.h"
				34	#include "arm_compute/core/Window.h"
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	35	#include "src/core/helpers/AutoConfiguration.h"
				36	#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	37
				38	#include <algorithm>
				39	#include <arm_neon.h>
				40	#include <array>
				41	#include <cstdint>
				42	#include <cstring>
				43	#include <tuple>
				44
				45	namespace arm_compute
				46	{
				47	namespace
				48	{
				49	const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
				50
				51	inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output)
				52	{
				53	const int16x8_t s16results = vcombine_s16(vqmovn_s32(out),
				54	vqmovn_s32(out2));
				55	vst1q_s16(output, s16results);
				56	}
				57
				58	inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output)
				59	{
				60	const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out),
				61	vqmovun_s32(out2)));
				62	vst1_u8(output, u8results);
				63	}
				64
				65	inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output)
				66	{
				67	const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2));
				68	const int16x8_t s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16));
				69	vst1q_s16(output, s16results);
				70	}
				71
				72	inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output)
				73	{
				74	const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out),
				75	vqmovn_u32(out2)));
				76	vst1_u8(output, u8results);
				77	}
				78
				79	inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output)
				80	{
				81	vst1q_s16(output, out);
				82	vst1q_s16(output + 8, out2);
				83	}
				84
				85	inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output)
				86	{
				87	const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out),
				88	vqmovun_s16(out2));
				89	vst1q_u8(output, u8results);
				90	}
				91
				92	inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output)
				93	{
				94	const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out),
				95	vqmovn_u16(out2));
				96	vst1q_u8(output, u8results);
				97	}
				98
				99	inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output)
				100	{
				101	vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16)));
				102	vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16)));
				103	}
				104
				105	inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2)
				106	{
				107	// Convert to s16 and split in blocks of 4 values:
				108	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				109	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				110
				111	const int16x4x3_t row =
				112	{
				113	{
				114	vget_low_s16(s16_tmp0),
				115	vget_high_s16(s16_tmp0),
				116	vget_low_s16(s16_tmp1)
				117	}
				118	};
				119
				120	// Calculate row left value for pixels [0,3]
				121	out = vmlal_s16(out, row.val[0], mat0);
				122	// Calculate row middle value for pixels [0,3]
				123	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				124	// Calculate row right value for pixels [0,3]
				125	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				126
				127	// Calculate row left value for pixels [4,7]
				128	out2 = vmlal_s16(out2, row.val[1], mat0);
				129	// Calculate row middle value for pixels [4,7]
				130	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				131	// Calculate row right value for pixels [4,7]
				132	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				133	}
				134
				135	inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				136	{
				137	const int16x4_t mat0 = vld1_dup_s16(convolution);
				138	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				139	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				140
				141	convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2);
				142	}
				143
				144	inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				145	{
				146	const int16x4_t mat0 = vld1_dup_s16(convolution);
				147	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				148	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				149	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				150	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				151
				152	// Convert to s16 and split in blocks of 4 values:
				153	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				154	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				155
				156	const int16x4x3_t row =
				157	{
				158	{
				159	vget_low_s16(s16_tmp0),
				160	vget_high_s16(s16_tmp0),
				161	vget_low_s16(s16_tmp1)
				162	}
				163	};
				164
				165	// Calculate row left 2 value for pixels [0,3]
				166	out = vmlal_s16(out, row.val[0], mat0);
				167	// Calculate row left 1 value for pixels [0,3]
				168	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				169	// Calculate row middle value for pixels [0,3]
				170	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				171	// Calculate row right +1 value for pixels [0,3]
				172	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				173	// Calculate row right +2 value for pixels [0,3]
				174	out = vmlal_s16(out, row.val[1], mat4);
				175
				176	// Calculate row left 2 value for pixels [4,7]
				177	out2 = vmlal_s16(out2, row.val[1], mat0);
				178	// Calculate row left 1 value for pixels [4,7]
				179	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				180	// Calculate row middle value for pixels [4,7]
				181	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				182	// Calculate row right +1 value for pixels [4,7]
				183	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				184	// Calculate row right +2 value for pixels [4,7]
				185	out2 = vmlal_s16(out2, row.val[2], mat4);
				186	}
				187
				188	inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				189	{
				190	const int16x4_t mat0 = vld1_dup_s16(convolution);
				191	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				192	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				193	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				194	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				195	const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
				196	const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
				197
				198	// Convert to s16 and split in blocks of 4 values:
				199	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				200	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				201
				202	const int16x4x4_t row =
				203	{
				204	{
				205	vget_low_s16(s16_tmp0),
				206	vget_high_s16(s16_tmp0),
				207	vget_low_s16(s16_tmp1),
				208	vget_high_s16(s16_tmp1)
				209	}
				210	};
				211
				212	// Calculate row left 3 value for pixels [0,3]
				213	out = vmlal_s16(out, row.val[0], mat0);
				214	// Calculate row left 2 value for pixels [0,3]
				215	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				216	// Calculate row left 1 value for pixels [0,3]
				217	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				218	// Calculate row middle value for pixels [0,3]
				219	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				220	// Calculate row right +1 value for pixels [0,3]
				221	out = vmlal_s16(out, row.val[1], mat4);
				222	// Calculate row right +2 value for pixels [0,3]
				223	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
				224	// Calculate row right +3 value for pixels [0,3]
				225	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
				226
				227	// Calculate row left 3 value for pixels [4,7]
				228	out2 = vmlal_s16(out2, row.val[1], mat0);
				229	// Calculate row left 2 value for pixels [4,7]
				230	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				231	// Calculate row left 1 value for pixels [4,7]
				232	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				233	// Calculate row middle value for pixels [4,7]
				234	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				235	// Calculate row right +1 value for pixels [4,7]
				236	out2 = vmlal_s16(out2, row.val[2], mat4);
				237	// Calculate row right +2 value for pixels [4,7]
				238	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
				239	// Calculate row right +3 value for pixels [4,7]
				240	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
				241	}
				242
				243	inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				244	{
				245	const int16x4_t mat0 = vld1_dup_s16(convolution);
				246	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				247	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				248	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				249	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				250	const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
				251	const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
				252	const int16x4_t mat7 = vld1_dup_s16(convolution + 7);
				253	const int16x4_t mat8 = vld1_dup_s16(convolution + 8);
				254
				255	// Convert to s16 and split in blocks of 4 values:
				256	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				257	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				258
				259	const int16x4x4_t row =
				260	{
				261	{
				262	vget_low_s16(s16_tmp0),
				263	vget_high_s16(s16_tmp0),
				264	vget_low_s16(s16_tmp1),
				265	vget_high_s16(s16_tmp1)
				266	}
				267	};
				268
				269	// Calculate row left 4 value for pixels [0,3]
				270	out = vmlal_s16(out, row.val[0], mat0);
				271	// Calculate row left 3 value for pixels [0,3]
				272	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				273	// Calculate row left 2 value for pixels [0,3]
				274	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				275	// Calculate row left 1 value for pixels [0,3]
				276	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				277	// Calculate row middle value for pixels [0,3]
				278	out = vmlal_s16(out, row.val[1], mat4);
				279	// Calculate row right +1 value for pixels [0,3]
				280	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
				281	// Calculate row right +2 value for pixels [0,3]
				282	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
				283	// Calculate row right +3 value for pixels [0,3]
				284	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7);
				285	// Calculate row right +4 value for pixels [0,3]
				286	out = vmlal_s16(out, row.val[2], mat8);
				287
				288	// Calculate row left 4 value for pixels [0,3]
				289	out2 = vmlal_s16(out2, row.val[1], mat0);
				290	// Calculate row left 3 value for pixels [0,3]
				291	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				292	// Calculate row left 2 value for pixels [0,3]
				293	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				294	// Calculate row left 1 value for pixels [0,3]
				295	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				296	// Calculate row middle value for pixels [0,3]
				297	out2 = vmlal_s16(out2, row.val[2], mat4);
				298	// Calculate row right +1 value for pixels [0,3]
				299	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
				300	// Calculate row right +2 value for pixels [0,3]
				301	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
				302	// Calculate row right +3 value for pixels [0,3]
				303	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7);
				304	// Calculate row right +4 value for pixels [0,3]
				305	out2 = vmlal_s16(out2, row.val[3], mat8);
				306	}
				307	} // namespace
				308
				309	/****************************************************************************************\
				310	* Square Convolution *
				311	\****************************************************************************************/
				312
				313	template <unsigned int matrix_size>
				314	NEConvolutionKernel<matrix_size>::NEConvolutionKernel()
				315	: INESimpleKernel(), _scale(0), _convolution{ {} }
				316	{
				317	}
				318
				319	template <unsigned int matrix_size>
				320	BorderSize NEConvolutionKernel<matrix_size>::border_size() const
				321	{
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	322	return BorderSize{ matrix_size / 2 };
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	323	}
				324
				325	template <unsigned int matrix_size>
				326	void NEConvolutionKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv, uint32_t scale, bool border_undefined)
				327	{
				328	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
				329
				330	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				331
				332	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				333	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				334	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				335
				336	_input = input;
				337	_output = output;
				338
				339	std::copy_n(conv, _convolution.size(), _convolution.begin());
				340
				341	if(scale == 0)
				342	{
				343	_scale = calculate_matrix_scale(_convolution.data(), matrix_size);
				344	}
				345	else
				346	{
				347	_scale = scale;
				348	}
				349
				350	// Configure kernel window
				351	constexpr unsigned int num_elems_processed_per_iteration = 8;
				352	constexpr unsigned int num_elems_read_per_iteration = 16;
				353	constexpr unsigned int num_elems_written_per_iteration = 8;
				354
				355	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				356	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				357
				358	update_window_and_padding(win,
				359	AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, matrix_size),
				360	output_access);
				361
				362	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				363
				364	INEKernel::configure(win);
				365	}
				366
				367	template <>
				368	template <typename OutputType>
				369	void NEConvolutionKernel<3>::convolution(const Window &win)
				370	{
				371	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				372	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				373
				374	Iterator input(_input, win);
				375	Iterator output(_output, win);
				376
				377	// Load the matrix's coefficients into NEON registers:
				378	const int16x4_t mat00 = vld1_dup_s16(_convolution.data());
				379	const int16x4_t mat01 = vld1_dup_s16(_convolution.data() + 1);
				380	const int16x4_t mat02 = vld1_dup_s16(_convolution.data() + 2);
				381	const int16x4_t mat10 = vld1_dup_s16(_convolution.data() + 3);
				382	const int16x4_t mat11 = vld1_dup_s16(_convolution.data() + 4);
				383	const int16x4_t mat12 = vld1_dup_s16(_convolution.data() + 5);
				384	const int16x4_t mat20 = vld1_dup_s16(_convolution.data() + 6);
				385	const int16x4_t mat21 = vld1_dup_s16(_convolution.data() + 7);
				386	const int16x4_t mat22 = vld1_dup_s16(_convolution.data() + 8);
				387	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				388
				389	const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1));
				390	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
				391	const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
				392
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	393	execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	394	{
				395	int32x4_t out = vdupq_n_s32(0);
				396	int32x4_t out2 = vdupq_n_s32(0);
				397
				398	// Load 16 bytes from the top row:
				399	const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
				400	convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02);
				401
				402	// Load 16 bytes from the middle row:
				403	const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
				404	convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12);
				405
				406	// Load 16 bytes from the middle row:
				407	const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset());
				408	convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22);
				409
				410	// Apply scale
				411	if(_scale != 1)
				412	{
				413	// Convert to F32, scale and convert back to S32
				414	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				415	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				416	}
				417
				418	// Clamp and store as U8 or S16:
				419	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				420	},
				421	input, output);
				422	}
				423
				424	template <>
				425	template <typename OutputType>
				426	void NEConvolutionKernel<5>::convolution(const Window &win)
				427	{
				428	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				429	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				430
				431	Iterator input(_input, win);
				432	Iterator output(_output, win);
				433
				434	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				435
				436	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2));
				437	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1));
				438	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0));
				439	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
				440	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
				441
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	442	execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	443	{
				444	int32x4_t out = vdupq_n_s32(0);
				445	int32x4_t out2 = vdupq_n_s32(0);
				446
				447	// Load 16 bytes from the top2 row:
				448	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				449	convolve_row5x1(out, out2, data_t2, _convolution.data());
				450
				451	// Load 16 bytes from the top1 row:
				452	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				453	convolve_row5x1(out, out2, data_t1, _convolution.data() + 5);
				454
				455	// Load 16 bytes from the middle row:
				456	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				457	convolve_row5x1(out, out2, data_m, _convolution.data() + 10);
				458
				459	// Load 16 bytes from the low1 row:
				460	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				461	convolve_row5x1(out, out2, data_b1, _convolution.data() + 15);
				462
				463	// Load 16 bytes from the low2 row:
				464	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				465	convolve_row5x1(out, out2, data_b2, _convolution.data() + 20);
				466
				467	// Apply scale
				468	if(_scale != 1)
				469	{
				470	// Convert to F32, scale and convert back to S32
				471	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				472	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				473	}
				474
				475	// Clamp and store as U8 or S16:
				476	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				477	},
				478	input, output);
				479	}
				480
				481	template <>
				482	template <typename OutputType>
				483	void NEConvolutionKernel<7>::convolution(const Window &win)
				484	{
				485	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				486	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				487
				488	Iterator input(_input, win);
				489	Iterator output(_output, win);
				490
				491	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				492
				493	const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3));
				494	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2));
				495	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1));
				496	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0));
				497	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1));
				498	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
				499	const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
				500
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	501	execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	502	{
				503	int32x4_t out = vdupq_n_s32(0);
				504	int32x4_t out2 = vdupq_n_s32(0);
				505
				506	// Load 16 bytes from the top3 row:
				507	const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
				508	convolve_row7x1(out, out2, data_t3, _convolution.data());
				509
				510	// Load 16 bytes from the top2 row:
				511	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				512	convolve_row7x1(out, out2, data_t2, _convolution.data() + 7);
				513
				514	// Load 16 bytes from the top1 row:
				515	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				516	convolve_row7x1(out, out2, data_t1, _convolution.data() + 14);
				517
				518	// Load 16 bytes from the middle row:
				519	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				520	convolve_row7x1(out, out2, data_m, _convolution.data() + 21);
				521
				522	// Load 16 bytes from the low1 row:
				523	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				524	convolve_row7x1(out, out2, data_b1, _convolution.data() + 28);
				525
				526	// Load 16 bytes from the low2 row:
				527	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				528	convolve_row7x1(out, out2, data_b2, _convolution.data() + 35);
				529
				530	// Load 16 bytes from the low3 row:
				531	const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
				532	convolve_row7x1(out, out2, data_b3, _convolution.data() + 42);
				533
				534	// Apply scale
				535	if(_scale != 1)
				536	{
				537	// Convert to F32, scale and convert back to S32
				538	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				539	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				540	}
				541
				542	// Clamp and store as U8 or S16:
				543	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				544	},
				545	input, output);
				546	}
				547
				548	template <>
				549	template <typename OutputType>
				550	void NEConvolutionKernel<9>::convolution(const Window &win)
				551	{
				552	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				553	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				554
				555	Iterator input(_input, win);
				556	Iterator output(_output, win);
				557
				558	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				559
				560	const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4));
				561	const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3));
				562	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2));
				563	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1));
				564	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0));
				565	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1));
				566	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2));
				567	const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
				568	const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
				569
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	570	execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	571	{
				572	int32x4_t out = vdupq_n_s32(0);
				573	int32x4_t out2 = vdupq_n_s32(0);
				574
				575	// Load 16 bytes from the top4 row:
				576	const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset());
				577	convolve_row9x1(out, out2, data_t4, _convolution.data());
				578
				579	// Load 16 bytes from the top3 row:
				580	const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
				581	convolve_row9x1(out, out2, data_t3, _convolution.data() + 9);
				582
				583	// Load 16 bytes from the top2 row:
				584	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				585	convolve_row9x1(out, out2, data_t2, _convolution.data() + 18);
				586
				587	// Load 16 bytes from the top1 row:
				588	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				589	convolve_row9x1(out, out2, data_t1, _convolution.data() + 27);
				590
				591	// Load 16 bytes from the middle row:
				592	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				593	convolve_row9x1(out, out2, data_m, _convolution.data() + 36);
				594
				595	// Load 16 bytes from the low1 row:
				596	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				597	convolve_row9x1(out, out2, data_b1, _convolution.data() + 45);
				598
				599	// Load 16 bytes from the low2 row:
				600	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				601	convolve_row9x1(out, out2, data_b2, _convolution.data() + 54);
				602
				603	// Load 16 bytes from the low3 row:
				604	const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
				605	convolve_row9x1(out, out2, data_b3, _convolution.data() + 63);
				606
				607	// Load 16 bytes from the low4 row:
				608	const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset());
				609	convolve_row9x1(out, out2, data_b4, _convolution.data() + 72);
				610
				611	// Apply scale
				612	if(_scale != 1)
				613	{
				614	// Convert to F32, scale and convert back to S32
				615	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				616	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				617	}
				618
				619	// Clamp and store as U8 or S16:
				620	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				621	},
				622	input, output);
				623	}
				624
				625	template <unsigned int matrix_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	626	void NEConvolutionKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	627	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	628	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	629	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				630	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				631
Sanghoon Lee	c8a85ba	2017-11-29 11:23:14 +0000	[diff] [blame]	632	switch(_output->info()->data_type())
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	633	{
Sanghoon Lee	c8a85ba	2017-11-29 11:23:14 +0000	[diff] [blame]	634	case DataType::U8:
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	635	convolution<uint8_t>(window);
				636	break;
Sanghoon Lee	c8a85ba	2017-11-29 11:23:14 +0000	[diff] [blame]	637	case DataType::S16:
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	638	convolution<int16_t>(window);
				639	break;
				640	default:
Sanghoon Lee	c8a85ba	2017-11-29 11:23:14 +0000	[diff] [blame]	641	ARM_COMPUTE_ERROR("Not supported Data type!");
				642	break;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	643	}
				644	}
				645
				646	template class arm_compute::NEConvolutionKernel<3>;
				647	template class arm_compute::NEConvolutionKernel<5>;
				648	template class arm_compute::NEConvolutionKernel<7>;
				649	template class arm_compute::NEConvolutionKernel<9>;
				650
				651	/****************************************************************************************\
				652	* Separable Square Convolution *
				653	\****************************************************************************************/
				654
				655	template <unsigned int matrix_size>
				656	NESeparableConvolutionHorKernel<matrix_size>::NESeparableConvolutionHorKernel()
				657	: _conv_row{ { 0 } }, _border_size(0)
				658	{
				659	}
				660
				661	template <unsigned int matrix_size>
				662	BorderSize NESeparableConvolutionHorKernel<matrix_size>::border_size() const
				663	{
				664	return _border_size;
				665	}
				666
				667	template <unsigned int matrix_size>
				668	void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv_row, bool border_undefined)
				669	{
				670	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row);
				671
				672	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				673
				674	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				675	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				676	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
				677
				678	_input = input;
				679	_output = output;
				680	std::copy_n(conv_row, _conv_row.size(), _conv_row.begin());
				681	_border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
				682
				683	// Configure kernel window
				684	constexpr unsigned int num_elems_processed_per_iteration = 8;
				685	constexpr unsigned int num_elems_read_per_iteration = 16;
				686	constexpr unsigned int num_elems_written_per_iteration = 8;
				687
				688	Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				689	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				690
				691	update_window_and_padding(win,
				692	AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
				693	output_access);
				694
				695	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				696
				697	INEKernel::configure(win);
				698	}
				699
				700	template <unsigned int matrix_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	701	void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	702	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	703	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	704	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				705	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				706	switch(_output->info()->data_type())
				707	{
				708	case DataType::U16:
				709	convolve<uint16_t>(window);
				710	break;
				711	case DataType::S16:
				712	convolve<int16_t>(window);
				713	break;
				714	case DataType::S32:
				715	convolve<int32_t>(window);
				716	break;
				717	default:
				718	ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
				719	break;
				720	}
				721	}
				722
				723	template <>
				724	template <>
				725	inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
				726	{
				727	Window win_in(window);
				728	win_in.shift(Window::DimX, -2);
				729
				730	Iterator input(_input, win_in);
				731	Iterator output(_output, window);
				732
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	733	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	734	{
				735	const uint8x16_t data = vld1q_u8(input.ptr());
				736
				737	const uint16x8x2_t data_u16 =
				738	{
				739	{
				740	vmovl_u8(vget_low_u8(data)),
				741	vmovl_u8(vget_high_u8(data))
				742	}
				743	};
				744
				745	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				746	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				747	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				748	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				749	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				750
				751	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				752	},
				753	input, output);
				754	}
				755
				756	template <>
				757	template <>
				758	inline void NESeparableConvolutionHorKernel<5>::convolve<int16_t>(const Window &window)
				759	{
				760	Window win_in(window);
				761	win_in.shift(Window::DimX, -2);
				762
				763	Iterator input(_input, win_in);
				764	Iterator output(_output, window);
				765
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	766	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	767	{
				768	const uint8x16_t data = vld1q_u8(input.ptr());
				769
				770	const int16x8x2_t data_s16 =
				771	{
				772	{
				773	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				774	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				775	}
				776	};
				777
				778	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				779	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				780	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				781	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				782	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				783
				784	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				785	},
				786	input, output);
				787	}
				788
				789	template <>
				790	template <>
				791	void NESeparableConvolutionHorKernel<5>::convolve<int32_t>(const Window &window)
				792	{
				793	Window win_in(window);
				794	win_in.shift(Window::DimX, -2);
				795
				796	Iterator input(_input, win_in);
				797	Iterator output(_output, window);
				798
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	799	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	800	{
				801	const uint8x16_t data = vld1q_u8(input.ptr());
				802
				803	const int16x8x2_t data_s16 =
				804	{
				805	{
				806	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				807	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				808	}
				809	};
				810
				811	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				812	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				813	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				814	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				815
				816	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				817	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]);
				818	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]);
				819	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]);
				820	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]);
				821
				822	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				823
				824	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				825	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]);
				826	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]);
				827	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]);
				828	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]);
				829
				830	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				831	},
				832	input, output);
				833	}
				834
				835	template <>
				836	template <>
				837	inline void NESeparableConvolutionHorKernel<7>::convolve<uint16_t>(const Window &window)
				838	{
				839	Window win_in(window);
				840	win_in.shift(Window::DimX, -3);
				841
				842	Iterator input(_input, win_in);
				843	Iterator output(_output, window);
				844
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	845	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	846	{
				847	const uint8x16_t data = vld1q_u8(input.ptr());
				848
				849	const uint16x8x2_t data_u16 =
				850	{
				851	{
				852	vmovl_u8(vget_low_u8(data)),
				853	vmovl_u8(vget_high_u8(data))
				854	}
				855	};
				856
				857	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				858	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				859	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				860	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				861	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				862	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
				863	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
				864
				865	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				866	},
				867	input, output);
				868	}
				869
				870	template <>
				871	template <>
				872	inline void NESeparableConvolutionHorKernel<7>::convolve<int16_t>(const Window &window)
				873	{
				874	Window win_in(window);
				875	win_in.shift(Window::DimX, -3);
				876
				877	Iterator input(_input, win_in);
				878	Iterator output(_output, window);
				879
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	880	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	881	{
				882	const uint8x16_t data = vld1q_u8(input.ptr());
				883
				884	const int16x8x2_t data_s16 =
				885	{
				886	{
				887	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				888	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				889	}
				890	};
				891
				892	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				893	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				894	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				895	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				896	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				897	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
				898	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
				899
				900	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				901	},
				902	input, output);
				903	}
				904
				905	template <>
				906	template <>
				907	void NESeparableConvolutionHorKernel<7>::convolve<int32_t>(const Window &window)
				908	{
				909	Window win_in(window);
				910	win_in.shift(Window::DimX, -3);
				911
				912	Iterator input(_input, win_in);
				913	Iterator output(_output, window);
				914
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	915	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	916	{
				917	const uint8x16_t data = vld1q_u8(input.ptr());
				918
				919	const int16x8x2_t data_s16 =
				920	{
				921	{
				922	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				923	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				924	}
				925	};
				926
				927	const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				928	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				929	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				930	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				931	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
				932	const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
				933
				934	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				935	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]);
				936	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]);
				937	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]);
				938	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]);
				939	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]);
				940	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]);
				941
				942	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				943
				944	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				945	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]);
				946	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]);
				947	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]);
				948	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]);
				949	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]);
				950	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]);
				951
				952	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				953	},
				954	input, output);
				955	}
				956
				957	template <>
				958	template <>
				959	inline void NESeparableConvolutionHorKernel<9>::convolve<uint16_t>(const Window &window)
				960	{
				961	Window win_in(window);
				962	win_in.shift(Window::DimX, -4);
				963
				964	Iterator input(_input, win_in);
				965	Iterator output(_output, window);
				966
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	967	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	968	{
				969	const uint8x16_t data = vld1q_u8(input.ptr());
				970
				971	const uint16x8x2_t data_u16 =
				972	{
				973	{
				974	vmovl_u8(vget_low_u8(data)),
				975	vmovl_u8(vget_high_u8(data))
				976	}
				977	};
				978
				979	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				980	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				981	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				982	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				983	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				984	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
				985	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
				986	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]);
				987	out = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]);
				988
				989	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				990	},
				991	input, output);
				992	}
				993
				994	template <>
				995	template <>
				996	inline void NESeparableConvolutionHorKernel<9>::convolve<int16_t>(const Window &window)
				997	{
				998	Window win_in(window);
				999	win_in.shift(Window::DimX, -4);
				1000
				1001	Iterator input(_input, win_in);
				1002	Iterator output(_output, window);
				1003
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	1004	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1005	{
				1006	const uint8x16_t data = vld1q_u8(input.ptr());
				1007
				1008	const int16x8x2_t data_s16 =
				1009	{
				1010	{
				1011	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				1012	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				1013	}
				1014	};
				1015
				1016	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				1017	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				1018	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				1019	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				1020	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				1021	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
				1022	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
				1023	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]);
				1024	out = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]);
				1025
				1026	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				1027	},
				1028	input, output);
				1029	}
				1030
				1031	template <>
				1032	template <>
				1033	void NESeparableConvolutionHorKernel<9>::convolve<int32_t>(const Window &window)
				1034	{
				1035	Window win_in(window);
				1036	win_in.shift(Window::DimX, -4);
				1037
				1038	Iterator input(_input, win_in);
				1039	Iterator output(_output, window);
				1040
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	1041	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1042	{
				1043	const uint8x16_t data = vld1q_u8(input.ptr());
				1044
				1045	const int16x8x2_t data_s16 =
				1046	{
				1047	{
				1048	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				1049	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				1050	}
				1051	};
				1052
				1053	const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				1054	const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				1055	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				1056	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				1057	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
				1058	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
				1059	const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7);
				1060
				1061	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				1062	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]);
				1063	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]);
				1064	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]);
				1065	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]);
				1066	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]);
				1067	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]);
				1068	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]);
				1069	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]);
				1070
				1071	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				1072
				1073	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				1074	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]);
				1075	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]);
				1076	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]);
				1077	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]);
				1078	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]);
				1079	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]);
				1080	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]);
				1081	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]);
				1082
				1083	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				1084	},
				1085	input, output);
				1086	}
				1087
				1088	template class arm_compute::NESeparableConvolutionHorKernel<5>;
				1089	template class arm_compute::NESeparableConvolutionHorKernel<7>;
				1090	template class arm_compute::NESeparableConvolutionHorKernel<9>;
				1091
				1092	template <unsigned int matrix_size>
				1093	NESeparableConvolutionVertKernel<matrix_size>::NESeparableConvolutionVertKernel()
				1094	: _conv_col{ { 0 } }, _scale(0)
				1095	{
				1096	}
				1097
				1098	template <unsigned int matrix_size>
				1099	BorderSize NESeparableConvolutionVertKernel<matrix_size>::border_size() const
				1100	{
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	1101	return BorderSize{ matrix_size / 2, 0 };
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1102	}
				1103
				1104	template <unsigned int matrix_size>
				1105	void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
				1106	{
				1107	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col);
				1108
				1109	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				1110
				1111	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				1112	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
				1113	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				1114	ARM_COMPUTE_ERROR_ON(scale == 0);
				1115
				1116	_input = input;
				1117	_output = output;
				1118	std::copy_n(conv_col, _conv_col.size(), _conv_col.begin());
				1119	_scale = scale;
				1120
				1121	// Configure kernel window
				1122	constexpr unsigned int num_elems_processed_per_iteration = 16;
				1123	constexpr unsigned int num_elems_read_per_iteration = 16;
				1124	constexpr unsigned int num_elems_written_per_iteration = 16;
				1125
				1126	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				1127	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				1128
				1129	update_window_and_padding(win,
				1130	AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, matrix_size),
				1131	output_access);
				1132
				1133	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				1134
				1135	INEKernel::configure(win);
				1136	}
				1137
				1138	template <unsigned int matrix_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1139	void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1140	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1141	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1142	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1143	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1144
				1145	switch(_input->info()->data_type())
				1146	{
				1147	case DataType::U16:
				1148	switch(_output->info()->data_type())
				1149	{
				1150	case DataType::U8:
				1151	convolution_u16<uint8_t>(window);
				1152	break;
				1153	case DataType::S16:
				1154	convolution_u16<int16_t>(window);
				1155	break;
				1156	default:
				1157	ARM_COMPUTE_ERROR("Not supported");
				1158	}
				1159	break;
				1160	case DataType::S16:
				1161	switch(_output->info()->data_type())
				1162	{
				1163	case DataType::U8:
				1164	convolution_s16<uint8_t>(window);
				1165	break;
				1166	case DataType::S16:
				1167	convolution_s16<int16_t>(window);
				1168	break;
				1169	default:
				1170	ARM_COMPUTE_ERROR("Not supported");
				1171	}
				1172	break;
				1173	case DataType::S32:
				1174	switch(_output->info()->data_type())
				1175	{
				1176	case DataType::U8:
				1177	convolution_s32<uint8_t>(window);
				1178	break;
				1179	case DataType::S16:
				1180	convolution_s32<int16_t>(window);
				1181	break;
				1182	default:
				1183	ARM_COMPUTE_ERROR("Not supported");
				1184	}
				1185	break;
				1186	default:
				1187	ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
				1188	break;
				1189	}
				1190	}
				1191
				1192	template <unsigned int matrix_size>
				1193	template <typename OutputType>
				1194	void NESeparableConvolutionVertKernel<matrix_size>::convolution_u16(const Window &win)
				1195	{
				1196	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1197
				1198	Window win_in(win);
				1199	win_in.set_dimension_step(Window::DimX, 8);
				1200
				1201	Iterator in(_input, win_in);
				1202	Iterator out(_output, win);
				1203
				1204	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1205	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1206	const int k_half = matrix_size / 2;
				1207
				1208	// Set row pointers
				1209	for(int i = -k_half; i <= k_half; ++i)
				1210	{
				1211	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1212	}
				1213
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	1214	execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1215	{
				1216	uint16x8_t out0 = vdupq_n_u16(0);
				1217	uint16x8_t out1 = vdupq_n_u16(0);
				1218
				1219	// First half
				1220	for(unsigned int r = 0; r < matrix_size; ++r)
				1221	{
				1222	const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
				1223	out0 = vmlaq_n_u16(out0, data, _conv_col[r]);
				1224	}
				1225
				1226	in.increment(Window::DimX);
				1227
				1228	// Second half
				1229	for(unsigned int r = 0; r < matrix_size; ++r)
				1230	{
				1231	const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
				1232	out1 = vmlaq_n_u16(out1, data, _conv_col[r]);
				1233	}
				1234
				1235	//scale the result if needed
				1236	if(_scale != 1)
				1237	{
				1238	float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0)));
				1239	float32x4_t out0_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0)));
				1240	out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
				1241	out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
				1242	store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
				1243
				1244	float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1)));
				1245	float32x4_t out1_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1)));
				1246	out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
				1247	out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
				1248	store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1249	}
				1250	else
				1251	{
				1252	store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
				1253	}
				1254	},
				1255	in, out);
				1256	}
				1257
				1258	template <unsigned int matrix_size>
				1259	template <typename OutputType>
				1260	void NESeparableConvolutionVertKernel<matrix_size>::convolution_s16(const Window &win)
				1261	{
				1262	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1263
				1264	Window win_in(win);
				1265	win_in.set_dimension_step(Window::DimX, 8);
				1266
				1267	Iterator in(_input, win_in);
				1268	Iterator out(_output, win);
				1269
				1270	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1271	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1272	const int k_half = matrix_size / 2;
				1273
				1274	// Set row pointers
				1275	for(int i = -k_half; i <= k_half; ++i)
				1276	{
				1277	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1278	}
				1279
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	1280	execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1281	{
				1282	int16x8_t out0 = vdupq_n_s16(0);
				1283	int16x8_t out1 = vdupq_n_s16(0);
				1284
				1285	// First half
				1286	for(unsigned int r = 0; r < matrix_size; ++r)
				1287	{
				1288	const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
				1289	out0 = vmlaq_n_s16(out0, data, _conv_col[r]);
				1290	}
				1291
				1292	in.increment(Window::DimX);
				1293
				1294	// Second half
				1295	for(unsigned int r = 0; r < matrix_size; ++r)
				1296	{
				1297	const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
				1298	out1 = vmlaq_n_s16(out1, data, _conv_col[r]);
				1299	}
				1300
				1301	//scale the result if needed
				1302	if(_scale != 1)
				1303	{
				1304	float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0)));
				1305	float32x4_t out0_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0)));
				1306	out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
				1307	out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
				1308	store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
				1309
				1310	float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1)));
				1311	float32x4_t out1_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1)));
				1312	out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
				1313	out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
				1314	store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1315	}
				1316	else
				1317	{
				1318	store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
				1319	}
				1320	},
				1321	in, out);
				1322	}
				1323
				1324	template <unsigned int matrix_size>
				1325	template <typename OutputType>
				1326	void NESeparableConvolutionVertKernel<matrix_size>::convolution_s32(const Window &win)
				1327	{
				1328	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1329
				1330	Window win_in(win);
				1331	win_in.set_dimension_step(Window::DimX, 8);
				1332
				1333	Iterator in(_input, win_in);
				1334	Iterator out(_output, win);
				1335
				1336	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1337	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1338	const int k_half = matrix_size / 2;
				1339
				1340	// Set row pointers
				1341	for(int i = -k_half; i <= k_half; ++i)
				1342	{
				1343	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1344	}
				1345
				1346	const int32x4_t zero = vdupq_n_s32(0);
				1347
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	1348	execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1349	{
				1350	int32x4x2_t out0 =
				1351	{
				1352	{
				1353	zero,
				1354	zero
				1355	}
				1356	};
				1357
				1358	int32x4x2_t out1 =
				1359	{
				1360	{
				1361	zero,
				1362	zero
				1363	}
				1364	};
				1365
				1366	// First half
				1367	for(unsigned int r = 0; r < matrix_size; ++r)
				1368	{
				1369	const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
				1370	out0.val[0] = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]);
				1371	out0.val[1] = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]);
				1372	}
				1373
				1374	in.increment(Window::DimX);
				1375
				1376	// Second half
				1377	for(unsigned int r = 0; r < matrix_size; ++r)
				1378	{
				1379	const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
				1380	out1.val[0] = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]);
				1381	out1.val[1] = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]);
				1382	}
				1383
				1384	//scale the result if needed
				1385	if(_scale != 1)
				1386	{
				1387	float32x4_t out0_f32_odd = vcvtq_f32_s32(out0.val[0]);
				1388	float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]);
				1389	out0_f32_odd = vmulq_f32(out0_f32_odd, oneoverscale);
				1390	out0_f32_even = vmulq_f32(out0_f32_even, oneoverscale);
				1391	out0.val[0] = vcvtq_s32_f32(out0_f32_odd);
				1392	out0.val[1] = vcvtq_s32_f32(out0_f32_even);
				1393
				1394	float32x4_t out1_f32_odd = vcvtq_f32_s32(out1.val[0]);
				1395	float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]);
				1396	out1_f32_odd = vmulq_f32(out1_f32_odd, oneoverscale);
				1397	out1_f32_even = vmulq_f32(out1_f32_even, oneoverscale);
				1398	out1.val[0] = vcvtq_s32_f32(out1_f32_odd);
				1399	out1.val[1] = vcvtq_s32_f32(out1_f32_even);
				1400	}
				1401
				1402	const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]);
				1403	store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()));
				1404
				1405	const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]);
				1406	store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1407	},
				1408	in, out);
				1409	}
				1410
				1411	template class arm_compute::NESeparableConvolutionVertKernel<5>;
				1412	template class arm_compute::NESeparableConvolutionVertKernel<7>;
				1413	template class arm_compute::NESeparableConvolutionVertKernel<9>;
				1414
				1415	/****************************************************************************************\
				1416	* Rectangle Convolution *
				1417	\****************************************************************************************/
				1418
				1419	NEConvolutionRectangleKernel::NEConvolutionRectangleKernel()
				1420	: _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0)
				1421	{
				1422	}
				1423
				1424	BorderSize NEConvolutionRectangleKernel::border_size() const
				1425	{
				1426	return _border_size;
				1427	}
				1428
				1429	void NEConvolutionRectangleKernel::configure(const ITensor input, ITensor output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
				1430	{
				1431	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
				1432
				1433	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				1434
				1435	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				1436	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				1437	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				1438	ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9);
				1439	ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9);
				1440	ARM_COMPUTE_ERROR_ON(0 == scale);
				1441
				1442	_input = input;
				1443	_output = output;
				1444	_scale = scale;
				1445	_border_size = BorderSize(height / 2, width / 2);
				1446
				1447	// Setup the convolution matrix
				1448	const uint32_t nr_elements = width * height;
				1449	_convolution.resize(nr_elements);
				1450	std::copy_n(conv, nr_elements, _convolution.begin());
				1451
				1452	// Set function index to help choose appropriate function in run()
				1453	_func_idx = get_index(height) * 4 + get_index(width);
				1454	ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes));
				1455
				1456	// Configure kernel window
				1457	constexpr unsigned int num_elems_processed_per_iteration = 8;
				1458	constexpr unsigned int num_elems_read_per_iteration = 16;
				1459	constexpr unsigned int num_elems_written_per_iteration = 8;
				1460
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	1461	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size);
				1462	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1463
				1464	update_window_and_padding(win,
				1465	AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height),
				1466	output_access);
				1467
				1468	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size);
				1469
				1470	INEKernel::configure(win);
				1471	}
				1472
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1473	void NEConvolutionRectangleKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1474	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1475	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1476	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1477	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1478
				1479	using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window);
				1480
				1481	// uint8_t function table
				1482	static const std::array<ConvolutionRectangleFunction, 16> func_table_u8 =
				1483	{
				1484	{
				1485	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 3>,
				1486	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 5>,
				1487	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 7>,
				1488	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 9>,
				1489	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 3>,
				1490	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 5>,
				1491	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 7>,
				1492	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 9>,
				1493	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 3>,
				1494	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 5>,
				1495	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 7>,
				1496	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 9>,
				1497	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 3>,
				1498	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 5>,
				1499	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 7>,
				1500	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 9>
				1501	}
				1502	};
				1503	// int16_t function table
				1504	static const std::array<ConvolutionRectangleFunction, 16> func_table_s16 =
				1505	{
				1506	{
				1507	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 3>,
				1508	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 5>,
				1509	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 7>,
				1510	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 9>,
				1511	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 3>,
				1512	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 5>,
				1513	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 7>,
				1514	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 9>,
				1515	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 3>,
				1516	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 5>,
				1517	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 7>,
				1518	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 9>,
				1519	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 3>,
				1520	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 5>,
				1521	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 7>,
				1522	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 9>
				1523	}
				1524	};
				1525
				1526	// Run appropriate function
Sanghoon Lee	d7ba539	2017-12-13 11:28:50 +0000	[diff] [blame]	1527	switch(_output->info()->data_type())
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1528	{
Sanghoon Lee	d7ba539	2017-12-13 11:28:50 +0000	[diff] [blame]	1529	case DataType::U8:
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1530	ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size());
				1531	(this->*func_table_u8[_func_idx])(window);
				1532	break;
Sanghoon Lee	d7ba539	2017-12-13 11:28:50 +0000	[diff] [blame]	1533	case DataType::S16:
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1534	ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size());
				1535	(this->*func_table_s16[_func_idx])(window);
				1536	break;
				1537	default:
				1538	ARM_COMPUTE_ERROR("Not supported");
				1539	}
				1540	}
				1541
				1542	unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val)
				1543	{
				1544	switch(val)
				1545	{
				1546	case 3:
				1547	return 0;
				1548	case 5:
				1549	return 1;
				1550	case 7:
				1551	return 2;
				1552	case 9:
				1553	return 3;
				1554	default:
				1555	ARM_COMPUTE_ERROR("Not supported dimension size");
				1556	return 0;
				1557	}
				1558	}
				1559
				1560	template <typename OutputType, unsigned int rows, unsigned int cols>
				1561	void NEConvolutionRectangleKernel::convolution(const Window &win)
				1562	{
				1563	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1564	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				1565
				1566	Iterator input(_input, win);
				1567	Iterator output(_output, win);
				1568
				1569	std::array<unsigned char *, rows> input_ptrs{ {} };
				1570	const int16_t *conv = _convolution.data();
				1571	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				1572	const int k_row_half = rows / 2;
				1573	const int k_col_half = cols / 2;
				1574
				1575	// Set row pointers
				1576	for(int i = -k_row_half; i <= k_row_half; ++i)
				1577	{
				1578	input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
				1579	}
				1580
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	1581	execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1582	{
				1583	int32x4_t out = vdupq_n_s32(0);
				1584	int32x4_t out2 = vdupq_n_s32(0);
				1585
				1586	// Perform appropriate convolution
				1587	for(unsigned int r = 0; r < rows; ++r)
				1588	{
				1589	const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
				1590	if(3 == cols)
				1591	{
				1592	convolve_row3x1(out, out2, data, conv + r * cols);
				1593	}
				1594	else if(5 == cols)
				1595	{
				1596	convolve_row5x1(out, out2, data, conv + r * cols);
				1597	}
				1598	else if(7 == cols)
				1599	{
				1600	convolve_row7x1(out, out2, data, conv + r * cols);
				1601	}
				1602	else if(9 == cols)
				1603	{
				1604	convolve_row9x1(out, out2, data, conv + r * cols);
				1605	}
				1606	else
				1607	{
				1608	ARM_COMPUTE_ERROR("Unsupported number of columns");
				1609	}
				1610	}
				1611
				1612	// Apply scale
				1613	if(_scale != 1)
				1614	{
				1615	// Convert to F32, scale and convert back to S32
				1616	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				1617	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				1618	}
				1619
				1620	// Clamp and store as U8 or S16:
				1621	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				1622	},
				1623	input, output);
				1624	}
				1625	} // namespace arm_compute