Blame - src/core/NEON/kernels/NEConvolutionKernel.cpp - ml/ComputeLibrary

blob: 0a10546b7be7b4012505585c9eb3adb004a18332 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	2	* Copyright (c) 2016-2018 ARM Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
				25
				26	#include "arm_compute/core/Coordinates.h"
				27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/ITensor.h"
				30	#include "arm_compute/core/TensorInfo.h"
				31	#include "arm_compute/core/Types.h"
				32	#include "arm_compute/core/Utils.h"
				33	#include "arm_compute/core/Validate.h"
				34	#include "arm_compute/core/Window.h"
				35
				36	#include <algorithm>
				37	#include <arm_neon.h>
				38	#include <array>
				39	#include <cstdint>
				40	#include <cstring>
				41	#include <tuple>
				42
				43	namespace arm_compute
				44	{
				45	namespace
				46	{
				47	const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
				48
				49	inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output)
				50	{
				51	const int16x8_t s16results = vcombine_s16(vqmovn_s32(out),
				52	vqmovn_s32(out2));
				53	vst1q_s16(output, s16results);
				54	}
				55
				56	inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output)
				57	{
				58	const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out),
				59	vqmovun_s32(out2)));
				60	vst1_u8(output, u8results);
				61	}
				62
				63	inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output)
				64	{
				65	const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2));
				66	const int16x8_t s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16));
				67	vst1q_s16(output, s16results);
				68	}
				69
				70	inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output)
				71	{
				72	const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out),
				73	vqmovn_u32(out2)));
				74	vst1_u8(output, u8results);
				75	}
				76
				77	inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output)
				78	{
				79	vst1q_s16(output, out);
				80	vst1q_s16(output + 8, out2);
				81	}
				82
				83	inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output)
				84	{
				85	const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out),
				86	vqmovun_s16(out2));
				87	vst1q_u8(output, u8results);
				88	}
				89
				90	inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output)
				91	{
				92	const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out),
				93	vqmovn_u16(out2));
				94	vst1q_u8(output, u8results);
				95	}
				96
				97	inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output)
				98	{
				99	vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16)));
				100	vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16)));
				101	}
				102
				103	inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2)
				104	{
				105	// Convert to s16 and split in blocks of 4 values:
				106	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				107	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				108
				109	const int16x4x3_t row =
				110	{
				111	{
				112	vget_low_s16(s16_tmp0),
				113	vget_high_s16(s16_tmp0),
				114	vget_low_s16(s16_tmp1)
				115	}
				116	};
				117
				118	// Calculate row left value for pixels [0,3]
				119	out = vmlal_s16(out, row.val[0], mat0);
				120	// Calculate row middle value for pixels [0,3]
				121	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				122	// Calculate row right value for pixels [0,3]
				123	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				124
				125	// Calculate row left value for pixels [4,7]
				126	out2 = vmlal_s16(out2, row.val[1], mat0);
				127	// Calculate row middle value for pixels [4,7]
				128	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				129	// Calculate row right value for pixels [4,7]
				130	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				131	}
				132
				133	inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				134	{
				135	const int16x4_t mat0 = vld1_dup_s16(convolution);
				136	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				137	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				138
				139	convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2);
				140	}
				141
				142	inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				143	{
				144	const int16x4_t mat0 = vld1_dup_s16(convolution);
				145	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				146	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				147	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				148	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				149
				150	// Convert to s16 and split in blocks of 4 values:
				151	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				152	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				153
				154	const int16x4x3_t row =
				155	{
				156	{
				157	vget_low_s16(s16_tmp0),
				158	vget_high_s16(s16_tmp0),
				159	vget_low_s16(s16_tmp1)
				160	}
				161	};
				162
				163	// Calculate row left 2 value for pixels [0,3]
				164	out = vmlal_s16(out, row.val[0], mat0);
				165	// Calculate row left 1 value for pixels [0,3]
				166	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				167	// Calculate row middle value for pixels [0,3]
				168	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				169	// Calculate row right +1 value for pixels [0,3]
				170	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				171	// Calculate row right +2 value for pixels [0,3]
				172	out = vmlal_s16(out, row.val[1], mat4);
				173
				174	// Calculate row left 2 value for pixels [4,7]
				175	out2 = vmlal_s16(out2, row.val[1], mat0);
				176	// Calculate row left 1 value for pixels [4,7]
				177	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				178	// Calculate row middle value for pixels [4,7]
				179	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				180	// Calculate row right +1 value for pixels [4,7]
				181	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				182	// Calculate row right +2 value for pixels [4,7]
				183	out2 = vmlal_s16(out2, row.val[2], mat4);
				184	}
				185
				186	inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				187	{
				188	const int16x4_t mat0 = vld1_dup_s16(convolution);
				189	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				190	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				191	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				192	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				193	const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
				194	const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
				195
				196	// Convert to s16 and split in blocks of 4 values:
				197	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				198	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				199
				200	const int16x4x4_t row =
				201	{
				202	{
				203	vget_low_s16(s16_tmp0),
				204	vget_high_s16(s16_tmp0),
				205	vget_low_s16(s16_tmp1),
				206	vget_high_s16(s16_tmp1)
				207	}
				208	};
				209
				210	// Calculate row left 3 value for pixels [0,3]
				211	out = vmlal_s16(out, row.val[0], mat0);
				212	// Calculate row left 2 value for pixels [0,3]
				213	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				214	// Calculate row left 1 value for pixels [0,3]
				215	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				216	// Calculate row middle value for pixels [0,3]
				217	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				218	// Calculate row right +1 value for pixels [0,3]
				219	out = vmlal_s16(out, row.val[1], mat4);
				220	// Calculate row right +2 value for pixels [0,3]
				221	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
				222	// Calculate row right +3 value for pixels [0,3]
				223	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
				224
				225	// Calculate row left 3 value for pixels [4,7]
				226	out2 = vmlal_s16(out2, row.val[1], mat0);
				227	// Calculate row left 2 value for pixels [4,7]
				228	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				229	// Calculate row left 1 value for pixels [4,7]
				230	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				231	// Calculate row middle value for pixels [4,7]
				232	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				233	// Calculate row right +1 value for pixels [4,7]
				234	out2 = vmlal_s16(out2, row.val[2], mat4);
				235	// Calculate row right +2 value for pixels [4,7]
				236	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
				237	// Calculate row right +3 value for pixels [4,7]
				238	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
				239	}
				240
				241	inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				242	{
				243	const int16x4_t mat0 = vld1_dup_s16(convolution);
				244	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				245	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				246	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				247	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				248	const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
				249	const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
				250	const int16x4_t mat7 = vld1_dup_s16(convolution + 7);
				251	const int16x4_t mat8 = vld1_dup_s16(convolution + 8);
				252
				253	// Convert to s16 and split in blocks of 4 values:
				254	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				255	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				256
				257	const int16x4x4_t row =
				258	{
				259	{
				260	vget_low_s16(s16_tmp0),
				261	vget_high_s16(s16_tmp0),
				262	vget_low_s16(s16_tmp1),
				263	vget_high_s16(s16_tmp1)
				264	}
				265	};
				266
				267	// Calculate row left 4 value for pixels [0,3]
				268	out = vmlal_s16(out, row.val[0], mat0);
				269	// Calculate row left 3 value for pixels [0,3]
				270	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				271	// Calculate row left 2 value for pixels [0,3]
				272	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				273	// Calculate row left 1 value for pixels [0,3]
				274	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				275	// Calculate row middle value for pixels [0,3]
				276	out = vmlal_s16(out, row.val[1], mat4);
				277	// Calculate row right +1 value for pixels [0,3]
				278	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
				279	// Calculate row right +2 value for pixels [0,3]
				280	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
				281	// Calculate row right +3 value for pixels [0,3]
				282	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7);
				283	// Calculate row right +4 value for pixels [0,3]
				284	out = vmlal_s16(out, row.val[2], mat8);
				285
				286	// Calculate row left 4 value for pixels [0,3]
				287	out2 = vmlal_s16(out2, row.val[1], mat0);
				288	// Calculate row left 3 value for pixels [0,3]
				289	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				290	// Calculate row left 2 value for pixels [0,3]
				291	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				292	// Calculate row left 1 value for pixels [0,3]
				293	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				294	// Calculate row middle value for pixels [0,3]
				295	out2 = vmlal_s16(out2, row.val[2], mat4);
				296	// Calculate row right +1 value for pixels [0,3]
				297	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
				298	// Calculate row right +2 value for pixels [0,3]
				299	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
				300	// Calculate row right +3 value for pixels [0,3]
				301	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7);
				302	// Calculate row right +4 value for pixels [0,3]
				303	out2 = vmlal_s16(out2, row.val[3], mat8);
				304	}
				305	} // namespace
				306
				307	/****************************************************************************************\
				308	* Square Convolution *
				309	\****************************************************************************************/
				310
				311	template <unsigned int matrix_size>
				312	NEConvolutionKernel<matrix_size>::NEConvolutionKernel()
				313	: INESimpleKernel(), _scale(0), _convolution{ {} }
				314	{
				315	}
				316
				317	template <unsigned int matrix_size>
				318	BorderSize NEConvolutionKernel<matrix_size>::border_size() const
				319	{
				320	return BorderSize(matrix_size / 2);
				321	}
				322
				323	template <unsigned int matrix_size>
				324	void NEConvolutionKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv, uint32_t scale, bool border_undefined)
				325	{
				326	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
				327
				328	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				329
				330	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				331	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				332	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				333
				334	_input = input;
				335	_output = output;
				336
				337	std::copy_n(conv, _convolution.size(), _convolution.begin());
				338
				339	if(scale == 0)
				340	{
				341	_scale = calculate_matrix_scale(_convolution.data(), matrix_size);
				342	}
				343	else
				344	{
				345	_scale = scale;
				346	}
				347
				348	// Configure kernel window
				349	constexpr unsigned int num_elems_processed_per_iteration = 8;
				350	constexpr unsigned int num_elems_read_per_iteration = 16;
				351	constexpr unsigned int num_elems_written_per_iteration = 8;
				352
				353	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				354	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				355
				356	update_window_and_padding(win,
				357	AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, matrix_size),
				358	output_access);
				359
				360	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				361
				362	INEKernel::configure(win);
				363	}
				364
				365	template <>
				366	template <typename OutputType>
				367	void NEConvolutionKernel<3>::convolution(const Window &win)
				368	{
				369	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				370	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				371
				372	Iterator input(_input, win);
				373	Iterator output(_output, win);
				374
				375	// Load the matrix's coefficients into NEON registers:
				376	const int16x4_t mat00 = vld1_dup_s16(_convolution.data());
				377	const int16x4_t mat01 = vld1_dup_s16(_convolution.data() + 1);
				378	const int16x4_t mat02 = vld1_dup_s16(_convolution.data() + 2);
				379	const int16x4_t mat10 = vld1_dup_s16(_convolution.data() + 3);
				380	const int16x4_t mat11 = vld1_dup_s16(_convolution.data() + 4);
				381	const int16x4_t mat12 = vld1_dup_s16(_convolution.data() + 5);
				382	const int16x4_t mat20 = vld1_dup_s16(_convolution.data() + 6);
				383	const int16x4_t mat21 = vld1_dup_s16(_convolution.data() + 7);
				384	const int16x4_t mat22 = vld1_dup_s16(_convolution.data() + 8);
				385	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				386
				387	const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1));
				388	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
				389	const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
				390
				391	execute_window_loop(win, [&](const Coordinates & id)
				392	{
				393	int32x4_t out = vdupq_n_s32(0);
				394	int32x4_t out2 = vdupq_n_s32(0);
				395
				396	// Load 16 bytes from the top row:
				397	const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
				398	convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02);
				399
				400	// Load 16 bytes from the middle row:
				401	const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
				402	convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12);
				403
				404	// Load 16 bytes from the middle row:
				405	const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset());
				406	convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22);
				407
				408	// Apply scale
				409	if(_scale != 1)
				410	{
				411	// Convert to F32, scale and convert back to S32
				412	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				413	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				414	}
				415
				416	// Clamp and store as U8 or S16:
				417	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				418	},
				419	input, output);
				420	}
				421
				422	template <>
				423	template <typename OutputType>
				424	void NEConvolutionKernel<5>::convolution(const Window &win)
				425	{
				426	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				427	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				428
				429	Iterator input(_input, win);
				430	Iterator output(_output, win);
				431
				432	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				433
				434	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2));
				435	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1));
				436	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0));
				437	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
				438	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
				439
				440	execute_window_loop(win, [&](const Coordinates & id)
				441	{
				442	int32x4_t out = vdupq_n_s32(0);
				443	int32x4_t out2 = vdupq_n_s32(0);
				444
				445	// Load 16 bytes from the top2 row:
				446	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				447	convolve_row5x1(out, out2, data_t2, _convolution.data());
				448
				449	// Load 16 bytes from the top1 row:
				450	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				451	convolve_row5x1(out, out2, data_t1, _convolution.data() + 5);
				452
				453	// Load 16 bytes from the middle row:
				454	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				455	convolve_row5x1(out, out2, data_m, _convolution.data() + 10);
				456
				457	// Load 16 bytes from the low1 row:
				458	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				459	convolve_row5x1(out, out2, data_b1, _convolution.data() + 15);
				460
				461	// Load 16 bytes from the low2 row:
				462	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				463	convolve_row5x1(out, out2, data_b2, _convolution.data() + 20);
				464
				465	// Apply scale
				466	if(_scale != 1)
				467	{
				468	// Convert to F32, scale and convert back to S32
				469	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				470	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				471	}
				472
				473	// Clamp and store as U8 or S16:
				474	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				475	},
				476	input, output);
				477	}
				478
				479	template <>
				480	template <typename OutputType>
				481	void NEConvolutionKernel<7>::convolution(const Window &win)
				482	{
				483	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				484	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				485
				486	Iterator input(_input, win);
				487	Iterator output(_output, win);
				488
				489	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				490
				491	const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3));
				492	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2));
				493	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1));
				494	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0));
				495	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1));
				496	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
				497	const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
				498
				499	execute_window_loop(win, [&](const Coordinates & id)
				500	{
				501	int32x4_t out = vdupq_n_s32(0);
				502	int32x4_t out2 = vdupq_n_s32(0);
				503
				504	// Load 16 bytes from the top3 row:
				505	const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
				506	convolve_row7x1(out, out2, data_t3, _convolution.data());
				507
				508	// Load 16 bytes from the top2 row:
				509	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				510	convolve_row7x1(out, out2, data_t2, _convolution.data() + 7);
				511
				512	// Load 16 bytes from the top1 row:
				513	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				514	convolve_row7x1(out, out2, data_t1, _convolution.data() + 14);
				515
				516	// Load 16 bytes from the middle row:
				517	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				518	convolve_row7x1(out, out2, data_m, _convolution.data() + 21);
				519
				520	// Load 16 bytes from the low1 row:
				521	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				522	convolve_row7x1(out, out2, data_b1, _convolution.data() + 28);
				523
				524	// Load 16 bytes from the low2 row:
				525	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				526	convolve_row7x1(out, out2, data_b2, _convolution.data() + 35);
				527
				528	// Load 16 bytes from the low3 row:
				529	const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
				530	convolve_row7x1(out, out2, data_b3, _convolution.data() + 42);
				531
				532	// Apply scale
				533	if(_scale != 1)
				534	{
				535	// Convert to F32, scale and convert back to S32
				536	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				537	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				538	}
				539
				540	// Clamp and store as U8 or S16:
				541	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				542	},
				543	input, output);
				544	}
				545
				546	template <>
				547	template <typename OutputType>
				548	void NEConvolutionKernel<9>::convolution(const Window &win)
				549	{
				550	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				551	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				552
				553	Iterator input(_input, win);
				554	Iterator output(_output, win);
				555
				556	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				557
				558	const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4));
				559	const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3));
				560	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2));
				561	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1));
				562	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0));
				563	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1));
				564	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2));
				565	const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
				566	const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
				567
				568	execute_window_loop(win, [&](const Coordinates & id)
				569	{
				570	int32x4_t out = vdupq_n_s32(0);
				571	int32x4_t out2 = vdupq_n_s32(0);
				572
				573	// Load 16 bytes from the top4 row:
				574	const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset());
				575	convolve_row9x1(out, out2, data_t4, _convolution.data());
				576
				577	// Load 16 bytes from the top3 row:
				578	const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
				579	convolve_row9x1(out, out2, data_t3, _convolution.data() + 9);
				580
				581	// Load 16 bytes from the top2 row:
				582	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				583	convolve_row9x1(out, out2, data_t2, _convolution.data() + 18);
				584
				585	// Load 16 bytes from the top1 row:
				586	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				587	convolve_row9x1(out, out2, data_t1, _convolution.data() + 27);
				588
				589	// Load 16 bytes from the middle row:
				590	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				591	convolve_row9x1(out, out2, data_m, _convolution.data() + 36);
				592
				593	// Load 16 bytes from the low1 row:
				594	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				595	convolve_row9x1(out, out2, data_b1, _convolution.data() + 45);
				596
				597	// Load 16 bytes from the low2 row:
				598	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				599	convolve_row9x1(out, out2, data_b2, _convolution.data() + 54);
				600
				601	// Load 16 bytes from the low3 row:
				602	const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
				603	convolve_row9x1(out, out2, data_b3, _convolution.data() + 63);
				604
				605	// Load 16 bytes from the low4 row:
				606	const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset());
				607	convolve_row9x1(out, out2, data_b4, _convolution.data() + 72);
				608
				609	// Apply scale
				610	if(_scale != 1)
				611	{
				612	// Convert to F32, scale and convert back to S32
				613	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				614	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				615	}
				616
				617	// Clamp and store as U8 or S16:
				618	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				619	},
				620	input, output);
				621	}
				622
				623	template <unsigned int matrix_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	624	void NEConvolutionKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	625	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	626	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	627	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				628	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				629
Sanghoon Lee	c8a85ba	2017-11-29 11:23:14 +0000	[diff] [blame]	630	switch(_output->info()->data_type())
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	631	{
Sanghoon Lee	c8a85ba	2017-11-29 11:23:14 +0000	[diff] [blame]	632	case DataType::U8:
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	633	convolution<uint8_t>(window);
				634	break;
Sanghoon Lee	c8a85ba	2017-11-29 11:23:14 +0000	[diff] [blame]	635	case DataType::S16:
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	636	convolution<int16_t>(window);
				637	break;
				638	default:
Sanghoon Lee	c8a85ba	2017-11-29 11:23:14 +0000	[diff] [blame]	639	ARM_COMPUTE_ERROR("Not supported Data type!");
				640	break;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	641	}
				642	}
				643
				644	template class arm_compute::NEConvolutionKernel<3>;
				645	template class arm_compute::NEConvolutionKernel<5>;
				646	template class arm_compute::NEConvolutionKernel<7>;
				647	template class arm_compute::NEConvolutionKernel<9>;
				648
				649	/****************************************************************************************\
				650	* Separable Square Convolution *
				651	\****************************************************************************************/
				652
				653	template <unsigned int matrix_size>
				654	NESeparableConvolutionHorKernel<matrix_size>::NESeparableConvolutionHorKernel()
				655	: _conv_row{ { 0 } }, _border_size(0)
				656	{
				657	}
				658
				659	template <unsigned int matrix_size>
				660	BorderSize NESeparableConvolutionHorKernel<matrix_size>::border_size() const
				661	{
				662	return _border_size;
				663	}
				664
				665	template <unsigned int matrix_size>
				666	void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv_row, bool border_undefined)
				667	{
				668	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row);
				669
				670	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				671
				672	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				673	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				674	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
				675
				676	_input = input;
				677	_output = output;
				678	std::copy_n(conv_row, _conv_row.size(), _conv_row.begin());
				679	_border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
				680
				681	// Configure kernel window
				682	constexpr unsigned int num_elems_processed_per_iteration = 8;
				683	constexpr unsigned int num_elems_read_per_iteration = 16;
				684	constexpr unsigned int num_elems_written_per_iteration = 8;
				685
				686	Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				687	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				688
				689	update_window_and_padding(win,
				690	AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
				691	output_access);
				692
				693	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				694
				695	INEKernel::configure(win);
				696	}
				697
				698	template <unsigned int matrix_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	699	void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	700	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	701	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	702	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				703	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				704	switch(_output->info()->data_type())
				705	{
				706	case DataType::U16:
				707	convolve<uint16_t>(window);
				708	break;
				709	case DataType::S16:
				710	convolve<int16_t>(window);
				711	break;
				712	case DataType::S32:
				713	convolve<int32_t>(window);
				714	break;
				715	default:
				716	ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
				717	break;
				718	}
				719	}
				720
				721	template <>
				722	template <>
				723	inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
				724	{
				725	Window win_in(window);
				726	win_in.shift(Window::DimX, -2);
				727
				728	Iterator input(_input, win_in);
				729	Iterator output(_output, window);
				730
				731	execute_window_loop(window, [&](const Coordinates & id)
				732	{
				733	const uint8x16_t data = vld1q_u8(input.ptr());
				734
				735	const uint16x8x2_t data_u16 =
				736	{
				737	{
				738	vmovl_u8(vget_low_u8(data)),
				739	vmovl_u8(vget_high_u8(data))
				740	}
				741	};
				742
				743	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				744	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				745	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				746	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				747	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				748
				749	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				750	},
				751	input, output);
				752	}
				753
				754	template <>
				755	template <>
				756	inline void NESeparableConvolutionHorKernel<5>::convolve<int16_t>(const Window &window)
				757	{
				758	Window win_in(window);
				759	win_in.shift(Window::DimX, -2);
				760
				761	Iterator input(_input, win_in);
				762	Iterator output(_output, window);
				763
				764	execute_window_loop(window, [&](const Coordinates & id)
				765	{
				766	const uint8x16_t data = vld1q_u8(input.ptr());
				767
				768	const int16x8x2_t data_s16 =
				769	{
				770	{
				771	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				772	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				773	}
				774	};
				775
				776	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				777	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				778	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				779	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				780	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				781
				782	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				783	},
				784	input, output);
				785	}
				786
				787	template <>
				788	template <>
				789	void NESeparableConvolutionHorKernel<5>::convolve<int32_t>(const Window &window)
				790	{
				791	Window win_in(window);
				792	win_in.shift(Window::DimX, -2);
				793
				794	Iterator input(_input, win_in);
				795	Iterator output(_output, window);
				796
				797	execute_window_loop(window, [&](const Coordinates & id)
				798	{
				799	const uint8x16_t data = vld1q_u8(input.ptr());
				800
				801	const int16x8x2_t data_s16 =
				802	{
				803	{
				804	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				805	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				806	}
				807	};
				808
				809	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				810	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				811	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				812	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				813
				814	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				815	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]);
				816	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]);
				817	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]);
				818	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]);
				819
				820	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				821
				822	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				823	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]);
				824	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]);
				825	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]);
				826	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]);
				827
				828	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				829	},
				830	input, output);
				831	}
				832
				833	template <>
				834	template <>
				835	inline void NESeparableConvolutionHorKernel<7>::convolve<uint16_t>(const Window &window)
				836	{
				837	Window win_in(window);
				838	win_in.shift(Window::DimX, -3);
				839
				840	Iterator input(_input, win_in);
				841	Iterator output(_output, window);
				842
				843	execute_window_loop(window, [&](const Coordinates & id)
				844	{
				845	const uint8x16_t data = vld1q_u8(input.ptr());
				846
				847	const uint16x8x2_t data_u16 =
				848	{
				849	{
				850	vmovl_u8(vget_low_u8(data)),
				851	vmovl_u8(vget_high_u8(data))
				852	}
				853	};
				854
				855	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				856	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				857	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				858	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				859	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				860	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
				861	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
				862
				863	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				864	},
				865	input, output);
				866	}
				867
				868	template <>
				869	template <>
				870	inline void NESeparableConvolutionHorKernel<7>::convolve<int16_t>(const Window &window)
				871	{
				872	Window win_in(window);
				873	win_in.shift(Window::DimX, -3);
				874
				875	Iterator input(_input, win_in);
				876	Iterator output(_output, window);
				877
				878	execute_window_loop(window, [&](const Coordinates & id)
				879	{
				880	const uint8x16_t data = vld1q_u8(input.ptr());
				881
				882	const int16x8x2_t data_s16 =
				883	{
				884	{
				885	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				886	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				887	}
				888	};
				889
				890	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				891	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				892	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				893	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				894	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				895	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
				896	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
				897
				898	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				899	},
				900	input, output);
				901	}
				902
				903	template <>
				904	template <>
				905	void NESeparableConvolutionHorKernel<7>::convolve<int32_t>(const Window &window)
				906	{
				907	Window win_in(window);
				908	win_in.shift(Window::DimX, -3);
				909
				910	Iterator input(_input, win_in);
				911	Iterator output(_output, window);
				912
				913	execute_window_loop(window, [&](const Coordinates & id)
				914	{
				915	const uint8x16_t data = vld1q_u8(input.ptr());
				916
				917	const int16x8x2_t data_s16 =
				918	{
				919	{
				920	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				921	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				922	}
				923	};
				924
				925	const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				926	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				927	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				928	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				929	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
				930	const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
				931
				932	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				933	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]);
				934	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]);
				935	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]);
				936	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]);
				937	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]);
				938	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]);
				939
				940	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				941
				942	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				943	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]);
				944	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]);
				945	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]);
				946	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]);
				947	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]);
				948	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]);
				949
				950	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				951	},
				952	input, output);
				953	}
				954
				955	template <>
				956	template <>
				957	inline void NESeparableConvolutionHorKernel<9>::convolve<uint16_t>(const Window &window)
				958	{
				959	Window win_in(window);
				960	win_in.shift(Window::DimX, -4);
				961
				962	Iterator input(_input, win_in);
				963	Iterator output(_output, window);
				964
				965	execute_window_loop(window, [&](const Coordinates & id)
				966	{
				967	const uint8x16_t data = vld1q_u8(input.ptr());
				968
				969	const uint16x8x2_t data_u16 =
				970	{
				971	{
				972	vmovl_u8(vget_low_u8(data)),
				973	vmovl_u8(vget_high_u8(data))
				974	}
				975	};
				976
				977	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				978	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				979	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				980	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				981	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				982	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
				983	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
				984	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]);
				985	out = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]);
				986
				987	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				988	},
				989	input, output);
				990	}
				991
				992	template <>
				993	template <>
				994	inline void NESeparableConvolutionHorKernel<9>::convolve<int16_t>(const Window &window)
				995	{
				996	Window win_in(window);
				997	win_in.shift(Window::DimX, -4);
				998
				999	Iterator input(_input, win_in);
				1000	Iterator output(_output, window);
				1001
				1002	execute_window_loop(window, [&](const Coordinates & id)
				1003	{
				1004	const uint8x16_t data = vld1q_u8(input.ptr());
				1005
				1006	const int16x8x2_t data_s16 =
				1007	{
				1008	{
				1009	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				1010	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				1011	}
				1012	};
				1013
				1014	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				1015	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				1016	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				1017	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				1018	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				1019	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
				1020	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
				1021	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]);
				1022	out = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]);
				1023
				1024	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				1025	},
				1026	input, output);
				1027	}
				1028
				1029	template <>
				1030	template <>
				1031	void NESeparableConvolutionHorKernel<9>::convolve<int32_t>(const Window &window)
				1032	{
				1033	Window win_in(window);
				1034	win_in.shift(Window::DimX, -4);
				1035
				1036	Iterator input(_input, win_in);
				1037	Iterator output(_output, window);
				1038
				1039	execute_window_loop(window, [&](const Coordinates & id)
				1040	{
				1041	const uint8x16_t data = vld1q_u8(input.ptr());
				1042
				1043	const int16x8x2_t data_s16 =
				1044	{
				1045	{
				1046	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				1047	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				1048	}
				1049	};
				1050
				1051	const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				1052	const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				1053	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				1054	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				1055	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
				1056	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
				1057	const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7);
				1058
				1059	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				1060	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]);
				1061	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]);
				1062	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]);
				1063	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]);
				1064	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]);
				1065	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]);
				1066	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]);
				1067	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]);
				1068
				1069	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				1070
				1071	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				1072	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]);
				1073	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]);
				1074	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]);
				1075	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]);
				1076	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]);
				1077	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]);
				1078	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]);
				1079	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]);
				1080
				1081	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				1082	},
				1083	input, output);
				1084	}
				1085
				1086	template class arm_compute::NESeparableConvolutionHorKernel<5>;
				1087	template class arm_compute::NESeparableConvolutionHorKernel<7>;
				1088	template class arm_compute::NESeparableConvolutionHorKernel<9>;
				1089
				1090	template <unsigned int matrix_size>
				1091	NESeparableConvolutionVertKernel<matrix_size>::NESeparableConvolutionVertKernel()
				1092	: _conv_col{ { 0 } }, _scale(0)
				1093	{
				1094	}
				1095
				1096	template <unsigned int matrix_size>
				1097	BorderSize NESeparableConvolutionVertKernel<matrix_size>::border_size() const
				1098	{
				1099	return BorderSize(matrix_size / 2, 0);
				1100	}
				1101
				1102	template <unsigned int matrix_size>
				1103	void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
				1104	{
				1105	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col);
				1106
				1107	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				1108
				1109	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				1110	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
				1111	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				1112	ARM_COMPUTE_ERROR_ON(scale == 0);
				1113
				1114	_input = input;
				1115	_output = output;
				1116	std::copy_n(conv_col, _conv_col.size(), _conv_col.begin());
				1117	_scale = scale;
				1118
				1119	// Configure kernel window
				1120	constexpr unsigned int num_elems_processed_per_iteration = 16;
				1121	constexpr unsigned int num_elems_read_per_iteration = 16;
				1122	constexpr unsigned int num_elems_written_per_iteration = 16;
				1123
				1124	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				1125	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				1126
				1127	update_window_and_padding(win,
				1128	AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, matrix_size),
				1129	output_access);
				1130
				1131	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				1132
				1133	INEKernel::configure(win);
				1134	}
				1135
				1136	template <unsigned int matrix_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1137	void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1138	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1139	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1140	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1141	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1142
				1143	switch(_input->info()->data_type())
				1144	{
				1145	case DataType::U16:
				1146	switch(_output->info()->data_type())
				1147	{
				1148	case DataType::U8:
				1149	convolution_u16<uint8_t>(window);
				1150	break;
				1151	case DataType::S16:
				1152	convolution_u16<int16_t>(window);
				1153	break;
				1154	default:
				1155	ARM_COMPUTE_ERROR("Not supported");
				1156	}
				1157	break;
				1158	case DataType::S16:
				1159	switch(_output->info()->data_type())
				1160	{
				1161	case DataType::U8:
				1162	convolution_s16<uint8_t>(window);
				1163	break;
				1164	case DataType::S16:
				1165	convolution_s16<int16_t>(window);
				1166	break;
				1167	default:
				1168	ARM_COMPUTE_ERROR("Not supported");
				1169	}
				1170	break;
				1171	case DataType::S32:
				1172	switch(_output->info()->data_type())
				1173	{
				1174	case DataType::U8:
				1175	convolution_s32<uint8_t>(window);
				1176	break;
				1177	case DataType::S16:
				1178	convolution_s32<int16_t>(window);
				1179	break;
				1180	default:
				1181	ARM_COMPUTE_ERROR("Not supported");
				1182	}
				1183	break;
				1184	default:
				1185	ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
				1186	break;
				1187	}
				1188	}
				1189
				1190	template <unsigned int matrix_size>
				1191	template <typename OutputType>
				1192	void NESeparableConvolutionVertKernel<matrix_size>::convolution_u16(const Window &win)
				1193	{
				1194	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1195
				1196	Window win_in(win);
				1197	win_in.set_dimension_step(Window::DimX, 8);
				1198
				1199	Iterator in(_input, win_in);
				1200	Iterator out(_output, win);
				1201
				1202	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1203	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1204	const int k_half = matrix_size / 2;
				1205
				1206	// Set row pointers
				1207	for(int i = -k_half; i <= k_half; ++i)
				1208	{
				1209	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1210	}
				1211
				1212	execute_window_loop(win, [&](const Coordinates & id)
				1213	{
				1214	uint16x8_t out0 = vdupq_n_u16(0);
				1215	uint16x8_t out1 = vdupq_n_u16(0);
				1216
				1217	// First half
				1218	for(unsigned int r = 0; r < matrix_size; ++r)
				1219	{
				1220	const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
				1221	out0 = vmlaq_n_u16(out0, data, _conv_col[r]);
				1222	}
				1223
				1224	in.increment(Window::DimX);
				1225
				1226	// Second half
				1227	for(unsigned int r = 0; r < matrix_size; ++r)
				1228	{
				1229	const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
				1230	out1 = vmlaq_n_u16(out1, data, _conv_col[r]);
				1231	}
				1232
				1233	//scale the result if needed
				1234	if(_scale != 1)
				1235	{
				1236	float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0)));
				1237	float32x4_t out0_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0)));
				1238	out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
				1239	out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
				1240	store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
				1241
				1242	float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1)));
				1243	float32x4_t out1_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1)));
				1244	out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
				1245	out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
				1246	store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1247	}
				1248	else
				1249	{
				1250	store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
				1251	}
				1252	},
				1253	in, out);
				1254	}
				1255
				1256	template <unsigned int matrix_size>
				1257	template <typename OutputType>
				1258	void NESeparableConvolutionVertKernel<matrix_size>::convolution_s16(const Window &win)
				1259	{
				1260	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1261
				1262	Window win_in(win);
				1263	win_in.set_dimension_step(Window::DimX, 8);
				1264
				1265	Iterator in(_input, win_in);
				1266	Iterator out(_output, win);
				1267
				1268	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1269	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1270	const int k_half = matrix_size / 2;
				1271
				1272	// Set row pointers
				1273	for(int i = -k_half; i <= k_half; ++i)
				1274	{
				1275	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1276	}
				1277
				1278	execute_window_loop(win, [&](const Coordinates & id)
				1279	{
				1280	int16x8_t out0 = vdupq_n_s16(0);
				1281	int16x8_t out1 = vdupq_n_s16(0);
				1282
				1283	// First half
				1284	for(unsigned int r = 0; r < matrix_size; ++r)
				1285	{
				1286	const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
				1287	out0 = vmlaq_n_s16(out0, data, _conv_col[r]);
				1288	}
				1289
				1290	in.increment(Window::DimX);
				1291
				1292	// Second half
				1293	for(unsigned int r = 0; r < matrix_size; ++r)
				1294	{
				1295	const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
				1296	out1 = vmlaq_n_s16(out1, data, _conv_col[r]);
				1297	}
				1298
				1299	//scale the result if needed
				1300	if(_scale != 1)
				1301	{
				1302	float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0)));
				1303	float32x4_t out0_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0)));
				1304	out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
				1305	out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
				1306	store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
				1307
				1308	float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1)));
				1309	float32x4_t out1_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1)));
				1310	out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
				1311	out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
				1312	store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1313	}
				1314	else
				1315	{
				1316	store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
				1317	}
				1318	},
				1319	in, out);
				1320	}
				1321
				1322	template <unsigned int matrix_size>
				1323	template <typename OutputType>
				1324	void NESeparableConvolutionVertKernel<matrix_size>::convolution_s32(const Window &win)
				1325	{
				1326	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1327
				1328	Window win_in(win);
				1329	win_in.set_dimension_step(Window::DimX, 8);
				1330
				1331	Iterator in(_input, win_in);
				1332	Iterator out(_output, win);
				1333
				1334	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1335	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1336	const int k_half = matrix_size / 2;
				1337
				1338	// Set row pointers
				1339	for(int i = -k_half; i <= k_half; ++i)
				1340	{
				1341	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1342	}
				1343
				1344	const int32x4_t zero = vdupq_n_s32(0);
				1345
				1346	execute_window_loop(win, [&](const Coordinates & id)
				1347	{
				1348	int32x4x2_t out0 =
				1349	{
				1350	{
				1351	zero,
				1352	zero
				1353	}
				1354	};
				1355
				1356	int32x4x2_t out1 =
				1357	{
				1358	{
				1359	zero,
				1360	zero
				1361	}
				1362	};
				1363
				1364	// First half
				1365	for(unsigned int r = 0; r < matrix_size; ++r)
				1366	{
				1367	const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
				1368	out0.val[0] = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]);
				1369	out0.val[1] = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]);
				1370	}
				1371
				1372	in.increment(Window::DimX);
				1373
				1374	// Second half
				1375	for(unsigned int r = 0; r < matrix_size; ++r)
				1376	{
				1377	const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
				1378	out1.val[0] = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]);
				1379	out1.val[1] = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]);
				1380	}
				1381
				1382	//scale the result if needed
				1383	if(_scale != 1)
				1384	{
				1385	float32x4_t out0_f32_odd = vcvtq_f32_s32(out0.val[0]);
				1386	float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]);
				1387	out0_f32_odd = vmulq_f32(out0_f32_odd, oneoverscale);
				1388	out0_f32_even = vmulq_f32(out0_f32_even, oneoverscale);
				1389	out0.val[0] = vcvtq_s32_f32(out0_f32_odd);
				1390	out0.val[1] = vcvtq_s32_f32(out0_f32_even);
				1391
				1392	float32x4_t out1_f32_odd = vcvtq_f32_s32(out1.val[0]);
				1393	float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]);
				1394	out1_f32_odd = vmulq_f32(out1_f32_odd, oneoverscale);
				1395	out1_f32_even = vmulq_f32(out1_f32_even, oneoverscale);
				1396	out1.val[0] = vcvtq_s32_f32(out1_f32_odd);
				1397	out1.val[1] = vcvtq_s32_f32(out1_f32_even);
				1398	}
				1399
				1400	const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]);
				1401	store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()));
				1402
				1403	const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]);
				1404	store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1405	},
				1406	in, out);
				1407	}
				1408
				1409	template class arm_compute::NESeparableConvolutionVertKernel<5>;
				1410	template class arm_compute::NESeparableConvolutionVertKernel<7>;
				1411	template class arm_compute::NESeparableConvolutionVertKernel<9>;
				1412
				1413	/****************************************************************************************\
				1414	* Rectangle Convolution *
				1415	\****************************************************************************************/
				1416
				1417	NEConvolutionRectangleKernel::NEConvolutionRectangleKernel()
				1418	: _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0)
				1419	{
				1420	}
				1421
				1422	BorderSize NEConvolutionRectangleKernel::border_size() const
				1423	{
				1424	return _border_size;
				1425	}
				1426
				1427	void NEConvolutionRectangleKernel::configure(const ITensor input, ITensor output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
				1428	{
				1429	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
				1430
				1431	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				1432
				1433	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				1434	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				1435	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				1436	ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9);
				1437	ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9);
				1438	ARM_COMPUTE_ERROR_ON(0 == scale);
				1439
				1440	_input = input;
				1441	_output = output;
				1442	_scale = scale;
				1443	_border_size = BorderSize(height / 2, width / 2);
				1444
				1445	// Setup the convolution matrix
				1446	const uint32_t nr_elements = width * height;
				1447	_convolution.resize(nr_elements);
				1448	std::copy_n(conv, nr_elements, _convolution.begin());
				1449
				1450	// Set function index to help choose appropriate function in run()
				1451	_func_idx = get_index(height) * 4 + get_index(width);
				1452	ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes));
				1453
				1454	// Configure kernel window
				1455	constexpr unsigned int num_elems_processed_per_iteration = 8;
				1456	constexpr unsigned int num_elems_read_per_iteration = 16;
				1457	constexpr unsigned int num_elems_written_per_iteration = 8;
				1458
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	1459	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size);
				1460	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1461
				1462	update_window_and_padding(win,
				1463	AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height),
				1464	output_access);
				1465
				1466	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size);
				1467
				1468	INEKernel::configure(win);
				1469	}
				1470
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1471	void NEConvolutionRectangleKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1472	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1473	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1474	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1475	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1476
				1477	using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window);
				1478
				1479	// uint8_t function table
				1480	static const std::array<ConvolutionRectangleFunction, 16> func_table_u8 =
				1481	{
				1482	{
				1483	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 3>,
				1484	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 5>,
				1485	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 7>,
				1486	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 9>,
				1487	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 3>,
				1488	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 5>,
				1489	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 7>,
				1490	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 9>,
				1491	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 3>,
				1492	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 5>,
				1493	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 7>,
				1494	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 9>,
				1495	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 3>,
				1496	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 5>,
				1497	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 7>,
				1498	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 9>
				1499	}
				1500	};
				1501	// int16_t function table
				1502	static const std::array<ConvolutionRectangleFunction, 16> func_table_s16 =
				1503	{
				1504	{
				1505	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 3>,
				1506	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 5>,
				1507	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 7>,
				1508	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 9>,
				1509	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 3>,
				1510	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 5>,
				1511	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 7>,
				1512	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 9>,
				1513	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 3>,
				1514	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 5>,
				1515	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 7>,
				1516	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 9>,
				1517	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 3>,
				1518	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 5>,
				1519	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 7>,
				1520	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 9>
				1521	}
				1522	};
				1523
				1524	// Run appropriate function
Sanghoon Lee	d7ba539	2017-12-13 11:28:50 +0000	[diff] [blame]	1525	switch(_output->info()->data_type())
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1526	{
Sanghoon Lee	d7ba539	2017-12-13 11:28:50 +0000	[diff] [blame]	1527	case DataType::U8:
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1528	ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size());
				1529	(this->*func_table_u8[_func_idx])(window);
				1530	break;
Sanghoon Lee	d7ba539	2017-12-13 11:28:50 +0000	[diff] [blame]	1531	case DataType::S16:
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1532	ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size());
				1533	(this->*func_table_s16[_func_idx])(window);
				1534	break;
				1535	default:
				1536	ARM_COMPUTE_ERROR("Not supported");
				1537	}
				1538	}
				1539
				1540	unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val)
				1541	{
				1542	switch(val)
				1543	{
				1544	case 3:
				1545	return 0;
				1546	case 5:
				1547	return 1;
				1548	case 7:
				1549	return 2;
				1550	case 9:
				1551	return 3;
				1552	default:
				1553	ARM_COMPUTE_ERROR("Not supported dimension size");
				1554	return 0;
				1555	}
				1556	}
				1557
				1558	template <typename OutputType, unsigned int rows, unsigned int cols>
				1559	void NEConvolutionRectangleKernel::convolution(const Window &win)
				1560	{
				1561	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1562	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				1563
				1564	Iterator input(_input, win);
				1565	Iterator output(_output, win);
				1566
				1567	std::array<unsigned char *, rows> input_ptrs{ {} };
				1568	const int16_t *conv = _convolution.data();
				1569	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				1570	const int k_row_half = rows / 2;
				1571	const int k_col_half = cols / 2;
				1572
				1573	// Set row pointers
				1574	for(int i = -k_row_half; i <= k_row_half; ++i)
				1575	{
				1576	input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
				1577	}
				1578
				1579	execute_window_loop(win, [&](const Coordinates & id)
				1580	{
				1581	int32x4_t out = vdupq_n_s32(0);
				1582	int32x4_t out2 = vdupq_n_s32(0);
				1583
				1584	// Perform appropriate convolution
				1585	for(unsigned int r = 0; r < rows; ++r)
				1586	{
				1587	const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
				1588	if(3 == cols)
				1589	{
				1590	convolve_row3x1(out, out2, data, conv + r * cols);
				1591	}
				1592	else if(5 == cols)
				1593	{
				1594	convolve_row5x1(out, out2, data, conv + r * cols);
				1595	}
				1596	else if(7 == cols)
				1597	{
				1598	convolve_row7x1(out, out2, data, conv + r * cols);
				1599	}
				1600	else if(9 == cols)
				1601	{
				1602	convolve_row9x1(out, out2, data, conv + r * cols);
				1603	}
				1604	else
				1605	{
				1606	ARM_COMPUTE_ERROR("Unsupported number of columns");
				1607	}
				1608	}
				1609
				1610	// Apply scale
				1611	if(_scale != 1)
				1612	{
				1613	// Convert to F32, scale and convert back to S32
				1614	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				1615	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				1616	}
				1617
				1618	// Clamp and store as U8 or S16:
				1619	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				1620	},
				1621	input, output);
				1622	}
				1623	} // namespace arm_compute