Blame - src/core/NEON/kernels/NEConvolutionKernel.cpp - ml/ComputeLibrary

blob: 30e91ef2535496dfb6d11806eba64873e1dfda60 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
				25
				26	#include "arm_compute/core/Coordinates.h"
				27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/ITensor.h"
				30	#include "arm_compute/core/TensorInfo.h"
				31	#include "arm_compute/core/Types.h"
				32	#include "arm_compute/core/Utils.h"
				33	#include "arm_compute/core/Validate.h"
				34	#include "arm_compute/core/Window.h"
				35
				36	#include <algorithm>
				37	#include <arm_neon.h>
				38	#include <array>
				39	#include <cstdint>
				40	#include <cstring>
				41	#include <tuple>
				42
				43	namespace arm_compute
				44	{
				45	namespace
				46	{
				47	const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
				48
				49	inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output)
				50	{
				51	const int16x8_t s16results = vcombine_s16(vqmovn_s32(out),
				52	vqmovn_s32(out2));
				53	vst1q_s16(output, s16results);
				54	}
				55
				56	inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output)
				57	{
				58	const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out),
				59	vqmovun_s32(out2)));
				60	vst1_u8(output, u8results);
				61	}
				62
				63	inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output)
				64	{
				65	const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2));
				66	const int16x8_t s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16));
				67	vst1q_s16(output, s16results);
				68	}
				69
				70	inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output)
				71	{
				72	const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out),
				73	vqmovn_u32(out2)));
				74	vst1_u8(output, u8results);
				75	}
				76
				77	inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output)
				78	{
				79	vst1q_s16(output, out);
				80	vst1q_s16(output + 8, out2);
				81	}
				82
				83	inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output)
				84	{
				85	const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out),
				86	vqmovun_s16(out2));
				87	vst1q_u8(output, u8results);
				88	}
				89
				90	inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output)
				91	{
				92	const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out),
				93	vqmovn_u16(out2));
				94	vst1q_u8(output, u8results);
				95	}
				96
				97	inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output)
				98	{
				99	vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16)));
				100	vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16)));
				101	}
				102
				103	inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2)
				104	{
				105	// Convert to s16 and split in blocks of 4 values:
				106	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				107	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				108
				109	const int16x4x3_t row =
				110	{
				111	{
				112	vget_low_s16(s16_tmp0),
				113	vget_high_s16(s16_tmp0),
				114	vget_low_s16(s16_tmp1)
				115	}
				116	};
				117
				118	// Calculate row left value for pixels [0,3]
				119	out = vmlal_s16(out, row.val[0], mat0);
				120	// Calculate row middle value for pixels [0,3]
				121	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				122	// Calculate row right value for pixels [0,3]
				123	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				124
				125	// Calculate row left value for pixels [4,7]
				126	out2 = vmlal_s16(out2, row.val[1], mat0);
				127	// Calculate row middle value for pixels [4,7]
				128	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				129	// Calculate row right value for pixels [4,7]
				130	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				131	}
				132
				133	inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				134	{
				135	const int16x4_t mat0 = vld1_dup_s16(convolution);
				136	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				137	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				138
				139	convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2);
				140	}
				141
				142	inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				143	{
				144	const int16x4_t mat0 = vld1_dup_s16(convolution);
				145	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				146	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				147	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				148	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				149
				150	// Convert to s16 and split in blocks of 4 values:
				151	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				152	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				153
				154	const int16x4x3_t row =
				155	{
				156	{
				157	vget_low_s16(s16_tmp0),
				158	vget_high_s16(s16_tmp0),
				159	vget_low_s16(s16_tmp1)
				160	}
				161	};
				162
				163	// Calculate row left 2 value for pixels [0,3]
				164	out = vmlal_s16(out, row.val[0], mat0);
				165	// Calculate row left 1 value for pixels [0,3]
				166	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				167	// Calculate row middle value for pixels [0,3]
				168	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				169	// Calculate row right +1 value for pixels [0,3]
				170	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				171	// Calculate row right +2 value for pixels [0,3]
				172	out = vmlal_s16(out, row.val[1], mat4);
				173
				174	// Calculate row left 2 value for pixels [4,7]
				175	out2 = vmlal_s16(out2, row.val[1], mat0);
				176	// Calculate row left 1 value for pixels [4,7]
				177	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				178	// Calculate row middle value for pixels [4,7]
				179	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				180	// Calculate row right +1 value for pixels [4,7]
				181	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				182	// Calculate row right +2 value for pixels [4,7]
				183	out2 = vmlal_s16(out2, row.val[2], mat4);
				184	}
				185
				186	inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				187	{
				188	const int16x4_t mat0 = vld1_dup_s16(convolution);
				189	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				190	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				191	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				192	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				193	const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
				194	const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
				195
				196	// Convert to s16 and split in blocks of 4 values:
				197	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				198	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				199
				200	const int16x4x4_t row =
				201	{
				202	{
				203	vget_low_s16(s16_tmp0),
				204	vget_high_s16(s16_tmp0),
				205	vget_low_s16(s16_tmp1),
				206	vget_high_s16(s16_tmp1)
				207	}
				208	};
				209
				210	// Calculate row left 3 value for pixels [0,3]
				211	out = vmlal_s16(out, row.val[0], mat0);
				212	// Calculate row left 2 value for pixels [0,3]
				213	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				214	// Calculate row left 1 value for pixels [0,3]
				215	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				216	// Calculate row middle value for pixels [0,3]
				217	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				218	// Calculate row right +1 value for pixels [0,3]
				219	out = vmlal_s16(out, row.val[1], mat4);
				220	// Calculate row right +2 value for pixels [0,3]
				221	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
				222	// Calculate row right +3 value for pixels [0,3]
				223	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
				224
				225	// Calculate row left 3 value for pixels [4,7]
				226	out2 = vmlal_s16(out2, row.val[1], mat0);
				227	// Calculate row left 2 value for pixels [4,7]
				228	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				229	// Calculate row left 1 value for pixels [4,7]
				230	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				231	// Calculate row middle value for pixels [4,7]
				232	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				233	// Calculate row right +1 value for pixels [4,7]
				234	out2 = vmlal_s16(out2, row.val[2], mat4);
				235	// Calculate row right +2 value for pixels [4,7]
				236	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
				237	// Calculate row right +3 value for pixels [4,7]
				238	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
				239	}
				240
				241	inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				242	{
				243	const int16x4_t mat0 = vld1_dup_s16(convolution);
				244	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				245	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				246	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				247	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				248	const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
				249	const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
				250	const int16x4_t mat7 = vld1_dup_s16(convolution + 7);
				251	const int16x4_t mat8 = vld1_dup_s16(convolution + 8);
				252
				253	// Convert to s16 and split in blocks of 4 values:
				254	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				255	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				256
				257	const int16x4x4_t row =
				258	{
				259	{
				260	vget_low_s16(s16_tmp0),
				261	vget_high_s16(s16_tmp0),
				262	vget_low_s16(s16_tmp1),
				263	vget_high_s16(s16_tmp1)
				264	}
				265	};
				266
				267	// Calculate row left 4 value for pixels [0,3]
				268	out = vmlal_s16(out, row.val[0], mat0);
				269	// Calculate row left 3 value for pixels [0,3]
				270	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				271	// Calculate row left 2 value for pixels [0,3]
				272	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				273	// Calculate row left 1 value for pixels [0,3]
				274	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				275	// Calculate row middle value for pixels [0,3]
				276	out = vmlal_s16(out, row.val[1], mat4);
				277	// Calculate row right +1 value for pixels [0,3]
				278	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
				279	// Calculate row right +2 value for pixels [0,3]
				280	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
				281	// Calculate row right +3 value for pixels [0,3]
				282	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7);
				283	// Calculate row right +4 value for pixels [0,3]
				284	out = vmlal_s16(out, row.val[2], mat8);
				285
				286	// Calculate row left 4 value for pixels [0,3]
				287	out2 = vmlal_s16(out2, row.val[1], mat0);
				288	// Calculate row left 3 value for pixels [0,3]
				289	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				290	// Calculate row left 2 value for pixels [0,3]
				291	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				292	// Calculate row left 1 value for pixels [0,3]
				293	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				294	// Calculate row middle value for pixels [0,3]
				295	out2 = vmlal_s16(out2, row.val[2], mat4);
				296	// Calculate row right +1 value for pixels [0,3]
				297	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
				298	// Calculate row right +2 value for pixels [0,3]
				299	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
				300	// Calculate row right +3 value for pixels [0,3]
				301	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7);
				302	// Calculate row right +4 value for pixels [0,3]
				303	out2 = vmlal_s16(out2, row.val[3], mat8);
				304	}
				305	} // namespace
				306
				307	/****************************************************************************************\
				308	* Square Convolution *
				309	\****************************************************************************************/
				310
				311	template <unsigned int matrix_size>
				312	NEConvolutionKernel<matrix_size>::NEConvolutionKernel()
				313	: INESimpleKernel(), _scale(0), _convolution{ {} }
				314	{
				315	}
				316
				317	template <unsigned int matrix_size>
				318	BorderSize NEConvolutionKernel<matrix_size>::border_size() const
				319	{
				320	return BorderSize(matrix_size / 2);
				321	}
				322
				323	template <unsigned int matrix_size>
				324	void NEConvolutionKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv, uint32_t scale, bool border_undefined)
				325	{
				326	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
				327
				328	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				329
				330	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				331	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				332	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				333
				334	_input = input;
				335	_output = output;
				336
				337	std::copy_n(conv, _convolution.size(), _convolution.begin());
				338
				339	if(scale == 0)
				340	{
				341	_scale = calculate_matrix_scale(_convolution.data(), matrix_size);
				342	}
				343	else
				344	{
				345	_scale = scale;
				346	}
				347
				348	// Configure kernel window
				349	constexpr unsigned int num_elems_processed_per_iteration = 8;
				350	constexpr unsigned int num_elems_read_per_iteration = 16;
				351	constexpr unsigned int num_elems_written_per_iteration = 8;
				352
				353	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				354	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				355
				356	update_window_and_padding(win,
				357	AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, matrix_size),
				358	output_access);
				359
				360	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				361
				362	INEKernel::configure(win);
				363	}
				364
				365	template <>
				366	template <typename OutputType>
				367	void NEConvolutionKernel<3>::convolution(const Window &win)
				368	{
				369	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				370	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				371
				372	Iterator input(_input, win);
				373	Iterator output(_output, win);
				374
				375	// Load the matrix's coefficients into NEON registers:
				376	const int16x4_t mat00 = vld1_dup_s16(_convolution.data());
				377	const int16x4_t mat01 = vld1_dup_s16(_convolution.data() + 1);
				378	const int16x4_t mat02 = vld1_dup_s16(_convolution.data() + 2);
				379	const int16x4_t mat10 = vld1_dup_s16(_convolution.data() + 3);
				380	const int16x4_t mat11 = vld1_dup_s16(_convolution.data() + 4);
				381	const int16x4_t mat12 = vld1_dup_s16(_convolution.data() + 5);
				382	const int16x4_t mat20 = vld1_dup_s16(_convolution.data() + 6);
				383	const int16x4_t mat21 = vld1_dup_s16(_convolution.data() + 7);
				384	const int16x4_t mat22 = vld1_dup_s16(_convolution.data() + 8);
				385	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				386
				387	const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1));
				388	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
				389	const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
				390
				391	execute_window_loop(win, [&](const Coordinates & id)
				392	{
				393	int32x4_t out = vdupq_n_s32(0);
				394	int32x4_t out2 = vdupq_n_s32(0);
				395
				396	// Load 16 bytes from the top row:
				397	const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
				398	convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02);
				399
				400	// Load 16 bytes from the middle row:
				401	const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
				402	convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12);
				403
				404	// Load 16 bytes from the middle row:
				405	const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset());
				406	convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22);
				407
				408	// Apply scale
				409	if(_scale != 1)
				410	{
				411	// Convert to F32, scale and convert back to S32
				412	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				413	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				414	}
				415
				416	// Clamp and store as U8 or S16:
				417	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				418	},
				419	input, output);
				420	}
				421
				422	template <>
				423	template <typename OutputType>
				424	void NEConvolutionKernel<5>::convolution(const Window &win)
				425	{
				426	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				427	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				428
				429	Iterator input(_input, win);
				430	Iterator output(_output, win);
				431
				432	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				433
				434	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2));
				435	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1));
				436	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0));
				437	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
				438	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
				439
				440	execute_window_loop(win, [&](const Coordinates & id)
				441	{
				442	int32x4_t out = vdupq_n_s32(0);
				443	int32x4_t out2 = vdupq_n_s32(0);
				444
				445	// Load 16 bytes from the top2 row:
				446	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				447	convolve_row5x1(out, out2, data_t2, _convolution.data());
				448
				449	// Load 16 bytes from the top1 row:
				450	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				451	convolve_row5x1(out, out2, data_t1, _convolution.data() + 5);
				452
				453	// Load 16 bytes from the middle row:
				454	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				455	convolve_row5x1(out, out2, data_m, _convolution.data() + 10);
				456
				457	// Load 16 bytes from the low1 row:
				458	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				459	convolve_row5x1(out, out2, data_b1, _convolution.data() + 15);
				460
				461	// Load 16 bytes from the low2 row:
				462	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				463	convolve_row5x1(out, out2, data_b2, _convolution.data() + 20);
				464
				465	// Apply scale
				466	if(_scale != 1)
				467	{
				468	// Convert to F32, scale and convert back to S32
				469	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				470	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				471	}
				472
				473	// Clamp and store as U8 or S16:
				474	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				475	},
				476	input, output);
				477	}
				478
				479	template <>
				480	template <typename OutputType>
				481	void NEConvolutionKernel<7>::convolution(const Window &win)
				482	{
				483	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				484	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				485
				486	Iterator input(_input, win);
				487	Iterator output(_output, win);
				488
				489	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				490
				491	const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3));
				492	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2));
				493	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1));
				494	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0));
				495	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1));
				496	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
				497	const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
				498
				499	execute_window_loop(win, [&](const Coordinates & id)
				500	{
				501	int32x4_t out = vdupq_n_s32(0);
				502	int32x4_t out2 = vdupq_n_s32(0);
				503
				504	// Load 16 bytes from the top3 row:
				505	const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
				506	convolve_row7x1(out, out2, data_t3, _convolution.data());
				507
				508	// Load 16 bytes from the top2 row:
				509	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				510	convolve_row7x1(out, out2, data_t2, _convolution.data() + 7);
				511
				512	// Load 16 bytes from the top1 row:
				513	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				514	convolve_row7x1(out, out2, data_t1, _convolution.data() + 14);
				515
				516	// Load 16 bytes from the middle row:
				517	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				518	convolve_row7x1(out, out2, data_m, _convolution.data() + 21);
				519
				520	// Load 16 bytes from the low1 row:
				521	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				522	convolve_row7x1(out, out2, data_b1, _convolution.data() + 28);
				523
				524	// Load 16 bytes from the low2 row:
				525	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				526	convolve_row7x1(out, out2, data_b2, _convolution.data() + 35);
				527
				528	// Load 16 bytes from the low3 row:
				529	const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
				530	convolve_row7x1(out, out2, data_b3, _convolution.data() + 42);
				531
				532	// Apply scale
				533	if(_scale != 1)
				534	{
				535	// Convert to F32, scale and convert back to S32
				536	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				537	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				538	}
				539
				540	// Clamp and store as U8 or S16:
				541	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				542	},
				543	input, output);
				544	}
				545
				546	template <>
				547	template <typename OutputType>
				548	void NEConvolutionKernel<9>::convolution(const Window &win)
				549	{
				550	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				551	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				552
				553	Iterator input(_input, win);
				554	Iterator output(_output, win);
				555
				556	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				557
				558	const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4));
				559	const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3));
				560	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2));
				561	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1));
				562	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0));
				563	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1));
				564	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2));
				565	const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
				566	const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
				567
				568	execute_window_loop(win, [&](const Coordinates & id)
				569	{
				570	int32x4_t out = vdupq_n_s32(0);
				571	int32x4_t out2 = vdupq_n_s32(0);
				572
				573	// Load 16 bytes from the top4 row:
				574	const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset());
				575	convolve_row9x1(out, out2, data_t4, _convolution.data());
				576
				577	// Load 16 bytes from the top3 row:
				578	const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
				579	convolve_row9x1(out, out2, data_t3, _convolution.data() + 9);
				580
				581	// Load 16 bytes from the top2 row:
				582	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				583	convolve_row9x1(out, out2, data_t2, _convolution.data() + 18);
				584
				585	// Load 16 bytes from the top1 row:
				586	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				587	convolve_row9x1(out, out2, data_t1, _convolution.data() + 27);
				588
				589	// Load 16 bytes from the middle row:
				590	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				591	convolve_row9x1(out, out2, data_m, _convolution.data() + 36);
				592
				593	// Load 16 bytes from the low1 row:
				594	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				595	convolve_row9x1(out, out2, data_b1, _convolution.data() + 45);
				596
				597	// Load 16 bytes from the low2 row:
				598	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				599	convolve_row9x1(out, out2, data_b2, _convolution.data() + 54);
				600
				601	// Load 16 bytes from the low3 row:
				602	const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
				603	convolve_row9x1(out, out2, data_b3, _convolution.data() + 63);
				604
				605	// Load 16 bytes from the low4 row:
				606	const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset());
				607	convolve_row9x1(out, out2, data_b4, _convolution.data() + 72);
				608
				609	// Apply scale
				610	if(_scale != 1)
				611	{
				612	// Convert to F32, scale and convert back to S32
				613	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				614	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				615	}
				616
				617	// Clamp and store as U8 or S16:
				618	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				619	},
				620	input, output);
				621	}
				622
				623	template <unsigned int matrix_size>
				624	void NEConvolutionKernel<matrix_size>::run(const Window &window)
				625	{
				626	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				627	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				628
				629	switch(_output->info()->format())
				630	{
				631	case Format::U8:
				632	convolution<uint8_t>(window);
				633	break;
				634	case Format::S16:
				635	convolution<int16_t>(window);
				636	break;
				637	default:
				638	ARM_COMPUTE_ERROR("Not supported");
				639	}
				640	}
				641
				642	template class arm_compute::NEConvolutionKernel<3>;
				643	template class arm_compute::NEConvolutionKernel<5>;
				644	template class arm_compute::NEConvolutionKernel<7>;
				645	template class arm_compute::NEConvolutionKernel<9>;
				646
				647	/****************************************************************************************\
				648	* Separable Square Convolution *
				649	\****************************************************************************************/
				650
				651	template <unsigned int matrix_size>
				652	NESeparableConvolutionHorKernel<matrix_size>::NESeparableConvolutionHorKernel()
				653	: _conv_row{ { 0 } }, _border_size(0)
				654	{
				655	}
				656
				657	template <unsigned int matrix_size>
				658	BorderSize NESeparableConvolutionHorKernel<matrix_size>::border_size() const
				659	{
				660	return _border_size;
				661	}
				662
				663	template <unsigned int matrix_size>
				664	void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv_row, bool border_undefined)
				665	{
				666	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row);
				667
				668	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				669
				670	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				671	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				672	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
				673
				674	_input = input;
				675	_output = output;
				676	std::copy_n(conv_row, _conv_row.size(), _conv_row.begin());
				677	_border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
				678
				679	// Configure kernel window
				680	constexpr unsigned int num_elems_processed_per_iteration = 8;
				681	constexpr unsigned int num_elems_read_per_iteration = 16;
				682	constexpr unsigned int num_elems_written_per_iteration = 8;
				683
				684	Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				685	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				686
				687	update_window_and_padding(win,
				688	AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
				689	output_access);
				690
				691	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				692
				693	INEKernel::configure(win);
				694	}
				695
				696	template <unsigned int matrix_size>
				697	void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window)
				698	{
				699	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				700	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				701	switch(_output->info()->data_type())
				702	{
				703	case DataType::U16:
				704	convolve<uint16_t>(window);
				705	break;
				706	case DataType::S16:
				707	convolve<int16_t>(window);
				708	break;
				709	case DataType::S32:
				710	convolve<int32_t>(window);
				711	break;
				712	default:
				713	ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
				714	break;
				715	}
				716	}
				717
				718	template <>
				719	template <>
				720	inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
				721	{
				722	Window win_in(window);
				723	win_in.shift(Window::DimX, -2);
				724
				725	Iterator input(_input, win_in);
				726	Iterator output(_output, window);
				727
				728	execute_window_loop(window, [&](const Coordinates & id)
				729	{
				730	const uint8x16_t data = vld1q_u8(input.ptr());
				731
				732	const uint16x8x2_t data_u16 =
				733	{
				734	{
				735	vmovl_u8(vget_low_u8(data)),
				736	vmovl_u8(vget_high_u8(data))
				737	}
				738	};
				739
				740	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				741	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				742	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				743	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				744	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				745
				746	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				747	},
				748	input, output);
				749	}
				750
				751	template <>
				752	template <>
				753	inline void NESeparableConvolutionHorKernel<5>::convolve<int16_t>(const Window &window)
				754	{
				755	Window win_in(window);
				756	win_in.shift(Window::DimX, -2);
				757
				758	Iterator input(_input, win_in);
				759	Iterator output(_output, window);
				760
				761	execute_window_loop(window, [&](const Coordinates & id)
				762	{
				763	const uint8x16_t data = vld1q_u8(input.ptr());
				764
				765	const int16x8x2_t data_s16 =
				766	{
				767	{
				768	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				769	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				770	}
				771	};
				772
				773	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				774	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				775	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				776	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				777	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				778
				779	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				780	},
				781	input, output);
				782	}
				783
				784	template <>
				785	template <>
				786	void NESeparableConvolutionHorKernel<5>::convolve<int32_t>(const Window &window)
				787	{
				788	Window win_in(window);
				789	win_in.shift(Window::DimX, -2);
				790
				791	Iterator input(_input, win_in);
				792	Iterator output(_output, window);
				793
				794	execute_window_loop(window, [&](const Coordinates & id)
				795	{
				796	const uint8x16_t data = vld1q_u8(input.ptr());
				797
				798	const int16x8x2_t data_s16 =
				799	{
				800	{
				801	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				802	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				803	}
				804	};
				805
				806	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				807	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				808	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				809	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				810
				811	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				812	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]);
				813	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]);
				814	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]);
				815	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]);
				816
				817	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				818
				819	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				820	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]);
				821	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]);
				822	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]);
				823	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]);
				824
				825	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				826	},
				827	input, output);
				828	}
				829
				830	template <>
				831	template <>
				832	inline void NESeparableConvolutionHorKernel<7>::convolve<uint16_t>(const Window &window)
				833	{
				834	Window win_in(window);
				835	win_in.shift(Window::DimX, -3);
				836
				837	Iterator input(_input, win_in);
				838	Iterator output(_output, window);
				839
				840	execute_window_loop(window, [&](const Coordinates & id)
				841	{
				842	const uint8x16_t data = vld1q_u8(input.ptr());
				843
				844	const uint16x8x2_t data_u16 =
				845	{
				846	{
				847	vmovl_u8(vget_low_u8(data)),
				848	vmovl_u8(vget_high_u8(data))
				849	}
				850	};
				851
				852	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				853	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				854	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				855	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				856	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				857	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
				858	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
				859
				860	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				861	},
				862	input, output);
				863	}
				864
				865	template <>
				866	template <>
				867	inline void NESeparableConvolutionHorKernel<7>::convolve<int16_t>(const Window &window)
				868	{
				869	Window win_in(window);
				870	win_in.shift(Window::DimX, -3);
				871
				872	Iterator input(_input, win_in);
				873	Iterator output(_output, window);
				874
				875	execute_window_loop(window, [&](const Coordinates & id)
				876	{
				877	const uint8x16_t data = vld1q_u8(input.ptr());
				878
				879	const int16x8x2_t data_s16 =
				880	{
				881	{
				882	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				883	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				884	}
				885	};
				886
				887	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				888	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				889	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				890	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				891	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				892	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
				893	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
				894
				895	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				896	},
				897	input, output);
				898	}
				899
				900	template <>
				901	template <>
				902	void NESeparableConvolutionHorKernel<7>::convolve<int32_t>(const Window &window)
				903	{
				904	Window win_in(window);
				905	win_in.shift(Window::DimX, -3);
				906
				907	Iterator input(_input, win_in);
				908	Iterator output(_output, window);
				909
				910	execute_window_loop(window, [&](const Coordinates & id)
				911	{
				912	const uint8x16_t data = vld1q_u8(input.ptr());
				913
				914	const int16x8x2_t data_s16 =
				915	{
				916	{
				917	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				918	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				919	}
				920	};
				921
				922	const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				923	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				924	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				925	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				926	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
				927	const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
				928
				929	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				930	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]);
				931	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]);
				932	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]);
				933	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]);
				934	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]);
				935	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]);
				936
				937	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				938
				939	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				940	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]);
				941	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]);
				942	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]);
				943	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]);
				944	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]);
				945	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]);
				946
				947	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				948	},
				949	input, output);
				950	}
				951
				952	template <>
				953	template <>
				954	inline void NESeparableConvolutionHorKernel<9>::convolve<uint16_t>(const Window &window)
				955	{
				956	Window win_in(window);
				957	win_in.shift(Window::DimX, -4);
				958
				959	Iterator input(_input, win_in);
				960	Iterator output(_output, window);
				961
				962	execute_window_loop(window, [&](const Coordinates & id)
				963	{
				964	const uint8x16_t data = vld1q_u8(input.ptr());
				965
				966	const uint16x8x2_t data_u16 =
				967	{
				968	{
				969	vmovl_u8(vget_low_u8(data)),
				970	vmovl_u8(vget_high_u8(data))
				971	}
				972	};
				973
				974	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				975	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				976	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				977	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				978	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				979	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
				980	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
				981	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]);
				982	out = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]);
				983
				984	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				985	},
				986	input, output);
				987	}
				988
				989	template <>
				990	template <>
				991	inline void NESeparableConvolutionHorKernel<9>::convolve<int16_t>(const Window &window)
				992	{
				993	Window win_in(window);
				994	win_in.shift(Window::DimX, -4);
				995
				996	Iterator input(_input, win_in);
				997	Iterator output(_output, window);
				998
				999	execute_window_loop(window, [&](const Coordinates & id)
				1000	{
				1001	const uint8x16_t data = vld1q_u8(input.ptr());
				1002
				1003	const int16x8x2_t data_s16 =
				1004	{
				1005	{
				1006	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				1007	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				1008	}
				1009	};
				1010
				1011	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				1012	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				1013	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				1014	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				1015	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				1016	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
				1017	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
				1018	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]);
				1019	out = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]);
				1020
				1021	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				1022	},
				1023	input, output);
				1024	}
				1025
				1026	template <>
				1027	template <>
				1028	void NESeparableConvolutionHorKernel<9>::convolve<int32_t>(const Window &window)
				1029	{
				1030	Window win_in(window);
				1031	win_in.shift(Window::DimX, -4);
				1032
				1033	Iterator input(_input, win_in);
				1034	Iterator output(_output, window);
				1035
				1036	execute_window_loop(window, [&](const Coordinates & id)
				1037	{
				1038	const uint8x16_t data = vld1q_u8(input.ptr());
				1039
				1040	const int16x8x2_t data_s16 =
				1041	{
				1042	{
				1043	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				1044	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				1045	}
				1046	};
				1047
				1048	const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				1049	const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				1050	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				1051	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				1052	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
				1053	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
				1054	const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7);
				1055
				1056	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				1057	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]);
				1058	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]);
				1059	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]);
				1060	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]);
				1061	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]);
				1062	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]);
				1063	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]);
				1064	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]);
				1065
				1066	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				1067
				1068	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				1069	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]);
				1070	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]);
				1071	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]);
				1072	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]);
				1073	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]);
				1074	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]);
				1075	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]);
				1076	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]);
				1077
				1078	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				1079	},
				1080	input, output);
				1081	}
				1082
				1083	template class arm_compute::NESeparableConvolutionHorKernel<5>;
				1084	template class arm_compute::NESeparableConvolutionHorKernel<7>;
				1085	template class arm_compute::NESeparableConvolutionHorKernel<9>;
				1086
				1087	template <unsigned int matrix_size>
				1088	NESeparableConvolutionVertKernel<matrix_size>::NESeparableConvolutionVertKernel()
				1089	: _conv_col{ { 0 } }, _scale(0)
				1090	{
				1091	}
				1092
				1093	template <unsigned int matrix_size>
				1094	BorderSize NESeparableConvolutionVertKernel<matrix_size>::border_size() const
				1095	{
				1096	return BorderSize(matrix_size / 2, 0);
				1097	}
				1098
				1099	template <unsigned int matrix_size>
				1100	void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
				1101	{
				1102	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col);
				1103
				1104	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				1105
				1106	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				1107	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
				1108	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				1109	ARM_COMPUTE_ERROR_ON(scale == 0);
				1110
				1111	_input = input;
				1112	_output = output;
				1113	std::copy_n(conv_col, _conv_col.size(), _conv_col.begin());
				1114	_scale = scale;
				1115
				1116	// Configure kernel window
				1117	constexpr unsigned int num_elems_processed_per_iteration = 16;
				1118	constexpr unsigned int num_elems_read_per_iteration = 16;
				1119	constexpr unsigned int num_elems_written_per_iteration = 16;
				1120
				1121	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				1122	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				1123
				1124	update_window_and_padding(win,
				1125	AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, matrix_size),
				1126	output_access);
				1127
				1128	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				1129
				1130	INEKernel::configure(win);
				1131	}
				1132
				1133	template <unsigned int matrix_size>
				1134	void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window)
				1135	{
				1136	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1137	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1138
				1139	switch(_input->info()->data_type())
				1140	{
				1141	case DataType::U16:
				1142	switch(_output->info()->data_type())
				1143	{
				1144	case DataType::U8:
				1145	convolution_u16<uint8_t>(window);
				1146	break;
				1147	case DataType::S16:
				1148	convolution_u16<int16_t>(window);
				1149	break;
				1150	default:
				1151	ARM_COMPUTE_ERROR("Not supported");
				1152	}
				1153	break;
				1154	case DataType::S16:
				1155	switch(_output->info()->data_type())
				1156	{
				1157	case DataType::U8:
				1158	convolution_s16<uint8_t>(window);
				1159	break;
				1160	case DataType::S16:
				1161	convolution_s16<int16_t>(window);
				1162	break;
				1163	default:
				1164	ARM_COMPUTE_ERROR("Not supported");
				1165	}
				1166	break;
				1167	case DataType::S32:
				1168	switch(_output->info()->data_type())
				1169	{
				1170	case DataType::U8:
				1171	convolution_s32<uint8_t>(window);
				1172	break;
				1173	case DataType::S16:
				1174	convolution_s32<int16_t>(window);
				1175	break;
				1176	default:
				1177	ARM_COMPUTE_ERROR("Not supported");
				1178	}
				1179	break;
				1180	default:
				1181	ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
				1182	break;
				1183	}
				1184	}
				1185
				1186	template <unsigned int matrix_size>
				1187	template <typename OutputType>
				1188	void NESeparableConvolutionVertKernel<matrix_size>::convolution_u16(const Window &win)
				1189	{
				1190	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1191
				1192	Window win_in(win);
				1193	win_in.set_dimension_step(Window::DimX, 8);
				1194
				1195	Iterator in(_input, win_in);
				1196	Iterator out(_output, win);
				1197
				1198	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1199	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1200	const int k_half = matrix_size / 2;
				1201
				1202	// Set row pointers
				1203	for(int i = -k_half; i <= k_half; ++i)
				1204	{
				1205	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1206	}
				1207
				1208	execute_window_loop(win, [&](const Coordinates & id)
				1209	{
				1210	uint16x8_t out0 = vdupq_n_u16(0);
				1211	uint16x8_t out1 = vdupq_n_u16(0);
				1212
				1213	// First half
				1214	for(unsigned int r = 0; r < matrix_size; ++r)
				1215	{
				1216	const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
				1217	out0 = vmlaq_n_u16(out0, data, _conv_col[r]);
				1218	}
				1219
				1220	in.increment(Window::DimX);
				1221
				1222	// Second half
				1223	for(unsigned int r = 0; r < matrix_size; ++r)
				1224	{
				1225	const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
				1226	out1 = vmlaq_n_u16(out1, data, _conv_col[r]);
				1227	}
				1228
				1229	//scale the result if needed
				1230	if(_scale != 1)
				1231	{
				1232	float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0)));
				1233	float32x4_t out0_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0)));
				1234	out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
				1235	out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
				1236	store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
				1237
				1238	float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1)));
				1239	float32x4_t out1_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1)));
				1240	out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
				1241	out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
				1242	store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1243	}
				1244	else
				1245	{
				1246	store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
				1247	}
				1248	},
				1249	in, out);
				1250	}
				1251
				1252	template <unsigned int matrix_size>
				1253	template <typename OutputType>
				1254	void NESeparableConvolutionVertKernel<matrix_size>::convolution_s16(const Window &win)
				1255	{
				1256	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1257
				1258	Window win_in(win);
				1259	win_in.set_dimension_step(Window::DimX, 8);
				1260
				1261	Iterator in(_input, win_in);
				1262	Iterator out(_output, win);
				1263
				1264	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1265	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1266	const int k_half = matrix_size / 2;
				1267
				1268	// Set row pointers
				1269	for(int i = -k_half; i <= k_half; ++i)
				1270	{
				1271	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1272	}
				1273
				1274	execute_window_loop(win, [&](const Coordinates & id)
				1275	{
				1276	int16x8_t out0 = vdupq_n_s16(0);
				1277	int16x8_t out1 = vdupq_n_s16(0);
				1278
				1279	// First half
				1280	for(unsigned int r = 0; r < matrix_size; ++r)
				1281	{
				1282	const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
				1283	out0 = vmlaq_n_s16(out0, data, _conv_col[r]);
				1284	}
				1285
				1286	in.increment(Window::DimX);
				1287
				1288	// Second half
				1289	for(unsigned int r = 0; r < matrix_size; ++r)
				1290	{
				1291	const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
				1292	out1 = vmlaq_n_s16(out1, data, _conv_col[r]);
				1293	}
				1294
				1295	//scale the result if needed
				1296	if(_scale != 1)
				1297	{
				1298	float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0)));
				1299	float32x4_t out0_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0)));
				1300	out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
				1301	out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
				1302	store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
				1303
				1304	float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1)));
				1305	float32x4_t out1_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1)));
				1306	out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
				1307	out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
				1308	store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1309	}
				1310	else
				1311	{
				1312	store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
				1313	}
				1314	},
				1315	in, out);
				1316	}
				1317
				1318	template <unsigned int matrix_size>
				1319	template <typename OutputType>
				1320	void NESeparableConvolutionVertKernel<matrix_size>::convolution_s32(const Window &win)
				1321	{
				1322	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1323
				1324	Window win_in(win);
				1325	win_in.set_dimension_step(Window::DimX, 8);
				1326
				1327	Iterator in(_input, win_in);
				1328	Iterator out(_output, win);
				1329
				1330	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1331	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1332	const int k_half = matrix_size / 2;
				1333
				1334	// Set row pointers
				1335	for(int i = -k_half; i <= k_half; ++i)
				1336	{
				1337	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1338	}
				1339
				1340	const int32x4_t zero = vdupq_n_s32(0);
				1341
				1342	execute_window_loop(win, [&](const Coordinates & id)
				1343	{
				1344	int32x4x2_t out0 =
				1345	{
				1346	{
				1347	zero,
				1348	zero
				1349	}
				1350	};
				1351
				1352	int32x4x2_t out1 =
				1353	{
				1354	{
				1355	zero,
				1356	zero
				1357	}
				1358	};
				1359
				1360	// First half
				1361	for(unsigned int r = 0; r < matrix_size; ++r)
				1362	{
				1363	const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
				1364	out0.val[0] = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]);
				1365	out0.val[1] = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]);
				1366	}
				1367
				1368	in.increment(Window::DimX);
				1369
				1370	// Second half
				1371	for(unsigned int r = 0; r < matrix_size; ++r)
				1372	{
				1373	const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
				1374	out1.val[0] = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]);
				1375	out1.val[1] = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]);
				1376	}
				1377
				1378	//scale the result if needed
				1379	if(_scale != 1)
				1380	{
				1381	float32x4_t out0_f32_odd = vcvtq_f32_s32(out0.val[0]);
				1382	float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]);
				1383	out0_f32_odd = vmulq_f32(out0_f32_odd, oneoverscale);
				1384	out0_f32_even = vmulq_f32(out0_f32_even, oneoverscale);
				1385	out0.val[0] = vcvtq_s32_f32(out0_f32_odd);
				1386	out0.val[1] = vcvtq_s32_f32(out0_f32_even);
				1387
				1388	float32x4_t out1_f32_odd = vcvtq_f32_s32(out1.val[0]);
				1389	float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]);
				1390	out1_f32_odd = vmulq_f32(out1_f32_odd, oneoverscale);
				1391	out1_f32_even = vmulq_f32(out1_f32_even, oneoverscale);
				1392	out1.val[0] = vcvtq_s32_f32(out1_f32_odd);
				1393	out1.val[1] = vcvtq_s32_f32(out1_f32_even);
				1394	}
				1395
				1396	const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]);
				1397	store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()));
				1398
				1399	const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]);
				1400	store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1401	},
				1402	in, out);
				1403	}
				1404
				1405	template class arm_compute::NESeparableConvolutionVertKernel<5>;
				1406	template class arm_compute::NESeparableConvolutionVertKernel<7>;
				1407	template class arm_compute::NESeparableConvolutionVertKernel<9>;
				1408
				1409	/****************************************************************************************\
				1410	* Rectangle Convolution *
				1411	\****************************************************************************************/
				1412
				1413	NEConvolutionRectangleKernel::NEConvolutionRectangleKernel()
				1414	: _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0)
				1415	{
				1416	}
				1417
				1418	BorderSize NEConvolutionRectangleKernel::border_size() const
				1419	{
				1420	return _border_size;
				1421	}
				1422
				1423	void NEConvolutionRectangleKernel::configure(const ITensor input, ITensor output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
				1424	{
				1425	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
				1426
				1427	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				1428
				1429	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				1430	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				1431	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				1432	ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9);
				1433	ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9);
				1434	ARM_COMPUTE_ERROR_ON(0 == scale);
				1435
				1436	_input = input;
				1437	_output = output;
				1438	_scale = scale;
				1439	_border_size = BorderSize(height / 2, width / 2);
				1440
				1441	// Setup the convolution matrix
				1442	const uint32_t nr_elements = width * height;
				1443	_convolution.resize(nr_elements);
				1444	std::copy_n(conv, nr_elements, _convolution.begin());
				1445
				1446	// Set function index to help choose appropriate function in run()
				1447	_func_idx = get_index(height) * 4 + get_index(width);
				1448	ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes));
				1449
				1450	// Configure kernel window
				1451	constexpr unsigned int num_elems_processed_per_iteration = 8;
				1452	constexpr unsigned int num_elems_read_per_iteration = 16;
				1453	constexpr unsigned int num_elems_written_per_iteration = 8;
				1454
				1455	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size);
				1456	AccessWindowHorizontal output_access = AccessWindowHorizontal(output->info(), 0, num_elems_written_per_iteration);
				1457
				1458	update_window_and_padding(win,
				1459	AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height),
				1460	output_access);
				1461
				1462	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size);
				1463
				1464	INEKernel::configure(win);
				1465	}
				1466
				1467	void NEConvolutionRectangleKernel::run(const Window &window)
				1468	{
				1469	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1470	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1471
				1472	using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window);
				1473
				1474	// uint8_t function table
				1475	static const std::array<ConvolutionRectangleFunction, 16> func_table_u8 =
				1476	{
				1477	{
				1478	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 3>,
				1479	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 5>,
				1480	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 7>,
				1481	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 9>,
				1482	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 3>,
				1483	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 5>,
				1484	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 7>,
				1485	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 9>,
				1486	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 3>,
				1487	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 5>,
				1488	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 7>,
				1489	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 9>,
				1490	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 3>,
				1491	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 5>,
				1492	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 7>,
				1493	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 9>
				1494	}
				1495	};
				1496	// int16_t function table
				1497	static const std::array<ConvolutionRectangleFunction, 16> func_table_s16 =
				1498	{
				1499	{
				1500	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 3>,
				1501	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 5>,
				1502	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 7>,
				1503	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 9>,
				1504	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 3>,
				1505	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 5>,
				1506	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 7>,
				1507	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 9>,
				1508	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 3>,
				1509	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 5>,
				1510	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 7>,
				1511	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 9>,
				1512	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 3>,
				1513	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 5>,
				1514	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 7>,
				1515	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 9>
				1516	}
				1517	};
				1518
				1519	// Run appropriate function
				1520	switch(_output->info()->format())
				1521	{
				1522	case Format::U8:
				1523	ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size());
				1524	(this->*func_table_u8[_func_idx])(window);
				1525	break;
				1526	case Format::S16:
				1527	ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size());
				1528	(this->*func_table_s16[_func_idx])(window);
				1529	break;
				1530	default:
				1531	ARM_COMPUTE_ERROR("Not supported");
				1532	}
				1533	}
				1534
				1535	unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val)
				1536	{
				1537	switch(val)
				1538	{
				1539	case 3:
				1540	return 0;
				1541	case 5:
				1542	return 1;
				1543	case 7:
				1544	return 2;
				1545	case 9:
				1546	return 3;
				1547	default:
				1548	ARM_COMPUTE_ERROR("Not supported dimension size");
				1549	return 0;
				1550	}
				1551	}
				1552
				1553	template <typename OutputType, unsigned int rows, unsigned int cols>
				1554	void NEConvolutionRectangleKernel::convolution(const Window &win)
				1555	{
				1556	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1557	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				1558
				1559	Iterator input(_input, win);
				1560	Iterator output(_output, win);
				1561
				1562	std::array<unsigned char *, rows> input_ptrs{ {} };
				1563	const int16_t *conv = _convolution.data();
				1564	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				1565	const int k_row_half = rows / 2;
				1566	const int k_col_half = cols / 2;
				1567
				1568	// Set row pointers
				1569	for(int i = -k_row_half; i <= k_row_half; ++i)
				1570	{
				1571	input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
				1572	}
				1573
				1574	execute_window_loop(win, [&](const Coordinates & id)
				1575	{
				1576	int32x4_t out = vdupq_n_s32(0);
				1577	int32x4_t out2 = vdupq_n_s32(0);
				1578
				1579	// Perform appropriate convolution
				1580	for(unsigned int r = 0; r < rows; ++r)
				1581	{
				1582	const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
				1583	if(3 == cols)
				1584	{
				1585	convolve_row3x1(out, out2, data, conv + r * cols);
				1586	}
				1587	else if(5 == cols)
				1588	{
				1589	convolve_row5x1(out, out2, data, conv + r * cols);
				1590	}
				1591	else if(7 == cols)
				1592	{
				1593	convolve_row7x1(out, out2, data, conv + r * cols);
				1594	}
				1595	else if(9 == cols)
				1596	{
				1597	convolve_row9x1(out, out2, data, conv + r * cols);
				1598	}
				1599	else
				1600	{
				1601	ARM_COMPUTE_ERROR("Unsupported number of columns");
				1602	}
				1603	}
				1604
				1605	// Apply scale
				1606	if(_scale != 1)
				1607	{
				1608	// Convert to F32, scale and convert back to S32
				1609	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				1610	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				1611	}
				1612
				1613	// Clamp and store as U8 or S16:
				1614	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				1615	},
				1616	input, output);
				1617	}
				1618	} // namespace arm_compute