Blame - src/core/NEON/kernels/NEConvolutionKernel.cpp - ml/ComputeLibrary

blob: 263fbe058ad63767043285d55234037f186261fa [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
				25
				26	#include "arm_compute/core/Coordinates.h"
				27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/ITensor.h"
				30	#include "arm_compute/core/TensorInfo.h"
				31	#include "arm_compute/core/Types.h"
				32	#include "arm_compute/core/Utils.h"
				33	#include "arm_compute/core/Validate.h"
				34	#include "arm_compute/core/Window.h"
				35
				36	#include <algorithm>
				37	#include <arm_neon.h>
				38	#include <array>
				39	#include <cstdint>
				40	#include <cstring>
				41	#include <tuple>
				42
				43	namespace arm_compute
				44	{
				45	namespace
				46	{
				47	const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
				48
				49	inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output)
				50	{
				51	const int16x8_t s16results = vcombine_s16(vqmovn_s32(out),
				52	vqmovn_s32(out2));
				53	vst1q_s16(output, s16results);
				54	}
				55
				56	inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output)
				57	{
				58	const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out),
				59	vqmovun_s32(out2)));
				60	vst1_u8(output, u8results);
				61	}
				62
				63	inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output)
				64	{
				65	const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2));
				66	const int16x8_t s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16));
				67	vst1q_s16(output, s16results);
				68	}
				69
				70	inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output)
				71	{
				72	const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out),
				73	vqmovn_u32(out2)));
				74	vst1_u8(output, u8results);
				75	}
				76
				77	inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output)
				78	{
				79	vst1q_s16(output, out);
				80	vst1q_s16(output + 8, out2);
				81	}
				82
				83	inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output)
				84	{
				85	const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out),
				86	vqmovun_s16(out2));
				87	vst1q_u8(output, u8results);
				88	}
				89
				90	inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output)
				91	{
				92	const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out),
				93	vqmovn_u16(out2));
				94	vst1q_u8(output, u8results);
				95	}
				96
				97	inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output)
				98	{
				99	vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16)));
				100	vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16)));
				101	}
				102
				103	inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2)
				104	{
				105	// Convert to s16 and split in blocks of 4 values:
				106	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				107	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				108
				109	const int16x4x3_t row =
				110	{
				111	{
				112	vget_low_s16(s16_tmp0),
				113	vget_high_s16(s16_tmp0),
				114	vget_low_s16(s16_tmp1)
				115	}
				116	};
				117
				118	// Calculate row left value for pixels [0,3]
				119	out = vmlal_s16(out, row.val[0], mat0);
				120	// Calculate row middle value for pixels [0,3]
				121	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				122	// Calculate row right value for pixels [0,3]
				123	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				124
				125	// Calculate row left value for pixels [4,7]
				126	out2 = vmlal_s16(out2, row.val[1], mat0);
				127	// Calculate row middle value for pixels [4,7]
				128	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				129	// Calculate row right value for pixels [4,7]
				130	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				131	}
				132
				133	inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				134	{
				135	const int16x4_t mat0 = vld1_dup_s16(convolution);
				136	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				137	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				138
				139	convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2);
				140	}
				141
				142	inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				143	{
				144	const int16x4_t mat0 = vld1_dup_s16(convolution);
				145	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				146	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				147	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				148	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				149
				150	// Convert to s16 and split in blocks of 4 values:
				151	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				152	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				153
				154	const int16x4x3_t row =
				155	{
				156	{
				157	vget_low_s16(s16_tmp0),
				158	vget_high_s16(s16_tmp0),
				159	vget_low_s16(s16_tmp1)
				160	}
				161	};
				162
				163	// Calculate row left 2 value for pixels [0,3]
				164	out = vmlal_s16(out, row.val[0], mat0);
				165	// Calculate row left 1 value for pixels [0,3]
				166	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				167	// Calculate row middle value for pixels [0,3]
				168	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				169	// Calculate row right +1 value for pixels [0,3]
				170	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				171	// Calculate row right +2 value for pixels [0,3]
				172	out = vmlal_s16(out, row.val[1], mat4);
				173
				174	// Calculate row left 2 value for pixels [4,7]
				175	out2 = vmlal_s16(out2, row.val[1], mat0);
				176	// Calculate row left 1 value for pixels [4,7]
				177	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				178	// Calculate row middle value for pixels [4,7]
				179	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				180	// Calculate row right +1 value for pixels [4,7]
				181	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				182	// Calculate row right +2 value for pixels [4,7]
				183	out2 = vmlal_s16(out2, row.val[2], mat4);
				184	}
				185
				186	inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				187	{
				188	const int16x4_t mat0 = vld1_dup_s16(convolution);
				189	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				190	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				191	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				192	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				193	const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
				194	const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
				195
				196	// Convert to s16 and split in blocks of 4 values:
				197	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				198	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				199
				200	const int16x4x4_t row =
				201	{
				202	{
				203	vget_low_s16(s16_tmp0),
				204	vget_high_s16(s16_tmp0),
				205	vget_low_s16(s16_tmp1),
				206	vget_high_s16(s16_tmp1)
				207	}
				208	};
				209
				210	// Calculate row left 3 value for pixels [0,3]
				211	out = vmlal_s16(out, row.val[0], mat0);
				212	// Calculate row left 2 value for pixels [0,3]
				213	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				214	// Calculate row left 1 value for pixels [0,3]
				215	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				216	// Calculate row middle value for pixels [0,3]
				217	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				218	// Calculate row right +1 value for pixels [0,3]
				219	out = vmlal_s16(out, row.val[1], mat4);
				220	// Calculate row right +2 value for pixels [0,3]
				221	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
				222	// Calculate row right +3 value for pixels [0,3]
				223	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
				224
				225	// Calculate row left 3 value for pixels [4,7]
				226	out2 = vmlal_s16(out2, row.val[1], mat0);
				227	// Calculate row left 2 value for pixels [4,7]
				228	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				229	// Calculate row left 1 value for pixels [4,7]
				230	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				231	// Calculate row middle value for pixels [4,7]
				232	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				233	// Calculate row right +1 value for pixels [4,7]
				234	out2 = vmlal_s16(out2, row.val[2], mat4);
				235	// Calculate row right +2 value for pixels [4,7]
				236	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
				237	// Calculate row right +3 value for pixels [4,7]
				238	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
				239	}
				240
				241	inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
				242	{
				243	const int16x4_t mat0 = vld1_dup_s16(convolution);
				244	const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
				245	const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
				246	const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
				247	const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
				248	const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
				249	const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
				250	const int16x4_t mat7 = vld1_dup_s16(convolution + 7);
				251	const int16x4_t mat8 = vld1_dup_s16(convolution + 8);
				252
				253	// Convert to s16 and split in blocks of 4 values:
				254	const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
				255	const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
				256
				257	const int16x4x4_t row =
				258	{
				259	{
				260	vget_low_s16(s16_tmp0),
				261	vget_high_s16(s16_tmp0),
				262	vget_low_s16(s16_tmp1),
				263	vget_high_s16(s16_tmp1)
				264	}
				265	};
				266
				267	// Calculate row left 4 value for pixels [0,3]
				268	out = vmlal_s16(out, row.val[0], mat0);
				269	// Calculate row left 3 value for pixels [0,3]
				270	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
				271	// Calculate row left 2 value for pixels [0,3]
				272	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
				273	// Calculate row left 1 value for pixels [0,3]
				274	out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
				275	// Calculate row middle value for pixels [0,3]
				276	out = vmlal_s16(out, row.val[1], mat4);
				277	// Calculate row right +1 value for pixels [0,3]
				278	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
				279	// Calculate row right +2 value for pixels [0,3]
				280	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
				281	// Calculate row right +3 value for pixels [0,3]
				282	out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7);
				283	// Calculate row right +4 value for pixels [0,3]
				284	out = vmlal_s16(out, row.val[2], mat8);
				285
				286	// Calculate row left 4 value for pixels [0,3]
				287	out2 = vmlal_s16(out2, row.val[1], mat0);
				288	// Calculate row left 3 value for pixels [0,3]
				289	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
				290	// Calculate row left 2 value for pixels [0,3]
				291	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
				292	// Calculate row left 1 value for pixels [0,3]
				293	out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
				294	// Calculate row middle value for pixels [0,3]
				295	out2 = vmlal_s16(out2, row.val[2], mat4);
				296	// Calculate row right +1 value for pixels [0,3]
				297	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
				298	// Calculate row right +2 value for pixels [0,3]
				299	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
				300	// Calculate row right +3 value for pixels [0,3]
				301	out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7);
				302	// Calculate row right +4 value for pixels [0,3]
				303	out2 = vmlal_s16(out2, row.val[3], mat8);
				304	}
				305	} // namespace
				306
				307	/****************************************************************************************\
				308	* Square Convolution *
				309	\****************************************************************************************/
				310
				311	template <unsigned int matrix_size>
				312	NEConvolutionKernel<matrix_size>::NEConvolutionKernel()
				313	: INESimpleKernel(), _scale(0), _convolution{ {} }
				314	{
				315	}
				316
				317	template <unsigned int matrix_size>
				318	BorderSize NEConvolutionKernel<matrix_size>::border_size() const
				319	{
				320	return BorderSize(matrix_size / 2);
				321	}
				322
				323	template <unsigned int matrix_size>
				324	void NEConvolutionKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv, uint32_t scale, bool border_undefined)
				325	{
				326	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
				327
				328	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				329
				330	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				331	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				332	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				333
				334	_input = input;
				335	_output = output;
				336
				337	std::copy_n(conv, _convolution.size(), _convolution.begin());
				338
				339	if(scale == 0)
				340	{
				341	_scale = calculate_matrix_scale(_convolution.data(), matrix_size);
				342	}
				343	else
				344	{
				345	_scale = scale;
				346	}
				347
				348	// Configure kernel window
				349	constexpr unsigned int num_elems_processed_per_iteration = 8;
				350	constexpr unsigned int num_elems_read_per_iteration = 16;
				351	constexpr unsigned int num_elems_written_per_iteration = 8;
				352
				353	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				354	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				355
				356	update_window_and_padding(win,
				357	AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, matrix_size),
				358	output_access);
				359
				360	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				361
				362	INEKernel::configure(win);
				363	}
				364
				365	template <>
				366	template <typename OutputType>
				367	void NEConvolutionKernel<3>::convolution(const Window &win)
				368	{
				369	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				370	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				371
				372	Iterator input(_input, win);
				373	Iterator output(_output, win);
				374
				375	// Load the matrix's coefficients into NEON registers:
				376	const int16x4_t mat00 = vld1_dup_s16(_convolution.data());
				377	const int16x4_t mat01 = vld1_dup_s16(_convolution.data() + 1);
				378	const int16x4_t mat02 = vld1_dup_s16(_convolution.data() + 2);
				379	const int16x4_t mat10 = vld1_dup_s16(_convolution.data() + 3);
				380	const int16x4_t mat11 = vld1_dup_s16(_convolution.data() + 4);
				381	const int16x4_t mat12 = vld1_dup_s16(_convolution.data() + 5);
				382	const int16x4_t mat20 = vld1_dup_s16(_convolution.data() + 6);
				383	const int16x4_t mat21 = vld1_dup_s16(_convolution.data() + 7);
				384	const int16x4_t mat22 = vld1_dup_s16(_convolution.data() + 8);
				385	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				386
				387	const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1));
				388	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
				389	const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
				390
				391	execute_window_loop(win, [&](const Coordinates & id)
				392	{
				393	int32x4_t out = vdupq_n_s32(0);
				394	int32x4_t out2 = vdupq_n_s32(0);
				395
				396	// Load 16 bytes from the top row:
				397	const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
				398	convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02);
				399
				400	// Load 16 bytes from the middle row:
				401	const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
				402	convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12);
				403
				404	// Load 16 bytes from the middle row:
				405	const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset());
				406	convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22);
				407
				408	// Apply scale
				409	if(_scale != 1)
				410	{
				411	// Convert to F32, scale and convert back to S32
				412	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				413	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				414	}
				415
				416	// Clamp and store as U8 or S16:
				417	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				418	},
				419	input, output);
				420	}
				421
				422	template <>
				423	template <typename OutputType>
				424	void NEConvolutionKernel<5>::convolution(const Window &win)
				425	{
				426	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				427	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				428
				429	Iterator input(_input, win);
				430	Iterator output(_output, win);
				431
				432	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				433
				434	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2));
				435	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1));
				436	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0));
				437	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
				438	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
				439
				440	execute_window_loop(win, [&](const Coordinates & id)
				441	{
				442	int32x4_t out = vdupq_n_s32(0);
				443	int32x4_t out2 = vdupq_n_s32(0);
				444
				445	// Load 16 bytes from the top2 row:
				446	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				447	convolve_row5x1(out, out2, data_t2, _convolution.data());
				448
				449	// Load 16 bytes from the top1 row:
				450	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				451	convolve_row5x1(out, out2, data_t1, _convolution.data() + 5);
				452
				453	// Load 16 bytes from the middle row:
				454	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				455	convolve_row5x1(out, out2, data_m, _convolution.data() + 10);
				456
				457	// Load 16 bytes from the low1 row:
				458	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				459	convolve_row5x1(out, out2, data_b1, _convolution.data() + 15);
				460
				461	// Load 16 bytes from the low2 row:
				462	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				463	convolve_row5x1(out, out2, data_b2, _convolution.data() + 20);
				464
				465	// Apply scale
				466	if(_scale != 1)
				467	{
				468	// Convert to F32, scale and convert back to S32
				469	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				470	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				471	}
				472
				473	// Clamp and store as U8 or S16:
				474	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				475	},
				476	input, output);
				477	}
				478
				479	template <>
				480	template <typename OutputType>
				481	void NEConvolutionKernel<7>::convolution(const Window &win)
				482	{
				483	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				484	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				485
				486	Iterator input(_input, win);
				487	Iterator output(_output, win);
				488
				489	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				490
				491	const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3));
				492	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2));
				493	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1));
				494	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0));
				495	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1));
				496	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
				497	const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
				498
				499	execute_window_loop(win, [&](const Coordinates & id)
				500	{
				501	int32x4_t out = vdupq_n_s32(0);
				502	int32x4_t out2 = vdupq_n_s32(0);
				503
				504	// Load 16 bytes from the top3 row:
				505	const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
				506	convolve_row7x1(out, out2, data_t3, _convolution.data());
				507
				508	// Load 16 bytes from the top2 row:
				509	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				510	convolve_row7x1(out, out2, data_t2, _convolution.data() + 7);
				511
				512	// Load 16 bytes from the top1 row:
				513	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				514	convolve_row7x1(out, out2, data_t1, _convolution.data() + 14);
				515
				516	// Load 16 bytes from the middle row:
				517	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				518	convolve_row7x1(out, out2, data_m, _convolution.data() + 21);
				519
				520	// Load 16 bytes from the low1 row:
				521	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				522	convolve_row7x1(out, out2, data_b1, _convolution.data() + 28);
				523
				524	// Load 16 bytes from the low2 row:
				525	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				526	convolve_row7x1(out, out2, data_b2, _convolution.data() + 35);
				527
				528	// Load 16 bytes from the low3 row:
				529	const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
				530	convolve_row7x1(out, out2, data_b3, _convolution.data() + 42);
				531
				532	// Apply scale
				533	if(_scale != 1)
				534	{
				535	// Convert to F32, scale and convert back to S32
				536	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				537	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				538	}
				539
				540	// Clamp and store as U8 or S16:
				541	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				542	},
				543	input, output);
				544	}
				545
				546	template <>
				547	template <typename OutputType>
				548	void NEConvolutionKernel<9>::convolution(const Window &win)
				549	{
				550	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				551	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				552
				553	Iterator input(_input, win);
				554	Iterator output(_output, win);
				555
				556	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				557
				558	const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4));
				559	const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3));
				560	const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2));
				561	const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1));
				562	const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0));
				563	const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1));
				564	const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2));
				565	const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
				566	const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
				567
				568	execute_window_loop(win, [&](const Coordinates & id)
				569	{
				570	int32x4_t out = vdupq_n_s32(0);
				571	int32x4_t out2 = vdupq_n_s32(0);
				572
				573	// Load 16 bytes from the top4 row:
				574	const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset());
				575	convolve_row9x1(out, out2, data_t4, _convolution.data());
				576
				577	// Load 16 bytes from the top3 row:
				578	const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
				579	convolve_row9x1(out, out2, data_t3, _convolution.data() + 9);
				580
				581	// Load 16 bytes from the top2 row:
				582	const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
				583	convolve_row9x1(out, out2, data_t2, _convolution.data() + 18);
				584
				585	// Load 16 bytes from the top1 row:
				586	const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
				587	convolve_row9x1(out, out2, data_t1, _convolution.data() + 27);
				588
				589	// Load 16 bytes from the middle row:
				590	const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
				591	convolve_row9x1(out, out2, data_m, _convolution.data() + 36);
				592
				593	// Load 16 bytes from the low1 row:
				594	const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
				595	convolve_row9x1(out, out2, data_b1, _convolution.data() + 45);
				596
				597	// Load 16 bytes from the low2 row:
				598	const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
				599	convolve_row9x1(out, out2, data_b2, _convolution.data() + 54);
				600
				601	// Load 16 bytes from the low3 row:
				602	const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
				603	convolve_row9x1(out, out2, data_b3, _convolution.data() + 63);
				604
				605	// Load 16 bytes from the low4 row:
				606	const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset());
				607	convolve_row9x1(out, out2, data_b4, _convolution.data() + 72);
				608
				609	// Apply scale
				610	if(_scale != 1)
				611	{
				612	// Convert to F32, scale and convert back to S32
				613	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				614	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				615	}
				616
				617	// Clamp and store as U8 or S16:
				618	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				619	},
				620	input, output);
				621	}
				622
				623	template <unsigned int matrix_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	624	void NEConvolutionKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	625	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	626	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	627	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				628	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				629
				630	switch(_output->info()->format())
				631	{
				632	case Format::U8:
				633	convolution<uint8_t>(window);
				634	break;
				635	case Format::S16:
				636	convolution<int16_t>(window);
				637	break;
				638	default:
				639	ARM_COMPUTE_ERROR("Not supported");
				640	}
				641	}
				642
				643	template class arm_compute::NEConvolutionKernel<3>;
				644	template class arm_compute::NEConvolutionKernel<5>;
				645	template class arm_compute::NEConvolutionKernel<7>;
				646	template class arm_compute::NEConvolutionKernel<9>;
				647
				648	/****************************************************************************************\
				649	* Separable Square Convolution *
				650	\****************************************************************************************/
				651
				652	template <unsigned int matrix_size>
				653	NESeparableConvolutionHorKernel<matrix_size>::NESeparableConvolutionHorKernel()
				654	: _conv_row{ { 0 } }, _border_size(0)
				655	{
				656	}
				657
				658	template <unsigned int matrix_size>
				659	BorderSize NESeparableConvolutionHorKernel<matrix_size>::border_size() const
				660	{
				661	return _border_size;
				662	}
				663
				664	template <unsigned int matrix_size>
				665	void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv_row, bool border_undefined)
				666	{
				667	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row);
				668
				669	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				670
				671	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				672	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				673	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
				674
				675	_input = input;
				676	_output = output;
				677	std::copy_n(conv_row, _conv_row.size(), _conv_row.begin());
				678	_border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
				679
				680	// Configure kernel window
				681	constexpr unsigned int num_elems_processed_per_iteration = 8;
				682	constexpr unsigned int num_elems_read_per_iteration = 16;
				683	constexpr unsigned int num_elems_written_per_iteration = 8;
				684
				685	Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				686	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				687
				688	update_window_and_padding(win,
				689	AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
				690	output_access);
				691
				692	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				693
				694	INEKernel::configure(win);
				695	}
				696
				697	template <unsigned int matrix_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	698	void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	699	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	700	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	701	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				702	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				703	switch(_output->info()->data_type())
				704	{
				705	case DataType::U16:
				706	convolve<uint16_t>(window);
				707	break;
				708	case DataType::S16:
				709	convolve<int16_t>(window);
				710	break;
				711	case DataType::S32:
				712	convolve<int32_t>(window);
				713	break;
				714	default:
				715	ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
				716	break;
				717	}
				718	}
				719
				720	template <>
				721	template <>
				722	inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
				723	{
				724	Window win_in(window);
				725	win_in.shift(Window::DimX, -2);
				726
				727	Iterator input(_input, win_in);
				728	Iterator output(_output, window);
				729
				730	execute_window_loop(window, [&](const Coordinates & id)
				731	{
				732	const uint8x16_t data = vld1q_u8(input.ptr());
				733
				734	const uint16x8x2_t data_u16 =
				735	{
				736	{
				737	vmovl_u8(vget_low_u8(data)),
				738	vmovl_u8(vget_high_u8(data))
				739	}
				740	};
				741
				742	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				743	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				744	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				745	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				746	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				747
				748	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				749	},
				750	input, output);
				751	}
				752
				753	template <>
				754	template <>
				755	inline void NESeparableConvolutionHorKernel<5>::convolve<int16_t>(const Window &window)
				756	{
				757	Window win_in(window);
				758	win_in.shift(Window::DimX, -2);
				759
				760	Iterator input(_input, win_in);
				761	Iterator output(_output, window);
				762
				763	execute_window_loop(window, [&](const Coordinates & id)
				764	{
				765	const uint8x16_t data = vld1q_u8(input.ptr());
				766
				767	const int16x8x2_t data_s16 =
				768	{
				769	{
				770	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				771	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				772	}
				773	};
				774
				775	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				776	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				777	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				778	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				779	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				780
				781	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				782	},
				783	input, output);
				784	}
				785
				786	template <>
				787	template <>
				788	void NESeparableConvolutionHorKernel<5>::convolve<int32_t>(const Window &window)
				789	{
				790	Window win_in(window);
				791	win_in.shift(Window::DimX, -2);
				792
				793	Iterator input(_input, win_in);
				794	Iterator output(_output, window);
				795
				796	execute_window_loop(window, [&](const Coordinates & id)
				797	{
				798	const uint8x16_t data = vld1q_u8(input.ptr());
				799
				800	const int16x8x2_t data_s16 =
				801	{
				802	{
				803	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				804	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				805	}
				806	};
				807
				808	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				809	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				810	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				811	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				812
				813	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				814	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]);
				815	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]);
				816	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]);
				817	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]);
				818
				819	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				820
				821	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				822	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]);
				823	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]);
				824	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]);
				825	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]);
				826
				827	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				828	},
				829	input, output);
				830	}
				831
				832	template <>
				833	template <>
				834	inline void NESeparableConvolutionHorKernel<7>::convolve<uint16_t>(const Window &window)
				835	{
				836	Window win_in(window);
				837	win_in.shift(Window::DimX, -3);
				838
				839	Iterator input(_input, win_in);
				840	Iterator output(_output, window);
				841
				842	execute_window_loop(window, [&](const Coordinates & id)
				843	{
				844	const uint8x16_t data = vld1q_u8(input.ptr());
				845
				846	const uint16x8x2_t data_u16 =
				847	{
				848	{
				849	vmovl_u8(vget_low_u8(data)),
				850	vmovl_u8(vget_high_u8(data))
				851	}
				852	};
				853
				854	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				855	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				856	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				857	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				858	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				859	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
				860	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
				861
				862	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				863	},
				864	input, output);
				865	}
				866
				867	template <>
				868	template <>
				869	inline void NESeparableConvolutionHorKernel<7>::convolve<int16_t>(const Window &window)
				870	{
				871	Window win_in(window);
				872	win_in.shift(Window::DimX, -3);
				873
				874	Iterator input(_input, win_in);
				875	Iterator output(_output, window);
				876
				877	execute_window_loop(window, [&](const Coordinates & id)
				878	{
				879	const uint8x16_t data = vld1q_u8(input.ptr());
				880
				881	const int16x8x2_t data_s16 =
				882	{
				883	{
				884	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				885	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				886	}
				887	};
				888
				889	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				890	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				891	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				892	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				893	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				894	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
				895	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
				896
				897	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				898	},
				899	input, output);
				900	}
				901
				902	template <>
				903	template <>
				904	void NESeparableConvolutionHorKernel<7>::convolve<int32_t>(const Window &window)
				905	{
				906	Window win_in(window);
				907	win_in.shift(Window::DimX, -3);
				908
				909	Iterator input(_input, win_in);
				910	Iterator output(_output, window);
				911
				912	execute_window_loop(window, [&](const Coordinates & id)
				913	{
				914	const uint8x16_t data = vld1q_u8(input.ptr());
				915
				916	const int16x8x2_t data_s16 =
				917	{
				918	{
				919	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				920	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				921	}
				922	};
				923
				924	const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				925	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				926	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				927	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				928	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
				929	const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
				930
				931	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				932	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]);
				933	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]);
				934	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]);
				935	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]);
				936	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]);
				937	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]);
				938
				939	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				940
				941	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				942	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]);
				943	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]);
				944	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]);
				945	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]);
				946	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]);
				947	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]);
				948
				949	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				950	},
				951	input, output);
				952	}
				953
				954	template <>
				955	template <>
				956	inline void NESeparableConvolutionHorKernel<9>::convolve<uint16_t>(const Window &window)
				957	{
				958	Window win_in(window);
				959	win_in.shift(Window::DimX, -4);
				960
				961	Iterator input(_input, win_in);
				962	Iterator output(_output, window);
				963
				964	execute_window_loop(window, [&](const Coordinates & id)
				965	{
				966	const uint8x16_t data = vld1q_u8(input.ptr());
				967
				968	const uint16x8x2_t data_u16 =
				969	{
				970	{
				971	vmovl_u8(vget_low_u8(data)),
				972	vmovl_u8(vget_high_u8(data))
				973	}
				974	};
				975
				976	uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
				977	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
				978	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
				979	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
				980	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
				981	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
				982	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
				983	out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]);
				984	out = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]);
				985
				986	vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
				987	},
				988	input, output);
				989	}
				990
				991	template <>
				992	template <>
				993	inline void NESeparableConvolutionHorKernel<9>::convolve<int16_t>(const Window &window)
				994	{
				995	Window win_in(window);
				996	win_in.shift(Window::DimX, -4);
				997
				998	Iterator input(_input, win_in);
				999	Iterator output(_output, window);
				1000
				1001	execute_window_loop(window, [&](const Coordinates & id)
				1002	{
				1003	const uint8x16_t data = vld1q_u8(input.ptr());
				1004
				1005	const int16x8x2_t data_s16 =
				1006	{
				1007	{
				1008	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				1009	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				1010	}
				1011	};
				1012
				1013	int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
				1014	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
				1015	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
				1016	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
				1017	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
				1018	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
				1019	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
				1020	out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]);
				1021	out = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]);
				1022
				1023	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
				1024	},
				1025	input, output);
				1026	}
				1027
				1028	template <>
				1029	template <>
				1030	void NESeparableConvolutionHorKernel<9>::convolve<int32_t>(const Window &window)
				1031	{
				1032	Window win_in(window);
				1033	win_in.shift(Window::DimX, -4);
				1034
				1035	Iterator input(_input, win_in);
				1036	Iterator output(_output, window);
				1037
				1038	execute_window_loop(window, [&](const Coordinates & id)
				1039	{
				1040	const uint8x16_t data = vld1q_u8(input.ptr());
				1041
				1042	const int16x8x2_t data_s16 =
				1043	{
				1044	{
				1045	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
				1046	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
				1047	}
				1048	};
				1049
				1050	const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
				1051	const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
				1052	const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
				1053	const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
				1054	const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
				1055	const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
				1056	const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7);
				1057
				1058	int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
				1059	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]);
				1060	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]);
				1061	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]);
				1062	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]);
				1063	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]);
				1064	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]);
				1065	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]);
				1066	out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]);
				1067
				1068	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
				1069
				1070	int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
				1071	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]);
				1072	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]);
				1073	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]);
				1074	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]);
				1075	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]);
				1076	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]);
				1077	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]);
				1078	out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]);
				1079
				1080	vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
				1081	},
				1082	input, output);
				1083	}
				1084
				1085	template class arm_compute::NESeparableConvolutionHorKernel<5>;
				1086	template class arm_compute::NESeparableConvolutionHorKernel<7>;
				1087	template class arm_compute::NESeparableConvolutionHorKernel<9>;
				1088
				1089	template <unsigned int matrix_size>
				1090	NESeparableConvolutionVertKernel<matrix_size>::NESeparableConvolutionVertKernel()
				1091	: _conv_col{ { 0 } }, _scale(0)
				1092	{
				1093	}
				1094
				1095	template <unsigned int matrix_size>
				1096	BorderSize NESeparableConvolutionVertKernel<matrix_size>::border_size() const
				1097	{
				1098	return BorderSize(matrix_size / 2, 0);
				1099	}
				1100
				1101	template <unsigned int matrix_size>
				1102	void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor input, ITensor output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
				1103	{
				1104	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col);
				1105
				1106	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				1107
				1108	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				1109	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
				1110	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				1111	ARM_COMPUTE_ERROR_ON(scale == 0);
				1112
				1113	_input = input;
				1114	_output = output;
				1115	std::copy_n(conv_col, _conv_col.size(), _conv_col.begin());
				1116	_scale = scale;
				1117
				1118	// Configure kernel window
				1119	constexpr unsigned int num_elems_processed_per_iteration = 16;
				1120	constexpr unsigned int num_elems_read_per_iteration = 16;
				1121	constexpr unsigned int num_elems_written_per_iteration = 16;
				1122
				1123	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				1124	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				1125
				1126	update_window_and_padding(win,
				1127	AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, matrix_size),
				1128	output_access);
				1129
				1130	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				1131
				1132	INEKernel::configure(win);
				1133	}
				1134
				1135	template <unsigned int matrix_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1136	void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1137	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1138	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1139	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1140	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1141
				1142	switch(_input->info()->data_type())
				1143	{
				1144	case DataType::U16:
				1145	switch(_output->info()->data_type())
				1146	{
				1147	case DataType::U8:
				1148	convolution_u16<uint8_t>(window);
				1149	break;
				1150	case DataType::S16:
				1151	convolution_u16<int16_t>(window);
				1152	break;
				1153	default:
				1154	ARM_COMPUTE_ERROR("Not supported");
				1155	}
				1156	break;
				1157	case DataType::S16:
				1158	switch(_output->info()->data_type())
				1159	{
				1160	case DataType::U8:
				1161	convolution_s16<uint8_t>(window);
				1162	break;
				1163	case DataType::S16:
				1164	convolution_s16<int16_t>(window);
				1165	break;
				1166	default:
				1167	ARM_COMPUTE_ERROR("Not supported");
				1168	}
				1169	break;
				1170	case DataType::S32:
				1171	switch(_output->info()->data_type())
				1172	{
				1173	case DataType::U8:
				1174	convolution_s32<uint8_t>(window);
				1175	break;
				1176	case DataType::S16:
				1177	convolution_s32<int16_t>(window);
				1178	break;
				1179	default:
				1180	ARM_COMPUTE_ERROR("Not supported");
				1181	}
				1182	break;
				1183	default:
				1184	ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
				1185	break;
				1186	}
				1187	}
				1188
				1189	template <unsigned int matrix_size>
				1190	template <typename OutputType>
				1191	void NESeparableConvolutionVertKernel<matrix_size>::convolution_u16(const Window &win)
				1192	{
				1193	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1194
				1195	Window win_in(win);
				1196	win_in.set_dimension_step(Window::DimX, 8);
				1197
				1198	Iterator in(_input, win_in);
				1199	Iterator out(_output, win);
				1200
				1201	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1202	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1203	const int k_half = matrix_size / 2;
				1204
				1205	// Set row pointers
				1206	for(int i = -k_half; i <= k_half; ++i)
				1207	{
				1208	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1209	}
				1210
				1211	execute_window_loop(win, [&](const Coordinates & id)
				1212	{
				1213	uint16x8_t out0 = vdupq_n_u16(0);
				1214	uint16x8_t out1 = vdupq_n_u16(0);
				1215
				1216	// First half
				1217	for(unsigned int r = 0; r < matrix_size; ++r)
				1218	{
				1219	const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
				1220	out0 = vmlaq_n_u16(out0, data, _conv_col[r]);
				1221	}
				1222
				1223	in.increment(Window::DimX);
				1224
				1225	// Second half
				1226	for(unsigned int r = 0; r < matrix_size; ++r)
				1227	{
				1228	const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
				1229	out1 = vmlaq_n_u16(out1, data, _conv_col[r]);
				1230	}
				1231
				1232	//scale the result if needed
				1233	if(_scale != 1)
				1234	{
				1235	float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0)));
				1236	float32x4_t out0_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0)));
				1237	out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
				1238	out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
				1239	store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
				1240
				1241	float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1)));
				1242	float32x4_t out1_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1)));
				1243	out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
				1244	out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
				1245	store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1246	}
				1247	else
				1248	{
				1249	store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
				1250	}
				1251	},
				1252	in, out);
				1253	}
				1254
				1255	template <unsigned int matrix_size>
				1256	template <typename OutputType>
				1257	void NESeparableConvolutionVertKernel<matrix_size>::convolution_s16(const Window &win)
				1258	{
				1259	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1260
				1261	Window win_in(win);
				1262	win_in.set_dimension_step(Window::DimX, 8);
				1263
				1264	Iterator in(_input, win_in);
				1265	Iterator out(_output, win);
				1266
				1267	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1268	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1269	const int k_half = matrix_size / 2;
				1270
				1271	// Set row pointers
				1272	for(int i = -k_half; i <= k_half; ++i)
				1273	{
				1274	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1275	}
				1276
				1277	execute_window_loop(win, [&](const Coordinates & id)
				1278	{
				1279	int16x8_t out0 = vdupq_n_s16(0);
				1280	int16x8_t out1 = vdupq_n_s16(0);
				1281
				1282	// First half
				1283	for(unsigned int r = 0; r < matrix_size; ++r)
				1284	{
				1285	const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
				1286	out0 = vmlaq_n_s16(out0, data, _conv_col[r]);
				1287	}
				1288
				1289	in.increment(Window::DimX);
				1290
				1291	// Second half
				1292	for(unsigned int r = 0; r < matrix_size; ++r)
				1293	{
				1294	const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
				1295	out1 = vmlaq_n_s16(out1, data, _conv_col[r]);
				1296	}
				1297
				1298	//scale the result if needed
				1299	if(_scale != 1)
				1300	{
				1301	float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0)));
				1302	float32x4_t out0_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0)));
				1303	out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
				1304	out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
				1305	store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
				1306
				1307	float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1)));
				1308	float32x4_t out1_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1)));
				1309	out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
				1310	out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
				1311	store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1312	}
				1313	else
				1314	{
				1315	store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
				1316	}
				1317	},
				1318	in, out);
				1319	}
				1320
				1321	template <unsigned int matrix_size>
				1322	template <typename OutputType>
				1323	void NESeparableConvolutionVertKernel<matrix_size>::convolution_s32(const Window &win)
				1324	{
				1325	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1326
				1327	Window win_in(win);
				1328	win_in.set_dimension_step(Window::DimX, 8);
				1329
				1330	Iterator in(_input, win_in);
				1331	Iterator out(_output, win);
				1332
				1333	std::array<unsigned char *, matrix_size> input_ptrs{ {} };
				1334	const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
				1335	const int k_half = matrix_size / 2;
				1336
				1337	// Set row pointers
				1338	for(int i = -k_half; i <= k_half; ++i)
				1339	{
				1340	input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
				1341	}
				1342
				1343	const int32x4_t zero = vdupq_n_s32(0);
				1344
				1345	execute_window_loop(win, [&](const Coordinates & id)
				1346	{
				1347	int32x4x2_t out0 =
				1348	{
				1349	{
				1350	zero,
				1351	zero
				1352	}
				1353	};
				1354
				1355	int32x4x2_t out1 =
				1356	{
				1357	{
				1358	zero,
				1359	zero
				1360	}
				1361	};
				1362
				1363	// First half
				1364	for(unsigned int r = 0; r < matrix_size; ++r)
				1365	{
				1366	const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
				1367	out0.val[0] = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]);
				1368	out0.val[1] = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]);
				1369	}
				1370
				1371	in.increment(Window::DimX);
				1372
				1373	// Second half
				1374	for(unsigned int r = 0; r < matrix_size; ++r)
				1375	{
				1376	const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
				1377	out1.val[0] = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]);
				1378	out1.val[1] = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]);
				1379	}
				1380
				1381	//scale the result if needed
				1382	if(_scale != 1)
				1383	{
				1384	float32x4_t out0_f32_odd = vcvtq_f32_s32(out0.val[0]);
				1385	float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]);
				1386	out0_f32_odd = vmulq_f32(out0_f32_odd, oneoverscale);
				1387	out0_f32_even = vmulq_f32(out0_f32_even, oneoverscale);
				1388	out0.val[0] = vcvtq_s32_f32(out0_f32_odd);
				1389	out0.val[1] = vcvtq_s32_f32(out0_f32_even);
				1390
				1391	float32x4_t out1_f32_odd = vcvtq_f32_s32(out1.val[0]);
				1392	float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]);
				1393	out1_f32_odd = vmulq_f32(out1_f32_odd, oneoverscale);
				1394	out1_f32_even = vmulq_f32(out1_f32_even, oneoverscale);
				1395	out1.val[0] = vcvtq_s32_f32(out1_f32_odd);
				1396	out1.val[1] = vcvtq_s32_f32(out1_f32_even);
				1397	}
				1398
				1399	const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]);
				1400	store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()));
				1401
				1402	const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]);
				1403	store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()) + 8);
				1404	},
				1405	in, out);
				1406	}
				1407
				1408	template class arm_compute::NESeparableConvolutionVertKernel<5>;
				1409	template class arm_compute::NESeparableConvolutionVertKernel<7>;
				1410	template class arm_compute::NESeparableConvolutionVertKernel<9>;
				1411
				1412	/****************************************************************************************\
				1413	* Rectangle Convolution *
				1414	\****************************************************************************************/
				1415
				1416	NEConvolutionRectangleKernel::NEConvolutionRectangleKernel()
				1417	: _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0)
				1418	{
				1419	}
				1420
				1421	BorderSize NEConvolutionRectangleKernel::border_size() const
				1422	{
				1423	return _border_size;
				1424	}
				1425
				1426	void NEConvolutionRectangleKernel::configure(const ITensor input, ITensor output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
				1427	{
				1428	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
				1429
				1430	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				1431
				1432	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				1433	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				1434	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
				1435	ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9);
				1436	ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9);
				1437	ARM_COMPUTE_ERROR_ON(0 == scale);
				1438
				1439	_input = input;
				1440	_output = output;
				1441	_scale = scale;
				1442	_border_size = BorderSize(height / 2, width / 2);
				1443
				1444	// Setup the convolution matrix
				1445	const uint32_t nr_elements = width * height;
				1446	_convolution.resize(nr_elements);
				1447	std::copy_n(conv, nr_elements, _convolution.begin());
				1448
				1449	// Set function index to help choose appropriate function in run()
				1450	_func_idx = get_index(height) * 4 + get_index(width);
				1451	ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes));
				1452
				1453	// Configure kernel window
				1454	constexpr unsigned int num_elems_processed_per_iteration = 8;
				1455	constexpr unsigned int num_elems_read_per_iteration = 16;
				1456	constexpr unsigned int num_elems_written_per_iteration = 8;
				1457
				1458	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size);
				1459	AccessWindowHorizontal output_access = AccessWindowHorizontal(output->info(), 0, num_elems_written_per_iteration);
				1460
				1461	update_window_and_padding(win,
				1462	AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height),
				1463	output_access);
				1464
				1465	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size);
				1466
				1467	INEKernel::configure(win);
				1468	}
				1469
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1470	void NEConvolutionRectangleKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1471	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1472	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1473	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1474	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1475
				1476	using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window);
				1477
				1478	// uint8_t function table
				1479	static const std::array<ConvolutionRectangleFunction, 16> func_table_u8 =
				1480	{
				1481	{
				1482	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 3>,
				1483	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 5>,
				1484	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 7>,
				1485	&NEConvolutionRectangleKernel::convolution<uint8_t, 3, 9>,
				1486	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 3>,
				1487	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 5>,
				1488	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 7>,
				1489	&NEConvolutionRectangleKernel::convolution<uint8_t, 5, 9>,
				1490	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 3>,
				1491	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 5>,
				1492	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 7>,
				1493	&NEConvolutionRectangleKernel::convolution<uint8_t, 7, 9>,
				1494	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 3>,
				1495	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 5>,
				1496	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 7>,
				1497	&NEConvolutionRectangleKernel::convolution<uint8_t, 9, 9>
				1498	}
				1499	};
				1500	// int16_t function table
				1501	static const std::array<ConvolutionRectangleFunction, 16> func_table_s16 =
				1502	{
				1503	{
				1504	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 3>,
				1505	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 5>,
				1506	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 7>,
				1507	&NEConvolutionRectangleKernel::convolution<int16_t, 3, 9>,
				1508	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 3>,
				1509	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 5>,
				1510	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 7>,
				1511	&NEConvolutionRectangleKernel::convolution<int16_t, 5, 9>,
				1512	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 3>,
				1513	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 5>,
				1514	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 7>,
				1515	&NEConvolutionRectangleKernel::convolution<int16_t, 7, 9>,
				1516	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 3>,
				1517	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 5>,
				1518	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 7>,
				1519	&NEConvolutionRectangleKernel::convolution<int16_t, 9, 9>
				1520	}
				1521	};
				1522
				1523	// Run appropriate function
				1524	switch(_output->info()->format())
				1525	{
				1526	case Format::U8:
				1527	ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size());
				1528	(this->*func_table_u8[_func_idx])(window);
				1529	break;
				1530	case Format::S16:
				1531	ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size());
				1532	(this->*func_table_s16[_func_idx])(window);
				1533	break;
				1534	default:
				1535	ARM_COMPUTE_ERROR("Not supported");
				1536	}
				1537	}
				1538
				1539	unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val)
				1540	{
				1541	switch(val)
				1542	{
				1543	case 3:
				1544	return 0;
				1545	case 5:
				1546	return 1;
				1547	case 7:
				1548	return 2;
				1549	case 9:
				1550	return 3;
				1551	default:
				1552	ARM_COMPUTE_ERROR("Not supported dimension size");
				1553	return 0;
				1554	}
				1555	}
				1556
				1557	template <typename OutputType, unsigned int rows, unsigned int cols>
				1558	void NEConvolutionRectangleKernel::convolution(const Window &win)
				1559	{
				1560	static_assert(sizeof(OutputType) == sizeof(uint8_t) \|\| sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
				1561	ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
				1562
				1563	Iterator input(_input, win);
				1564	Iterator output(_output, win);
				1565
				1566	std::array<unsigned char *, rows> input_ptrs{ {} };
				1567	const int16_t *conv = _convolution.data();
				1568	const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
				1569	const int k_row_half = rows / 2;
				1570	const int k_col_half = cols / 2;
				1571
				1572	// Set row pointers
				1573	for(int i = -k_row_half; i <= k_row_half; ++i)
				1574	{
				1575	input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
				1576	}
				1577
				1578	execute_window_loop(win, [&](const Coordinates & id)
				1579	{
				1580	int32x4_t out = vdupq_n_s32(0);
				1581	int32x4_t out2 = vdupq_n_s32(0);
				1582
				1583	// Perform appropriate convolution
				1584	for(unsigned int r = 0; r < rows; ++r)
				1585	{
				1586	const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
				1587	if(3 == cols)
				1588	{
				1589	convolve_row3x1(out, out2, data, conv + r * cols);
				1590	}
				1591	else if(5 == cols)
				1592	{
				1593	convolve_row5x1(out, out2, data, conv + r * cols);
				1594	}
				1595	else if(7 == cols)
				1596	{
				1597	convolve_row7x1(out, out2, data, conv + r * cols);
				1598	}
				1599	else if(9 == cols)
				1600	{
				1601	convolve_row9x1(out, out2, data, conv + r * cols);
				1602	}
				1603	else
				1604	{
				1605	ARM_COMPUTE_ERROR("Unsupported number of columns");
				1606	}
				1607	}
				1608
				1609	// Apply scale
				1610	if(_scale != 1)
				1611	{
				1612	// Convert to F32, scale and convert back to S32
				1613	out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
				1614	out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
				1615	}
				1616
				1617	// Clamp and store as U8 or S16:
				1618	store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
				1619	},
				1620	input, output);
				1621	}
				1622	} // namespace arm_compute