Blame - src/core/NEON/kernels/NESobel7x7Kernel.cpp - ml/ComputeLibrary

blob: 9761942c69ac2e077d57003ce9c73898eb6cb11a [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame^]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
				29	#include "arm_compute/core/TensorInfo.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Utils.h"
				32	#include "arm_compute/core/Validate.h"
				33
				34	#include <arm_neon.h>
				35	#include <cstdint>
				36
				37	using namespace arm_compute;
				38
				39	namespace arm_compute
				40	{
				41	class Coordinates;
				42	} // namespace arm_compute
				43
				44	namespace
				45	{
				46	const int32x4_t minusfour = vdupq_n_s32(-4);
				47	const int32x4_t minusfive = vdupq_n_s32(-5);
				48	const int32x4_t four = vdupq_n_s32(4);
				49	const int32x4_t five = vdupq_n_s32(5);
				50	const int32x4_t six = vdupq_n_s32(6);
				51	const int32x4_t fifteen = vdupq_n_s32(15);
				52	const int32x4_t twenty = vdupq_n_s32(20);
				53
				54	inline int32x4x2_t compute_hor_sobel_x(const int32x4x4_t &data)
				55	{
				56	int32x4x2_t out =
				57	{
				58	{
				59	vnegq_s32(data.val[0]),
				60	vnegq_s32(data.val[1])
				61	}
				62	};
				63
				64	out.val[0] = vmlaq_s32(out.val[0],
				65	vextq_s32(data.val[0], data.val[1], 1), minusfour);
				66
				67	out.val[0] = vmlaq_s32(out.val[0],
				68	vextq_s32(data.val[0], data.val[1], 2), minusfive);
				69
				70	out.val[0] = vmlaq_s32(out.val[0], data.val[1], five);
				71
				72	out.val[0] = vmlaq_s32(out.val[0],
				73	vextq_s32(data.val[1], data.val[2], 1), four);
				74
				75	out.val[0] = vaddq_s32(out.val[0],
				76	vextq_s32(data.val[1], data.val[2], 2));
				77
				78	out.val[1] = vmlaq_s32(out.val[1],
				79	vextq_s32(data.val[1], data.val[2], 1), minusfour);
				80
				81	out.val[1] = vmlaq_s32(out.val[1],
				82	vextq_s32(data.val[1], data.val[2], 2), minusfive);
				83
				84	out.val[1] = vmlaq_s32(out.val[1], data.val[2], five);
				85
				86	out.val[1] = vmlaq_s32(out.val[1],
				87	vextq_s32(data.val[2], data.val[3], 1), four);
				88
				89	out.val[1] = vaddq_s32(out.val[1],
				90	vextq_s32(data.val[2], data.val[3], 2));
				91
				92	return out;
				93	}
				94
				95	inline int32x4x2_t compute_hor_sobel_y(const int32x4x4_t &data)
				96	{
				97	int32x4x2_t out =
				98	{
				99	{
				100	data.val[0],
				101	data.val[1]
				102	}
				103	};
				104
				105	out.val[0] = vmlaq_s32(out.val[0],
				106	vextq_s32(data.val[0], data.val[1], 1), six);
				107
				108	out.val[0] = vmlaq_s32(out.val[0],
				109	vextq_s32(data.val[0], data.val[1], 2), fifteen);
				110
				111	out.val[0] = vmlaq_s32(out.val[0],
				112	vextq_s32(data.val[0], data.val[1], 3), twenty);
				113
				114	out.val[0] = vmlaq_s32(out.val[0], data.val[1], fifteen);
				115
				116	out.val[0] = vmlaq_s32(out.val[0],
				117	vextq_s32(data.val[1], data.val[2], 1), six);
				118
				119	out.val[0] = vaddq_s32(out.val[0],
				120	vextq_s32(data.val[1], data.val[2], 2));
				121
				122	out.val[1] = vmlaq_s32(out.val[1],
				123	vextq_s32(data.val[1], data.val[2], 1), six);
				124
				125	out.val[1] = vmlaq_s32(out.val[1],
				126	vextq_s32(data.val[1], data.val[2], 2), fifteen);
				127
				128	out.val[1] = vmlaq_s32(out.val[1],
				129	vextq_s32(data.val[1], data.val[2], 3), twenty);
				130
				131	out.val[1] = vmlaq_s32(out.val[1], data.val[2], fifteen);
				132
				133	out.val[1] = vmlaq_s32(out.val[1],
				134	vextq_s32(data.val[2], data.val[3], 1), six);
				135
				136	out.val[1] = vaddq_s32(out.val[1],
				137	vextq_s32(data.val[2], data.val[3], 2));
				138
				139	return out;
				140	}
				141	} // namespace
				142
				143	NESobel7x7HorKernel::NESobel7x7HorKernel()
				144	: _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
				145	{
				146	}
				147
				148	BorderSize NESobel7x7HorKernel::border_size() const
				149	{
				150	return _border_size;
				151	}
				152
				153	void NESobel7x7HorKernel::configure(const ITensor input, ITensor output_x, ITensor *output_y, bool border_undefined)
				154	{
				155	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8);
				156	ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
				157
				158	_run_sobel_x = output_x != nullptr;
				159	_run_sobel_y = output_y != nullptr;
				160
				161	if(_run_sobel_x)
				162	{
				163	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
				164	}
				165
				166	if(_run_sobel_y)
				167	{
				168	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
				169	}
				170
				171	_input = input;
				172	_output_x = output_x;
				173	_output_y = output_y;
				174	_border_size = BorderSize(border_undefined ? 0 : 3, 3);
				175
				176	// Configure kernel window
				177	constexpr unsigned int num_elems_processed_per_iteration = 8;
				178	constexpr unsigned int num_elems_read_per_iteration = 16;
				179	constexpr unsigned int num_elems_written_per_iteration = 8;
				180
				181	Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				182	AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
				183	AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
				184
				185	update_window_and_padding(win,
				186	AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
				187	output_x_access,
				188	output_y_access);
				189
				190	output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				191	output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				192
				193	INEKernel::configure(win);
				194	}
				195
				196	void NESobel7x7HorKernel::run(const Window &window)
				197	{
				198	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				199	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				200
				201	Iterator input(_input, window);
				202	Iterator output_x;
				203	Iterator output_y;
				204
				205	if(_run_sobel_x)
				206	{
				207	output_x = Iterator(_output_x, window);
				208	}
				209
				210	if(_run_sobel_y)
				211	{
				212	output_y = Iterator(_output_y, window);
				213	}
				214
				215	if(_run_sobel_y && _run_sobel_x)
				216	{
				217	execute_window_loop(window, [&](const Coordinates & id)
				218	{
				219	const uint8x16_t data = vld1q_u8(input.ptr() - 3);
				220
				221	const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
				222	const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
				223
				224	const int32x4x4_t data_s32 =
				225	{
				226	{
				227	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
				228	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
				229	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
				230	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
				231	}
				232	};
				233
				234	const int32x4x2_t out_y = compute_hor_sobel_y(data_s32);
				235	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out_y.val[0]);
				236	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out_y.val[1]);
				237
				238	const int32x4x2_t out_x = compute_hor_sobel_x(data_s32);
				239	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out_x.val[0]);
				240	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out_x.val[1]);
				241	},
				242	input, output_x, output_y);
				243	}
				244	else if(_run_sobel_x)
				245	{
				246	execute_window_loop(window, [&](const Coordinates & id)
				247	{
				248	const uint8x16_t data = vld1q_u8(input.ptr() - 3);
				249
				250	const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
				251	const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
				252
				253	const int32x4x4_t data_s32 =
				254	{
				255	{
				256	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
				257	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
				258	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
				259	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
				260	}
				261	};
				262
				263	const int32x4x2_t out = compute_hor_sobel_x(data_s32);
				264	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out.val[0]);
				265	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
				266	},
				267	input, output_x);
				268	}
				269	else if(_run_sobel_y)
				270	{
				271	execute_window_loop(window, [&](const Coordinates & id)
				272	{
				273	const uint8x16_t data = vld1q_u8(input.ptr() - 3);
				274
				275	const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
				276	const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
				277
				278	const int32x4x4_t data_s32 =
				279	{
				280	{
				281	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
				282	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
				283	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
				284	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
				285	}
				286	};
				287
				288	const int32x4x2_t out = compute_hor_sobel_x(data_s32);
				289	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out.val[0]);
				290	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
				291	},
				292	input, output_y);
				293	}
				294	}
				295
				296	NESobel7x7VertKernel::NESobel7x7VertKernel()
				297	: _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
				298	{
				299	}
				300
				301	BorderSize NESobel7x7VertKernel::border_size() const
				302	{
				303	return BorderSize(3, 0);
				304	}
				305
				306	void NESobel7x7VertKernel::configure(const ITensor input_x, const ITensor input_y, ITensor output_x, ITensor output_y, bool border_undefined)
				307	{
				308	ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
				309
				310	_run_sobel_x = (output_x != nullptr);
				311	_run_sobel_y = (output_y != nullptr);
				312
				313	if(_run_sobel_x)
				314	{
				315	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S32);
				316	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
				317	}
				318
				319	if(_run_sobel_y)
				320	{
				321	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S32);
				322	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
				323	}
				324
				325	_input_x = input_x;
				326	_input_y = input_y;
				327	_output_x = output_x;
				328	_output_y = output_y;
				329
				330	const ITensor *const input = _run_sobel_x ? input_x : input_y;
				331
				332	// Configure kernel window
				333	constexpr unsigned int num_elems_processed_per_iteration = 8;
				334	constexpr unsigned int num_elems_read_per_iteration = 8;
				335	constexpr unsigned int num_elems_written_per_iteration = 8;
				336	constexpr unsigned int num_rows_read_per_iteration = 7;
				337
				338	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				339	AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
				340	AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
				341
				342	update_window_and_padding(win,
				343	AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				344	AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				345	output_x_access,
				346	output_y_access);
				347
				348	output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				349	output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				350
				351	INEKernel::configure(win);
				352	}
				353
				354	void NESobel7x7VertKernel::run(const Window &window)
				355	{
				356	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				357	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				358
				359	Iterator input_x;
				360	Iterator input_y;
				361	Iterator output_x;
				362	Iterator output_y;
				363
				364	int32_t in_x_stride = 0;
				365	int32_t in_y_stride = 0;
				366
				367	if(_run_sobel_x)
				368	{
				369	input_x = Iterator(_input_x, window);
				370	output_x = Iterator(_output_x, window);
				371	in_x_stride = _input_x->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_x->info()->format());
				372	}
				373
				374	if(_run_sobel_y)
				375	{
				376	input_y = Iterator(_input_y, window);
				377	output_y = Iterator(_output_y, window);
				378	in_y_stride = _input_y->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_y->info()->format());
				379	}
				380
				381	if(_run_sobel_x)
				382	{
				383	execute_window_loop(window, [&](const Coordinates & id)
				384	{
				385	auto in_ptr = reinterpret_cast<int32_t >(input_x.ptr()) - 3 in_x_stride;
				386
				387	//top3
				388	int32x4x2_t data =
				389	{
				390	{
				391	vld1q_s32(in_ptr),
				392	vld1q_s32(in_ptr + 4)
				393	}
				394	};
				395
				396	int32x4x2_t out = data;
				397
				398	//top2
				399	in_ptr += in_x_stride;
				400	data.val[0] = vld1q_s32(in_ptr);
				401	out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
				402
				403	data.val[1] = vld1q_s32(in_ptr + 4);
				404	out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
				405
				406	//top
				407	in_ptr += in_x_stride;
				408	data.val[0] = vld1q_s32(in_ptr);
				409	out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
				410
				411	data.val[1] = vld1q_s32(in_ptr + 4);
				412	out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
				413
				414	//mid
				415	in_ptr += in_x_stride;
				416	data.val[0] = vld1q_s32(in_ptr);
				417	out.val[0] = vmlaq_s32(out.val[0], data.val[0], twenty);
				418
				419	data.val[1] = vld1q_s32(in_ptr + 4);
				420	out.val[1] = vmlaq_s32(out.val[1], data.val[1], twenty);
				421
				422	//low
				423	in_ptr += in_x_stride;
				424	data.val[0] = vld1q_s32(in_ptr);
				425	out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
				426
				427	data.val[1] = vld1q_s32(in_ptr + 4);
				428	out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
				429
				430	//low2
				431	in_ptr += in_x_stride;
				432	data.val[0] = vld1q_s32(in_ptr);
				433	out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
				434
				435	data.val[1] = vld1q_s32(in_ptr + 4);
				436	out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
				437
				438	//low3
				439	in_ptr += in_x_stride;
				440	data.val[0] = vld1q_s32(in_ptr);
				441	out.val[0] = vaddq_s32(out.val[0], data.val[0]);
				442
				443	data.val[1] = vld1q_s32(in_ptr + 4);
				444	out.val[1] = vaddq_s32(out.val[1], data.val[1]);
				445
				446	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 0, out.val[0]);
				447	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
				448	},
				449	input_x, output_x);
				450	}
				451
				452	if(_run_sobel_y)
				453	{
				454	execute_window_loop(window, [&](const Coordinates & id)
				455	{
				456	auto in_ptr = reinterpret_cast<int32_t >(input_y.ptr()) - 3 in_y_stride;
				457
				458	//top3
				459	int32x4x2_t data =
				460	{
				461	{
				462	vld1q_s32(in_ptr),
				463	vld1q_s32(in_ptr + 4)
				464	}
				465	};
				466
				467	int32x4x2_t out =
				468	{
				469	{
				470	vnegq_s32(data.val[0]),
				471	vnegq_s32(data.val[1])
				472	}
				473	};
				474
				475	//top2
				476	in_ptr += in_y_stride;
				477	data.val[0] = vld1q_s32(in_ptr);
				478	out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfour);
				479
				480	data.val[1] = vld1q_s32(in_ptr + 4);
				481	out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfour);
				482
				483	//top
				484	in_ptr += in_y_stride;
				485	data.val[0] = vld1q_s32(in_ptr);
				486	out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfive);
				487
				488	data.val[1] = vld1q_s32(in_ptr + 4);
				489	out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfive);
				490
				491	//low
				492	in_ptr += (2 * in_y_stride);
				493	data.val[0] = vld1q_s32(in_ptr);
				494	out.val[0] = vmlaq_s32(out.val[0], data.val[0], five);
				495
				496	data.val[1] = vld1q_s32(in_ptr + 4);
				497	out.val[1] = vmlaq_s32(out.val[1], data.val[1], five);
				498
				499	//low2
				500	in_ptr += in_y_stride;
				501	data.val[0] = vld1q_s32(in_ptr);
				502	out.val[0] = vmlaq_s32(out.val[0], data.val[0], four);
				503
				504	data.val[1] = vld1q_s32(in_ptr + 4);
				505	out.val[1] = vmlaq_s32(out.val[1], data.val[1], four);
				506
				507	//low3
				508	in_ptr += in_y_stride;
				509	data.val[0] = vld1q_s32(in_ptr);
				510	out.val[0] = vaddq_s32(out.val[0], data.val[0]);
				511
				512	data.val[1] = vld1q_s32(in_ptr + 4);
				513	out.val[1] = vaddq_s32(out.val[1], data.val[1]);
				514
				515	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 0, out.val[0]);
				516	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
				517	},
				518	input_y, output_y);
				519	}
				520	}