Blame - src/core/NEON/kernels/NESobel7x7Kernel.cpp - ml/ComputeLibrary

blob: 40a3e31a3989f2d0c00b5d2c382cae27da1bd770 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
				29	#include "arm_compute/core/TensorInfo.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Utils.h"
				32	#include "arm_compute/core/Validate.h"
				33
				34	#include <arm_neon.h>
				35	#include <cstdint>
				36
				37	using namespace arm_compute;
				38
				39	namespace arm_compute
				40	{
				41	class Coordinates;
				42	} // namespace arm_compute
				43
				44	namespace
				45	{
				46	const int32x4_t minusfour = vdupq_n_s32(-4);
				47	const int32x4_t minusfive = vdupq_n_s32(-5);
				48	const int32x4_t four = vdupq_n_s32(4);
				49	const int32x4_t five = vdupq_n_s32(5);
				50	const int32x4_t six = vdupq_n_s32(6);
				51	const int32x4_t fifteen = vdupq_n_s32(15);
				52	const int32x4_t twenty = vdupq_n_s32(20);
				53
				54	inline int32x4x2_t compute_hor_sobel_x(const int32x4x4_t &data)
				55	{
				56	int32x4x2_t out =
				57	{
				58	{
				59	vnegq_s32(data.val[0]),
				60	vnegq_s32(data.val[1])
				61	}
				62	};
				63
				64	out.val[0] = vmlaq_s32(out.val[0],
				65	vextq_s32(data.val[0], data.val[1], 1), minusfour);
				66
				67	out.val[0] = vmlaq_s32(out.val[0],
				68	vextq_s32(data.val[0], data.val[1], 2), minusfive);
				69
				70	out.val[0] = vmlaq_s32(out.val[0], data.val[1], five);
				71
				72	out.val[0] = vmlaq_s32(out.val[0],
				73	vextq_s32(data.val[1], data.val[2], 1), four);
				74
				75	out.val[0] = vaddq_s32(out.val[0],
				76	vextq_s32(data.val[1], data.val[2], 2));
				77
				78	out.val[1] = vmlaq_s32(out.val[1],
				79	vextq_s32(data.val[1], data.val[2], 1), minusfour);
				80
				81	out.val[1] = vmlaq_s32(out.val[1],
				82	vextq_s32(data.val[1], data.val[2], 2), minusfive);
				83
				84	out.val[1] = vmlaq_s32(out.val[1], data.val[2], five);
				85
				86	out.val[1] = vmlaq_s32(out.val[1],
				87	vextq_s32(data.val[2], data.val[3], 1), four);
				88
				89	out.val[1] = vaddq_s32(out.val[1],
				90	vextq_s32(data.val[2], data.val[3], 2));
				91
				92	return out;
				93	}
				94
				95	inline int32x4x2_t compute_hor_sobel_y(const int32x4x4_t &data)
				96	{
				97	int32x4x2_t out =
				98	{
				99	{
				100	data.val[0],
				101	data.val[1]
				102	}
				103	};
				104
				105	out.val[0] = vmlaq_s32(out.val[0],
				106	vextq_s32(data.val[0], data.val[1], 1), six);
				107
				108	out.val[0] = vmlaq_s32(out.val[0],
				109	vextq_s32(data.val[0], data.val[1], 2), fifteen);
				110
				111	out.val[0] = vmlaq_s32(out.val[0],
				112	vextq_s32(data.val[0], data.val[1], 3), twenty);
				113
				114	out.val[0] = vmlaq_s32(out.val[0], data.val[1], fifteen);
				115
				116	out.val[0] = vmlaq_s32(out.val[0],
				117	vextq_s32(data.val[1], data.val[2], 1), six);
				118
				119	out.val[0] = vaddq_s32(out.val[0],
				120	vextq_s32(data.val[1], data.val[2], 2));
				121
				122	out.val[1] = vmlaq_s32(out.val[1],
				123	vextq_s32(data.val[1], data.val[2], 1), six);
				124
				125	out.val[1] = vmlaq_s32(out.val[1],
				126	vextq_s32(data.val[1], data.val[2], 2), fifteen);
				127
				128	out.val[1] = vmlaq_s32(out.val[1],
				129	vextq_s32(data.val[1], data.val[2], 3), twenty);
				130
				131	out.val[1] = vmlaq_s32(out.val[1], data.val[2], fifteen);
				132
				133	out.val[1] = vmlaq_s32(out.val[1],
				134	vextq_s32(data.val[2], data.val[3], 1), six);
				135
				136	out.val[1] = vaddq_s32(out.val[1],
				137	vextq_s32(data.val[2], data.val[3], 2));
				138
				139	return out;
				140	}
				141	} // namespace
				142
				143	NESobel7x7HorKernel::NESobel7x7HorKernel()
				144	: _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
				145	{
				146	}
				147
				148	BorderSize NESobel7x7HorKernel::border_size() const
				149	{
				150	return _border_size;
				151	}
				152
				153	void NESobel7x7HorKernel::configure(const ITensor input, ITensor output_x, ITensor *output_y, bool border_undefined)
				154	{
				155	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8);
				156	ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
				157
				158	_run_sobel_x = output_x != nullptr;
				159	_run_sobel_y = output_y != nullptr;
				160
				161	if(_run_sobel_x)
				162	{
				163	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
				164	}
				165
				166	if(_run_sobel_y)
				167	{
				168	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
				169	}
				170
				171	_input = input;
				172	_output_x = output_x;
				173	_output_y = output_y;
				174	_border_size = BorderSize(border_undefined ? 0 : 3, 3);
				175
				176	// Configure kernel window
				177	constexpr unsigned int num_elems_processed_per_iteration = 8;
				178	constexpr unsigned int num_elems_read_per_iteration = 16;
				179	constexpr unsigned int num_elems_written_per_iteration = 8;
				180
				181	Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				182	AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
				183	AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
				184
				185	update_window_and_padding(win,
				186	AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
				187	output_x_access,
				188	output_y_access);
				189
				190	output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				191	output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				192
				193	INEKernel::configure(win);
				194	}
				195
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	196	void NESobel7x7HorKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	197	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	198	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	199	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				200	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				201
				202	Iterator input(_input, window);
				203	Iterator output_x;
				204	Iterator output_y;
				205
				206	if(_run_sobel_x)
				207	{
				208	output_x = Iterator(_output_x, window);
				209	}
				210
				211	if(_run_sobel_y)
				212	{
				213	output_y = Iterator(_output_y, window);
				214	}
				215
				216	if(_run_sobel_y && _run_sobel_x)
				217	{
				218	execute_window_loop(window, [&](const Coordinates & id)
				219	{
				220	const uint8x16_t data = vld1q_u8(input.ptr() - 3);
				221
				222	const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
				223	const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
				224
				225	const int32x4x4_t data_s32 =
				226	{
				227	{
				228	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
				229	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
				230	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
				231	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
				232	}
				233	};
				234
				235	const int32x4x2_t out_y = compute_hor_sobel_y(data_s32);
				236	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out_y.val[0]);
				237	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out_y.val[1]);
				238
				239	const int32x4x2_t out_x = compute_hor_sobel_x(data_s32);
				240	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out_x.val[0]);
				241	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out_x.val[1]);
				242	},
				243	input, output_x, output_y);
				244	}
				245	else if(_run_sobel_x)
				246	{
				247	execute_window_loop(window, [&](const Coordinates & id)
				248	{
				249	const uint8x16_t data = vld1q_u8(input.ptr() - 3);
				250
				251	const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
				252	const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
				253
				254	const int32x4x4_t data_s32 =
				255	{
				256	{
				257	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
				258	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
				259	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
				260	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
				261	}
				262	};
				263
				264	const int32x4x2_t out = compute_hor_sobel_x(data_s32);
				265	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out.val[0]);
				266	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
				267	},
				268	input, output_x);
				269	}
				270	else if(_run_sobel_y)
				271	{
				272	execute_window_loop(window, [&](const Coordinates & id)
				273	{
				274	const uint8x16_t data = vld1q_u8(input.ptr() - 3);
				275
				276	const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
				277	const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
				278
				279	const int32x4x4_t data_s32 =
				280	{
				281	{
				282	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
				283	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
				284	vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
				285	vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
				286	}
				287	};
				288
Isabella Gottardi	43ce898	2017-11-08 11:13:23 +0000	[diff] [blame]	289	const int32x4x2_t out = compute_hor_sobel_y(data_s32);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	290	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out.val[0]);
				291	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
				292	},
				293	input, output_y);
				294	}
				295	}
				296
				297	NESobel7x7VertKernel::NESobel7x7VertKernel()
				298	: _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
				299	{
				300	}
				301
				302	BorderSize NESobel7x7VertKernel::border_size() const
				303	{
				304	return BorderSize(3, 0);
				305	}
				306
				307	void NESobel7x7VertKernel::configure(const ITensor input_x, const ITensor input_y, ITensor output_x, ITensor output_y, bool border_undefined)
				308	{
				309	ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
				310
				311	_run_sobel_x = (output_x != nullptr);
				312	_run_sobel_y = (output_y != nullptr);
				313
				314	if(_run_sobel_x)
				315	{
				316	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S32);
				317	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
				318	}
				319
				320	if(_run_sobel_y)
				321	{
				322	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S32);
				323	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
				324	}
				325
				326	_input_x = input_x;
				327	_input_y = input_y;
				328	_output_x = output_x;
				329	_output_y = output_y;
				330
				331	const ITensor *const input = _run_sobel_x ? input_x : input_y;
				332
				333	// Configure kernel window
				334	constexpr unsigned int num_elems_processed_per_iteration = 8;
				335	constexpr unsigned int num_elems_read_per_iteration = 8;
				336	constexpr unsigned int num_elems_written_per_iteration = 8;
				337	constexpr unsigned int num_rows_read_per_iteration = 7;
				338
				339	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				340	AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
				341	AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
				342
				343	update_window_and_padding(win,
				344	AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				345	AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				346	output_x_access,
				347	output_y_access);
				348
				349	output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				350	output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				351
				352	INEKernel::configure(win);
				353	}
				354
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	355	void NESobel7x7VertKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	356	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	357	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	358	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				359	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				360
				361	Iterator input_x;
				362	Iterator input_y;
				363	Iterator output_x;
				364	Iterator output_y;
				365
				366	int32_t in_x_stride = 0;
				367	int32_t in_y_stride = 0;
				368
				369	if(_run_sobel_x)
				370	{
				371	input_x = Iterator(_input_x, window);
				372	output_x = Iterator(_output_x, window);
				373	in_x_stride = _input_x->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_x->info()->format());
				374	}
				375
				376	if(_run_sobel_y)
				377	{
				378	input_y = Iterator(_input_y, window);
				379	output_y = Iterator(_output_y, window);
				380	in_y_stride = _input_y->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_y->info()->format());
				381	}
				382
				383	if(_run_sobel_x)
				384	{
				385	execute_window_loop(window, [&](const Coordinates & id)
				386	{
				387	auto in_ptr = reinterpret_cast<int32_t >(input_x.ptr()) - 3 in_x_stride;
				388
				389	//top3
				390	int32x4x2_t data =
				391	{
				392	{
				393	vld1q_s32(in_ptr),
				394	vld1q_s32(in_ptr + 4)
				395	}
				396	};
				397
				398	int32x4x2_t out = data;
				399
				400	//top2
				401	in_ptr += in_x_stride;
				402	data.val[0] = vld1q_s32(in_ptr);
				403	out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
				404
				405	data.val[1] = vld1q_s32(in_ptr + 4);
				406	out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
				407
				408	//top
				409	in_ptr += in_x_stride;
				410	data.val[0] = vld1q_s32(in_ptr);
				411	out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
				412
				413	data.val[1] = vld1q_s32(in_ptr + 4);
				414	out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
				415
				416	//mid
				417	in_ptr += in_x_stride;
				418	data.val[0] = vld1q_s32(in_ptr);
				419	out.val[0] = vmlaq_s32(out.val[0], data.val[0], twenty);
				420
				421	data.val[1] = vld1q_s32(in_ptr + 4);
				422	out.val[1] = vmlaq_s32(out.val[1], data.val[1], twenty);
				423
				424	//low
				425	in_ptr += in_x_stride;
				426	data.val[0] = vld1q_s32(in_ptr);
				427	out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
				428
				429	data.val[1] = vld1q_s32(in_ptr + 4);
				430	out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
				431
				432	//low2
				433	in_ptr += in_x_stride;
				434	data.val[0] = vld1q_s32(in_ptr);
				435	out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
				436
				437	data.val[1] = vld1q_s32(in_ptr + 4);
				438	out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
				439
				440	//low3
				441	in_ptr += in_x_stride;
				442	data.val[0] = vld1q_s32(in_ptr);
				443	out.val[0] = vaddq_s32(out.val[0], data.val[0]);
				444
				445	data.val[1] = vld1q_s32(in_ptr + 4);
				446	out.val[1] = vaddq_s32(out.val[1], data.val[1]);
				447
				448	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 0, out.val[0]);
				449	vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
				450	},
				451	input_x, output_x);
				452	}
				453
				454	if(_run_sobel_y)
				455	{
				456	execute_window_loop(window, [&](const Coordinates & id)
				457	{
				458	auto in_ptr = reinterpret_cast<int32_t >(input_y.ptr()) - 3 in_y_stride;
				459
				460	//top3
				461	int32x4x2_t data =
				462	{
				463	{
				464	vld1q_s32(in_ptr),
				465	vld1q_s32(in_ptr + 4)
				466	}
				467	};
				468
				469	int32x4x2_t out =
				470	{
				471	{
				472	vnegq_s32(data.val[0]),
				473	vnegq_s32(data.val[1])
				474	}
				475	};
				476
				477	//top2
				478	in_ptr += in_y_stride;
				479	data.val[0] = vld1q_s32(in_ptr);
				480	out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfour);
				481
				482	data.val[1] = vld1q_s32(in_ptr + 4);
				483	out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfour);
				484
				485	//top
				486	in_ptr += in_y_stride;
				487	data.val[0] = vld1q_s32(in_ptr);
				488	out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfive);
				489
				490	data.val[1] = vld1q_s32(in_ptr + 4);
				491	out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfive);
				492
				493	//low
				494	in_ptr += (2 * in_y_stride);
				495	data.val[0] = vld1q_s32(in_ptr);
				496	out.val[0] = vmlaq_s32(out.val[0], data.val[0], five);
				497
				498	data.val[1] = vld1q_s32(in_ptr + 4);
				499	out.val[1] = vmlaq_s32(out.val[1], data.val[1], five);
				500
				501	//low2
				502	in_ptr += in_y_stride;
				503	data.val[0] = vld1q_s32(in_ptr);
				504	out.val[0] = vmlaq_s32(out.val[0], data.val[0], four);
				505
				506	data.val[1] = vld1q_s32(in_ptr + 4);
				507	out.val[1] = vmlaq_s32(out.val[1], data.val[1], four);
				508
				509	//low3
				510	in_ptr += in_y_stride;
				511	data.val[0] = vld1q_s32(in_ptr);
				512	out.val[0] = vaddq_s32(out.val[0], data.val[0]);
				513
				514	data.val[1] = vld1q_s32(in_ptr + 4);
				515	out.val[1] = vaddq_s32(out.val[1], data.val[1]);
				516
				517	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 0, out.val[0]);
				518	vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
				519	},
				520	input_y, output_y);
				521	}
				522	}