Blame - src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp - ml/ComputeLibrary

blob: 8e55994aaa738fb6c8aca90b6f4c0130ebe3a41e [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/IAccessWindow.h"
				29	#include "arm_compute/core/ITensor.h"
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	30	#include "arm_compute/core/NEON/NEFixedPoint.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	31	#include "arm_compute/core/TensorInfo.h"
				32	#include "arm_compute/core/Validate.h"
				33
				34	#include <algorithm>
				35	#include <arm_neon.h>
				36	#include <cstdint>
				37	#include <map>
				38	#include <string>
				39
				40	using namespace arm_compute;
				41
				42	namespace arm_compute
				43	{
				44	class Coordinates;
				45	} // namespace arm_compute
				46
				47	namespace
				48	{
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	49	void add_wrap_QS8_QS8_QS8(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				50	{
				51	Iterator input1(in1, window);
				52	Iterator input2(in2, window);
				53	Iterator output(out, window);
				54
				55	execute_window_loop(window, [&](const Coordinates & id)
				56	{
				57	const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
				58	const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
				59
				60	vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vaddq_qs8(a, b));
				61	},
				62	input1, input2, output);
				63	}
				64
				65	void add_saturate_QS8_QS8_QS8(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				66	{
				67	Iterator input1(in1, window);
				68	Iterator input2(in2, window);
				69	Iterator output(out, window);
				70
				71	execute_window_loop(window, [&](const Coordinates & id)
				72	{
				73	const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
				74	const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
				75
				76	vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqaddq_qs8(a, b));
				77	},
				78	input1, input2, output);
				79	}
				80
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	81	void add_wrap_U8_U8_U8(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				82	{
				83	Iterator input1(in1, window);
				84	Iterator input2(in2, window);
				85	Iterator output(out, window);
				86
				87	execute_window_loop(window, [&](const Coordinates & id)
				88	{
				89	vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
				90	},
				91	input1, input2, output);
				92	}
				93
				94	void add_saturate_U8_U8_U8(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				95	{
				96	Iterator input1(in1, window);
				97	Iterator input2(in2, window);
				98	Iterator output(out, window);
				99
				100	execute_window_loop(window, [&](const Coordinates & id)
				101	{
				102	vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
				103	},
				104	input1, input2, output);
				105	}
				106
				107	inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
				108	{
				109	const int16x8x2_t res =
				110	{
				111	{
				112	vaddq_s16(a.val[0], b.val[0]),
				113	vaddq_s16(a.val[1], b.val[1])
				114	}
				115	};
				116
				117	return res;
				118	}
				119
				120	inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b)
				121	{
				122	const float32x4x4_t res =
				123	{
				124	{
				125	vaddq_f32(a.val[0], b.val[0]),
				126	vaddq_f32(a.val[1], b.val[1]),
				127	vaddq_f32(a.val[2], b.val[2]),
				128	vaddq_f32(a.val[3], b.val[3])
				129	}
				130	};
				131
				132	return res;
				133	}
				134
				135	inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
				136	{
				137	const int16x8x2_t res =
				138	{
				139	{
				140	vqaddq_s16(a.val[0], b.val[0]),
				141	vqaddq_s16(a.val[1], b.val[1])
				142	}
				143	};
				144
				145	return res;
				146	}
				147
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	148	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	149	inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
				150	{
				151	const float16x8x2_t res =
				152	{
				153	{
				154	vaddq_f16(a.val[0], b.val[0]),
				155	vaddq_f16(a.val[1], b.val[1])
				156	}
				157	};
				158
				159	return res;
				160	}
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	161	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	162
				163	void add_F16_F16_F16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				164	{
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	165	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	166	Iterator input1(in1, window);
				167	Iterator input2(in2, window);
				168	Iterator output(out, window);
				169
				170	execute_window_loop(window, [&](const Coordinates & id)
				171	{
				172	const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
				173	const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
				174
				175	vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vadd2q_f16(a, b));
				176	},
				177	input1, input2, output);
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	178	#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	179	ARM_COMPUTE_UNUSED(in1);
				180	ARM_COMPUTE_UNUSED(in2);
				181	ARM_COMPUTE_UNUSED(out);
				182	ARM_COMPUTE_UNUSED(window);
				183	ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	184	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	185	}
				186
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	187	void add_F32_F32_F32(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				188	{
				189	Iterator input1(in1, window);
				190	Iterator input2(in2, window);
				191	Iterator output(out, window);
				192
				193	execute_window_loop(window, [&](const Coordinates & id)
				194	{
				195	const float32x4x4_t a = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
				196	const float32x4x4_t b = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
				197
				198	vst4q_f32(reinterpret_cast<float *>(output.ptr()), vadd4q_f32(a, b));
				199	},
				200	input1, input2, output);
				201	}
				202
				203	void add_wrap_S16_S16_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				204	{
				205	Iterator input1(in1, window);
				206	Iterator input2(in2, window);
				207	Iterator output(out, window);
				208
				209	execute_window_loop(window, [&](const Coordinates & id)
				210	{
				211	const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
				212	const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
				213
				214	vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vadd2q_s16(a, b));
				215	},
				216	input1, input2, output);
				217	}
				218
				219	void add_saturate_S16_S16_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				220	{
				221	Iterator input1(in1, window);
				222	Iterator input2(in2, window);
				223	Iterator output(out, window);
				224
				225	execute_window_loop(window, [&](const Coordinates & id)
				226	{
				227	const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
				228	const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
				229
				230	vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqadd2q_s16(a, b));
				231	},
				232	input1, input2, output);
				233	}
				234
				235	void add_wrap_S16_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				236	{
				237	Iterator input1(in1, window);
				238	Iterator input2(in2, window);
				239	Iterator output(out, window);
				240
				241	execute_window_loop(window, [&](const Coordinates & id)
				242	{
				243	const int16x8x2_t a =
				244	{
				245	{
				246	vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
				247	vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
				248	}
				249	};
				250	const uint8x16_t b = vld1q_u8(input2.ptr());
				251
				252	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
				253	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
				254	},
				255	input1, input2, output);
				256	}
				257
				258	void add_saturate_S16_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				259	{
				260	Iterator input1(in1, window);
				261	Iterator input2(in2, window);
				262	Iterator output(out, window);
				263
				264	execute_window_loop(window, [&](const Coordinates & id)
				265	{
				266	const int16x8x2_t a =
				267	{
				268	{
				269	vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
				270	vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
				271	}
				272	};
				273	const uint8x16_t b = vld1q_u8(input2.ptr());
				274
				275	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
				276	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
				277	},
				278	input1, input2, output);
				279	}
				280
				281	inline void add_wrap_U8_S16_S16(const ITensor input1, const ITensor input2, ITensor *output, const Window &window)
				282	{
				283	//Simply swap the two input buffers:
				284	add_wrap_S16_U8_S16(input2, input1, output, window);
				285	}
				286
				287	inline void add_saturate_U8_S16_S16(const ITensor input1, const ITensor input2, ITensor *output, const Window &window)
				288	{
				289	//Simply swap the two input buffers:
				290	add_saturate_S16_U8_S16(input2, input1, output, window);
				291	}
				292
				293	void add_wrap_U8_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				294	{
				295	Iterator input1(in1, window);
				296	Iterator input2(in2, window);
				297	Iterator output(out, window);
				298
				299	execute_window_loop(window, [&](const Coordinates & id)
				300	{
				301	const uint8x16_t a = vld1q_u8(input1.ptr());
				302	const uint8x16_t b = vld1q_u8(input2.ptr());
				303
				304	const int16x8x2_t a_s16 =
				305	{
				306	{
				307	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
				308	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
				309	}
				310	};
				311
				312	const int16x8x2_t b_s16 =
				313	{
				314	{
				315	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
				316	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
				317	}
				318	};
				319
				320	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0]));
				321	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1]));
				322	},
				323	input1, input2, output);
				324	}
				325
				326	void add_saturate_U8_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				327	{
				328	Iterator input1(in1, window);
				329	Iterator input2(in2, window);
				330	Iterator output(out, window);
				331
				332	execute_window_loop(window, [&](const Coordinates & id)
				333	{
				334	const uint8x16_t a = vld1q_u8(input1.ptr());
				335	const uint8x16_t b = vld1q_u8(input2.ptr());
				336
				337	const int16x8x2_t a_s16 =
				338	{
				339	{
				340	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
				341	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
				342	}
				343	};
				344
				345	const int16x8x2_t b_s16 =
				346	{
				347	{
				348	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
				349	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
				350	}
				351	};
				352
				353	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0]));
				354	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1]));
				355	},
				356	input1, input2, output);
				357	}
				358	} // namespace
				359
				360	NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
				361	: _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
				362	{
				363	}
				364
				365	void NEArithmeticAdditionKernel::configure(const ITensor input1, const ITensor input2, ITensor *output, ConvertPolicy policy)
				366	{
				367	ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
				368
Georgios Pinitas	f0dea70	2017-07-03 18:17:28 +0100	[diff] [blame]	369	// Auto initialize output if not initialized
				370	{
				371	set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	372
Georgios Pinitas	f0dea70	2017-07-03 18:17:28 +0100	[diff] [blame]	373	if(input1->info()->data_type() == DataType::S16 \|\| input2->info()->data_type() == DataType::S16)
				374	{
				375	set_format_if_unknown(*output->info(), Format::S16);
				376	}
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	377	else if(input1->info()->data_type() == DataType::F16 \|\| input2->info()->data_type() == DataType::F16)
				378	{
				379	set_format_if_unknown(*output->info(), Format::F16);
				380	}
Georgios Pinitas	f0dea70	2017-07-03 18:17:28 +0100	[diff] [blame]	381	else if(input1->info()->data_type() == DataType::F32 \|\| input2->info()->data_type() == DataType::F32)
				382	{
				383	set_format_if_unknown(*output->info(), Format::F32);
				384	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	385	}
				386
				387	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	388	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
				389	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
				390	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	391	ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 \|\| input2->info()->data_type() != DataType::U8),
				392	"Output can only be U8 if both inputs are U8");
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	393	if(is_data_type_fixed_point(input1->info()->data_type()) \|\| is_data_type_fixed_point(input2->info()->data_type()) \|\| is_data_type_fixed_point(output->info()->data_type()))
				394	{
				395	// Check that all data types are the same and all fixed-point positions are the same
				396	ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
				397	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	398
				399	static std::map<std::string, AddFunction *> map_function =
				400	{
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	401	{ "add_wrap_QS8_QS8_QS8", &add_wrap_QS8_QS8_QS8 },
				402	{ "add_saturate_QS8_QS8_QS8", &add_saturate_QS8_QS8_QS8 },
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	403	{ "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
				404	{ "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
				405	{ "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
				406	{ "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 },
				407	{ "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 },
				408	{ "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
				409	{ "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
				410	{ "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	411	{ "add_wrap_QS16_QS16_QS16", &add_wrap_S16_S16_S16 },
				412	{ "add_saturate_QS16_QS16_QS16", &add_saturate_S16_S16_S16 },
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	413	{ "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
				414	{ "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
				415	{ "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
				416	{ "add_saturate_F32_F32_F32", &add_F32_F32_F32 },
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	417	{ "add_wrap_F16_F16_F16", &add_F16_F16_F16 },
				418	{ "add_saturate_F16_F16_F16", &add_F16_F16_F16 },
				419
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	420	};
				421
				422	_input1 = input1;
				423	_input2 = input2;
				424	_output = output;
				425
				426	std::string function_to_call("add_");
				427	function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
				428	function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
				429	function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
				430	function_to_call += string_from_data_type(output->info()->data_type());
				431
				432	auto it = map_function.find(function_to_call);
				433
				434	if(it != map_function.end())
				435	{
				436	_func = it->second;
				437	}
				438	else
				439	{
				440	ARM_COMPUTE_ERROR("You called arithmetic addition with the wrong tensor data type");
				441	}
				442
				443	constexpr unsigned int num_elems_processed_per_iteration = 16;
				444
				445	// Configure kernel window
				446	Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
				447	AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
				448
				449	update_window_and_padding(win,
				450	AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
				451	AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
				452	output_access);
				453
				454	ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
				455	input2->info()->valid_region());
				456
				457	output_access.set_valid_region(win, valid_region);
				458
				459	INEKernel::configure(win);
				460	}
				461
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	462	void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	463	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	464	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	465	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				466	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				467	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				468
				469	(*_func)(_input1, _input2, _output, window);
				470	}