Blame - src/core/CL/cl_kernels/reduction_operation.cl - ml/ComputeLibrary

blob: 451b962b01d10264a812653b28c89298993b64d4 [file] [log] [blame]

Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	1	/*
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	2	* Copyright (c) 2016-2019 ARM Limited.
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "helpers.h"
				25
				26	/** Calculate square sum of a vector
				27	*
				28	* @param[in] input Pointer to the first pixel.
				29	*
				30	* @return square sum of vector.
				31	*/
				32	inline DATA_TYPE square_sum(__global const DATA_TYPE *input)
				33	{
				34	VEC_DATA_TYPE(DATA_TYPE, 16)
				35	in = vload16(0, input);
				36
				37	in *= in;
				38
				39	in.s01234567 += in.s89ABCDEF;
				40	in.s0123 += in.s4567;
				41	in.s01 += in.s23;
				42
				43	return (in.s0 + in.s1);
				44	}
				45
				46	/** Calculate sum of a vector
				47	*
				48	* @param[in] input Pointer to the first pixel.
				49	*
				50	* @return sum of vector.
				51	*/
				52	inline DATA_TYPE sum(__global const DATA_TYPE *input)
				53	{
				54	VEC_DATA_TYPE(DATA_TYPE, 16)
				55	in = vload16(0, input);
				56
				57	in.s01234567 += in.s89ABCDEF;
				58	in.s0123 += in.s4567;
				59	in.s01 += in.s23;
				60
				61	return (in.s0 + in.s1);
				62	}
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	63
				64	/** Calculate product of a vector
				65	*
				66	* @param[in] input Pointer to the first pixel.
				67	*
				68	* @return product of vector.
				69	*/
				70	inline DATA_TYPE product(__global const DATA_TYPE *input)
				71	{
				72	VEC_DATA_TYPE(DATA_TYPE, 16)
				73	in = vload16(0, input);
				74
				75	in.s01234567 *= in.s89ABCDEF;
				76	in.s0123 *= in.s4567;
				77	in.s01 *= in.s23;
				78
				79	return (in.s0 * in.s1);
				80	}
Manuel Bottini	34f88dd	2019-10-18 10:37:46 +0000	[diff] [blame]	81	#if defined(OPERATION)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	82	/** This kernel performs parallel reduction given an operation on x-axis.
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	83	*
				84	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	85	* @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	86	* @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	87	* @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	88	* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	89	*
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	90	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	91	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				92	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	93	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				94	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	95	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	96	* @param[in] partial_res_ptr The local buffer to hold partial result values. Supported data types: same as @p src_ptr
				97	* @param[in] partial_res_stride_x Stride of the output tensor in X dimension (in bytes)
				98	* @param[in] partial_res_step_x partial_res_stride_x * number of elements along X processed per workitem(in bytes)
				99	* @param[in] partial_res_stride_y Stride of the output tensor in Y dimension (in bytes)
				100	* @param[in] partial_res_step_y partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
				101	* @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
				102	* @param[in] local_results Local buffer for storing the partial result
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	103	*/
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	104	__kernel void reduction_operation_x(
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	105	IMAGE_DECLARATION(src),
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	106	IMAGE_DECLARATION(partial_res),
Manuel Bottini	34f88dd	2019-10-18 10:37:46 +0000	[diff] [blame]	107	__local DATA_TYPE *local_results)
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	108	{
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	109	Image src = CONVERT_TO_IMAGE_STRUCT(src);
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	110	Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	111
				112	unsigned int lsize = get_local_size(0);
				113	unsigned int lid = get_local_id(0);
				114
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	115	for(unsigned int y = 0; y < get_local_size(1); ++y)
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	116	{
Manuel Bottini	34f88dd	2019-10-18 10:37:46 +0000	[diff] [blame]	117	local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	118	barrier(CLK_LOCAL_MEM_FENCE);
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	119
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	120	// Perform parallel reduction
				121	for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
				122	{
				123	if(lid < i)
				124	{
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	125	#if defined(PROD)
				126	local_results[lid] *= local_results[lid + i];
Manuel Bottini	34f88dd	2019-10-18 10:37:46 +0000	[diff] [blame]	127	#else // !defined(PROD)
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	128	local_results[lid] += local_results[lid + i];
Manuel Bottini	34f88dd	2019-10-18 10:37:46 +0000	[diff] [blame]	129	#endif // defined(PROD)
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	130	}
				131	barrier(CLK_LOCAL_MEM_FENCE);
				132	}
				133
				134	if(lid == 0)
				135	{
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	136	#if defined(MEAN) && defined(WIDTH)
				137	if(y == get_local_size(1) - 1)
				138	{
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	139	local_results[0] /= WIDTH;
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	140	}
Michalis Spyrou	b9626ab	2019-05-13 17:41:01 +0100	[diff] [blame]	141	#endif // defined(MEAN) && defined(WIDTH)
Manuel Bottini	34f88dd	2019-10-18 10:37:46 +0000	[diff] [blame]	142	((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	143	}
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	144	}
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	145	}
Manuel Bottini	34f88dd	2019-10-18 10:37:46 +0000	[diff] [blame]	146	#endif // defined(OPERATION)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	147
				148	#if defined(WIDTH)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	149	/** This kernel performs reduction on x-axis. (Non parallel)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	150	*
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	151	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	152	* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	153	* @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	154	* @note In case of MIN and MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	155	*
Michalis Spyrou	b9626ab	2019-05-13 17:41:01 +0100	[diff] [blame]	156	* @param[in] src_ptr Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8 for operation MEAN
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	157	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				158	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				159	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				160	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
				161	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				162	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				163	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				164	*/
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	165	__kernel void reduction_operation_non_parallel_x(
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	166	VECTOR_DECLARATION(src),
				167	VECTOR_DECLARATION(output))
				168	{
				169	Vector src = CONVERT_TO_VECTOR_STRUCT(src);
				170	Vector output = CONVERT_TO_VECTOR_STRUCT(output);
				171
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	172	DATA_TYPE_PROMOTED res = ((__global DATA_TYPE )vector_offset(&src, 0));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	173
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	174	for(unsigned int x = 1; x < WIDTH; ++x)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	175	{
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	176	DATA_TYPE_PROMOTED in = ((__global DATA_TYPE )vector_offset(&src, x));
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	177	#if defined(MIN)
Usama Arif	048b0f3	2019-05-22 16:32:27 +0100	[diff] [blame]	178	res = select(res, in, CONVERT(ISLESS(in, res), COND_DATA_TYPE));
				179	#elif defined(MAX)
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	180	res = select(res, in, CONVERT(ISGREATER(in, res), COND_DATA_TYPE));
				181	#else // !(defined(MAX) \|\| defined(MIN))
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	182	res += in;
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	183	#endif // defined(MAX) \|\| defined(MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	184	}
				185
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	186	// Store result
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	187	#if defined(MEAN)
				188	res /= WIDTH;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	189	#endif // defined(MEAN)
Usama Arif	048b0f3	2019-05-22 16:32:27 +0100	[diff] [blame]	190	#if defined(MIN) \|\| defined(MAX)
Usama Arif	b289050	2019-05-21 11:48:37 +0100	[diff] [blame]	191	((__global DATA_TYPE_PROMOTED )output.ptr) = res;
Usama Arif	048b0f3	2019-05-22 16:32:27 +0100	[diff] [blame]	192	#else // defined(MIN) \|\| defined(MAX)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	193	((__global uchar )output.ptr) = convert_uchar(res);
Usama Arif	048b0f3	2019-05-22 16:32:27 +0100	[diff] [blame]	194	#endif // defined(MIN) \|\| defined(MAX)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	195	}
Michalis Spyrou	b9626ab	2019-05-13 17:41:01 +0100	[diff] [blame]	196	#endif // defined(WIDTH)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	197
				198	#if defined(HEIGHT)
				199	/** This kernel performs reduction on y-axis.
				200	*
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	201	* @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	202	* @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
				203	*
Michalis Spyrou	b9626ab	2019-05-13 17:41:01 +0100	[diff] [blame]	204	* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	205	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				206	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				207	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				208	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				209	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				210	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
				211	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				212	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				213	* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
				214	* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
				215	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				216	*/
				217	__kernel void reduction_operation_y(
				218	IMAGE_DECLARATION(src),
				219	IMAGE_DECLARATION(output))
				220	{
				221	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				222	Image output = CONVERT_TO_IMAGE_STRUCT(output);
				223
				224	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	225	res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	226
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	227	#if defined(SUM_SQUARE)
				228	res *= res;
				229	#endif // defined(SUM_SQUARE)
				230
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	231	for(unsigned int y = 1; y < HEIGHT; ++y)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	232	{
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	233	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
				234	in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	235	#if defined(MIN)
Usama Arif	048b0f3	2019-05-22 16:32:27 +0100	[diff] [blame]	236	res = select(res, in, ISLESS(in, res));
				237	#elif defined(MAX)
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	238	res = select(res, in, ISGREATER(in, res));
				239	#else // !(defined(MAX) \|\| defined(MIN))
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	240	#if defined(SUM_SQUARE)
				241	in *= in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	242	#endif // defined(SUM_SQUARE)
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	243	#if defined(PROD)
				244	res *= in;
Michalis Spyrou	b9626ab	2019-05-13 17:41:01 +0100	[diff] [blame]	245	#else // !defined(PROD)
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	246	res += in;
Michalis Spyrou	b9626ab	2019-05-13 17:41:01 +0100	[diff] [blame]	247	#endif // defined(PROD)
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	248	#endif // defined(MAX) \|\| defined(MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	249	}
				250
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	251	// Store result
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	252	#if defined(MEAN)
				253	res /= HEIGHT;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	254	#endif // defined(MEAN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	255	vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
				256	}
Michalis Spyrou	b9626ab	2019-05-13 17:41:01 +0100	[diff] [blame]	257	#endif // defined(HEIGHT)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	258
				259	#if defined(DEPTH)
				260	/** This kernel performs reduction on z-axis.
				261	*
				262	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
				263	* @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
				264	*
Michalis Spyrou	b9626ab	2019-05-13 17:41:01 +0100	[diff] [blame]	265	* @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	266	* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
				267	* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
				268	* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
				269	* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
				270	* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
				271	* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
				272	* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
				273	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt
				274	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				275	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				276	* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
				277	* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
				278	* @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
				279	* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
				280	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				281	*/
				282	__kernel void reduction_operation_z(
				283	TENSOR3D_DECLARATION(input),
				284	TENSOR3D_DECLARATION(output))
				285	{
				286	Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
				287	Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
				288
				289	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	290	res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	291
Georgios Pinitas	8be9148	2019-03-26 17:23:28 +0000	[diff] [blame]	292	#if defined(COMPLEX)
				293	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
				294	res1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
				295	#endif // defined(COMPLEX)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	296	#if defined(SUM_SQUARE)
				297	res *= res;
				298	#endif // defined(SUM_SQUARE)
				299
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	300	for(unsigned int z = 1; z < DEPTH; ++z)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	301	{
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	302	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
				303	in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	304
Georgios Pinitas	8be9148	2019-03-26 17:23:28 +0000	[diff] [blame]	305	#if defined(COMPLEX)
				306	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
				307	in1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
				308	#endif // defined(COMPLEX)
				309
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	310	#if defined(MIN)
Usama Arif	048b0f3	2019-05-22 16:32:27 +0100	[diff] [blame]	311	res = select(res, in, ISLESS(in, res));
				312	#elif defined(MAX)
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	313	res = select(res, in, ISGREATER(in, res));
				314	#else // !(defined(MAX) \|\| defined(MIN))
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	315	#if defined(SUM_SQUARE)
				316	in *= in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	317	#endif // defined(SUM_SQUARE)
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	318	#if defined(PROD)
				319	res *= in;
Georgios Pinitas	8be9148	2019-03-26 17:23:28 +0000	[diff] [blame]	320	#else //!defined(PROD)
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	321	res += in;
Georgios Pinitas	8be9148	2019-03-26 17:23:28 +0000	[diff] [blame]	322	#if defined(COMPLEX)
				323	res1 += in1;
				324	#endif // defined(COMPLEX)
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	325	#endif // defined(PROD)
				326	#endif // defined(MAX) \|\| defined(MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	327	}
				328
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	329	// Store result
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	330	#if defined(MEAN)
				331	res /= DEPTH;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	332	#endif // defined(MEAN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	333	vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
Georgios Pinitas	8be9148	2019-03-26 17:23:28 +0000	[diff] [blame]	334	#if defined(COMPLEX)
				335	vstore16(CONVERT(res1, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)tensor3D_offset(&output, 8, 0, 0));
				336	#endif // defined(COMPLEX)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	337	}
				338	#endif /* defined(DEPTH) */
				339
				340	#if defined(BATCH) && defined(DEPTH)
				341	/** This kernel performs reduction on w-axis.
				342	*
				343	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
				344	* @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
Manuel Bottini	34f88dd	2019-10-18 10:37:46 +0000	[diff] [blame]	345	* @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	346	*
Michalis Spyrou	b9626ab	2019-05-13 17:41:01 +0100	[diff] [blame]	347	* @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	348	* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
				349	* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
				350	* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
				351	* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
				352	* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
				353	* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
				354	* @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
				355	* @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
				356	* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
				357	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt
				358	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				359	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				360	* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
				361	* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
				362	* @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
				363	* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
				364	* @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes)
				365	* @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
				366	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				367	*/
				368	__kernel void reduction_operation_w(
				369	TENSOR4D_DECLARATION(input),
				370	TENSOR4D_DECLARATION(output))
				371	{
				372	Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
				373	Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
				374
				375	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	376	res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	377
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	378	#if defined(SUM_SQUARE)
				379	res *= res;
				380	#endif // defined(SUM_SQUARE)
				381
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	382	for(unsigned int w = 1; w < BATCH; ++w)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	383	{
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	384	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
				385	in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	386
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	387	#if defined(MIN)
Usama Arif	048b0f3	2019-05-22 16:32:27 +0100	[diff] [blame]	388	res = select(res, in, ISLESS(in, res));
				389	#elif defined(MAX)
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	390	res = select(res, in, ISGREATER(in, res));
				391	#else // !(defined(MAX) \|\| defined(MIN))
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	392	#if defined(SUM_SQUARE)
				393	in *= in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	394	#endif // defined(SUM_SQUARE)
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	395	#if defined(PROD)
				396	res *= in;
				397	#else //!defined(PROD)
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	398	res += in;
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	399	#endif //defined(PROD)
Manuel Bottini	7b9998d	2019-10-21 17:59:07 +0100	[diff] [blame^]	400	#endif // defined(MAX) \|\| defined(MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	401	}
				402
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	403	// Store result
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	404	#if defined(MEAN)
				405	res /= BATCH;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	406	#endif // defined(MEAN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	407	vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
				408	}
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame]	409	#endif /* defined(BATCH) && defined(DEPTH) */