Blame - src/core/CL/cl_kernels/reduction_operation.cl - ml/ComputeLibrary

blob: b4ede2529609c25402d228c17d923f27b5c9698a [file] [log] [blame]

Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	1	/*
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	2	* Copyright (c) 2016-2019 ARM Limited.
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "helpers.h"
				25
				26	/** Calculate square sum of a vector
				27	*
				28	* @param[in] input Pointer to the first pixel.
				29	*
				30	* @return square sum of vector.
				31	*/
				32	inline DATA_TYPE square_sum(__global const DATA_TYPE *input)
				33	{
				34	VEC_DATA_TYPE(DATA_TYPE, 16)
				35	in = vload16(0, input);
				36
				37	in *= in;
				38
				39	in.s01234567 += in.s89ABCDEF;
				40	in.s0123 += in.s4567;
				41	in.s01 += in.s23;
				42
				43	return (in.s0 + in.s1);
				44	}
				45
				46	/** Calculate sum of a vector
				47	*
				48	* @param[in] input Pointer to the first pixel.
				49	*
				50	* @return sum of vector.
				51	*/
				52	inline DATA_TYPE sum(__global const DATA_TYPE *input)
				53	{
				54	VEC_DATA_TYPE(DATA_TYPE, 16)
				55	in = vload16(0, input);
				56
				57	in.s01234567 += in.s89ABCDEF;
				58	in.s0123 += in.s4567;
				59	in.s01 += in.s23;
				60
				61	return (in.s0 + in.s1);
				62	}
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	63
				64	/** Calculate product of a vector
				65	*
				66	* @param[in] input Pointer to the first pixel.
				67	*
				68	* @return product of vector.
				69	*/
				70	inline DATA_TYPE product(__global const DATA_TYPE *input)
				71	{
				72	VEC_DATA_TYPE(DATA_TYPE, 16)
				73	in = vload16(0, input);
				74
				75	in.s01234567 *= in.s89ABCDEF;
				76	in.s0123 *= in.s4567;
				77	in.s01 *= in.s23;
				78
				79	return (in.s0 * in.s1);
				80	}
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	81	#if defined(OPERATION)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	82	/** This kernel performs parallel reduction given an operation on x-axis.
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	83	*
				84	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	85	* @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	86	* @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	87	* @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	88	* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	89	*
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	90	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	91	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				92	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	93	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				94	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	95	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	96	* @param[in] partial_res_ptr The local buffer to hold partial result values. Supported data types: same as @p src_ptr
				97	* @param[in] partial_res_stride_x Stride of the output tensor in X dimension (in bytes)
				98	* @param[in] partial_res_step_x partial_res_stride_x * number of elements along X processed per workitem(in bytes)
				99	* @param[in] partial_res_stride_y Stride of the output tensor in Y dimension (in bytes)
				100	* @param[in] partial_res_step_y partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
				101	* @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
				102	* @param[in] local_results Local buffer for storing the partial result
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	103	*/
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	104	__kernel void reduction_operation_x(
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	105	IMAGE_DECLARATION(src),
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	106	IMAGE_DECLARATION(partial_res),
				107	__local DATA_TYPE *local_results)
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	108	{
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	109	Image src = CONVERT_TO_IMAGE_STRUCT(src);
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	110	Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	111
				112	unsigned int lsize = get_local_size(0);
				113	unsigned int lid = get_local_id(0);
				114
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	115	for(unsigned int y = 0; y < get_local_size(1); ++y)
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	116	{
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	117	local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	118	barrier(CLK_LOCAL_MEM_FENCE);
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	119
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	120	// Perform parallel reduction
				121	for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
				122	{
				123	if(lid < i)
				124	{
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	125	#if defined(PROD)
				126	local_results[lid] *= local_results[lid + i];
				127	#else //!defined(PROD)
				128	local_results[lid] += local_results[lid + i];
				129	#endif //defined(PROD)
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	130	}
				131	barrier(CLK_LOCAL_MEM_FENCE);
				132	}
				133
				134	if(lid == 0)
				135	{
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	136	#if defined(MEAN) && defined(WIDTH)
				137	if(y == get_local_size(1) - 1)
				138	{
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	139	local_results[0] /= WIDTH;
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	140	}
				141	#endif /* defined(MEAN) && defined(WIDTH) */
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	142	((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	143	}
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	144	}
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	145	}
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	146	#endif // defined(OPERATION)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	147
				148	#if defined(WIDTH)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	149	/** This kernel performs reduction on x-axis. (Non parallel)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	150	*
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	151	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	152	* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	153	* @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	154	* @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	155	*
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	156	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 and QASYMM8 for operation MEAN
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	157	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				158	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				159	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				160	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
				161	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				162	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				163	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				164	*/
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	165	__kernel void reduction_operation_non_parallel_x(
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	166	VECTOR_DECLARATION(src),
				167	VECTOR_DECLARATION(output))
				168	{
				169	Vector src = CONVERT_TO_VECTOR_STRUCT(src);
				170	Vector output = CONVERT_TO_VECTOR_STRUCT(output);
				171
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	172	DATA_TYPE_PROMOTED res = ((__global DATA_TYPE )vector_offset(&src, 0));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	173
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	174	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				175	uint indx = 0;
				176	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
				177
				178	for(unsigned int x = 1; x < WIDTH; ++x)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	179	{
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	180	DATA_TYPE_PROMOTED in = ((__global DATA_TYPE )vector_offset(&src, x));
				181	#if defined(ARG_MAX)
				182	indx = select(indx, x, isgreater(in, res));
				183	res = select(res, in, CONVERT(isgreater(in, res), COND_DATA_TYPE));
				184	#elif defined(ARG_MIN)
				185	indx = select(indx, x, isless(in, res));
				186	res = select(res, in, CONVERT(isless(in, res), COND_DATA_TYPE));
				187	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
				188	res += in;
				189	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	190	}
				191
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	192	// Store result
				193	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				194	((__global uint )output.ptr) = indx;
				195	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	196	#if defined(MEAN)
				197	res /= WIDTH;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	198	#endif // defined(MEAN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	199	((__global uchar )output.ptr) = convert_uchar(res);
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	200	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	201	}
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	202	#endif /* defined(WIDTH) */
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	203
				204	#if defined(HEIGHT)
				205	/** This kernel performs reduction on y-axis.
				206	*
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	207	* @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	208	* @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
				209	*
				210	* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
				211	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				212	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				213	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				214	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				215	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				216	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
				217	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				218	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				219	* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
				220	* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
				221	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				222	*/
				223	__kernel void reduction_operation_y(
				224	IMAGE_DECLARATION(src),
				225	IMAGE_DECLARATION(output))
				226	{
				227	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				228	Image output = CONVERT_TO_IMAGE_STRUCT(output);
				229
				230	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	231	res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	232
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	233	#if defined(SUM_SQUARE)
				234	res *= res;
				235	#endif // defined(SUM_SQUARE)
				236
				237	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				238	uint16 indx = 0;
				239	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
				240
				241	for(unsigned int y = 1; y < HEIGHT; ++y)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	242	{
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	243	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
				244	in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	245	#if defined(ARG_MAX)
				246	uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
				247	indx = select(indx, y, cond_conv);
				248	res = select(res, in, isgreater(in, res));
				249	#elif defined(ARG_MIN)
				250	uint16 cond_conv = CONVERT(isless(in, res), uint16);
				251	indx = select(indx, y, cond_conv);
				252	res = select(res, in, isless(in, res));
				253	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	254	#if defined(SUM_SQUARE)
				255	in *= in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	256	#endif // defined(SUM_SQUARE)
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	257	#if defined(PROD)
				258	res *= in;
				259	#else //!defined(PROD)
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	260	res += in;
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	261	#endif //defined(PROD)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	262	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	263	}
				264
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	265	// Store result
				266	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				267	vstore16(indx, 0, (__global uint *)output.ptr);
				268	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	269	#if defined(MEAN)
				270	res /= HEIGHT;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	271	#endif // defined(MEAN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	272	vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	273	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	274	}
				275	#endif /* defined(HEIGHT) */
				276
				277	#if defined(DEPTH)
				278	/** This kernel performs reduction on z-axis.
				279	*
				280	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
				281	* @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
				282	*
				283	* @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
				284	* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
				285	* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
				286	* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
				287	* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
				288	* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
				289	* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
				290	* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
				291	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt
				292	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				293	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				294	* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
				295	* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
				296	* @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
				297	* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
				298	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				299	*/
				300	__kernel void reduction_operation_z(
				301	TENSOR3D_DECLARATION(input),
				302	TENSOR3D_DECLARATION(output))
				303	{
				304	Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
				305	Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
				306
				307	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	308	res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	309
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	310	#if defined(SUM_SQUARE)
				311	res *= res;
				312	#endif // defined(SUM_SQUARE)
				313
				314	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				315	uint16 indx = 0;
				316	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
				317
				318	for(unsigned int z = 1; z < DEPTH; ++z)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	319	{
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	320	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
				321	in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	322
				323	#if defined(ARG_MAX)
				324	uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
				325	indx = select(indx, z, cond_conv);
				326	res = select(res, in, isgreater(in, res));
				327	#elif defined(ARG_MIN)
				328	uint16 cond_conv = CONVERT(isless(in, res), uint16);
				329	indx = select(indx, z, cond_conv);
				330	res = select(res, in, isless(in, res));
				331	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	332	#if defined(SUM_SQUARE)
				333	in *= in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	334	#endif // defined(SUM_SQUARE)
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	335	#if defined(PROD)
				336	res *= in;
				337	#else //!defined(PROD)
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	338	res += in;
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	339	#endif //defined(PROD)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	340	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	341	}
				342
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	343	// Store result
				344	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				345	vstore16(indx, 0, (__global uint *)output.ptr);
				346	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	347	#if defined(MEAN)
				348	res /= DEPTH;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	349	#endif // defined(MEAN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	350	vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	351	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	352	}
				353	#endif /* defined(DEPTH) */
				354
				355	#if defined(BATCH) && defined(DEPTH)
				356	/** This kernel performs reduction on w-axis.
				357	*
				358	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
				359	* @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
				360	* @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
				361	*
				362	* @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
				363	* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
				364	* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
				365	* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
				366	* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
				367	* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
				368	* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
				369	* @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
				370	* @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
				371	* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
				372	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt
				373	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				374	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				375	* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
				376	* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
				377	* @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
				378	* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
				379	* @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes)
				380	* @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
				381	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				382	*/
				383	__kernel void reduction_operation_w(
				384	TENSOR4D_DECLARATION(input),
				385	TENSOR4D_DECLARATION(output))
				386	{
				387	Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
				388	Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
				389
				390	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	391	res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	392
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	393	#if defined(SUM_SQUARE)
				394	res *= res;
				395	#endif // defined(SUM_SQUARE)
				396
				397	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				398	uint16 indx = 0;
				399	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
				400
				401	for(unsigned int w = 1; w < BATCH; ++w)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	402	{
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	403	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
				404	in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	405
				406	#if defined(ARG_MAX)
				407	uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
				408	indx = select(indx, w, cond_conv);
				409	res = select(res, in, isgreater(in, res));
				410	#elif defined(ARG_MIN)
				411	uint16 cond_conv = CONVERT(isless(in, res), uint16);
				412	indx = select(indx, w, cond_conv);
				413	res = select(res, in, isless(in, res));
				414	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	415	#if defined(SUM_SQUARE)
				416	in *= in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	417	#endif // defined(SUM_SQUARE)
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	418	#if defined(PROD)
				419	res *= in;
				420	#else //!defined(PROD)
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	421	res += in;
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	422	#endif //defined(PROD)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	423	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	424	}
				425
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	426	// Store result
				427	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				428	vstore16(indx, 0, (__global uint *)output.ptr);
				429	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	430	#if defined(MEAN)
				431	res /= BATCH;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	432	#endif // defined(MEAN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	433	vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	434	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	435	}
Manuel Bottini	b412fab	2018-12-10 17:40:23 +0000	[diff] [blame^]	436	#endif /* defined(BATCH) && defined(DEPTH) */