Blame - src/core/CL/cl_kernels/reduction_operation.cl - ml/ComputeLibrary

blob: d1f47beda70402a2176f8f6668a3552f8a5ad2d3 [file] [log] [blame]

Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	1	/*
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	2	* Copyright (c) 2016-2018 ARM Limited.
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "helpers.h"
				25
				26	/** Calculate square sum of a vector
				27	*
				28	* @param[in] input Pointer to the first pixel.
				29	*
				30	* @return square sum of vector.
				31	*/
				32	inline DATA_TYPE square_sum(__global const DATA_TYPE *input)
				33	{
				34	VEC_DATA_TYPE(DATA_TYPE, 16)
				35	in = vload16(0, input);
				36
				37	in *= in;
				38
				39	in.s01234567 += in.s89ABCDEF;
				40	in.s0123 += in.s4567;
				41	in.s01 += in.s23;
				42
				43	return (in.s0 + in.s1);
				44	}
				45
				46	/** Calculate sum of a vector
				47	*
				48	* @param[in] input Pointer to the first pixel.
				49	*
				50	* @return sum of vector.
				51	*/
				52	inline DATA_TYPE sum(__global const DATA_TYPE *input)
				53	{
				54	VEC_DATA_TYPE(DATA_TYPE, 16)
				55	in = vload16(0, input);
				56
				57	in.s01234567 += in.s89ABCDEF;
				58	in.s0123 += in.s4567;
				59	in.s01 += in.s23;
				60
				61	return (in.s0 + in.s1);
				62	}
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	63	#if defined(OPERATION)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	64	/** This kernel performs parallel reduction given an operation on x-axis.
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	65	*
				66	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	67	* @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	68	* @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
				69	* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	70	*
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	71	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	72	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				73	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	74	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				75	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	76	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				77	* @param[in] partial_sum_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	78	* @param[in] partial_sum_stride_x Stride of the output tensor in X dimension (in bytes)
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	79	* @param[in] partial_sum_step_x partial_sum_stride_x * number of elements along X processed per workitem(in bytes)
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	80	* @param[in] partial_sum_stride_y Stride of the output tensor in Y dimension (in bytes)
				81	* @param[in] partial_sum_step_y partial_sum_stride_y * number of elements along Y processed per workitem(in bytes)
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	82	* @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	83	* @param[in] local_sums Local buffer for storing the partial sum
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	84	*/
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	85	__kernel void reduction_operation_x(
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	86	IMAGE_DECLARATION(src),
				87	IMAGE_DECLARATION(partial_sum),
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	88	__local DATA_TYPE *local_sums)
				89	{
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	90	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				91	Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum);
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	92
				93	unsigned int lsize = get_local_size(0);
				94	unsigned int lid = get_local_id(0);
				95
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	96	for(unsigned int y = 0; y < get_local_size(1); ++y)
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	97	{
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	98	local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	99	barrier(CLK_LOCAL_MEM_FENCE);
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	100
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	101	// Perform parallel reduction
				102	for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
				103	{
				104	if(lid < i)
				105	{
				106	local_sums[lid] += local_sums[lid + i];
				107	}
				108	barrier(CLK_LOCAL_MEM_FENCE);
				109	}
				110
				111	if(lid == 0)
				112	{
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	113	#if defined(MEAN) && defined(WIDTH)
				114	if(y == get_local_size(1) - 1)
				115	{
				116	local_sums[0] /= WIDTH;
				117	}
				118	#endif /* defined(MEAN) && defined(WIDTH) */
Michalis Spyrou	f6402dd	2018-01-26 15:06:19 +0000	[diff] [blame]	119	((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];
				120	}
Michalis Spyrou	04f089c	2017-08-08 17:42:38 +0100	[diff] [blame]	121	}
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	122	}
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	123	#endif // defined(OPERATION)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	124
				125	#if defined(WIDTH)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	126	/** This kernel performs reduction on x-axis. (Non parallel)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	127	*
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	128	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	129	* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	130	* @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	131	*
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	132	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 and QASYMM8 for operation MEAN
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	133	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				134	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				135	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				136	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
				137	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				138	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				139	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				140	*/
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	141	__kernel void reduction_operation_non_parallel_x(
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	142	VECTOR_DECLARATION(src),
				143	VECTOR_DECLARATION(output))
				144	{
				145	Vector src = CONVERT_TO_VECTOR_STRUCT(src);
				146	Vector output = CONVERT_TO_VECTOR_STRUCT(output);
				147
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	148	DATA_TYPE_PROMOTED res = ((__global DATA_TYPE )vector_offset(&src, 0));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	149
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	150	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				151	uint indx = 0;
				152	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
				153
				154	for(unsigned int x = 1; x < WIDTH; ++x)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	155	{
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	156	DATA_TYPE_PROMOTED in = ((__global DATA_TYPE )vector_offset(&src, x));
				157	#if defined(ARG_MAX)
				158	indx = select(indx, x, isgreater(in, res));
				159	res = select(res, in, CONVERT(isgreater(in, res), COND_DATA_TYPE));
				160	#elif defined(ARG_MIN)
				161	indx = select(indx, x, isless(in, res));
				162	res = select(res, in, CONVERT(isless(in, res), COND_DATA_TYPE));
				163	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
				164	res += in;
				165	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	166	}
				167
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	168	// Store result
				169	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				170	((__global uint )output.ptr) = indx;
				171	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	172	#if defined(MEAN)
				173	res /= WIDTH;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	174	#endif // defined(MEAN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	175	((__global uchar )output.ptr) = convert_uchar(res);
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	176	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	177	}
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	178	#endif /* defined(WIDTH) */
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	179
				180	#if defined(HEIGHT)
				181	/** This kernel performs reduction on y-axis.
				182	*
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	183	* @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	184	* @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
				185	*
				186	* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
				187	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				188	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				189	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				190	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				191	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				192	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
				193	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				194	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				195	* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
				196	* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
				197	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				198	*/
				199	__kernel void reduction_operation_y(
				200	IMAGE_DECLARATION(src),
				201	IMAGE_DECLARATION(output))
				202	{
				203	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				204	Image output = CONVERT_TO_IMAGE_STRUCT(output);
				205
				206	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	207	res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	208
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	209	#if defined(SUM_SQUARE)
				210	res *= res;
				211	#endif // defined(SUM_SQUARE)
				212
				213	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				214	uint16 indx = 0;
				215	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
				216
				217	for(unsigned int y = 1; y < HEIGHT; ++y)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	218	{
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	219	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
				220	in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	221	#if defined(ARG_MAX)
				222	uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
				223	indx = select(indx, y, cond_conv);
				224	res = select(res, in, isgreater(in, res));
				225	#elif defined(ARG_MIN)
				226	uint16 cond_conv = CONVERT(isless(in, res), uint16);
				227	indx = select(indx, y, cond_conv);
				228	res = select(res, in, isless(in, res));
				229	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	230	#if defined(SUM_SQUARE)
				231	in *= in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	232	#endif // defined(SUM_SQUARE)
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	233	res += in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	234	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	235	}
				236
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	237	// Store result
				238	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				239	vstore16(indx, 0, (__global uint *)output.ptr);
				240	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	241	#if defined(MEAN)
				242	res /= HEIGHT;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	243	#endif // defined(MEAN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	244	vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	245	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	246	}
				247	#endif /* defined(HEIGHT) */
				248
				249	#if defined(DEPTH)
				250	/** This kernel performs reduction on z-axis.
				251	*
				252	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
				253	* @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
				254	*
				255	* @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
				256	* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
				257	* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
				258	* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
				259	* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
				260	* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
				261	* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
				262	* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
				263	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt
				264	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				265	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				266	* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
				267	* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
				268	* @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
				269	* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
				270	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				271	*/
				272	__kernel void reduction_operation_z(
				273	TENSOR3D_DECLARATION(input),
				274	TENSOR3D_DECLARATION(output))
				275	{
				276	Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
				277	Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
				278
				279	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	280	res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	281
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	282	#if defined(SUM_SQUARE)
				283	res *= res;
				284	#endif // defined(SUM_SQUARE)
				285
				286	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				287	uint16 indx = 0;
				288	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
				289
				290	for(unsigned int z = 1; z < DEPTH; ++z)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	291	{
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	292	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
				293	in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	294
				295	#if defined(ARG_MAX)
				296	uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
				297	indx = select(indx, z, cond_conv);
				298	res = select(res, in, isgreater(in, res));
				299	#elif defined(ARG_MIN)
				300	uint16 cond_conv = CONVERT(isless(in, res), uint16);
				301	indx = select(indx, z, cond_conv);
				302	res = select(res, in, isless(in, res));
				303	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	304	#if defined(SUM_SQUARE)
				305	in *= in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	306	#endif // defined(SUM_SQUARE)
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	307	res += in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	308	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	309	}
				310
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	311	// Store result
				312	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				313	vstore16(indx, 0, (__global uint *)output.ptr);
				314	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	315	#if defined(MEAN)
				316	res /= DEPTH;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	317	#endif // defined(MEAN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	318	vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	319	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	320	}
				321	#endif /* defined(DEPTH) */
				322
				323	#if defined(BATCH) && defined(DEPTH)
				324	/** This kernel performs reduction on w-axis.
				325	*
				326	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
				327	* @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
				328	* @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
				329	*
				330	* @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
				331	* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
				332	* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
				333	* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
				334	* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
				335	* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
				336	* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
				337	* @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
				338	* @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
				339	* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
				340	* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt
				341	* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
				342	* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
				343	* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
				344	* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
				345	* @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
				346	* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
				347	* @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes)
				348	* @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
				349	* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
				350	*/
				351	__kernel void reduction_operation_w(
				352	TENSOR4D_DECLARATION(input),
				353	TENSOR4D_DECLARATION(output))
				354	{
				355	Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
				356	Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
				357
				358	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	359	res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	360
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	361	#if defined(SUM_SQUARE)
				362	res *= res;
				363	#endif // defined(SUM_SQUARE)
				364
				365	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				366	uint16 indx = 0;
				367	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
				368
				369	for(unsigned int w = 1; w < BATCH; ++w)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	370	{
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	371	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
				372	in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	373
				374	#if defined(ARG_MAX)
				375	uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
				376	indx = select(indx, w, cond_conv);
				377	res = select(res, in, isgreater(in, res));
				378	#elif defined(ARG_MIN)
				379	uint16 cond_conv = CONVERT(isless(in, res), uint16);
				380	indx = select(indx, w, cond_conv);
				381	res = select(res, in, isless(in, res));
				382	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	383	#if defined(SUM_SQUARE)
				384	in *= in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	385	#endif // defined(SUM_SQUARE)
Michalis Spyrou	8aaf93e	2018-10-11 17:33:32 +0100	[diff] [blame]	386	res += in;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	387	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	388	}
				389
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	390	// Store result
				391	#if defined(ARG_MAX) \|\| defined(ARG_MIN)
				392	vstore16(indx, 0, (__global uint *)output.ptr);
				393	#else // !(defined(ARG_MAX) \|\| defined(ARG_MIN))
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	394	#if defined(MEAN)
				395	res /= BATCH;
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	396	#endif // defined(MEAN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	397	vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
Michalis Spyrou	7930db4	2018-11-22 17:36:28 +0000	[diff] [blame]	398	#endif // defined(ARG_MAX) \|\| defined(ARG_MIN)
Michalis Spyrou	7e9391b	2018-10-05 14:49:28 +0100	[diff] [blame]	399	}
				400	#endif /* defined(BATCH) && defined(DEPTH) */