blob: d1f47beda70402a2176f8f6668a3552f8a5ad2d3 [file] [log] [blame]
Michalis Spyrou04f089c2017-08-08 17:42:38 +01001/*
Michalis Spyrouf6402dd2018-01-26 15:06:19 +00002 * Copyright (c) 2016-2018 ARM Limited.
Michalis Spyrou04f089c2017-08-08 17:42:38 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26/** Calculate square sum of a vector
27 *
28 * @param[in] input Pointer to the first pixel.
29 *
30 * @return square sum of vector.
31 */
32inline DATA_TYPE square_sum(__global const DATA_TYPE *input)
33{
34 VEC_DATA_TYPE(DATA_TYPE, 16)
35 in = vload16(0, input);
36
37 in *= in;
38
39 in.s01234567 += in.s89ABCDEF;
40 in.s0123 += in.s4567;
41 in.s01 += in.s23;
42
43 return (in.s0 + in.s1);
44}
45
46/** Calculate sum of a vector
47 *
48 * @param[in] input Pointer to the first pixel.
49 *
50 * @return sum of vector.
51 */
52inline DATA_TYPE sum(__global const DATA_TYPE *input)
53{
54 VEC_DATA_TYPE(DATA_TYPE, 16)
55 in = vload16(0, input);
56
57 in.s01234567 += in.s89ABCDEF;
58 in.s0123 += in.s4567;
59 in.s01 += in.s23;
60
61 return (in.s0 + in.s1);
62}
Michalis Spyrou7930db42018-11-22 17:36:28 +000063#if defined(OPERATION)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010064/** This kernel performs parallel reduction given an operation on x-axis.
Michalis Spyrou04f089c2017-08-08 17:42:38 +010065 *
66 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou04f089c2017-08-08 17:42:38 +010067 * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010068 * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
69 * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
Michalis Spyrou04f089c2017-08-08 17:42:38 +010070 *
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010071 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Michalis Spyrou04f089c2017-08-08 17:42:38 +010072 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
73 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Michalis Spyrouf6402dd2018-01-26 15:06:19 +000074 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
75 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michalis Spyrou04f089c2017-08-08 17:42:38 +010076 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
77 * @param[in] partial_sum_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
Michalis Spyrouf6402dd2018-01-26 15:06:19 +000078 * @param[in] partial_sum_stride_x Stride of the output tensor in X dimension (in bytes)
Michalis Spyrou04f089c2017-08-08 17:42:38 +010079 * @param[in] partial_sum_step_x partial_sum_stride_x * number of elements along X processed per workitem(in bytes)
Michalis Spyrouf6402dd2018-01-26 15:06:19 +000080 * @param[in] partial_sum_stride_y Stride of the output tensor in Y dimension (in bytes)
81 * @param[in] partial_sum_step_y partial_sum_stride_y * number of elements along Y processed per workitem(in bytes)
Michalis Spyrou04f089c2017-08-08 17:42:38 +010082 * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
Michalis Spyrouf6402dd2018-01-26 15:06:19 +000083 * @param[in] local_sums Local buffer for storing the partial sum
Michalis Spyrou04f089c2017-08-08 17:42:38 +010084 */
Michalis Spyrou7e9391b2018-10-05 14:49:28 +010085__kernel void reduction_operation_x(
Michalis Spyrouf6402dd2018-01-26 15:06:19 +000086 IMAGE_DECLARATION(src),
87 IMAGE_DECLARATION(partial_sum),
Michalis Spyrou04f089c2017-08-08 17:42:38 +010088 __local DATA_TYPE *local_sums)
89{
Michalis Spyrouf6402dd2018-01-26 15:06:19 +000090 Image src = CONVERT_TO_IMAGE_STRUCT(src);
91 Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum);
Michalis Spyrou04f089c2017-08-08 17:42:38 +010092
93 unsigned int lsize = get_local_size(0);
94 unsigned int lid = get_local_id(0);
95
Michalis Spyrouf6402dd2018-01-26 15:06:19 +000096 for(unsigned int y = 0; y < get_local_size(1); ++y)
Michalis Spyrou04f089c2017-08-08 17:42:38 +010097 {
Michalis Spyrouf6402dd2018-01-26 15:06:19 +000098 local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
Michalis Spyrou04f089c2017-08-08 17:42:38 +010099 barrier(CLK_LOCAL_MEM_FENCE);
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100100
Michalis Spyrouf6402dd2018-01-26 15:06:19 +0000101 // Perform parallel reduction
102 for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
103 {
104 if(lid < i)
105 {
106 local_sums[lid] += local_sums[lid + i];
107 }
108 barrier(CLK_LOCAL_MEM_FENCE);
109 }
110
111 if(lid == 0)
112 {
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100113#if defined(MEAN) && defined(WIDTH)
114 if(y == get_local_size(1) - 1)
115 {
116 local_sums[0] /= WIDTH;
117 }
118#endif /* defined(MEAN) && defined(WIDTH) */
Michalis Spyrouf6402dd2018-01-26 15:06:19 +0000119 ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];
120 }
Michalis Spyrou04f089c2017-08-08 17:42:38 +0100121 }
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100122}
Michalis Spyrou7930db42018-11-22 17:36:28 +0000123#endif // defined(OPERATION)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100124
125#if defined(WIDTH)
Michalis Spyrou7930db42018-11-22 17:36:28 +0000126/** This kernel performs reduction on x-axis. (Non parallel)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100127 *
Michalis Spyrou7930db42018-11-22 17:36:28 +0000128 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100129 * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
Michalis Spyrou7930db42018-11-22 17:36:28 +0000130 * @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100131 *
Michalis Spyrou7930db42018-11-22 17:36:28 +0000132 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 and QASYMM8 for operation MEAN
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100133 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
134 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
135 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
136 * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
137 * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
138 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
139 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
140 */
Michalis Spyrou7930db42018-11-22 17:36:28 +0000141__kernel void reduction_operation_non_parallel_x(
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100142 VECTOR_DECLARATION(src),
143 VECTOR_DECLARATION(output))
144{
145 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
146 Vector output = CONVERT_TO_VECTOR_STRUCT(output);
147
Michalis Spyrou7930db42018-11-22 17:36:28 +0000148 DATA_TYPE_PROMOTED res = *((__global DATA_TYPE *)vector_offset(&src, 0));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100149
Michalis Spyrou7930db42018-11-22 17:36:28 +0000150#if defined(ARG_MAX) || defined(ARG_MIN)
151 uint indx = 0;
152#endif // defined(ARG_MAX) || defined(ARG_MIN)
153
154 for(unsigned int x = 1; x < WIDTH; ++x)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100155 {
Michalis Spyrou7930db42018-11-22 17:36:28 +0000156 DATA_TYPE_PROMOTED in = *((__global DATA_TYPE *)vector_offset(&src, x));
157#if defined(ARG_MAX)
158 indx = select(indx, x, isgreater(in, res));
159 res = select(res, in, CONVERT(isgreater(in, res), COND_DATA_TYPE));
160#elif defined(ARG_MIN)
161 indx = select(indx, x, isless(in, res));
162 res = select(res, in, CONVERT(isless(in, res), COND_DATA_TYPE));
163#else // !(defined(ARG_MAX) || defined(ARG_MIN))
164 res += in;
165#endif // defined(ARG_MAX) || defined(ARG_MIN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100166 }
167
Michalis Spyrou7930db42018-11-22 17:36:28 +0000168 // Store result
169#if defined(ARG_MAX) || defined(ARG_MIN)
170 *((__global uint *)output.ptr) = indx;
171#else // !(defined(ARG_MAX) || defined(ARG_MIN))
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100172#if defined(MEAN)
173 res /= WIDTH;
Michalis Spyrou7930db42018-11-22 17:36:28 +0000174#endif // defined(MEAN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100175 *((__global uchar *)output.ptr) = convert_uchar(res);
Michalis Spyrou7930db42018-11-22 17:36:28 +0000176#endif // defined(ARG_MAX) || defined(ARG_MIN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100177}
Michalis Spyrou7930db42018-11-22 17:36:28 +0000178#endif /* defined(WIDTH) */
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100179
180#if defined(HEIGHT)
181/** This kernel performs reduction on y-axis.
182 *
Michalis Spyrou7930db42018-11-22 17:36:28 +0000183 * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100184 * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
185 *
186 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
187 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
188 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
189 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
190 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
191 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
192 * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
193 * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
194 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
195 * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
196 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
197 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
198 */
199__kernel void reduction_operation_y(
200 IMAGE_DECLARATION(src),
201 IMAGE_DECLARATION(output))
202{
203 Image src = CONVERT_TO_IMAGE_STRUCT(src);
204 Image output = CONVERT_TO_IMAGE_STRUCT(output);
205
206 VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou7930db42018-11-22 17:36:28 +0000207 res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100208
Michalis Spyrou7930db42018-11-22 17:36:28 +0000209#if defined(SUM_SQUARE)
210 res *= res;
211#endif // defined(SUM_SQUARE)
212
213#if defined(ARG_MAX) || defined(ARG_MIN)
214 uint16 indx = 0;
215#endif // defined(ARG_MAX) || defined(ARG_MIN)
216
217 for(unsigned int y = 1; y < HEIGHT; ++y)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100218 {
Michalis Spyrou8aaf93e2018-10-11 17:33:32 +0100219 VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
220 in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou7930db42018-11-22 17:36:28 +0000221#if defined(ARG_MAX)
222 uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
223 indx = select(indx, y, cond_conv);
224 res = select(res, in, isgreater(in, res));
225#elif defined(ARG_MIN)
226 uint16 cond_conv = CONVERT(isless(in, res), uint16);
227 indx = select(indx, y, cond_conv);
228 res = select(res, in, isless(in, res));
229#else // !(defined(ARG_MAX) || defined(ARG_MIN))
Michalis Spyrou8aaf93e2018-10-11 17:33:32 +0100230#if defined(SUM_SQUARE)
231 in *= in;
Michalis Spyrou7930db42018-11-22 17:36:28 +0000232#endif // defined(SUM_SQUARE)
Michalis Spyrou8aaf93e2018-10-11 17:33:32 +0100233 res += in;
Michalis Spyrou7930db42018-11-22 17:36:28 +0000234#endif // defined(ARG_MAX) || defined(ARG_MIN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100235 }
236
Michalis Spyrou7930db42018-11-22 17:36:28 +0000237 // Store result
238#if defined(ARG_MAX) || defined(ARG_MIN)
239 vstore16(indx, 0, (__global uint *)output.ptr);
240#else // !(defined(ARG_MAX) || defined(ARG_MIN))
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100241#if defined(MEAN)
242 res /= HEIGHT;
Michalis Spyrou7930db42018-11-22 17:36:28 +0000243#endif // defined(MEAN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100244 vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
Michalis Spyrou7930db42018-11-22 17:36:28 +0000245#endif // defined(ARG_MAX) || defined(ARG_MIN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100246}
247#endif /* defined(HEIGHT) */
248
249#if defined(DEPTH)
250/** This kernel performs reduction on z-axis.
251 *
252 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
253 * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
254 *
255 * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
256 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
257 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
258 * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
259 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
260 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
261 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
262 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
263 * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt
264 * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
265 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
266 * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
267 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
268 * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
269 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
270 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
271 */
272__kernel void reduction_operation_z(
273 TENSOR3D_DECLARATION(input),
274 TENSOR3D_DECLARATION(output))
275{
276 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
277 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
278
279 VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou7930db42018-11-22 17:36:28 +0000280 res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100281
Michalis Spyrou7930db42018-11-22 17:36:28 +0000282#if defined(SUM_SQUARE)
283 res *= res;
284#endif // defined(SUM_SQUARE)
285
286#if defined(ARG_MAX) || defined(ARG_MIN)
287 uint16 indx = 0;
288#endif // defined(ARG_MAX) || defined(ARG_MIN)
289
290 for(unsigned int z = 1; z < DEPTH; ++z)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100291 {
Michalis Spyrou8aaf93e2018-10-11 17:33:32 +0100292 VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
293 in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou7930db42018-11-22 17:36:28 +0000294
295#if defined(ARG_MAX)
296 uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
297 indx = select(indx, z, cond_conv);
298 res = select(res, in, isgreater(in, res));
299#elif defined(ARG_MIN)
300 uint16 cond_conv = CONVERT(isless(in, res), uint16);
301 indx = select(indx, z, cond_conv);
302 res = select(res, in, isless(in, res));
303#else // !(defined(ARG_MAX) || defined(ARG_MIN))
Michalis Spyrou8aaf93e2018-10-11 17:33:32 +0100304#if defined(SUM_SQUARE)
305 in *= in;
Michalis Spyrou7930db42018-11-22 17:36:28 +0000306#endif // defined(SUM_SQUARE)
Michalis Spyrou8aaf93e2018-10-11 17:33:32 +0100307 res += in;
Michalis Spyrou7930db42018-11-22 17:36:28 +0000308#endif // defined(ARG_MAX) || defined(ARG_MIN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100309 }
310
Michalis Spyrou7930db42018-11-22 17:36:28 +0000311 // Store result
312#if defined(ARG_MAX) || defined(ARG_MIN)
313 vstore16(indx, 0, (__global uint *)output.ptr);
314#else // !(defined(ARG_MAX) || defined(ARG_MIN))
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100315#if defined(MEAN)
316 res /= DEPTH;
Michalis Spyrou7930db42018-11-22 17:36:28 +0000317#endif // defined(MEAN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100318 vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
Michalis Spyrou7930db42018-11-22 17:36:28 +0000319#endif // defined(ARG_MAX) || defined(ARG_MIN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100320}
321#endif /* defined(DEPTH) */
322
323#if defined(BATCH) && defined(DEPTH)
324/** This kernel performs reduction on w-axis.
325 *
326 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
327 * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
328 * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
329 *
330 * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
331 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
332 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
333 * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
334 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
335 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
336 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
337 * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
338 * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
339 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
340 * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt
341 * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
342 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
343 * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
344 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
345 * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
346 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
347 * @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes)
348 * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
349 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
350 */
351__kernel void reduction_operation_w(
352 TENSOR4D_DECLARATION(input),
353 TENSOR4D_DECLARATION(output))
354{
355 Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
356 Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
357
358 VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
Michalis Spyrou7930db42018-11-22 17:36:28 +0000359 res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100360
Michalis Spyrou7930db42018-11-22 17:36:28 +0000361#if defined(SUM_SQUARE)
362 res *= res;
363#endif // defined(SUM_SQUARE)
364
365#if defined(ARG_MAX) || defined(ARG_MIN)
366 uint16 indx = 0;
367#endif // defined(ARG_MAX) || defined(ARG_MIN)
368
369 for(unsigned int w = 1; w < BATCH; ++w)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100370 {
Michalis Spyrou8aaf93e2018-10-11 17:33:32 +0100371 VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
372 in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
Michalis Spyrou7930db42018-11-22 17:36:28 +0000373
374#if defined(ARG_MAX)
375 uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
376 indx = select(indx, w, cond_conv);
377 res = select(res, in, isgreater(in, res));
378#elif defined(ARG_MIN)
379 uint16 cond_conv = CONVERT(isless(in, res), uint16);
380 indx = select(indx, w, cond_conv);
381 res = select(res, in, isless(in, res));
382#else // !(defined(ARG_MAX) || defined(ARG_MIN))
Michalis Spyrou8aaf93e2018-10-11 17:33:32 +0100383#if defined(SUM_SQUARE)
384 in *= in;
Michalis Spyrou7930db42018-11-22 17:36:28 +0000385#endif // defined(SUM_SQUARE)
Michalis Spyrou8aaf93e2018-10-11 17:33:32 +0100386 res += in;
Michalis Spyrou7930db42018-11-22 17:36:28 +0000387#endif // defined(ARG_MAX) || defined(ARG_MIN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100388 }
389
Michalis Spyrou7930db42018-11-22 17:36:28 +0000390 // Store result
391#if defined(ARG_MAX) || defined(ARG_MIN)
392 vstore16(indx, 0, (__global uint *)output.ptr);
393#else // !(defined(ARG_MAX) || defined(ARG_MIN))
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100394#if defined(MEAN)
395 res /= BATCH;
Michalis Spyrou7930db42018-11-22 17:36:28 +0000396#endif // defined(MEAN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100397 vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
Michalis Spyrou7930db42018-11-22 17:36:28 +0000398#endif // defined(ARG_MAX) || defined(ARG_MIN)
Michalis Spyrou7e9391b2018-10-05 14:49:28 +0100399}
400#endif /* defined(BATCH) && defined(DEPTH) */