blob: d2e65408dcf6161975f83f49fd7860375b8b403f [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Giorgio Arenac90fcfe2020-11-25 11:51:30 +000026#if defined(VEC_SIZE)
27#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
28
Pablo Telloeb6c88a2019-02-07 15:53:19 +000029#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
30#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
Manuel Bottini8481d832019-12-10 15:28:40 +000031#define VEC_QUANT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Pablo Telloeb6c88a2019-02-07 15:53:19 +000032#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
33#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
Manuel Bottini8481d832019-12-10 15:28:40 +000034inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset, float in_scale, float out_scale)
Pablo Telloeb6c88a2019-02-07 15:53:19 +000035{
36 const VEC_FLOAT in_f32 = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale);
37 const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset));
Manuel Bottini8481d832019-12-10 15:28:40 +000038 const VEC_QUANT res_q8 = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_QUANT);
39 return res_q8;
Pablo Telloeb6c88a2019-02-07 15:53:19 +000040}
41#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +010042
Giorgio Arenac90fcfe2020-11-25 11:51:30 +000043#if defined(DATA_TYPE)
44#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Michele Di Giorgio27400b92018-11-01 13:44:05 +000045
Giorgio Arenac90fcfe2020-11-25 11:51:30 +000046#if defined(DEPTH) && defined(ELEMENT_SIZE)
Michele Di Giorgio27400b92018-11-01 13:44:05 +000047#if defined(INPUT1_WIDTH)
48
Giorgio Arenac90fcfe2020-11-25 11:51:30 +000049#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
50#define SEQ VEC_OFFS(int, VEC_SIZE)
Pablo Telloeb6c88a2019-02-07 15:53:19 +000051
Michele Di Giorgio27400b92018-11-01 13:44:05 +000052/** This kernel concatenates two input tensors into the output tensor along the first dimension
53 *
54 * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
55 * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
Sheri Zhang72923622020-10-27 10:19:41 +000056 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Michele Di Giorgio27400b92018-11-01 13:44:05 +000057 * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
58 * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
59 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +010060 * @param[in] src1_ptr Pointer to the source tensor. Supported data types: All.
Michele Di Giorgio27400b92018-11-01 13:44:05 +000061 * @param[in] src1_stride_x Stride of the source tensor in X dimension (in bytes)
62 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
63 * @param[in] src1_stride_y Stride of the source tensor in Y dimension (in bytes)
64 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
65 * @param[in] src1_stride_z Stride of the source tensor in Z dimension (in bytes)
66 * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
67 * @param[in] src1_stride_w Stride of the first source tensor in Z dimension (in bytes)
68 * @param[in] src1_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
69 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source tensor
70 * @param[in] src2_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
71 * @param[in] src2_stride_x Stride of the source tensor in X dimension (in bytes)
72 * @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
73 * @param[in] src2_stride_y Stride of the source tensor in Y dimension (in bytes)
74 * @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
75 * @param[in] src2_stride_z Stride of the source tensor in Z dimension (in bytes)
76 * @param[in] src2_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
77 * @param[in] src2_stride_w Stride of the first source tensor in Z dimension (in bytes)
78 * @param[in] src2_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
79 * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source tensor
80 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src1_ptr
81 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
82 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
83 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
84 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
85 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
86 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
87 * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
88 * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
89 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
90 */
91__kernel void concatenate_width_x2(
92 TENSOR4D_DECLARATION(src1),
93 TENSOR4D_DECLARATION(src2),
Sheri Zhang72923622020-10-27 10:19:41 +000094 TENSOR4D_DECLARATION(dst))
Michele Di Giorgio27400b92018-11-01 13:44:05 +000095{
Michele Di Giorgio27400b92018-11-01 13:44:05 +000096 // Calculate input indices
Sheri Zhang72923622020-10-27 10:19:41 +000097 const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
Michele Di Giorgio27400b92018-11-01 13:44:05 +000098 const int y = get_global_id(1);
99 const int z = get_global_id(2) % (int)DEPTH;
100 const int w = get_global_id(2) / (int)DEPTH;
Sheri Zhang72923622020-10-27 10:19:41 +0000101 const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
102 const int x2 = max(x - (int)INPUT1_WIDTH, 0);
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000103
104 // Calculate inputs and output addresses
Sheri Zhang72923622020-10-27 10:19:41 +0000105 const __global uchar *dst_addr = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
106 const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
107 const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000108
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000109 VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
110 VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000111
Georgios Pinitas6631ac22019-04-17 12:12:56 +0100112#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000113 src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
114 src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
Georgios Pinitas6631ac22019-04-17 12:12:56 +0100115#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000116 const VEC_INT x_coords = SEQ + (VEC_INT)(x);
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000117
Sheri Zhang72923622020-10-27 10:19:41 +0000118 // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000119 SELECT_TYPE cond = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH) && ((VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
120 src1_values = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
121 src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
Sheri Zhang72923622020-10-27 10:19:41 +0000122
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000123 cond = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
124 const VEC_TYPE values0 = select(src2_values, src1_values, cond);
Sheri Zhang72923622020-10-27 10:19:41 +0000125
126 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000127}
128
129#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
130/** This kernel concatenates four input tensors into the output tensor along the first dimension
131 *
132 * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
133 * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
Sheri Zhang72923622020-10-27 10:19:41 +0000134 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000135 * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
136 * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
137 * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
138 * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
139 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100140 * @param[in] src1_ptr Pointer to the source tensor. Supported data types: All
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000141 * @param[in] src1_stride_x Stride of the source tensor in X dimension (in bytes)
142 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
143 * @param[in] src1_stride_y Stride of the source tensor in Y dimension (in bytes)
144 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
145 * @param[in] src1_stride_z Stride of the source tensor in Z dimension (in bytes)
146 * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
147 * @param[in] src1_stride_w Stride of the first source tensor in Z dimension (in bytes)
148 * @param[in] src1_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
149 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source tensor
150 * @param[in] src2_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
151 * @param[in] src2_stride_x Stride of the source tensor in X dimension (in bytes)
152 * @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
153 * @param[in] src2_stride_y Stride of the source tensor in Y dimension (in bytes)
154 * @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
155 * @param[in] src2_stride_z Stride of the source tensor in Z dimension (in bytes)
156 * @param[in] src2_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
157 * @param[in] src2_stride_w Stride of the first source tensor in Z dimension (in bytes)
158 * @param[in] src2_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
159 * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source tensor
160 * @param[in] src3_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
161 * @param[in] src3_stride_x Stride of the source tensor in X dimension (in bytes)
162 * @param[in] src3_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
163 * @param[in] src3_stride_y Stride of the source tensor in Y dimension (in bytes)
164 * @param[in] src3_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
165 * @param[in] src3_stride_z Stride of the source tensor in Z dimension (in bytes)
166 * @param[in] src3_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
167 * @param[in] src3_stride_w Stride of the first source tensor in Z dimension (in bytes)
168 * @param[in] src3_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
169 * @param[in] src3_offset_first_element_in_bytes The offset of the first element in the source tensor
170 * @param[in] src4_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
171 * @param[in] src4_stride_x Stride of the source tensor in X dimension (in bytes)
172 * @param[in] src4_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
173 * @param[in] src4_stride_y Stride of the source tensor in Y dimension (in bytes)
174 * @param[in] src4_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
175 * @param[in] src4_stride_z Stride of the source tensor in Z dimension (in bytes)
176 * @param[in] src4_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
177 * @param[in] src4_stride_w Stride of the first source tensor in Z dimension (in bytes)
178 * @param[in] src4_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
179 * @param[in] src4_offset_first_element_in_bytes The offset of the first element in the source tensor
180 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src1_ptr
181 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
182 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
183 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
184 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
185 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
186 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
187 * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
188 * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
189 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
190 */
191__kernel void concatenate_width_x4(
192 TENSOR4D_DECLARATION(src1),
193 TENSOR4D_DECLARATION(src2),
194 TENSOR4D_DECLARATION(src3),
195 TENSOR4D_DECLARATION(src4),
Sheri Zhang72923622020-10-27 10:19:41 +0000196 TENSOR4D_DECLARATION(dst))
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000197{
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000198 // Calculate input indices
Sheri Zhang72923622020-10-27 10:19:41 +0000199 const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000200 const int y = get_global_id(1);
201 const int z = get_global_id(2) % (int)DEPTH;
202 const int w = get_global_id(2) / (int)DEPTH;
203
Sheri Zhang72923622020-10-27 10:19:41 +0000204 const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
205 const int x2 = min(max(x - (int)INPUT1_WIDTH, 0), (int)INPUT2_WIDTH - (int)VEC_SIZE);
206 const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, 0), (int)INPUT3_WIDTH - (int)VEC_SIZE);
207 const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, 0);
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000208
209 // Calculate inputs and output addresses
Sheri Zhang72923622020-10-27 10:19:41 +0000210 const __global uchar *dst_addr = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
211 const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
212 const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
213 const __global uchar *src3_addr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * sizeof(DATA_TYPE) + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
214 const __global uchar *src4_addr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * sizeof(DATA_TYPE) + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000215
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000216 VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
217 VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
218 VEC_TYPE src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src3_addr);
219 VEC_TYPE src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src4_addr);
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000220
221#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
222 src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
223 src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
224 src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
225 src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
226#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000227
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000228 const VEC_INT x_coords = SEQ + (VEC_INT)(x);
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000229
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000230 SELECT_TYPE cond_in2 = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
231 SELECT_TYPE cond_in3 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH - VEC_SIZE)), SELECT_TYPE);
232 SELECT_TYPE cond_in4 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH - VEC_SIZE)), SELECT_TYPE);
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000233
Sheri Zhang72923622020-10-27 10:19:41 +0000234 // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000235 src1_values = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
236 src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
Sheri Zhang72923622020-10-27 10:19:41 +0000237 // Rotate src2/3_values, if values0 is a combination of src2_values and src3_values.
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000238 src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
239 src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
Sheri Zhang72923622020-10-27 10:19:41 +0000240 // Rotate src3/4_values, if values0 is a combination of src3_values and src4_values.
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000241 src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
242 src4_values = select(src4_values, ROTATE(src4_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000243
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000244 cond_in2 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
245 cond_in3 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH), SELECT_TYPE);
246 cond_in4 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), SELECT_TYPE);
247
248 VEC_TYPE values0 = select(src2_values, src1_values, cond_in2);
249 values0 = select(src3_values, values0, cond_in3);
250 values0 = select(src4_values, values0, cond_in4);
Sheri Zhang72923622020-10-27 10:19:41 +0000251
252 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000253}
254#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
255#endif /* defined(INPUT1_WIDTH) */
256#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
257
Sheri Zhang1b50bd42020-10-27 00:24:07 +0000258#if defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100259/** This kernel concatenates the input tensor into the output tensor along the first dimension
260 *
Gian Marco Iodice1d1f32c2018-08-10 09:34:11 +0100261 * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100262 * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
Sheri Zhang1b50bd42020-10-27 00:24:07 +0000263 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Gian Marco Iodice1d1f32c2018-08-10 09:34:11 +0100264 * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000265 * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
Gian Marco Iodice1d1f32c2018-08-10 09:34:11 +0100266 *
267 * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100268 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
269 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
270 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
271 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
272 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
273 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100274 * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
275 * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100276 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
277 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
278 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
279 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
280 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
281 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
282 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
283 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100284 * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
285 * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100286 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100287 */
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000288
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100289__kernel void concatenate_width(
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100290 TENSOR4D_DECLARATION(src),
291 TENSOR4D_DECLARATION(dst))
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100292{
Sheri Zhang1b50bd42020-10-27 00:24:07 +0000293 // Calculate input indices
294 const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
295 const int y = get_global_id(1);
296 const int z = get_global_id(2) % (int)DEPTH;
297 const int w = get_global_id(2) / (int)DEPTH;
298
299 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + z * src_stride_z + w * src_stride_w;
300 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + w * dst_stride_w;
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100301
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000302 VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100303
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000304#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
Sheri Zhang1b50bd42020-10-27 00:24:07 +0000305 const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
306 STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000307#else /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
Sheri Zhang1b50bd42020-10-27 00:24:07 +0000308 STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000309#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100310}
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000311
Sheri Zhang1b50bd42020-10-27 00:24:07 +0000312#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)*/
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100313
Giorgio Arena4112eed2020-10-23 14:24:26 +0100314#if defined(VEC_SIZE_LEFTOVER)
315
Pablo Tello6a14adb2019-03-05 17:33:08 +0000316#if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
317/** This kernel concatenates the input tensor into the output tensor along the second dimension
318 *
319 * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
320 * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
321 * @note Vector sizes supported are 2,4,8 and 16.
322 * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
323 * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
Giorgio Arena4112eed2020-10-23 14:24:26 +0100324 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Pablo Tello6a14adb2019-03-05 17:33:08 +0000325 *
326 * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
327 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
328 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
329 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
330 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
331 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
332 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
333 * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
334 * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
335 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
336 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
337 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
338 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
339 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
340 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
341 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
342 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
343 * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
344 * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
345 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
346 */
347
348__kernel void concatenate_height(
349 TENSOR4D_DECLARATION(src),
350 TENSOR4D_DECLARATION(dst))
351{
Giorgio Arena4112eed2020-10-23 14:24:26 +0100352 const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
353
354 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + (get_global_id(2) % DEPTH) * src_stride_z + (get_global_id(
355 2) / DEPTH) * src_stride_w;
356 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + (get_global_id(2) % DEPTH) * dst_stride_z + (get_global_id(
357 2) / DEPTH) * dst_stride_w;
Pablo Tello6a14adb2019-03-05 17:33:08 +0000358
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000359 VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
Pablo Tello6a14adb2019-03-05 17:33:08 +0000360
361#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
Giorgio Arena4112eed2020-10-23 14:24:26 +0100362 const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
363 STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Pablo Tello6a14adb2019-03-05 17:33:08 +0000364#else /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
Giorgio Arena4112eed2020-10-23 14:24:26 +0100365 STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Pablo Tello6a14adb2019-03-05 17:33:08 +0000366#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
367}
368
369#endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */
370
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100371/** This kernel concatenates the input tensor into the output tensor along the third dimension
372 *
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100373 * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
374 * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
Giorgio Arena53048842020-10-07 16:03:43 +0100375 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100376 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100377 * @param[in] src_ptr Pointer to the source tensor. Supported data types: All
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100378 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
379 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
380 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
381 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100382 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
383 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100384 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100385 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100386 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
387 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
388 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
389 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100390 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
391 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100392 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100393 * @param[in] offsets The offsets to the first valid element of the output tensor in bytes
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100394 */
Vidhya Sudhan Loganathan338595b2019-06-28 14:09:53 +0100395__kernel void concatenate(
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100396 TENSOR3D_DECLARATION(src),
397 TENSOR3D_DECLARATION(dst),
Michalis Spyroua9c44722019-04-05 17:18:36 +0100398 int offset)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100399{
Giorgio Arena53048842020-10-07 16:03:43 +0100400 uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
401
402 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
403 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100404
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000405 VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100406
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000407#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
Giorgio Arena53048842020-10-07 16:03:43 +0100408 source_values0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000409#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
410
Giorgio Arena53048842020-10-07 16:03:43 +0100411 STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + offset, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100412}
Giorgio Arena53048842020-10-07 16:03:43 +0100413#endif /* defined(VEC_SIZE_LEFTOVER) */
Giorgio Arenac90fcfe2020-11-25 11:51:30 +0000414#endif /* defined(DATA_TYPE) */
415#endif /* defined(VEC_SIZE) */