blob: e5c7a5107dc3880fca5d2c34ef7a4a121bcf7b47 [file] [log] [blame]
steniu01db006682017-08-09 16:26:22 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2016-2018 Arm Limited.
steniu01db006682017-08-09 16:26:22 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26#undef CONVERT_SAT
27
Gian Marco Iodice1246b632017-08-16 18:38:32 +010028#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
29
steniu01db006682017-08-09 16:26:22 +010030#if STRIDE_X == 1
31#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
32#elif STRIDE_X == 2 /* STRIDE_X == 1 */
33#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
34#else /* STRIDE_X not equals 1 or 2 */
35#error "STRIDE_X larger than 2 is not supported"
36#endif /* STRIDE_X == 2 */
37
38#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \
39 ({ \
40 VEC_DATA_TYPE(DATA_TYPE, 4) \
41 weights_values0 = vload4(0, weights_row_ptr); \
42 DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \
43 VEC_DATA_TYPE(DATA_TYPE, 8) \
44 src0 = vload8(0, src_row_ptr); \
45 VEC_DATA_TYPE(DATA_TYPE, 4) \
46 src1 = vload4(0, src_row_ptr + 8); \
47 \
48 acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \
49 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
50 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
51 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
52 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \
53 })
54
55#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \
56 ({ \
57 VEC_DATA_TYPE(DATA_TYPE, 4) \
58 weights_values0 = vload4(0, weights_row_ptr); \
59 DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \
60 VEC_DATA_TYPE(DATA_TYPE, 16) \
61 src0 = vload16(0, src_row_ptr); \
62 VEC_DATA_TYPE(DATA_TYPE, 4) \
63 src1 = vload4(0, src_row_ptr + 16); \
64 acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \
65 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
66 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
67 \
68 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
69 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \
70 })
71
Pablo Tello3d319462018-06-21 15:13:17 +010072#if defined(DATA_LAYOUT_NHWC)
73
74#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
75
76#if STRIDE_X == 1
77#define CONVOLUTION1x5_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x5_STRIDE1_NHWC(acc, row_ptr, weights_ptr)
78#elif STRIDE_X == 2 /* STRIDE_X == 1 */
79#define CONVOLUTION1x5_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x5_STRIDE2_NHWC(acc, row_ptr, weights_ptr)
80#else /* STRIDE_X not equals 1 or 2 */
81#error "STRIDE_X larger than 2 is not supported"
82#endif /* STRIDE_X == 2 */
83
84#define CONVOLUTION1x5_STRIDE1_NHWC(acc, row_ptr, weights_ptr) \
85 ({ \
86 VEC_DATA_TYPE(DATA_TYPE, 8) \
87 src0 = (VEC_DATA_TYPE(DATA_TYPE, 8))( \
88 PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE), \
89 PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE), \
90 PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE), \
91 PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE)); \
92 VEC_DATA_TYPE(DATA_TYPE, 4) \
93 src1 = (VEC_DATA_TYPE(DATA_TYPE, 4))( \
94 PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE), \
95 PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE)); \
96 VEC_DATA_TYPE(DATA_TYPE, 4) \
97 weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 4))( \
98 PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE), \
99 PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE)); \
100 DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE); \
101 acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \
102 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
103 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
104 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
105 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \
106 })
107
108#define CONVOLUTION1x5_STRIDE2_NHWC(acc, row_ptr, weights_ptr) \
109 ({ \
110 VEC_DATA_TYPE(DATA_TYPE, 16) \
111 src0 = (VEC_DATA_TYPE(DATA_TYPE, 16))( \
112 PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE), \
113 PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE), \
114 PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE), \
115 PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE), \
116 PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE), \
117 PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE), \
118 PTR_TO_VALUE(row_ptr + 12 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 13 * src_stride_y, DATA_TYPE), \
119 PTR_TO_VALUE(row_ptr + 14 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 15 * src_stride_y, DATA_TYPE)); \
120 VEC_DATA_TYPE(DATA_TYPE, 4) \
121 src1 = (VEC_DATA_TYPE(DATA_TYPE, 4))( \
122 PTR_TO_VALUE(row_ptr + 16 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 17 * src_stride_y, DATA_TYPE), \
123 PTR_TO_VALUE(row_ptr + 18 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 19 * src_stride_y, DATA_TYPE)); \
124 VEC_DATA_TYPE(DATA_TYPE, 4) \
125 weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 4))( \
126 PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE), \
127 PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE)); \
128 DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE); \
129 acc += src0.s02468ACE * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \
130 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
131 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
132 \
133 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
134 acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \
135 })
136
137/** This kernel performs a direct convolution to convolve the low three dimensions in a tensor with the NHWC data layout
138 *
139 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
140 * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
141 * @note If biases are used then -DHAS_BIAS has to be passed at compile time
142 *
143 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
144 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
145 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
146 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
147 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
148 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
149 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
150 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
151 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
152 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
153 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
154 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
155 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
156 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
157 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
158 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
159 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
160 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
161 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
162 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
163 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
164 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
165 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
166 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
167 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
168 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
169 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
170 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
171 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
172 */
173__kernel void direct_convolution5x5_nhwc(
174 TENSOR3D_DECLARATION(src),
175 TENSOR3D_DECLARATION(dst),
176 TENSOR3D_DECLARATION(weights),
177#ifdef HAS_BIAS
178 VECTOR_DECLARATION(biases),
179#endif /* defined(HAS_BIAS) */
180 unsigned int weights_stride_w)
181{
182 Image src = CONVERT_TO_IMAGE_STRUCT(src);
183 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
184 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
185
186 VEC_DATA_TYPE(DATA_TYPE, 8)
187 values0 = 0;
188
189 const int id0 = get_global_id(0);
190 const int id1 = get_global_id(1);
191 const int id2 = get_global_id(2);
192
193 __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
194 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + ((id2 * STRIDE_Y) - PAD_TOP) * (int)src_stride_z;
195
196 weights_addr += id0 * weights_stride_w;
Pablo Tello3d319462018-06-21 15:13:17 +0100197
Pablo Tellod041a832018-10-03 17:11:09 +0100198#if(PAD_TOP == 1)
199 const int coordy = id2 - PAD_TOP;
Pablo Tello3d319462018-06-21 15:13:17 +0100200 for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
201 {
Pablo Tello3d319462018-06-21 15:13:17 +0100202 if(coordy < 0) // special case Z = -1 doesn't exists
203 {
204 //skip first row and load the two next ones
205 CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
206 CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
207 CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
208 CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
209 }
210 else if(coordy == (DST_HEIGHT - PAD_TOP - 1))
211 {
212 // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
213 // Z axis has no padding at all.
214 CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
215 CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
216 CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
217 CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
218 }
219 else
220 {
221 CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
222 CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
223 CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
224 CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
225 CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
226 }
Pablo Tellod041a832018-10-03 17:11:09 +0100227 src_addr += src_stride_x;
228 weights_addr += weights_stride_x;
229 }
230#elif(PAD_TOP == 2)
231 const int coordy = id2 * STRIDE_Y;
232 for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
233 {
234 if(coordy == 0) // special case Z = -2 doesn't exists
235 {
236 //skip first row and load the two next ones
237 CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
238 CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
239 CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
240 }
241 else if(coordy == 1) // special case Z = -1 doesn't exists
242 {
243 //skip first row and load the two next ones
244 CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
245 CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
246 CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
247 CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
248 }
249 else if(coordy == (SRC_HEIGHT - 1))
250 {
251 // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
252 // Z axis has no padding at all.
253 CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
254 CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
255 CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
256 }
257 else if(coordy == (SRC_HEIGHT - 2))
258 {
259 // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
260 // Z axis has no padding at all.
261 CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
262 CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
263 CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
264 CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
265 }
266 else
267 {
268 CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
269 CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
270 CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
271 CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
272 CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
273 }
274 src_addr += src_stride_x;
275 weights_addr += weights_stride_x;
276 }
277
278#else /* PAD_TOP == 2 */
279 for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
280 {
Pablo Tello3d319462018-06-21 15:13:17 +0100281 CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
282 CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
283 CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
284 CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
285 CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
Pablo Tello3d319462018-06-21 15:13:17 +0100286 src_addr += src_stride_x;
287 weights_addr += weights_stride_x;
288 }
Pablo Tellod041a832018-10-03 17:11:09 +0100289#endif /* PAD_TOP == 1 */
Pablo Tello3d319462018-06-21 15:13:17 +0100290
291#ifdef HAS_BIAS
292 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
293 values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0)));
294#endif /* defined(HAS_BIAS) */
295
296 *((__global DATA_TYPE *)(dst.ptr + 0 * dst_stride_y)) = values0.s0;
297 *((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values0.s1;
298 *((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values0.s2;
299 *((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values0.s3;
300 *((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values0.s4;
301 *((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values0.s5;
302 *((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values0.s6;
303 *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values0.s7;
304}
305
306#endif // defined(DATA_LAYOUT_NHWC)
307
steniu01db006682017-08-09 16:26:22 +0100308/** This kernel performs a direct convolution to convolve the low three dimensions.
309 *
310 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
311 * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100312 * @note If biases are used then -DHAS_BIAS has to be passed at compile time
steniu01db006682017-08-09 16:26:22 +0100313 *
314 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
315 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
316 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
317 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
318 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
319 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
320 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
321 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
322 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
323 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
324 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
325 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
326 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
327 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
328 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
329 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800330 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
steniu01db006682017-08-09 16:26:22 +0100331 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
332 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
333 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
334 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
335 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
336 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
337 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
338 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
339 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
340 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
341 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
342 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
343 */
steniu01db006682017-08-09 16:26:22 +0100344__kernel void direct_convolution5x5(
345 TENSOR3D_DECLARATION(src),
346 TENSOR3D_DECLARATION(dst),
347 TENSOR3D_DECLARATION(weights),
348#ifdef HAS_BIAS
349 VECTOR_DECLARATION(biases),
350#endif /* defined(HAS_BIAS) */
351 unsigned int weights_stride_w)
352{
353 Image src = CONVERT_TO_IMAGE_STRUCT(src);
354 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
355 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
356
357 VEC_DATA_TYPE(DATA_TYPE, 8)
Pablo Tello3d319462018-06-21 15:13:17 +0100358 values0 = 0;
steniu01db006682017-08-09 16:26:22 +0100359
360 __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
361 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
362
363 const int kernel_index = get_global_id(2);
364 weights_addr += kernel_index * weights_stride_w;
365
Gian Marco Iodice744b5ed2017-10-06 15:44:27 +0100366 for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
steniu01db006682017-08-09 16:26:22 +0100367 {
Pablo Tello3d319462018-06-21 15:13:17 +0100368 CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
369 CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
370 CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
371 CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
372 CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
steniu01db006682017-08-09 16:26:22 +0100373
374 src_addr += src_stride_z;
375 weights_addr += weights_stride_z;
376 }
377
378#ifdef HAS_BIAS
379 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
380
Pablo Tello3d319462018-06-21 15:13:17 +0100381 values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)));
steniu01db006682017-08-09 16:26:22 +0100382#endif /* defined(HAS_BIAS) */
383
Pablo Tello3d319462018-06-21 15:13:17 +0100384 vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr);
steniu01db006682017-08-09 16:26:22 +0100385}
386#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100387
388#if defined(WEIGHTS_DEPTH)
389
390#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \
391 ({ \
392 acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0); \
393 acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1); \
394 acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2); \
395 acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3); \
396 acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0); \
397 acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1); \
398 acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2); \
399 acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3); \
400 acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0); \
401 acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1); \
402 acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2); \
403 acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3); \
404 acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0); \
405 acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1); \
406 acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2); \
407 acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3); \
408 acc.s0 = mad(src0.s4, weights_row01, acc.s0); \
409 acc.s1 = mad(src0.s5, weights_row01, acc.s1); \
410 acc.s2 = mad(src0.s6, weights_row01, acc.s2); \
411 acc.s3 = mad(src0.s7, weights_row01, acc.s3); \
412 })
413
414/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32
415 *
416 * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
417 * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
418 * @note If biases are used then -DHAS_BIAS has to be passed at compile time
419 *
420 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
421 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
422 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
423 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
424 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
425 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
426 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
427 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
428 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
429 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
430 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
431 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
432 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
433 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
434 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
435 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800436 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100437 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
438 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
439 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
440 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
441 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
442 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
443 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
444 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
445 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
446 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
447 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
448 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
449 */
450__kernel void direct_convolution5x5_f32_bifrost(
451 TENSOR3D_DECLARATION(src),
452 TENSOR3D_DECLARATION(dst),
453 TENSOR3D_DECLARATION(weights),
454#ifdef HAS_BIAS
455 VECTOR_DECLARATION(biases),
456#endif /* defined(HAS_BIAS) */
457 unsigned int weights_stride_w)
458{
459 // Get the kernel index
460 const int kernel_index = get_global_id(2);
461
462 Image src = CONVERT_TO_IMAGE_STRUCT(src);
463 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
464
Pablo Tello3d319462018-06-21 15:13:17 +0100465 float4 values0 = 0.0f;
466 float4 values1 = 0.0f;
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100467
468 __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
469 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
470
471 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
472
473 for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
474 {
475 // Load the weights from row0 and row1
476 float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y));
477 float weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4);
478 float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y));
479 float weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4);
480 float8 src0;
481
482 // Load values from row0 of input tensor
483 src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
484
485 // Accumulate
Pablo Tello3d319462018-06-21 15:13:17 +0100486 CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100487
488 // Load values from row1 of input tensor
489 src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
490
491 // Accumulate
Pablo Tello3d319462018-06-21 15:13:17 +0100492 CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
493 CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100494
495 // Load values from row2 of input tensor
496 src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
497
498 // Load weights from row2
499 weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y));
500 weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4);
501
502 // Accumulate
Pablo Tello3d319462018-06-21 15:13:17 +0100503 CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
504 CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100505
506 // Load values from row3 of input tensor
507 src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
508
509 // Load weights from row3
510 weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y));
511 weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4);
512
513 // Accumulate
Pablo Tello3d319462018-06-21 15:13:17 +0100514 CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
515 CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100516
517 // Load values from row4 of input tensor
518 src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
519
520 // Load weights from row4
521 weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y));
522 weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4);
523
Pablo Tello3d319462018-06-21 15:13:17 +0100524 CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
525 CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100526
527 // Load values from row5 of input tensor
528 src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
529
530 // Accumulate
Pablo Tello3d319462018-06-21 15:13:17 +0100531 CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100532
533 src_addr += src_stride_z;
534 weights_addr += weights_stride_z;
535 }
536
537#ifdef HAS_BIAS
538 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
539
540 float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index)));
541
Pablo Tello3d319462018-06-21 15:13:17 +0100542 values0 += bias;
543 values1 += bias;
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100544#endif /* defined(HAS_BIAS) */
545
Pablo Tello3d319462018-06-21 15:13:17 +0100546 vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
547 vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100548}
549#endif // defined(WEIGHTS_DEPTH)