blob: d0eea5bfb4ed0d265eff8464174512d7b5b99431 [file] [log] [blame]
SiCong Lic51b72f2017-07-28 14:46:20 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2016-2018 Arm Limited.
SiCong Lic51b72f2017-07-28 14:46:20 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
steniu01db006682017-08-09 16:26:22 +010026#undef CONVERT_SAT
Michalis Spyroudef665a2017-08-14 11:26:37 +010027
28#define ADD_OP(a, b) ((a) + (b))
29#define MUL_OP(a, b) ((a) * (b))
30#define CONVERT_SAT(a, b) ((a))
31
Gian Marco Iodice1c8409d2017-09-06 17:24:25 +010032#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
33
Pablo Tello3d319462018-06-21 15:13:17 +010034#if defined(DATA_LAYOUT_NHWC)
35
36#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
37
38/** This kernel performs a direct convolution to convolve the low three dimensions of a tensor with data layout NHWC
39 *
40 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
41 * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
42 * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
43 * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
44 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
45 *
46 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
47 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
48 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
49 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
50 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
51 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
52 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
53 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
54 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
55 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
56 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
57 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
58 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
59 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
60 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
61 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
62 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
63 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
64 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
65 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
66 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
67 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
68 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
69 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
70 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
71 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
72 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
73 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
74 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
75 */
76__kernel void direct_convolution1x1_nhwc(
77 TENSOR3D_DECLARATION(src),
78 TENSOR3D_DECLARATION(dst),
79 TENSOR3D_DECLARATION(weights),
80#ifdef HAS_BIAS
81 VECTOR_DECLARATION(biases),
82#endif /* defined(HAS_BIAS) */
83 unsigned int weights_stride_w)
84{
85 Image src = CONVERT_TO_IMAGE_STRUCT(src);
86 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
87 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
88
89#ifdef HAS_BIAS
90 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
91#endif /* defined(HAS_BIAS) */
92
93 VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
94 values = 0;
95 const int id0 = get_global_id(0);
96 const int id1 = get_global_id(1);
97 const int id2 = get_global_id(2);
98 weights.ptr += id0 * weights_stride_w;
99 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + id2 * STRIDE_Y * (int)src_stride_z;
100
101 for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
102 {
103 DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
104#if STRIDE_X == 1
105 VEC_DATA_TYPE(DATA_TYPE, 8)
106 col0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(
107 PTR_TO_VALUE(src_addr + 0 * src_stride_y, DATA_TYPE),
108 PTR_TO_VALUE(src_addr + 1 * src_stride_y, DATA_TYPE),
109 PTR_TO_VALUE(src_addr + 2 * src_stride_y, DATA_TYPE),
110 PTR_TO_VALUE(src_addr + 3 * src_stride_y, DATA_TYPE),
111 PTR_TO_VALUE(src_addr + 4 * src_stride_y, DATA_TYPE),
112 PTR_TO_VALUE(src_addr + 5 * src_stride_y, DATA_TYPE),
113 PTR_TO_VALUE(src_addr + 6 * src_stride_y, DATA_TYPE),
114 PTR_TO_VALUE(src_addr + 7 * src_stride_y, DATA_TYPE));
115#elif STRIDE_X == 2 /* STRIDE_X == 1 */
116 VEC_DATA_TYPE(DATA_TYPE, 8)
117 col0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(
118 PTR_TO_VALUE(src_addr + 0 * src_stride_y, DATA_TYPE),
119 PTR_TO_VALUE(src_addr + 2 * src_stride_y, DATA_TYPE),
120 PTR_TO_VALUE(src_addr + 4 * src_stride_y, DATA_TYPE),
121 PTR_TO_VALUE(src_addr + 6 * src_stride_y, DATA_TYPE),
122 PTR_TO_VALUE(src_addr + 8 * src_stride_y, DATA_TYPE),
123 PTR_TO_VALUE(src_addr + 10 * src_stride_y, DATA_TYPE),
124 PTR_TO_VALUE(src_addr + 12 * src_stride_y, DATA_TYPE),
125 PTR_TO_VALUE(src_addr + 14 * src_stride_y, DATA_TYPE));
126#else /* STRIDE_X not equals 1 or 2 */
127#error "STRIDE_X larger than 2 is not supported"
128#endif /* STRIDE_X == 2 */
129 values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, col0));
130
131 src_addr += src_stride_x;
132 weights.ptr += weights_stride_x;
133 }
134
135#ifdef HAS_BIAS
136 values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0))));
137#endif /* defined(HAS_BIAS) */
138
139 *((__global DATA_TYPE *)dst.ptr) = values.s0;
140 *((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values.s1;
141 *((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values.s2;
142 *((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values.s3;
143 *((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values.s4;
144 *((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values.s5;
145 *((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values.s6;
146 *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values.s7;
147}
148#endif // defined(DATA_LAYOUT_NHWC)
149
SiCong Lic51b72f2017-07-28 14:46:20 +0100150#if STRIDE_X == 3
151#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
152#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
153#elif STRIDE_X == 2
154#define INPUT_PIXEL(data_size) extract_input_stride2
155#elif STRIDE_X == 1
156#define INPUT_PIXEL(data_size) extract_input_stride1
157#else /* STRIDE_X not equals 1, 2 or 3 */
158#error "Only support strides 1, 2 and 3"
159#endif /* STRIDE_X == 3 */
160
161/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
162 *
163 * @param[in] input_pixel Pointer to the first pixel.
164 *
Pablo Tello3d319462018-06-21 15:13:17 +0100165 * @return extracted input values.
SiCong Lic51b72f2017-07-28 14:46:20 +0100166 */
167inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
168{
169 return vload8(0, input_pixel);
170}
171
172/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
173 *
174 * @param[in] input_pixel Pointer to the first pixel.
175 *
Pablo Tello3d319462018-06-21 15:13:17 +0100176 * @return extracted input values.
SiCong Lic51b72f2017-07-28 14:46:20 +0100177 */
178inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
179{
180 VEC_DATA_TYPE(DATA_TYPE, 16)
181 temp = vload16(0, input_pixel);
182 return temp.s02468ace;
183}
184
185/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
186 *
187 * @param[in] input_pixel Pointer to the first pixel.
188 *
Pablo Tello3d319462018-06-21 15:13:17 +0100189 * @return extracted input values.
SiCong Lic51b72f2017-07-28 14:46:20 +0100190 */
191inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
192{
193 VEC_DATA_TYPE(DATA_TYPE, 4)
194 temp1 = vload4(0, input_pixel);
195 VEC_DATA_TYPE(DATA_TYPE, 4)
196 temp2 = vload4(0, input_pixel + 6);
197 VEC_DATA_TYPE(DATA_TYPE, 4)
198 temp3 = vload4(0, input_pixel + 12);
199 VEC_DATA_TYPE(DATA_TYPE, 4)
200 temp4 = vload4(0, input_pixel + 18);
201 return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
202}
203
204/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
205 *
206 * @param[in] input_pixel Pointer to the first pixel.
207 *
Pablo Tello3d319462018-06-21 15:13:17 +0100208 * @return extracted input values.
SiCong Lic51b72f2017-07-28 14:46:20 +0100209 */
210inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
211{
212 VEC_DATA_TYPE(DATA_TYPE, 8)
213 temp1 = vload8(0, input_pixel);
214 VEC_DATA_TYPE(DATA_TYPE, 8)
215 temp2 = vload8(0, input_pixel + 8);
216 VEC_DATA_TYPE(DATA_TYPE, 8)
217 temp3 = vload8(0, input_pixel + 16);
218 return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
219}
220
221/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
222 *
223 * @param[in] input_pixel Pointer to the first pixel.
224 *
Pablo Tello3d319462018-06-21 15:13:17 +0100225 * @return extracted input values.
SiCong Lic51b72f2017-07-28 14:46:20 +0100226 */
227inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
228{
229 VEC_DATA_TYPE(DATA_TYPE, 16)
230 temp1 = vload16(0, input_pixel);
231 VEC_DATA_TYPE(DATA_TYPE, 16)
232 temp2 = vload16(0, input_pixel + 12);
233 return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
234}
235
236/** This kernel performs a direct convolution to convolve the low three dimensions.
237 *
238 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
239 * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100240 * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
241 * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
SiCong Lic51b72f2017-07-28 14:46:20 +0100242 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
243 *
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100244 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
SiCong Lic51b72f2017-07-28 14:46:20 +0100245 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
246 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
247 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
248 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
249 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
250 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
251 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
252 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
253 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
254 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
255 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
256 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
257 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
258 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
259 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800260 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
SiCong Lic51b72f2017-07-28 14:46:20 +0100261 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
262 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
263 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
264 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
265 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
266 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
267 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
268 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
269 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
270 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
271 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100272 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
SiCong Lic51b72f2017-07-28 14:46:20 +0100273 */
274__kernel void direct_convolution1x1(
275 TENSOR3D_DECLARATION(src),
276 TENSOR3D_DECLARATION(dst),
277 TENSOR3D_DECLARATION(weights),
278#ifdef HAS_BIAS
279 VECTOR_DECLARATION(biases),
280#endif /* defined(HAS_BIAS) */
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100281 unsigned int weights_stride_w)
SiCong Lic51b72f2017-07-28 14:46:20 +0100282{
283 Image src = CONVERT_TO_IMAGE_STRUCT(src);
284 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
285 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
286
287#ifdef HAS_BIAS
288 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
289#endif /* defined(HAS_BIAS) */
290
Michalis Spyroudef665a2017-08-14 11:26:37 +0100291 VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
Pablo Tello3d319462018-06-21 15:13:17 +0100292 values = 0;
SiCong Lic51b72f2017-07-28 14:46:20 +0100293
294 const uint z_index = get_global_id(2);
295
296 weights.ptr += z_index * weights_stride_w;
Gian Marco Iodice744b5ed2017-10-06 15:44:27 +0100297 for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
SiCong Lic51b72f2017-07-28 14:46:20 +0100298 {
299 DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
300 VEC_DATA_TYPE(DATA_TYPE, 8)
301 input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
Pablo Tello3d319462018-06-21 15:13:17 +0100302 values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
SiCong Lic51b72f2017-07-28 14:46:20 +0100303 src.ptr += src_stride_z;
304 weights.ptr += weights_stride_z;
305 }
306
307#ifdef HAS_BIAS
Pablo Tello3d319462018-06-21 15:13:17 +0100308 values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
SiCong Lic51b72f2017-07-28 14:46:20 +0100309#endif /* defined(HAS_BIAS) */
310
Pablo Tello3d319462018-06-21 15:13:17 +0100311 vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
SiCong Lic51b72f2017-07-28 14:46:20 +0100312}
steniu01db006682017-08-09 16:26:22 +0100313#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
Gian Marco Iodice1c8409d2017-09-06 17:24:25 +0100314
315#if defined(WEIGHTS_DEPTH)
316
317#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \
318 ({ \
319 acc.s0 = mad(src.s0, weight_value, acc.s0); \
320 acc.s1 = mad(src.s1, weight_value, acc.s1); \
321 acc.s2 = mad(src.s2, weight_value, acc.s2); \
322 acc.s3 = mad(src.s3, weight_value, acc.s3); \
323 })
324
325/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32
326 *
327 * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
328 * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
329 * @note In case biases, -DHAS_BIAS must to be passed at compile
330 *
331 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
332 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
333 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
334 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
335 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
336 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
337 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
338 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
339 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
340 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
341 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
342 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
343 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
344 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
345 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
346 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Joel Liangf1f3ebd2017-11-10 09:59:19 +0800347 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Gian Marco Iodice1c8409d2017-09-06 17:24:25 +0100348 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
349 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
350 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
351 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
352 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
353 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
354 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
355 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
356 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
357 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
358 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
359 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
360 */
361__kernel void direct_convolution1x1_f32_bifrost(
362 TENSOR3D_DECLARATION(src),
363 TENSOR3D_DECLARATION(dst),
364 TENSOR3D_DECLARATION(weights),
365#ifdef HAS_BIAS
366 VECTOR_DECLARATION(biases),
367#endif /* defined(HAS_BIAS) */
368 unsigned int weights_stride_w)
369{
370 // Get the kernel index
371 const int kernel_index = get_global_id(2);
372
373 Image src = CONVERT_TO_IMAGE_STRUCT(src);
374 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
375
376 float4 acc0 = 0.0f;
377 float4 acc1 = 0.0f;
378 float4 acc2 = 0.0f;
379 float4 acc3 = 0.0f;
380
381 __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
382 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
383
384 for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
385 {
386 // Load the weights
387 float weight = *((__global float *)weights_addr);
388
389 // Load values from row0 of input tensor
390 float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
391 float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
392 float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
393 float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
394
395 CONVOLUTION1x1_BIFROST(acc0, src0, weight);
396 CONVOLUTION1x1_BIFROST(acc1, src1, weight);
397 CONVOLUTION1x1_BIFROST(acc2, src2, weight);
398 CONVOLUTION1x1_BIFROST(acc3, src3, weight);
399
400 src_addr += src_stride_z;
401 weights_addr += weights_stride_z;
402 }
403
404#ifdef HAS_BIAS
405 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
406
407 float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
408
409 acc0.s0 += bias;
410 acc0.s1 += bias;
411 acc0.s2 += bias;
412 acc0.s3 += bias;
413 acc1.s0 += bias;
414 acc1.s1 += bias;
415 acc1.s2 += bias;
416 acc1.s3 += bias;
417 acc2.s0 += bias;
418 acc2.s1 += bias;
419 acc2.s2 += bias;
420 acc2.s3 += bias;
421 acc3.s0 += bias;
422 acc3.s1 += bias;
423 acc3.s2 += bias;
424 acc3.s3 += bias;
425#endif /* defined(HAS_BIAS) */
426
427 vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
428 vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
429 vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
430 vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
431}
Pablo Tello3d319462018-06-21 15:13:17 +0100432#endif // defined(WEIGHTS_DEPTH)