blob: 1a228dd74ccb2e873ca869058ae1f419c8af2e59 [file] [log] [blame]
steniu0127b386c2017-07-18 17:37:43 +01001/*
2 * Copyright (c) 2016, 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Michalis Spyroudef665a2017-08-14 11:26:37 +010026#if defined(FIXED_POINT_POSITION)
27#include "fixed_point.h"
28
29#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE_PROMOTED, 8)
30#define MUL_OP(a, b) MUL_SAT_OP_EXPAND(CONVERT((a), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), CONVERT((b), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), DATA_TYPE_PROMOTED, 8, FIXED_POINT_POSITION)
31
32// There is no need to have a larger intermediate type for qs32 because all the arguments are already promoted
33MULQ_SAT_IMPL(qs32x8, qs32x8)
34
35#else /* FIXED_POINT_POSITION */
36
37#define ADD_OP(a, b) ((a) + (b))
38#define MUL_OP(a, b) ((a) * (b))
39#define CONVERT_SAT(a, b) ((a))
40
41#endif /* FIXED_POINT_POSITION */
42
Gian Marco Iodice1246b632017-08-16 18:38:32 +010043#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
44
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +010045#if STRIDE_X == 1
46#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
47#elif STRIDE_X == 2 /* STRIDE_X == 1 */
48#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
steniu0127b386c2017-07-18 17:37:43 +010049#else /* STRIDE_X not equals 1 or 2 */
50#error "STRIDE_X larger than 2 is not supported"
51#endif /* STRIDE_X == 2 */
52
Michalis Spyroudef665a2017-08-14 11:26:37 +010053#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \
54 ({ \
steniu01db006682017-08-09 16:26:22 +010055 VEC_DATA_TYPE(DATA_TYPE, 3) \
56 weights_values0 = vload3(0, weights_row_ptr); \
Michalis Spyroudef665a2017-08-14 11:26:37 +010057 VEC_DATA_TYPE(DATA_TYPE, 8) \
58 src0 = vload8(0, src_row_ptr); \
59 VEC_DATA_TYPE(DATA_TYPE, 2) \
60 src1 = vload2(0, src_row_ptr + 8); \
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +010061 \
Michalis Spyroudef665a2017-08-14 11:26:37 +010062 acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \
63 acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
64 acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +010065 })
steniu0127b386c2017-07-18 17:37:43 +010066
Michalis Spyroudef665a2017-08-14 11:26:37 +010067#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \
68 ({ \
steniu01db006682017-08-09 16:26:22 +010069 VEC_DATA_TYPE(DATA_TYPE, 3) \
70 weights_values0 = vload3(0, weights_row_ptr); \
Michalis Spyroudef665a2017-08-14 11:26:37 +010071 VEC_DATA_TYPE(DATA_TYPE, 16) \
72 src0 = vload16(0, src_row_ptr); \
73 DATA_TYPE src1 = *(src_row_ptr + 16); \
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +010074 \
Michalis Spyroudef665a2017-08-14 11:26:37 +010075 acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \
76 acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
77 acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +010078 })
steniu0127b386c2017-07-18 17:37:43 +010079
80/** This kernel performs a direct convolution to convolve the low three dimensions.
81 *
Gian Marco Iodice1246b632017-08-16 18:38:32 +010082 * @note This OpenCL kernel works with stride_x = 1 and 2
steniu0127b386c2017-07-18 17:37:43 +010083 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +010084 * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
Gian Marco Iodice1246b632017-08-16 18:38:32 +010085 * @note If biases are used then -DHAS_BIAS has to be passed at compile time
steniu0127b386c2017-07-18 17:37:43 +010086 *
Gian Marco Iodice1246b632017-08-16 18:38:32 +010087 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
steniu0127b386c2017-07-18 17:37:43 +010088 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
89 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
90 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
91 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
92 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
93 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
94 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
95 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
96 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
97 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
98 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
99 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
100 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
101 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
102 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
103 * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
104 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
105 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
106 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
107 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
108 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
109 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
110 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
111 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
112 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
113 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
114 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100115 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
steniu0127b386c2017-07-18 17:37:43 +0100116 */
117__kernel void direct_convolution3x3(
118 TENSOR3D_DECLARATION(src),
119 TENSOR3D_DECLARATION(dst),
120 TENSOR3D_DECLARATION(weights),
121#ifdef HAS_BIAS
122 VECTOR_DECLARATION(biases),
123#endif /* defined(HAS_BIAS) */
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100124 unsigned int weights_stride_w)
steniu0127b386c2017-07-18 17:37:43 +0100125{
126 Image src = CONVERT_TO_IMAGE_STRUCT(src);
127 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
128 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
129
Michalis Spyroudef665a2017-08-14 11:26:37 +0100130 VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100131 pixels0 = 0;
steniu0127b386c2017-07-18 17:37:43 +0100132
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100133 __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
134 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
steniu0127b386c2017-07-18 17:37:43 +0100135
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100136 const int kernel_index = get_global_id(2);
137 weights_addr += kernel_index * weights_stride_w;
steniu0127b386c2017-07-18 17:37:43 +0100138
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100139 for(int d = 0; d < WEIGHTS_DEPTH; ++d)
steniu0127b386c2017-07-18 17:37:43 +0100140 {
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100141 CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
142 CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
143 CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
steniu0127b386c2017-07-18 17:37:43 +0100144
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100145 src_addr += src_stride_z;
146 weights_addr += weights_stride_z;
steniu0127b386c2017-07-18 17:37:43 +0100147 }
148
149#ifdef HAS_BIAS
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100150 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
151
Michalis Spyroudef665a2017-08-14 11:26:37 +0100152 pixels0 = ADD_OP(pixels0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));
steniu0127b386c2017-07-18 17:37:43 +0100153#endif /* defined(HAS_BIAS) */
154
Michalis Spyroudef665a2017-08-14 11:26:37 +0100155 vstore8(CONVERT_SAT(pixels0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
steniu0127b386c2017-07-18 17:37:43 +0100156}
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100157#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
158
159#if defined(WEIGHTS_DEPTH)
160
161#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \
162 ({ \
163 acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0); \
164 acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1); \
165 acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2); \
166 acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3); \
167 acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0); \
168 acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1); \
169 acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2); \
170 acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3); \
171 acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0); \
172 acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1); \
173 acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2); \
174 acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3); \
175 })
176
177/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32
178 *
179 * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
180 * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
181 * @note In case biases, -DHAS_BIAS must to be passed at compile
182 *
183 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
184 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
185 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
186 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
187 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
188 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
189 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
190 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
191 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
192 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
193 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
194 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
195 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
196 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
197 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
198 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
199 * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
200 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
201 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
202 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
203 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
204 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
205 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
206 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
207 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
208 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
209 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
210 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
211 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
212 */
213__kernel void direct_convolution3x3_f32_bifrost(
214 TENSOR3D_DECLARATION(src),
215 TENSOR3D_DECLARATION(dst),
216 TENSOR3D_DECLARATION(weights),
217#ifdef HAS_BIAS
218 VECTOR_DECLARATION(biases),
219#endif /* defined(HAS_BIAS) */
220 unsigned int weights_stride_w)
221{
222 // Get the kernel index
223 const int kernel_index = get_global_id(2);
224
225 Image src = CONVERT_TO_IMAGE_STRUCT(src);
226 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
227
228 float4 pixels0 = 0;
229 float4 pixels1 = 0;
230 float4 pixels2 = 0;
231
232 __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
233 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
234
235 // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor
236
237 for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
238 {
239 // Load the weights
240 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
241 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
242 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
243 float4 src0;
244 float2 src1;
245
246 // Load values from row0 of input tensor
247 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
248 src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);
249
250 CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row0);
251
252 // Load values from row1 of input tensor
253 src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
254 src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);
255
256 // Accumulate
257 CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row1);
258 CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row0);
259
260 // Load values from row2 of input tensor
261 src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
262 src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);
263
264 // Accumulate
265 CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row2);
266 CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row1);
267 CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row0);
268
269 // Load values from row3 of input tensor
270 src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
271 src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);
272
273 // Accumulate
274 CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row2);
275 CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row1);
276
277 // Row4
278 src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
279 src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);
280
281 // Accumulate
282 CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row2);
283
284 src_addr += src_stride_z;
285 weights_addr += weights_stride_z;
286 }
287
288#ifdef HAS_BIAS
289 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
290
Gian Marco Iodice1c8409d2017-09-06 17:24:25 +0100291 float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100292
Gian Marco Iodice1c8409d2017-09-06 17:24:25 +0100293 pixels0 += (float4)bias;
294 pixels1 += (float4)bias;
295 pixels2 += (float4)bias;
Gian Marco Iodice1246b632017-08-16 18:38:32 +0100296#endif /* defined(HAS_BIAS) */
297
298 vstore4(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
299 vstore4(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
300 vstore4(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
301}
302#endif // defined(WEIGHTS_DEPTH)