blob: 807b990e8243fe958352572155e27f5c28e144ea [file] [log] [blame]
Giorgio Arena945ae9e2021-10-13 11:13:04 +01001/*
Michalis Spyroub1fcefd2022-06-15 19:02:28 +01002 * Copyright (c) 2021-2022 Arm Limited.
Giorgio Arena945ae9e2021-10-13 11:13:04 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26#include "tile_helpers.h"
27
28//! @cond Doxygen_Suppress
Giorgio Arena273c28c2021-10-14 15:59:15 +010029/** OpenCL kernel to compute the direct convolution 3d.
Giorgio Arena945ae9e2021-10-13 11:13:04 +010030 *
31 * @note Data layout supported: NDHWC
Giorgio Arena51847d52021-10-19 15:45:57 +010032 * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
Giorgio Arena945ae9e2021-10-13 11:13:04 +010033 * @note The accumulation data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
34 * @note The convolution padding (left, top and front) must be passed at compile time using -DPAD_LEFT, -DPAD_TOP and -DPAD_FRONT (e.g. -DPAD_LEFT=2, -DPAD_TOP=2, -DPAD_FRONT=2)
35 * @note The convolution strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2, -DSTRIDE_Z=2)
36 * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH, -DWEI_HEIGHT and -DWEI_DEPTH (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9, -DWEI_DEPTH=9)
37 * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT and -DSRC_DEPTH (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64, -DSRC_DEPTH=32)
38 * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH, -DDST_HEIGHT and -DDST_DEPTH (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64, -DDST_DEPTH=32)
39 * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
Giorgio Arena273c28c2021-10-14 15:59:15 +010040 * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDST_CHANNELS=64)
41 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
42 * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
Giorgio Arena945ae9e2021-10-13 11:13:04 +010043 * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
44 * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
45 * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
46 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
Giorgio Arena51847d52021-10-19 15:45:57 +010047 * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
Giorgio Arena945ae9e2021-10-13 11:13:04 +010048 * @note Only the following configurations of M0, N0 and K0 are currently supported:
49 * - M0 = 1, 2, 3, 4, 5, .... n
50 * - N0 = 2, 3, 4, 8, 16
51 * - K0 = 2, 3, 4, 8, 16
52 *
Giorgio Arena51847d52021-10-19 15:45:57 +010053 * @note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
54 * - -DIS_QUANTIZED
55 * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
56 * - The destination quantization shift e.g. -DDST_SHIFT=4
57 * - The destination offset e.g. -DDST_OFFSET=4
58 * - The source offset e.g. -DSRC_OFFSET=4
59 * - The weights offset e.g. -DWEI_OFFSET=4
60 * - The quantized zero value e.g. -DZERO_VALUE=4
61 *
62 * @note If biases are used then -DHAS_BIAS has to be passed at compile time along with its tensor type by using -DBIA_DATA_TYPE (e.g. -DBIA_DATA_TYPE=int).
Giorgio Arena945ae9e2021-10-13 11:13:04 +010063 *
64 * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32
65 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
66 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
67 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
68 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
69 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
70 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
71 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
72 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
73 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
74 * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr
75 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
76 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
77 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
78 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
79 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
80 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
81 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
82 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
83 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
84 * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr
85 * @param[in] wei_stride_x Stride of the weights tensor in X dimension (in bytes)
86 * @param[in] wei_step_x wei_stride_x * number of elements along X processed per workitem(in bytes)
87 * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes)
88 * @param[in] wei_step_y wei_stride_y * number of elements along Y processed per workitem(in bytes)
89 * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes)
90 * @param[in] wei_step_z wei_stride_z * number of elements along Z processed per workitem(in bytes)
91 * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes)
92 * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes)
93 * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights matrix
94 * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
95 * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes)
96 * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
97 * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
98 */
99//! @endcond
100__kernel void direct_convolution3d_ndhwc(
101 TENSOR4D(src, BUFFER),
102 TENSOR4D(dst, BUFFER),
103 TENSOR4D(wei, BUFFER)
104#if defined(HAS_BIAS)
105 ,
106 VECTOR_DECLARATION(bia)
107#endif // defined(HAS_BIAS)
108)
109{
110#define _IWEI_WIDTH WEI_WIDTH
111#define _IWEI_HEIGHT WEI_HEIGHT
112#define _IWEI_DEPTH WEI_DEPTH
113#define _ISRC_WIDTH SRC_WIDTH
114#define _ISRC_HEIGHT SRC_HEIGHT
115#define _ISRC_DEPTH SRC_DEPTH
116#define _ISRC_CHANNELS SRC_CHANNELS
117#define _IDST_WIDTH DST_WIDTH
118#define _IDST_HEIGHT DST_HEIGHT
119#define _IDST_DEPTH DST_DEPTH
120#define _IDST_CHANNELS DST_CHANNELS
121#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT * _IWEI_DEPTH)
122
Giorgio Arena51847d52021-10-19 15:45:57 +0100123 // If quantized, the output tile has to be quantized first before being stored to global memory
124#if defined(IS_QUANTIZED)
125#define _IOUTPUT_TILE cq
126#else // defined(IS_QUANTIZED)
127#define _IOUTPUT_TILE c
128#endif // defined(IS_QUANTIZED)
129
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100130 const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
131 const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT x DEPTH
132 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
133
134 TILE(int, M0, 1, xi);
135 TILE(int, M0, 1, yi);
136 TILE(int, M0, 1, zi);
137
138 // Convert the linear index to coordinate
139 LOOP_UNROLLING(int, i, 0, 1, M0,
140 {
141 xi[i].v = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
142 yi[i].v = (((mout + i) / _IDST_WIDTH) % _IDST_HEIGHT) * STRIDE_Y;
143 zi[i].v = (((mout + i) / (_IDST_WIDTH * _IDST_HEIGHT)) % _IDST_DEPTH) * STRIDE_Z;
144
145 xi[i].v -= PAD_LEFT;
146 yi[i].v -= PAD_TOP;
147 zi[i].v -= PAD_FRONT;
148 })
149
150 // Initialize the accumulators
151 TILE(ACC_DATA_TYPE, M0, N0, c);
152
153 LOOP_UNROLLING(int, i, 0, 1, M0,
154 {
155 c[i].v = (ACC_DATA_TYPE)0;
156 })
157
158 for(int i = 0; i < _IY_MULTIPLIER; ++i)
159 {
160 int ck = 0;
161 int xk = i % _IWEI_WIDTH;
162 int yk = (i / _IWEI_WIDTH) % _IWEI_HEIGHT;
163 int zk = i / (_IWEI_WIDTH * _IWEI_HEIGHT);
164
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100165 int k = 0;
166 for(; k <= (_ISRC_CHANNELS - K0); k += K0)
167 {
168 TILE(DATA_TYPE, M0, K0, a);
169 TILE(DATA_TYPE, N0, K0, b);
170
171 LOOP_UNROLLING(int, i, 0, 1, M0,
172 {
Giorgio Arena51847d52021-10-19 15:45:57 +0100173 a[i].v = ZERO_VALUE;
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100174 })
175
176 // Load tile from the src tensor
177 T_LOAD_NDHWC_INDIRECT(DATA_TYPE, M0, K0, BUFFER, src, bout, zk, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, _ISRC_DEPTH, src_stride_y, xi, yi, zi, a);
178
179 // Load tile from the weights tensor
180 const int b_offs = k + (xk * _ISRC_CHANNELS) + (yk * _ISRC_CHANNELS * _IWEI_WIDTH) + (zk * _ISRC_CHANNELS * _IWEI_WIDTH * _IWEI_HEIGHT);
181 LOOP_UNROLLING(int, i, 0, 1, N0,
182 {
183 if((cout + i) < _IDST_CHANNELS)
184 {
185 LOOP_UNROLLING(int, j, 0, 1, K0,
186 {
187 b[i].s[j] = *(__global DATA_TYPE *)(wei_ptr + wei_offset_first_element_in_bytes + (cout + i) * sizeof(DATA_TYPE) + j * wei_stride_y + b_offs * wei_stride_y);
188 })
189 }
190 })
191
192 // Compute the matrix multiplication between two tiles
193 T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
194
Giorgio Arena51847d52021-10-19 15:45:57 +0100195 // Apply the offset correction (correction usually needed for asymmetric quantized computation)
196 // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
197 T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
198
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100199 ck += K0;
200 }
201
202#if((_ISRC_CHANNELS % K0) != 0)
203 // Left-over accumulations
204 for(; k < _ISRC_CHANNELS; ++k)
205 {
206 TILE(DATA_TYPE, M0, 1, a);
207 TILE(DATA_TYPE, N0, 1, b);
208
209 LOOP_UNROLLING(int, i, 0, 1, M0,
210 {
Giorgio Arena51847d52021-10-19 15:45:57 +0100211 a[i].v = ZERO_VALUE;
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100212 })
213
214 // Load tile from the src tensor
215 T_LOAD_NDHWC_INDIRECT(DATA_TYPE, M0, 1, BUFFER, src, bout, zk, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, _ISRC_DEPTH, src_stride_y, xi, yi, zi, a);
216
217 // Load tile from the weights tensor
218 const int b_offs = k + (xk * _ISRC_CHANNELS) + (yk * _ISRC_CHANNELS * _IWEI_WIDTH) + (zk * _ISRC_CHANNELS * _IWEI_WIDTH * _IWEI_HEIGHT);
219 LOOP_UNROLLING(int, i, 0, 1, N0,
220 {
221 if((cout + i) < _IDST_CHANNELS)
222 {
223 b[i].v = *(__global DATA_TYPE *)(wei_ptr + wei_offset_first_element_in_bytes + (cout + i) * sizeof(DATA_TYPE) + b_offs * wei_stride_y);
224 }
225 })
226
227 // // Compute the matrix multiplication between two tiles
228 T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
229
Giorgio Arena51847d52021-10-19 15:45:57 +0100230 // Apply the offset correction (operation usually needed for asymmetric quantized computation)
231 // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
232 T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
233
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100234 ++ck;
235 }
236#endif // ((_ISRC_CHANNELS % K0) != 0)
237 }
238
Giorgio Arena51847d52021-10-19 15:45:57 +0100239 // Offset correction required for the quantized asymmetric computation
240 // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
241 T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _IWEI_DEPTH * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
242
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100243#if defined(HAS_BIAS)
Giorgio Arena51847d52021-10-19 15:45:57 +0100244 TILE(BIA_DATA_TYPE, 1, N0, bias0);
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100245
246 if((cout + N0) <= _IDST_CHANNELS)
247 {
Giorgio Arena51847d52021-10-19 15:45:57 +0100248 bias0[0].v = VLOAD(N0)(0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(BIA_DATA_TYPE)));
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100249 }
250 else
251 {
252 VLOAD_PARTIAL(N0, PARTIAL_N0)
Giorgio Arena51847d52021-10-19 15:45:57 +0100253 (bias0[0].v, 0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(BIA_DATA_TYPE)));
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100254 }
255
256 // c = c + bias[broadcasted]
Michalis Spyroub1fcefd2022-06-15 19:02:28 +0100257 T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100258
259#endif // HAS_BIAS
260
261 TILE(uint, M0, 1, dst_indirect_y);
262
263 // Calculate the destination indirect Y
264 LOOP_UNROLLING(int, i, 0, 1, M0,
265 {
266 dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH *_IDST_HEIGHT * _IDST_DEPTH) - 1);
267 dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH *_IDST_HEIGHT * _IDST_DEPTH);
268 })
269
Giorgio Arena51847d52021-10-19 15:45:57 +0100270#if defined(IS_QUANTIZED)
271 TILE(DATA_TYPE, M0, N0, cq);
272
273 // Quantize the tile
274 T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
275#endif // defined(IS_QUANTIZED)
276
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100277 bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
278
279 // Store the tile in reverse order so the invalid values are overwritten with the valid ones
Giorgio Arena51847d52021-10-19 15:45:57 +0100280 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_N0, BUFFER, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
Giorgio Arena945ae9e2021-10-13 11:13:04 +0100281}