blob: 3a31cb80a76746976d7a63a9cfadaf2b63bcba98 [file] [log] [blame]
Anthony Barbier7068f992017-10-26 15:23:08 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
26
27#include "helpers.h"
28
29layout(std140) uniform shader_params
30{
31 TENSOR3D_PARAM_DECLARATION(src);
32 TENSOR3D_PARAM_DECLARATION(dst);
33 TENSOR3D_PARAM_DECLARATION(weights);
34#ifdef BIAS
35 VECTOR_PARAM_DECLARATION(biases);
36#endif /* BIAS */
37 uint weights_stride_w;
38 uint weights_depth;
39};
40
41#if defined(DATA_TYPE_FP32)
42precision highp float;
43
44BUFFER_DECLARATION(src, 1, float, readonly);
45BUFFER_DECLARATION(dst, 2, float, writeonly);
46BUFFER_DECLARATION(weights, 3, float, readonly);
47#ifdef BIAS
48BUFFER_DECLARATION(biases, 4, float, readonly);
49#endif /* BIAS */
50
51/** This kernel performs a direct convolution to convolve the low three dimensions.
52 *
53 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
54 * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
55 * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
56 *
57 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
58 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
59 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
60 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
61 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
62 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
63 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
64 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
65 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
66 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
67 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
68 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
69 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
70 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
71 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
72 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
73 * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
74 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
75 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
76 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
77 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
78 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
79 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
80 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
81 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
82 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
83 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
84 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
85 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
86 * @param[in] weights_depth The third dimensions of the weights tensors
87 */
88void main()
89{
90 Image src = CONVERT_TO_IMAGE_STRUCT(src);
91 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
92 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
93
94#ifdef BIAS
95 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
96#endif /* BIAS */
97
98 float pixels = CONVERT(0, float);
99 uint z_index = gl_GlobalInvocationID.z;
100 weights.current_offset += z_index * weights_stride_w >> 2;
101 float temp;
102 float temp_weight;
103
104 for(int d = 0; d < int(weights_depth); ++d)
105 {
106 temp = LOAD4(src, CURRENT_OFFSET(src));
107 temp_weight = LOAD4(weights, CURRENT_OFFSET(weights));
108 pixels += temp * temp_weight;
109
110 src.current_offset += (src_stride_z >> 2);
111 weights.current_offset += (weights_stride_z >> 2);
112 }
113
114#ifdef BIAS
115 pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
116#endif /* BIAS */
117
118 STORE4(dst, CURRENT_OFFSET(dst), pixels);
119}
120#elif defined(DATA_TYPE_FP16)
121precision mediump float;
122
123BUFFER_DECLARATION(src, 1, uvec4, readonly);
124BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
125BUFFER_DECLARATION(weights, 3, uint, readonly);
126#ifdef BIAS
127BUFFER_DECLARATION(biases, 4, uint, readonly);
128#endif /* BIAS */
129
130#if STRIDE_X == 2
131#define CONVOLVE(s, w) convolve_stride2(s, w)
132#elif STRIDE_X == 1 /* STRIDE_X == 1 */
133#define CONVOLVE(s, w) convolve_stride1(s, w)
134#else /* STRIDE_X not equals 1 or 2 */
135#error STRIDE_X larger than 2 is not supported
136#endif /* STRIDE_X == 2 */
137
138vec4[2] convolve_stride1(Image src, float w)
139{
140 uvec4 packed_s;
141 vec4 s[2];
142
143 GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
144
145 s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
146 s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
147
148 s[0] *= w;
149 s[1] *= w;
150
151 return s;
152}
153
154vec4[2] convolve_stride2(Image src, float w)
155{
156 uvec4 packed_s;
157 vec4 s[2];
158 vec4 r[2];
159
160 GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
161 s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
162 s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
163
164 r[0] = vec4(s[0].xz, s[1].xz);
165
166 GC_LOAD1_2D_OFFSET(packed_s, src, 8, 0);
167 s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
168 s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
169
170 r[1] = vec4(s[0].xz, s[1].xz);
171
172 r[0] *= w;
173 r[1] *= w;
174
175 return r;
176}
177
178/** This kernel performs a direct convolution to convolve the low three dimensions.
179 *
180 * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
181 * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
182 * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
183 *
184 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
185 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
186 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
187 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
188 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
189 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
190 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
191 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
192 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
193 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
194 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
195 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
196 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
197 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
198 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
199 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
200 * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
201 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
202 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
203 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
204 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
205 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
206 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
207 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
208 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
209 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
210 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
211 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
212 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
213 * @param[in] weights_depth The third dimensions of the weights tensors
214 */
215void main()
216{
217 Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
218 Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
219 Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
220
221#ifdef BIAS
222 Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
223#endif /* BIAS */
224
225 vec4 pixels[2];
226 pixels[0] = vec4(0.f);
227 pixels[1] = vec4(0.f);
228
229 uint z_index = gl_GlobalInvocationID.z;
230
231 weights.current_offset += z_index * weights_stride_w;
232
233 uint packed_w;
234 float w;
235
236 for(int d = 0; d < int(weights_depth); ++d)
237 {
238 GC_LOAD1_3D_OFFSET(packed_w, weights, 0, 0, 0);
239 w = unpackHalf2x16(packed_w).x;
240
241 vec4 r[2] = CONVOLVE(src, w);
242 pixels[0] += r[0];
243 pixels[1] += r[1];
244
245 src.current_offset += src_stride_z;
246 weights.current_offset += weights_stride_z;
247 }
248
249#ifdef BIAS
250 uint packed_b;
251 float b;
252
253 GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
254
255 if(z_index % uint(2) == uint(0))
256 {
257 b = unpackHalf2x16(packed_b).x;
258 }
259 else
260 {
261 b = unpackHalf2x16(packed_b).y;
262 }
263
264 pixels[0] += vec4(b);
265 pixels[1] += vec4(b);
266#endif /* BIAS */
267
268 uvec4 packed_d;
269 packed_d = uvec4(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw),
270 packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
271 GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
272}
273#else /* DATA_TYPE_FP32 */
274#error Data type not supported
275#endif /* DATA_TYPE_FP32 */