blob: 9c2c3a5b37e594fd750866e0f5e39139c22eecf7 [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
Giorgio Arena9fe41442017-08-23 16:36:24 +010027#if defined(CONV_STRIDE_X)
28
Giorgio Arena93a690e2017-08-01 16:09:33 +010029#if CONV_STRIDE_X == 1
30#define convolution1x3 convolution1x3_stride_1
31#elif CONV_STRIDE_X == 2
32#define convolution1x3 convolution1x3_stride_2
33#elif CONV_STRIDE_X == 3
34#define convolution1x3 convolution1x3_stride_3
35#else /* CONV_STRIDE_X */
36#error "Stride not supported"
37#endif /* CONV_STRIDE_X */
38
39/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
40 *
41 * @param[in] left_pixel Pointer to the left pixel.
42 * @param[in] left_coeff Weight of the left pixel
43 * @param[in] middle_coeff Weight of the middle pixel
44 * @param[in] right_coeff Weight of the right pixel
45 *
46 * @return a float2 containing 2 convoluted values.
47 */
48inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
49 const float left_coeff,
50 const float middle_coeff,
51 const float right_coeff)
52{
53 float4 temp = vload4(0, (__global float *)left_pixel);
54
55 float2 left = CONVERT(temp.s01, float2);
56 float2 middle = CONVERT(temp.s12, float2);
57 float2 right = CONVERT(temp.s23, float2);
58
59 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
60}
61
62/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
63 *
64 * @param[in] left_pixel Pointer to the left pixel.
65 * @param[in] left_coeff Weight of the left pixel
66 * @param[in] middle_coeff Weight of the middle pixel
67 * @param[in] right_coeff Weight of the right pixel
68 *
69 * @return a float2 containing 2 convoluted values.
70 */
71inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
72 const float left_coeff,
73 const float middle_coeff,
74 const float right_coeff)
75{
76 float4 temp0 = vload4(0, (__global float *)left_pixel);
77 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
78
79 float2 left = CONVERT(temp0.s02, float2);
80 float2 middle = CONVERT(temp0.s13, float2);
81 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
82
83 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
84}
85
86/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
87 *
88 * @param[in] left_pixel Pointer to the left pixel.
89 * @param[in] left_coeff Weight of the left pixel
90 * @param[in] middle_coeff Weight of the middle pixel
91 * @param[in] right_coeff Weight of the right pixel
92 *
93 * @return a float2 containing 2 convoluted values.
94 */
95inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
96 const float left_coeff,
97 const float middle_coeff,
98 const float right_coeff)
99{
100 float4 temp0 = vload4(0, (__global float *)left_pixel);
101 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
102
103 float2 left = CONVERT(temp0.s03, float2);
104 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
105 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
106
107 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
108}
109
110/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
111 *
112 * Convolution matrix layout:
113 *
114 * [ mat0, mat1, mat2 ]\n
115 * [ mat3, mat4, mat5 ]\n
116 * [ mat6, mat7, mat8 ]\n
117 *
118 * @param[in] src A pointer to source Image structure
119 * @param[in] mat0 Coefficient from the convolution matrix
120 * @param[in] mat1 Coefficient from the convolution matrix
121 * @param[in] mat2 Coefficient from the convolution matrix
122 * @param[in] mat3 Coefficient from the convolution matrix
123 * @param[in] mat4 Coefficient from the convolution matrix
124 * @param[in] mat5 Coefficient from the convolution matrix
125 * @param[in] mat6 Coefficient from the convolution matrix
126 * @param[in] mat0 Coefficient from the convolution matrix
127 * @param[in] mat7 Coefficient from the convolution matrix
128 * @param[in] mat8 Coefficient from the convolution matrix
129 *
130 * @return a float2 containing 2 convoluted values.
131 */
132inline float2 convolution3x3(
133 Image *src,
134 const float mat0, const float mat1, const float mat2,
135 const float mat3, const float mat4, const float mat5,
136 const float mat6, const float mat7, const float mat8)
137{
138 float2 pixels;
139
140 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
141 pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
142 pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
143
144 return pixels;
145}
146
147/** This function computes the horizontal integral of the image.
148 *
149 * @param[in] src_ptr Pointer to the source image. Supported data types: U8
150 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
151 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
152 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
153 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
154 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
155 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
156 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
157 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F16/F32
158 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
159 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
160 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
161 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
162 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
163 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
164 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
165 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
166 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
167 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
168 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
169 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
170 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
171 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
172 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
173 */
174
175__kernel void depthwise_convolution_3x3(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), TENSOR3D_DECLARATION(weights))
176{
177 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
178 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
179 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
180
181 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
182 float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
183 float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
184 float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
185
186 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
187 weights_values1.s0, weights_values1.s1, weights_values1.s2,
188 weights_values2.s0, weights_values2.s1, weights_values2.s2);
189
190 vstore2(pixels, 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100191}
192
193#endif //defined(CONV_STRIDE_X)
194
195#if defined(SRC_WIDTH) && defined(DATA_TYPE)
196/** This kernel reshapes each of the tensor's low three dimensions to single rows.
197 *
198 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
199 *
200 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
201 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
202 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
203 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
204 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
205 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
206 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
207 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
208 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
209 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
210 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
211 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
212 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
213 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
214 */
215__kernel void depthwise_weights_reshape(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
216{
217 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
218
219 __global DATA_TYPE *input_ptr = (__global DATA_TYPE *)src.ptr;
220 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
221
222 for(int i = 0; i < SRC_WIDTH; ++i, ++input_ptr)
223 {
224 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;
225 }
226}
227#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
228
229#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
230/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
231 *
232 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
233 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_X, -DPAD_Y, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
234 *
235 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
236 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
237 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
238 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
239 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
240 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
241 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
242 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
243 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
244 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
245 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
246 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
247 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
248 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
249 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
250 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
251 */
252
253__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
254{
255 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
256
257 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
258 const int full_length = SRC_WIDTH + 2 * PAD_X;
259 const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
260
261 const int src_x = -PAD_X + src_pixel_linear % max_initial_x;
262 const int src_y = -PAD_Y + src_pixel_linear / max_initial_x * STRIDE_Y;
263 const int src_z = get_global_id(2);
264
265 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
266 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
267
268 for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
269 {
270 for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
271 {
272 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
273 {
274 *output_ptr = 0;
275 }
276 else
277 {
278 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
279 }
280 }
281 }
282}
283
284#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE)
285
286#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
287
288/** This kernel performs a reshaping of the output of the depthwise generic convolution.
289 *
290 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
291 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
292 *
293 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
294 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
295 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
296 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
297 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
298 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
299 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
300 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
301 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
302 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
303 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
304 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
305 */
306__kernel void depthwise_vector_to_tensor(
307 VECTOR_DECLARATION(src),
308 TENSOR3D_DECLARATION(dst))
309{
310 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
311
312 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
313 const int id0 = get_global_id(0);
314 const int z = id0 / patch_size;
315 const int index2D = id0 - z * patch_size;
316
317 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z;
318 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
319}
320
321#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)