blob: ec0551b018151730fed9aafdbee8ed199ec12a12 [file] [log] [blame]
SiCong Lic51b72f2017-07-28 14:46:20 +01001/*
2 * Copyright (c) 2016, 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26#if STRIDE_X == 3
27#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
28#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
29#elif STRIDE_X == 2
30#define INPUT_PIXEL(data_size) extract_input_stride2
31#elif STRIDE_X == 1
32#define INPUT_PIXEL(data_size) extract_input_stride1
33#else /* STRIDE_X not equals 1, 2 or 3 */
34#error "Only support strides 1, 2 and 3"
35#endif /* STRIDE_X == 3 */
36
37/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
38 *
39 * @param[in] input_pixel Pointer to the first pixel.
40 *
41 * @return extracted input pixels.
42 */
43inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
44{
45 return vload8(0, input_pixel);
46}
47
48/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
49 *
50 * @param[in] input_pixel Pointer to the first pixel.
51 *
52 * @return extracted input pixels.
53 */
54inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
55{
56 VEC_DATA_TYPE(DATA_TYPE, 16)
57 temp = vload16(0, input_pixel);
58 return temp.s02468ace;
59}
60
61/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
62 *
63 * @param[in] input_pixel Pointer to the first pixel.
64 *
65 * @return extracted input pixels.
66 */
67inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
68{
69 VEC_DATA_TYPE(DATA_TYPE, 4)
70 temp1 = vload4(0, input_pixel);
71 VEC_DATA_TYPE(DATA_TYPE, 4)
72 temp2 = vload4(0, input_pixel + 6);
73 VEC_DATA_TYPE(DATA_TYPE, 4)
74 temp3 = vload4(0, input_pixel + 12);
75 VEC_DATA_TYPE(DATA_TYPE, 4)
76 temp4 = vload4(0, input_pixel + 18);
77 return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
78}
79
80/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
81 *
82 * @param[in] input_pixel Pointer to the first pixel.
83 *
84 * @return extracted input pixels.
85 */
86inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
87{
88 VEC_DATA_TYPE(DATA_TYPE, 8)
89 temp1 = vload8(0, input_pixel);
90 VEC_DATA_TYPE(DATA_TYPE, 8)
91 temp2 = vload8(0, input_pixel + 8);
92 VEC_DATA_TYPE(DATA_TYPE, 8)
93 temp3 = vload8(0, input_pixel + 16);
94 return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
95}
96
97/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
98 *
99 * @param[in] input_pixel Pointer to the first pixel.
100 *
101 * @return extracted input pixels.
102 */
103inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
104{
105 VEC_DATA_TYPE(DATA_TYPE, 16)
106 temp1 = vload16(0, input_pixel);
107 VEC_DATA_TYPE(DATA_TYPE, 16)
108 temp2 = vload16(0, input_pixel + 12);
109 return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
110}
111
112/** This kernel performs a direct convolution to convolve the low three dimensions.
113 *
114 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
115 * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100116 * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
117 * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
SiCong Lic51b72f2017-07-28 14:46:20 +0100118 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
119 *
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100120 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
SiCong Lic51b72f2017-07-28 14:46:20 +0100121 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
122 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
123 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
124 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
125 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
126 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
127 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
128 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
129 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
130 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
131 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
132 * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
133 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
134 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
135 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
136 * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
137 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
138 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
139 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
140 * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
141 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
142 * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
143 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
144 * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
145 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
146 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
147 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100148 * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
SiCong Lic51b72f2017-07-28 14:46:20 +0100149 */
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100150#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
SiCong Lic51b72f2017-07-28 14:46:20 +0100151__kernel void direct_convolution1x1(
152 TENSOR3D_DECLARATION(src),
153 TENSOR3D_DECLARATION(dst),
154 TENSOR3D_DECLARATION(weights),
155#ifdef HAS_BIAS
156 VECTOR_DECLARATION(biases),
157#endif /* defined(HAS_BIAS) */
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100158 unsigned int weights_stride_w)
SiCong Lic51b72f2017-07-28 14:46:20 +0100159{
160 Image src = CONVERT_TO_IMAGE_STRUCT(src);
161 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
162 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
163
164#ifdef HAS_BIAS
165 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
166#endif /* defined(HAS_BIAS) */
167
168 VEC_DATA_TYPE(DATA_TYPE, 8)
169 pixels = 0;
170
171 const uint z_index = get_global_id(2);
172
173 weights.ptr += z_index * weights_stride_w;
174
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100175 for(int d = 0; d < WEIGHTS_DEPTH; ++d)
SiCong Lic51b72f2017-07-28 14:46:20 +0100176 {
177 DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
178 VEC_DATA_TYPE(DATA_TYPE, 8)
179 input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
180 pixels += weight * input_pixel;
181 src.ptr += src_stride_z;
182 weights.ptr += weights_stride_z;
183 }
184
185#ifdef HAS_BIAS
186 pixels += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index)));
187#endif /* defined(HAS_BIAS) */
188
189 vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
190}
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +0100191#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)