blob: cbcdbf2a341536616793d4d01b1b4aed5af30194 [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
27#if CONV_STRIDE_X == 1
28#define convolution1x3 convolution1x3_stride_1
29#elif CONV_STRIDE_X == 2
30#define convolution1x3 convolution1x3_stride_2
31#elif CONV_STRIDE_X == 3
32#define convolution1x3 convolution1x3_stride_3
33#else /* CONV_STRIDE_X */
34#error "Stride not supported"
35#endif /* CONV_STRIDE_X */
36
37/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
38 *
39 * @param[in] left_pixel Pointer to the left pixel.
40 * @param[in] left_coeff Weight of the left pixel
41 * @param[in] middle_coeff Weight of the middle pixel
42 * @param[in] right_coeff Weight of the right pixel
43 *
44 * @return a float2 containing 2 convoluted values.
45 */
46inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
47 const float left_coeff,
48 const float middle_coeff,
49 const float right_coeff)
50{
51 float4 temp = vload4(0, (__global float *)left_pixel);
52
53 float2 left = CONVERT(temp.s01, float2);
54 float2 middle = CONVERT(temp.s12, float2);
55 float2 right = CONVERT(temp.s23, float2);
56
57 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
58}
59
60/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
61 *
62 * @param[in] left_pixel Pointer to the left pixel.
63 * @param[in] left_coeff Weight of the left pixel
64 * @param[in] middle_coeff Weight of the middle pixel
65 * @param[in] right_coeff Weight of the right pixel
66 *
67 * @return a float2 containing 2 convoluted values.
68 */
69inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
70 const float left_coeff,
71 const float middle_coeff,
72 const float right_coeff)
73{
74 float4 temp0 = vload4(0, (__global float *)left_pixel);
75 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
76
77 float2 left = CONVERT(temp0.s02, float2);
78 float2 middle = CONVERT(temp0.s13, float2);
79 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
80
81 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
82}
83
84/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
85 *
86 * @param[in] left_pixel Pointer to the left pixel.
87 * @param[in] left_coeff Weight of the left pixel
88 * @param[in] middle_coeff Weight of the middle pixel
89 * @param[in] right_coeff Weight of the right pixel
90 *
91 * @return a float2 containing 2 convoluted values.
92 */
93inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
94 const float left_coeff,
95 const float middle_coeff,
96 const float right_coeff)
97{
98 float4 temp0 = vload4(0, (__global float *)left_pixel);
99 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
100
101 float2 left = CONVERT(temp0.s03, float2);
102 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
103 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
104
105 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
106}
107
108/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
109 *
110 * Convolution matrix layout:
111 *
112 * [ mat0, mat1, mat2 ]\n
113 * [ mat3, mat4, mat5 ]\n
114 * [ mat6, mat7, mat8 ]\n
115 *
116 * @param[in] src A pointer to source Image structure
117 * @param[in] mat0 Coefficient from the convolution matrix
118 * @param[in] mat1 Coefficient from the convolution matrix
119 * @param[in] mat2 Coefficient from the convolution matrix
120 * @param[in] mat3 Coefficient from the convolution matrix
121 * @param[in] mat4 Coefficient from the convolution matrix
122 * @param[in] mat5 Coefficient from the convolution matrix
123 * @param[in] mat6 Coefficient from the convolution matrix
124 * @param[in] mat0 Coefficient from the convolution matrix
125 * @param[in] mat7 Coefficient from the convolution matrix
126 * @param[in] mat8 Coefficient from the convolution matrix
127 *
128 * @return a float2 containing 2 convoluted values.
129 */
130inline float2 convolution3x3(
131 Image *src,
132 const float mat0, const float mat1, const float mat2,
133 const float mat3, const float mat4, const float mat5,
134 const float mat6, const float mat7, const float mat8)
135{
136 float2 pixels;
137
138 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
139 pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
140 pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
141
142 return pixels;
143}
144
145/** This function computes the horizontal integral of the image.
146 *
147 * @param[in] src_ptr Pointer to the source image. Supported data types: U8
148 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
149 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
150 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
151 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
152 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
153 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
154 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
155 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F16/F32
156 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
157 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
158 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
159 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
160 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
161 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
162 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
163 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
164 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
165 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
166 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
167 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
168 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
169 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
170 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
171 */
172
173__kernel void depthwise_convolution_3x3(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), TENSOR3D_DECLARATION(weights))
174{
175 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
176 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
177 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
178
179 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
180 float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
181 float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
182 float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
183
184 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
185 weights_values1.s0, weights_values1.s1, weights_values1.s2,
186 weights_values2.s0, weights_values2.s1, weights_values2.s2);
187
188 vstore2(pixels, 0, (__global float *)dst.ptr);
189}