blob: 3ac67c58aee7e79a58241d7e3fba1acbec5d3398 [file] [log] [blame]
Michele Di Giorgio72175632018-05-01 16:52:00 +01001/*
2* Copyright (c) 2018 ARM Limited.
3*
4* SPDX-License-Identifier: MIT
5*
6* Permission is hereby granted, free of charge, to any person obtaining a copy
7* of this software and associated documentation files (the "Software"), to
8* deal in the Software without restriction, including without limitation the
9* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10* sell copies of the Software, and to permit persons to whom the Software is
11* furnished to do so, subject to the following conditions:
12*
13* The above copyright notice and this permission notice shall be included in all
14* copies or substantial portions of the Software.
15*
16* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22* SOFTWARE.
23*/
24#include "helpers.h"
25
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010026#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
Michele Di Giorgio72175632018-05-01 16:52:00 +010027
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010028// Check valid VEC_SIZES
29#if VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
30#error "Only vector sizes 4, 8 and 16 are supported"
31#endif // VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
Michele Di Giorgio72175632018-05-01 16:52:00 +010032
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010033#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Michele Di Giorgio72175632018-05-01 16:52:00 +010034
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010035#define DIV_MOD_UINT(x, y, div_res, mod_res) \
36 ({ \
37 div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
38 uint r = div_res * (y); \
39 mod_res = (x)-r; \
40 })
41
42/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
Michele Di Giorgio72175632018-05-01 16:52:00 +010043 *
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010044 * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
45 * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
46 * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
47 * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
Michele Di Giorgio72175632018-05-01 16:52:00 +010048 * K is equal to num_channels / num_groups.
49 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010050 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
Michele Di Giorgio72175632018-05-01 16:52:00 +010051 * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
52 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
53 * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
54 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
55 * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
56 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010057 * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
58 * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
Michele Di Giorgio72175632018-05-01 16:52:00 +010059 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
60 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
61 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
62 * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
63 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
64 * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
65 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
66 * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010067 * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
68 * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
Michele Di Giorgio72175632018-05-01 16:52:00 +010069 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
70 */
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010071__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
72 TENSOR4D_DECLARATION(dst))
Michele Di Giorgio72175632018-05-01 16:52:00 +010073{
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010074 uint curr_channel = 0; // channel id of input
75 uint batch_id = 0; // batch id
76 uint group_id = 0; // group id
77 uint channel_id = 0; // channel id within the group
Michele Di Giorgio72175632018-05-01 16:52:00 +010078
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010079 // Compute curr_channel and batch_id
80 DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
Michele Di Giorgio72175632018-05-01 16:52:00 +010081
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010082 // Compute group_id and channel_id
83 DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
84
85 const uint x = get_global_id(0) * VEC_SIZE;
86 const uint y = get_global_id(1) * 2;
Michele Di Giorgioefac7c62018-05-16 00:02:35 +010087 const uint z = channel_id * NUM_GROUPS + group_id;
Michele Di Giorgio72175632018-05-01 16:52:00 +010088
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010089 // Load the Nx2 block
90 const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
91 TYPE u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
92 TYPE u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
Michele Di Giorgio72175632018-05-01 16:52:00 +010093
94 // Store blocks
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +010095 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
96 VSTORE(VEC_SIZE)
97 (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
98 VSTORE(VEC_SIZE)
99 (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
Michele Di Giorgio72175632018-05-01 16:52:00 +0100100}
Gian Marco Iodice8bab0ee2018-09-13 11:51:56 +0100101
102#if VEC_SIZE == 4 && defined(LAST_ACCESSED)
103/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
104 *
105 * @note This implementation is only defined for VEC_SIZE = 4
106 * @note This last element accessed along the first dimension must be given as a preprocessor argument using -DLAST_ACCESSED=num. e.g. -DLAST_ACCESSED=64 in order to prevent out-of-bound writes.
107 * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
108 * @note The height of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
109 * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
110 * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
111 * K is equal to num_channels / num_groups.
112 *
113 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
114 * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
115 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
116 * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
117 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
118 * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
119 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
120 * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
121 * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
122 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
123 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
124 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
125 * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
126 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
127 * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
128 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
129 * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
130 * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
131 * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
132 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
133 */
134__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
135 TENSOR4D_DECLARATION(dst))
136{
137 const uint curr_channel = min((uint)(get_global_id(0) * VEC_SIZE), (uint)LAST_ACCESSED); // input feature map
138 uint channel_id0 = 0;
139 uint channel_id1 = 0;
140 uint channel_id2 = 0;
141 uint channel_id3 = 0;
142 uint group_id0 = 0;
143 uint group_id1 = 0;
144 uint group_id2 = 0;
145 uint group_id3 = 0;
146 uint y = 0;
147 uint batch_id = 0;
148
149 // Compute curr_channel and batch_id
150 DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, y);
151
152 // Compute group_id and channel_id
153 DIV_MOD_UINT(curr_channel + (uint)0, K, group_id0, channel_id0);
154 DIV_MOD_UINT(curr_channel + (uint)1, K, group_id1, channel_id1);
155 DIV_MOD_UINT(curr_channel + (uint)2, K, group_id2, channel_id2);
156 DIV_MOD_UINT(curr_channel + (uint)3, K, group_id3, channel_id3);
157
158 const uint x = get_global_id(1) * 2;
159 const uint z0 = channel_id0 * (uint)NUM_GROUPS + group_id0;
160 const uint z1 = channel_id1 * (uint)NUM_GROUPS + group_id1;
161 const uint z2 = channel_id2 * (uint)NUM_GROUPS + group_id2;
162 const uint z3 = channel_id3 * (uint)NUM_GROUPS + group_id3;
163
164 // Load the Nx2 block
165 const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + curr_channel * sizeof(DATA_TYPE) + x * src_stride_y + y * src_stride_z + batch_id * src_stride_w;
166 TYPE u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
167 TYPE u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
168
169 // Store blocks
170 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_stride_y + y * dst_stride_z + batch_id * dst_stride_w;
171 *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u0.s0;
172 *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u0.s1;
173 *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u0.s2;
174 *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u0.s3;
175 *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u1.s0;
176 *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u1.s1;
177 *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u1.s2;
178 *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u1.s3;
179}
180#endif // VEC_SIZE == 4 && defined(LAST_ACCESSED)
181#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)