Blame - src/core/CL/cl_kernels/channel_shuffle.cl - ml/ComputeLibrary

const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;

91

TYPE u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

92

TYPE u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

Michele Di Giorgio

7217563

2018-05-01 16:52:00 +0100

[diff] [blame]

93

94

// Store blocks

Gian Marco Iodice

8bab0ee

2018-09-13 11:51:56 +0100

[diff] [blame]

95

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;

96

VSTORE(VEC_SIZE)

97

(u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));

98

VSTORE(VEC_SIZE)

99

(u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));

Michele Di Giorgio

7217563

2018-05-01 16:52:00 +0100

[diff] [blame]

100

}

Gian Marco Iodice

8bab0ee

2018-09-13 11:51:56 +0100

[diff] [blame]

101

102

#if VEC_SIZE == 4 && defined(LAST_ACCESSED)

103

/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.

104

*

105

* @note This implementation is only defined for VEC_SIZE = 4

106

* @note This last element accessed along the first dimension must be given as a preprocessor argument using -DLAST_ACCESSED=num. e.g. -DLAST_ACCESSED=64 in order to prevent out-of-bound writes.

107

* @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4

108

* @note The height of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64

109

* @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2

110

* @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1

111

* K is equal to num_channels / num_groups.

112

*

113

* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

114

* @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)

115

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

116

* @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)

117

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

118

* @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)

119

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

120

* @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)

121

* @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)

122

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor

123

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

124

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

125

* @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

126

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

127

* @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

128

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

129

* @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

130

* @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)

131

* @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)

132

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

133

*/

134

__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),

135

TENSOR4D_DECLARATION(dst))

136

{

137

const uint curr_channel = min((uint)(get_global_id(0) * VEC_SIZE), (uint)LAST_ACCESSED); // input feature map

138

uint channel_id0 = 0;

139

uint channel_id1 = 0;

140

uint channel_id2 = 0;

141

uint channel_id3 = 0;

uint group_id0 = 0;

uint group_id1 = 0;

uint group_id2 = 0;

uint group_id3 = 0;

uint y = 0;

uint batch_id = 0;

// Compute curr_channel and batch_id

150

DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, y);

151

152

// Compute group_id and channel_id

153

DIV_MOD_UINT(curr_channel + (uint)0, K, group_id0, channel_id0);

154

DIV_MOD_UINT(curr_channel + (uint)1, K, group_id1, channel_id1);

155

DIV_MOD_UINT(curr_channel + (uint)2, K, group_id2, channel_id2);

156

DIV_MOD_UINT(curr_channel + (uint)3, K, group_id3, channel_id3);

157

158

const uint x = get_global_id(1) * 2;

159

const uint z0 = channel_id0 * (uint)NUM_GROUPS + group_id0;

160

const uint z1 = channel_id1 * (uint)NUM_GROUPS + group_id1;

161

const uint z2 = channel_id2 * (uint)NUM_GROUPS + group_id2;

162

const uint z3 = channel_id3 * (uint)NUM_GROUPS + group_id3;

163

164

// Load the Nx2 block

165

const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + curr_channel * sizeof(DATA_TYPE) + x * src_stride_y + y * src_stride_z + batch_id * src_stride_w;

166

TYPE u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

167

TYPE u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

168

169

// Store blocks

170

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_stride_y + y * dst_stride_z + batch_id * dst_stride_w;

171

*((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u0.s0;

172

*((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u0.s1;

173

*((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u0.s2;

174

*((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u0.s3;

175

*((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u1.s0;

176

*((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u1.s1;

177

*((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u1.s2;

178

*((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u1.s3;

179

}

180

#endif // VEC_SIZE == 4 && defined(LAST_ACCESSED)

181

#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)