Blame - src/core/CL/cl_kernels/tile_helpers.h - ml/ComputeLibrary

2021-03-19 11:26:20 +0000

[diff] [blame]

/*

*

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

// *INDENT-OFF*

// clang-format off

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

28

#define TILE_VECTOR_SIZE1 1

29

#define TILE_VECTOR_SIZE2 2

30

#define TILE_VECTOR_SIZE3 3

31

#define TILE_VECTOR_SIZE4 4

32

#define TILE_VECTOR_SIZE5 8

33

#define TILE_VECTOR_SIZE6 8

34

#define TILE_VECTOR_SIZE7 8

35

#define TILE_VECTOR_SIZE8 8

36

#define TILE_VECTOR_SIZE9 16

37

#define TILE_VECTOR_SIZE10 16

38

#define TILE_VECTOR_SIZE11 16

39

#define TILE_VECTOR_SIZE12 16

40

#define TILE_VECTOR_SIZE13 16

41

#define TILE_VECTOR_SIZE14 16

42

#define TILE_VECTOR_SIZE15 16

43

#define TILE_VECTOR_SIZE16 16

44

45

#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1

46

#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2

47

#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3

48

#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4

49

#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8

50

#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8

51

#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8

52

#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8

53

#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16

54

#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16

55

#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16

56

#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16

57

#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16

58

#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16

59

#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16

60

#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16

61

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

62

/** Tile object

63

* A tile object is a 2D memory block and can be accessed using the following syntax:

64

* -# a[m0].v = access the the vector at row "m0" (OpenCL vector)

65

* -# a[m0].s[x] = access the scalar element at row "m0" and column "n0" (scalar access)

66

*

67

* @param[in] DATA_TYPE Data type of the tile

68

* @param[in] H Number of tile rows

69

* @param[in] W Number of tile colums

70

* @param[in] BASENAME Tile's name

71

*/

72

#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)

73

#define TILE_STR(DATA_TYPE, H, W, BASENAME) \

74

union { \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

75

DATA_TYPE s[TILE_VECTOR_SIZE##W]; \

76

TILE_VECTOR_TYPE##W(DATA_TYPE) v; \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

77

} BASENAME[H]

78

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

79

#define TENSOR4D_IMAGE(name) \

80

__read_only image2d_t name##_img, \

81

__global uchar *name##_ptr, \

82

uint name##_stride_x, \

83

uint name##_step_x, \

84

uint name##_stride_y, \

85

uint name##_step_y, \

86

uint name##_stride_z, \

87

uint name##_step_z, \

88

uint name##_stride_w, \

89

uint name##_step_w, \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

90

uint name##_offset_first_element_in_bytes

91

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

92

#define TENSOR4D_BUFFER(name) \

93

__global uchar *name##_ptr, \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

94

uint name##_stride_x, \

95

uint name##_step_x, \

96

uint name##_stride_y, \

97

uint name##_step_y, \

98

uint name##_stride_z, \

99

uint name##_step_z, \

100

uint name##_stride_w, \

101

uint name##_step_w, \

102

uint name##_offset_first_element_in_bytes

103

104

#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)

105

#define TENSOR4D(name, type) TENSOR4D_STR(name, type)

106

Adnan AlSinan

17975a6

2021-11-08 17:46:39 +0000

[diff] [blame]

107

#define TENSOR4D_T_IMAGE(name) \

108

__read_only image2d_t name##_img, \

109

__global uchar *name##_ptr, \

110

uint name##_stride_y, \

111

uint name##_stride_z, \

112

uint name##_stride_w, \

uint name##_c, \

uint name##_w, \

uint name##_h, \

uint name##_n, \

uint name##_offset_first_element_in_bytes

118

119

#define TENSOR4D_T_BUFFER(name) \

120

__global uchar *name##_ptr, \

121

uint name##_stride_y, \

122

uint name##_stride_z, \

123

uint name##_stride_w, \

uint name##_c, \

uint name##_w, \

uint name##_h, \

uint name##_n, \

uint name##_offset_first_element_in_bytes

129

130

#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)

131

#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)

132

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame]

133

#define TENSOR3D_T_IMAGE(name) \

134

__read_only image2d_t name##_img, \

135

__global uchar *name##_ptr, \

136

uint name##_stride_y, \

137

uint name##_stride_z, \

uint name##_w, \

uint name##_h, \

uint name##_n, \

uint name##_offset_first_element_in_bytes

142

143

#define TENSOR3D_T_BUFFER(name) \

144

__global uchar *name##_ptr, \

145

uint name##_stride_y, \

146

uint name##_stride_z, \

uint name##_w, \

uint name##_h, \

uint name##_n, \

uint name##_offset_first_element_in_bytes

151

152

#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)

153

#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)

154

Giorgio Arena

ea8d266

2021-05-20 11:36:56 +0100

[diff] [blame]

155

#if !defined(UNROLL_WITH_PRAGMA)

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

156

#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)

157

158

#define LOOP_UNROLLING_1(idx, step, macro) (macro)

159

#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)

160

#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)

161

#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)

162

#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)

163

#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)

164

#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)

165

#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)

166

#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)

167

#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)

168

#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)

169

#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)

170

#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)

171

#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)

172

#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)

173

#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)

174

#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)

175

#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)

176

#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)

177

#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)

178

#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)

179

#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)

180

#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)

181

#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)

182

#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)

183

#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)

184

#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)

185

#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)

186

#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)

187

#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)

188

#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)

189

#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)

190

#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)

191

#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)

192

#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)

193

#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)

194

#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)

195

#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)

196

#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)

197

#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)

198

#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)

199

#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)

200

#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)

201

#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)

202

#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)

203

#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)

204

#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)

205

#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)

206

#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)

207

#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)

208

#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)

209

#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)

210

#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)

211

#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)

212

#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)

213

#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)

214

#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)

215

#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)

216

#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)

217

#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)

218

#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)

219

#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)

220

#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)

221

#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)

222

#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)

223

#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)

224

#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)

225

#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)

226

#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)

227

#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)

228

#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)

229

#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)

230

#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)

231

#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)

232

#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)

233

#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)

234

#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)

235

#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)

236

#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)

237

#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)

238

#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)

239

#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)

240

#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)

241

#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)

242

#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)

243

#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)

244

#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)

245

#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)

246

#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)

247

#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)

248

#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)

249

#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)

250

#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)

251

#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)

252

#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)

253

#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)

254

#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)

255

#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)

256

#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)

257

#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)

258

#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)

259

#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)

260

#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)

261

#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)

262

#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)

263

#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)

264

#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)

265

#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)

266

#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)

267

#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)

268

#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)

269

#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)

270

#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)

271

#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)

272

#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)

273

#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)

274

#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)

275

#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)

276

#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)

277

#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)

278

#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)

279

#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)

280

#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)

281

#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)

282

#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)

283

#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)

284

#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)

285

#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)

286

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

287

#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \

288

{ \

289

type idx = start; \

290

LOOP_UNROLLING_##num(idx, step, macro); \

291

}

Giorgio Arena

ea8d266

2021-05-20 11:36:56 +0100

[diff] [blame]

292

#else // !defined(UNROLL_WITH_PRAGMA)

293

#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \

294

{ \

295

_Pragma("unroll") \

296

for(type idx = start; idx < (num * step); idx += step) \

{ \

(macro); \

} \

}

#endif // !defined(UNROLL_WITH_PRAGMA)

302

#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

303

304

/** Get the get_global_id with partial N0. This function is useful when the dimension is not multiple of N0 and we need to use a partial N0

305

* to avoid out-of-bound read/write

306

*

307

* @note PARTIAL_N0 is used for get_global_id(n) = 0.

308

*

309

* @param[in] IDX get_global_id index (0,1 and 2 only)

310

* @param[in] N0 Number of elements read/written on the IDX direction

311

* @param[in] PARTIAL_N0 Number of elements read/written on the IDX direction for get_global_id(IDX) = 0. If zero,

312

* the Number of elements read/written on the IDX direction for get_global_id(IDX) = 0 is N0

313

*/

314

#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))

315

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

316

/** Dot product integet 8bit function

317

*

318

* @note Performs: c += dot(a, b)

319

*

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

320

* @param[in] A_DATA_TYPE A (lhs) data type

321

* @param[in] B_DATA_TYPE B (rhs) data type

322

* @param[in] C_DATA_TYPE C (accumulator) data type

323

* @param[in] K0 Number of accumulations

324

* @param[in] a OpenCL vector a

325

* @param[in] b OpenCL vector b

326

* @param[in] c Scalar variable c

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

327

*/

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

328

#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)

329

#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)

330

#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

331

({ \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

332

c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b); \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

333

})

334

#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

335

#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));

336

#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));

337

#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

338

#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Michalis Spyrou

c38ca38

2021-07-14 13:30:28 +0100

[diff] [blame]

339

#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

340

#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));

341

#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

342

#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

343

#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

344

({ \

345

c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \

346

c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \

347

})

348

#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

349

({ \

350

DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c); \

351

c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \

352

})

353

#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

354

({ \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

355

val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \

356

val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \

357

val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \

358

val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

359

})

360

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

361

#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

362

({ \

363

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \

364

DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c); \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

365

})

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

366

#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

367

({ \

368

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \

369

DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c); \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

370

})

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

371

#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

372

({ \

373

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \

374

DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c); \

375

})

376

#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

377

({ \

378

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \

379

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \

380

})

381

#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

382

({ \

383

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

384

DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c); \

385

})

386

#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

387

({ \

388

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

389

DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c); \

390

})

391

#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

392

({ \

393

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

394

DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c); \

395

})

396

#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

397

({ \

398

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

399

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c); \

400

})

401

#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

402

({ \

403

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

404

DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c); \

405

})

406

#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

407

({ \

408

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

409

DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c); \

410

})

411

#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

412

({ \

413

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

414

DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c); \

415

})

416

#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

417

({ \

418

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \

419

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \

420

})

421

422

/** Dot product integet 8bit function

423

*

424

* @note Performs: c += dot(a, b)

425

*

426

* @param[in] A_DATA_TYPE A (lhs) data type

427

* @param[in] B_DATA_TYPE B (rhs) data type

428

* @param[in] C_DATA_TYPE C (accumulator) data type

429

* @param[in] K0 Number of accumulations

430

* @param[in] a OpenCL vector a

431

* @param[in] c Scalar variable c

432

*/

433

#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)

434

#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

435

436

/** Load a vector from global memory (tensor)

437

*

438

* @param[in] DATA_TYPE Data type

439

* @param[in] WIDTH Number of dst columns

440

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).

441

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

442

* @param[in] TENSOR Tensor basename

443

* @param[in] X Starting X position

444

* @param[in] Y Starting Y position

445

* @param[in] STRIDE_Y Stride Y (in bytes)

446

*/

447

#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)

448

#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)

449

#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \

450

VLOAD(WIDTH) \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

451

(0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

452

#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))

453

454

/** Load a tile from global memory (tensor)

455

*

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

456

* @param[in] DATA_TYPE Data type

457

* @param[in] HEIGHT Number of dst rows

458

* @param[in] WIDTH Number of dst columns

459

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).

460

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

461

* @param[in] TENSOR Tensor basename

462

* @param[in] X Starting X position

463

* @param[in] Y Starting Y position

464

* @param[in] YI_MULTIPLIER Parameter used to multiply the internal row increment (_i).

465

* In common cases should be 1 but it becomes useful when we want to load rows which are multiple of STRIDE_Y. (e.g. loading the weights of convolution layer).

466

* In this case the address calculation is performed as: (Y + _i * Y_MULTIPLIER) * STRIDE_Y

467

* @param[in] STRIDE_Y Stride Y (in bytes) used to load each row.

468

* @param[out] dst Output tile

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

469

*/

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

470

#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

471

({ \

472

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

473

{ \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

474

dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

475

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

476

})

477

478

/** Load a tile from global memory (tensor) using an indirect Y index tile

479

*

480

* @param[in] DATA_TYPE Data type

481

* @param[in] HEIGHT Number of dst rows

482

* @param[in] WIDTH Number of dst columns

483

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

484

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

485

* @param[in] TENSOR Tensor basename

486

* @param[in] X Starting X position

487

* @param[in] STRIDE_Y Stride Y (in bytes)

488

* @param[in] indirect_y Indirect Y index tile

489

* @param[out] dst Output tile

490

*/

491

#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

492

({ \

493

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

494

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

495

dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

496

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

497

})

498

Gian Marco Iodice

534b889

2021-04-01 16:17:16 +0100

[diff] [blame]

499

/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout

500

*

501

* @param[in] DATA_TYPE Data type

502

* @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension

503

* @param[in] TILE_WIDTH Number of elements to load from X (width) dimension

504

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

505

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

506

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

507

* @param[in] TENSOR Tensor basename

508

* @param[in] B Starting batch index

509

* @param[in] Y Starting Y index

510

* @param[in] X Starting X index

511

* @param[in] C Starting C index

512

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

513

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

514

* @param[in] STRIDE_Y Stride Y (in bytes)

515

* @param[out] dst Output tile

516

*/

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

517

#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

518

({ \

519

LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \

520

{ \

521

LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \

522

{ \

523

int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH); \

524

_src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \

Gian Marco Iodice

534b889

2021-04-01 16:17:16 +0100

[diff] [blame]

525

int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

526

if(_src_valid_y != 0) \

527

{ \

528

dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

529

} \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

530

}) \

531

}) \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

532

})

533

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

534

/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout with dilation for the X and Y increments

535

*

536

* @param[in] DATA_TYPE Data type

537

* @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension

538

* @param[in] TILE_WIDTH Number of elements to load from X (width) dimension

539

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

540

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

541

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

542

* @param[in] TENSOR Tensor basename

543

* @param[in] B Starting batch index

544

* @param[in] Y Starting Y index

545

* @param[in] X Starting X index

546

* @param[in] C Starting C index

547

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

548

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

549

* @param[in] DILATION_X Dilation for the X increment

550

* @param[in] DILATION_Y Dilation for the Y increment

551

* @param[in] BOUNDARY_CHECK Boundary check flag. If true, it checks for any out-of-bound reads

552

* @param[out] dst Output tile

553

*/

554

#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst) \

555

({ \

556

LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \

557

{ \

558

LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \

559

{ \

560

int _src_y = (X) + _xk * (DILATION_X); \

561

int _src_z = ((Y) + _yk * (DILATION_Y)); \

562

int _src_w = (B); \

563

bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \

564

if(!(BOUNDARY_CHECK)) \

565

{ \

566

dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \

567

(0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \

} \

else \

{ \

if(_src_valid_y) \

{ \

dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \

574

(0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \

} \

} \

}) \

}) \

})

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

581

/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout using indirect X and Y coordinates

582

*

583

* @param[in] DATA_TYPE Data type

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

584

* @param[in] TILE_AREA Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

585

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

586

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

587

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

588

* @param[in] TENSOR Tensor basename

589

* @param[in] B Starting batch index

590

* @param[in] Y Starting Y index

591

* @param[in] X Starting X index

592

* @param[in] C Starting C index

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

593

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

594

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

595

* @param[in] STRIDE_Y Stride Y (in bytes)

596

* @param[out] xi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate

597

* @param[out] yi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate

598

* @param[out] dst Output tile

599

*/

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

600

#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst) \

601

({ \

602

LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \

603

{ \

604

int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH); \

605

_src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

606

int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

607

if(_src_valid_y != 0) \

608

{ \

609

dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

610

} \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

611

}) \

Gian Marco Iodice

534b889

2021-04-01 16:17:16 +0100

[diff] [blame]

612

})

613

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

614

/** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates

615

*

616

* @param[in] DATA_TYPE Data type

617

* @param[in] TILE_AREA Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension

618

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

619

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

620

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

621

* @param[in] TENSOR Tensor basename

622

* @param[in] B Starting batch index

623

* @param[in] Z Starting Z index

624

* @param[in] Y Starting Y index

625

* @param[in] X Starting X index

626

* @param[in] C Starting C index

627

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

628

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

629

* @param[in] TENSOR_DEPTH Number of elements to load from Z (depth) dimension

630

* @param[in] STRIDE_Y Stride Y (in bytes)

631

* @param[out] xi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate

632

* @param[out] yi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate

633

* @param[out] zi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Z coordinate

634

* @param[out] dst Output tile

635

*/

636

#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \

637

({ \

638

LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \

639

{ \

640

int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT); \

641

_src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH); \

642

int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT) \

643

&& ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH)); \

644

if(_src_valid_y != 0) \

645

{ \

646

dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

} \

}) \

})

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

651

/** Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a different length for the store

652

*

653

* @note If WIDTH1_CONDITION is true, the store will use the WIDTH1 length for the store

654

* @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones

655

*

656

* @param[in] DATA_TYPE Data type

657

* @param[in] HEIGHT Number of src rows

658

* @param[in] WIDTH0 Store width to use if WIDTH1_CONDITION = false

659

* @param[in] WIDTH1 Store width to use if WIDTH1_CONDITION = true

660

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

661

* cl_image is not supported.

662

* @param[in] TENSOR Tensor basename

663

* @param[in] X Starting X position

664

* @param[in] STRIDE_Y Stride Y (in bytes)

665

* @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store

666

* @param[in] src Input tile

667

* @param[in] indirect_y Indirect Y index tile

668

*/

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

669

#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y) \

670

({ \

671

if(WIDTH1_CONDITION) \

672

{ \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

673

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

674

{ \

675

VSTORE_PARTIAL(WIDTH0, WIDTH1) \

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

676

(CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

677

}) \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

678

} \

679

else \

680

{ \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

681

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

682

{ \

683

VSTORE(WIDTH0) \

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

684

(CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

685

}) \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

686

} \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

687

})

688

689

/** Offset correction for the QASYMM8 computation

690

*

691

* @param[in] ACC_DATA_TYPE Accumulator data type

692

* @param[in] M0 Number of src/dst rows

693

* @param[in] N0 Number of src/dst columns

694

* @param[in] K0 Number of src columns

695

* @param[in] SRC_OFFSET Source quantization offset

696

* @param[in] WEI_OFFSET Weights quantization shift

697

* @param[in] lhs LHS tile

698

* @param[in] rhs RHS tile

699

* @param[out] dst DST tile

700

*/

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

701

#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst) \

702

({ \

703

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

704

{ \

705

ACC_DATA_TYPE _tm = 0; \

706

LOOP_UNROLLING(int, _k0, 0, 1, K0, \

707

{ \

708

_tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET); \

709

}) \

710

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

711

{ \

712

dst[_m0].s[_n0] += _tm; \

713

LOOP_UNROLLING(int, _k0, 0, 1, K0, \

714

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

715

dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

716

}) \

717

}) \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

718

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

719

})

720

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

721

/** 8-bit quantization with fixed-point scale

722

*

723

* @param[in] SRC_DATA_TYPE SRC data type

724

* @param[in] DST_DATA_TYPE DST data type

725

* @param[in] QUANTIZATION_TYPE Quantization type (PER_TENSOR or PER_CHANNEL)

726

* @param[in] M0 Number of src/dst rows

727

* @param[in] N0 Number of src/dst columns

728

* @param[in] DST_OFFSET Quantization offset used for both the per-tensor and per-channel quantization

729

* @param[in] DST_SHIFT Quantization shift for the per-tensor quantization

730

* @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization

731

* @param[in] src Input tile

732

* @param[in] dst_multipliers Output multipliers tile for the per-channel quantization

733

* @param[in] dst_shifts Output shift tile for the per-channel quantization

734

* @param[out] dst Output tile

735

*/

736

#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)

737

#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)

738

739

/** 8-bit per-tensor quantization with fixed-point scale

740

*

741

* @param[in] SRC_DATA_TYPE SRC data type

742

* @param[in] DST_DATA_TYPE DST data type

743

* @param[in] M0 Number of src/dst rows

744

* @param[in] N0 Number of src/dst columns

745

* @param[in] DST_OFFSET Quantization offset

746

* @param[in] DST_SHIFT Quantization shift for the per-tensor quantization

747

* @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization

748

* @param[in] src Input tile

749

* @param[in] dst_multipliers (unused)

750

* @param[in] dst_shifts (unused)

751

* @param[out] dst Output tile

752

*/

753

#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \

754

({ \

755

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

756

{ \

757

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

758

{ \

759

SRC_DATA_TYPE _tmp = 0; \

760

SRC_DATA_TYPE _src = src[_m0].s[_n0]; \

761

_src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \

762

SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \

763

long a_64 = (long)(_src); \

764

long b_64 = (long)(DST_MULTIPLIER); \

765

long ab_64 = a_64 * b_64; \

766

long mask1 = 1 << 30; \

767

long mask2 = 1 - (1 << 30); \

768

long is_positive_or_zero = ab_64 >= 0; \

769

long nudge = select(mask2, mask1, is_positive_or_zero); \

770

SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \

771

_tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \

772

if(DST_SHIFT >= 0) \

773

{ \

Freddie Liardet

767dbf9

2021-07-21 16:20:41 +0100

[diff] [blame]

774

long mask = ((((int)1) << DST_SHIFT) - (long)1); \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

775

long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \

776

_tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \

777

} \

778

_tmp += DST_OFFSET; \

779

dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \

}) \

}) \

})

/** 8-bit per-channel quantization with fixed-point scale

785

*

786

* @param[in] SRC_DATA_TYPE SRC data type

787

* @param[in] DST_DATA_TYPE DST data type

788

* @param[in] M0 Number of src/dst rows

789

* @param[in] N0 Number of src/dst columns

790

* @param[in] DST_OFFSET Quantization offset

791

* @param[in] DST_SHIFT (unused)

792

* @param[in] DST_MULTIPLIER (unused)

793

* @param[in] src Input tile

794

* @param[in] dst_multipliers Output multipliers tile for the per-channel quantization

795

* @param[in] dst_shifts Output shift tile for the per-channel quantization

796

* @param[out] dst Output tile

797

*/

798

#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \

799

({ \

800

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

801

{ \

802

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

803

{ \

804

SRC_DATA_TYPE _tmp = 0; \

805

SRC_DATA_TYPE _src = src[_m0].s[_n0]; \

806

SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \

807

SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \

808

_src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \

809

SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \

810

long a_64 = (long)(_src); \

811

long b_64 = (long)(_dst_multiplier); \

812

long ab_64 = a_64 * b_64; \

813

long mask1 = 1 << 30; \

814

long mask2 = 1 - (1 << 30); \

815

long is_positive_or_zero = ab_64 >= 0; \

816

long nudge = select(mask2, mask1, is_positive_or_zero); \

817

SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \

818

_tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \

819

if(_dst_shift >= 0) \

820

{ \

821

long mask = ((((int)1) << _dst_shift) - (int)1); \

822

long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \

823

_tmp = (_tmp & mask) > threshold ? (_tmp >> _dst_shift) + (int)1 : (_tmp >> _dst_shift); \

824

} \

825

_tmp += DST_OFFSET; \

826

dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \

}) \

}) \

})

/** Quantized the 8-bit tile with fixed-point scale for asymmetric

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

832

*

833

* @param[in] SRC_DATA_TYPE SRC data type

834

* @param[in] DST_DATA_TYPE DST data type

835

* @param[in] M0 Number of src/dst rows

836

* @param[in] N0 Number of src/dst columns

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

837

* @param[in] DST_OFFSET Quantization offset used for both the per-tensor and per-channel quantization

838

* @param[in] DST_SHIFT Quantization shift for the per-tensor quantization

839

* @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

840

* @param[in] src Input tile

841

* @param[out] dst Output tile

842

*/

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

843

#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst) \

844

({ \

845

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

846

{ \

847

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

848

{ \

849

SRC_DATA_TYPE _tmp = 0; \

850

SRC_DATA_TYPE _src = src[_m0].s[_n0]; \

851

_src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \

852

SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \

853

long a_64 = (long)(_src); \

854

long b_64 = (long)(DST_MULTIPLIER); \

855

long ab_64 = a_64 * b_64; \

856

long mask1 = 1 << 30; \

857

long mask2 = 1 - (1 << 30); \

858

long is_positive_or_zero = ab_64 >= 0; \

859

long nudge = select(mask2, mask1, is_positive_or_zero); \

860

SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \

861

_tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \

862

if(DST_SHIFT >= 0) \

863

{ \

864

long mask = ((((int)1) << DST_SHIFT) - (int)1); \

865

long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \

866

_tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \

867

} \

868

_tmp += DST_OFFSET; \

869

dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \

870

}) \

871

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

872

})

873

874

/** Conditional rowset (memset by row)

875

*

876

* @note Set the row to VALUE_TO_SET if the corresponding mask == 0

877

*

878

* @param[in] DATA_TYPE Data type

879

* @param[in] M0 Number of LHS rows

880

* @param[in] N0 Number of LHS columns

881

* @param[in] VALUE_TO_SET Value to set the row

882

* @param[in, out] a Input/output tile

883

* @param[out] mask Mask to check for setting the row to VALUE_TO_SET

884

*/

885

#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

886

({ \

887

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

888

{ \

889

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

890

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

891

a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

892

}) \

893

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

894

})

895

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

896

/** Element-wise activation for floating point types

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

897

*

898

* @note Performs: activation(LHS) = DST

899

*

900

* @param[in] DATA_TYPE SRC/DST data type

901

* @param[in] M0 Number of SRC/DST rows

902

* @param[in] N0 Number of SRC/DST columns

903

* @param[in] ACTIVATION_TYPE Activation type

904

* @param[in] A_VAL A value used for the activation (e.g. tanh_op, brelu,..)

905

* @param[in] B_VAL B value used for the activation (e.g. tanh_op, brelu,..)

906

* @param[out] src SRC tile

907

* @param[out] dst DST tile

908

*/

909

#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

910

({ \

911

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

912

{ \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

913

dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

914

}) \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

915

})

916

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

917

// RELU Activation

918

#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))

919

// Bounded RELU Activation

920

#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))

921

// Lower Upper Bounded RELU Activation

922

#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))

923

// Hard Swish Activation

924

#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))

925

// Identity Activation

926

#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)

927

928

#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)

929

#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)

930

931

/** Element-wise activation for quantized types

932

*

933

* @note Performs: activation(LHS) = DST

934

*

935

* @param[in] DATA_TYPE SRC/DST data type

936

* @param[in] M0 Number of SRC/DST rows

937

* @param[in] N0 Number of SRC/DST columns

938

* @param[in] ACTIVATION_TYPE Activation type

939

* @param[in] ZERO_VALUE The zero value to consider in the computation

940

* @param[in] A_VAL A value used for the activation (e.g. tanh_op, brelu,..)

941

* @param[in] B_VAL B value used for the activation (e.g. tanh_op, brelu,..)

942

* @param[out] src SRC tile

943

* @param[out] dst DST tile

944

*/

945

#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst) \

946

({ \

947

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

948

{ \

949

dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \

}) \

})

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

953

/** Element-wise addition with a constant value

954

*

955

* @note Performs: LHS + constant = DST

956

*

957

* @param[in] DATA_TYPE LHS/RHS/DST data type

958

* @param[in] M0 Number of LHS rows

959

* @param[in] N0 Number of LHS columns

960

* @param[in] lhs LHS tile

961

* @param[in] rhs_constant Constant value

962

* @param[out] dst DST tile

963

*/

964

#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

965

({ \

966

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

967

{ \

968

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

969

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

970

dst[_m0].s[_n0] = lhs[_m0].s[_n0] + rhs_constant; \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

971

}) \

972

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

973

})

974

975

/** Element-wise addition with RHS broadcasted (RHS has the X dimension only)

976

*

977

* @note Performs: LHS + RHS[broadcasted] = DST

978

* @note Both tiles must have same data type

979

*

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

980

* @param[in] DST_DATA_TYPE DST data type

981

* @param[in] M0 Number of LHS rows

982

* @param[in] N0 Number of LHS columns

983

* @param[in] lhs LHS tile

984

* @param[in] rhs RHS tile

985

* @param[out] dst DST tile

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

986

*/

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

987

#define T_ADD_BROADCAST_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

988

({ \

989

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

990

{ \

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

991

dst[_m0].v = CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)) + CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

992

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

993

})

994

995

/** Matrix multiplication

996

*

997

* @note Performs: LHS X RHS + DST = DST

998

*

999

* @param[in] LHS_DATA_TYPE LHS tile data type

1000

* @param[in] RHS_DATA_TYPE RHS tile data type

1001

* @param[in] DST_DATA_TYPE RHS tile data type

1002

* @param[in] M0 Number of LHS rows

1003

* @param[in] N0 Number of RHS columns

1004

* @param[in] K0 Number of LHS columns

1005

* @param[in] LHS_LAYOUT LHS layout (T= transposed, NT= not transposed)

1006

* @param[in] RHS_LAYOUT RHS layout (T= transposed, NT= not transposed)

1007

* @param[in] lhs LHS tile

1008

* @param[in] rhs RHS tile

1009

* @param[in, out] dst DST tile

1010

*/

1011

#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

1012

#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)

1013

#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

1014

#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)

Gian Marco Iodice