Blame - src/core/CL/cl_kernels/tile_helpers.h - ml/ComputeLibrary

2021-03-19 11:26:20 +0000

[diff] [blame]

1

/*

Viet-Hoa Do

b3077fb

2023-01-03 17:59:14 +0000

[diff] [blame]

2

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

22

* SOFTWARE.

23

*/

Ramy Elgammal

2022-12-14 09:20:09 +0000

[diff] [blame^]

24

#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS

25

#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

26

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

// *INDENT-OFF*

// clang-format off

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

30

#define TILE_VECTOR_SIZE1 1

31

#define TILE_VECTOR_SIZE2 2

32

#define TILE_VECTOR_SIZE3 3

33

#define TILE_VECTOR_SIZE4 4

34

#define TILE_VECTOR_SIZE5 8

35

#define TILE_VECTOR_SIZE6 8

36

#define TILE_VECTOR_SIZE7 8

37

#define TILE_VECTOR_SIZE8 8

38

#define TILE_VECTOR_SIZE9 16

39

#define TILE_VECTOR_SIZE10 16

40

#define TILE_VECTOR_SIZE11 16

41

#define TILE_VECTOR_SIZE12 16

42

#define TILE_VECTOR_SIZE13 16

43

#define TILE_VECTOR_SIZE14 16

44

#define TILE_VECTOR_SIZE15 16

45

#define TILE_VECTOR_SIZE16 16

46

47

#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1

48

#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2

49

#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3

50

#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4

51

#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8

52

#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8

53

#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8

54

#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8

55

#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16

56

#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16

57

#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16

58

#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16

59

#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16

60

#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16

61

#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16

62

#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16

63

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

64

/** Tile object

65

* A tile object is a 2D memory block and can be accessed using the following syntax:

66

* -# a[m0].v = access the the vector at row "m0" (OpenCL vector)

Ramy Elgammal

404462a

2022-11-08 02:14:46 +0000

[diff] [blame]

67

* -# dst[m0].s[n0] = access the scalar element at row "m0" and column "n0" (scalar access)

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

68

*

69

* @param[in] DATA_TYPE Data type of the tile

70

* @param[in] H Number of tile rows

71

* @param[in] W Number of tile colums

72

* @param[in] BASENAME Tile's name

73

*/

74

#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)

75

#define TILE_STR(DATA_TYPE, H, W, BASENAME) \

76

union { \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

77

DATA_TYPE s[TILE_VECTOR_SIZE##W]; \

78

TILE_VECTOR_TYPE##W(DATA_TYPE) v; \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

79

} BASENAME[H]

80

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

81

#define TENSOR4D_IMAGE(name) \

82

__read_only image2d_t name##_img, \

83

__global uchar *name##_ptr, \

84

uint name##_stride_x, \

85

uint name##_step_x, \

86

uint name##_stride_y, \

87

uint name##_step_y, \

88

uint name##_stride_z, \

89

uint name##_step_z, \

90

uint name##_stride_w, \

91

uint name##_step_w, \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

92

uint name##_offset_first_element_in_bytes

93

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

94

#define TENSOR4D_BUFFER(name) \

95

__global uchar *name##_ptr, \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

96

uint name##_stride_x, \

97

uint name##_step_x, \

98

uint name##_stride_y, \

99

uint name##_step_y, \

100

uint name##_stride_z, \

101

uint name##_step_z, \

102

uint name##_stride_w, \

103

uint name##_step_w, \

104

uint name##_offset_first_element_in_bytes

105

106

#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)

107

#define TENSOR4D(name, type) TENSOR4D_STR(name, type)

108

Adnan AlSinan

17975a6

2021-11-08 17:46:39 +0000

[diff] [blame]

109

#define TENSOR4D_T_IMAGE(name) \

110

__read_only image2d_t name##_img, \

111

__global uchar *name##_ptr, \

112

uint name##_stride_y, \

113

uint name##_stride_z, \

114

uint name##_stride_w, \

uint name##_c, \

uint name##_w, \

uint name##_h, \

uint name##_n, \

uint name##_offset_first_element_in_bytes

120

121

#define TENSOR4D_T_BUFFER(name) \

122

__global uchar *name##_ptr, \

123

uint name##_stride_y, \

124

uint name##_stride_z, \

125

uint name##_stride_w, \

uint name##_c, \

uint name##_w, \

uint name##_h, \

uint name##_n, \

uint name##_offset_first_element_in_bytes

131

132

#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)

Gian Marco Iodice

2022-12-30 16:07:45 +0000

[diff] [blame]

133

134

/** Legacy tensor 4D arguments

135

*

136

* @param[in] name Tensor name. The tensor name is the prefix of the tensor components

137

* @param[in] type Tensor type (BUFFER or IMAGE)

138

*/

Adnan AlSinan

17975a6

2021-11-08 17:46:39 +0000

[diff] [blame]

139

#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)

140

Gian Marco Iodice

2022-12-30 16:07:45 +0000

[diff] [blame]

141

#define TENSOR4D_RO_T_IMAGE(name) \

142

__read_only image2d_t name##_img, \

143

TENSOR4D_T_BUFFER(name)

144

145

#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)

146

147

#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)

148

149

/** Read-Only (RO) tensor 4D.

150

*

151

* @param[in] name Tensor name. The tensor name is the prefix of the tensor components

152

* @param[in] type Tensor type (BUFFER or IMAGE)

153

*/

154

#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)

155

156

#define TENSOR4D_WO_T_IMAGE(name) \

157

__write_only image2d_t name##_img, \

158

TENSOR4D_T_BUFFER(name)

159

160

#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)

161

162

#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)

163

164

/** Write-Only (WO) tensor 4D.

165

*

166

* @param[in] name Tensor name. The tensor name is the prefix of the tensor components

167

* @param[in] type Tensor type (BUFFER or IMAGE)

168

*/

169

#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)

170

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame]

171

#define TENSOR3D_T_IMAGE(name) \

172

__read_only image2d_t name##_img, \

173

__global uchar *name##_ptr, \

174

uint name##_stride_y, \

175

uint name##_stride_z, \

uint name##_w, \

uint name##_h, \

uint name##_n, \

uint name##_offset_first_element_in_bytes

180

181

#define TENSOR3D_T_BUFFER(name) \

182

__global uchar *name##_ptr, \

183

uint name##_stride_y, \

184

uint name##_stride_z, \

uint name##_w, \

uint name##_h, \

uint name##_n, \

uint name##_offset_first_element_in_bytes

189

190

#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)

191

#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)

192

Giorgio Arena

ea8d266

2021-05-20 11:36:56 +0100

[diff] [blame]

193

#if !defined(UNROLL_WITH_PRAGMA)

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

194

#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)

195

196

#define LOOP_UNROLLING_1(idx, step, macro) (macro)

197

#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)

198

#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)

199

#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)

200

#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)

201

#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)

202

#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)

203

#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)

204

#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)

205

#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)

206

#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)

207

#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)

208

#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)

209

#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)

210

#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)

211

#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)

212

#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)

213

#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)

214

#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)

215

#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)

216

#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)

217

#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)

218

#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)

219

#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)

220

#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)

221

#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)

222

#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)

223

#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)

224

#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)

225

#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)

226

#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)

227

#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)

228

#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)

229

#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)

230

#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)

231

#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)

232

#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)

233

#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)

234

#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)

235

#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)

236

#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)

237

#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)

238

#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)

239

#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)

240

#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)

241

#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)

242

#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)

243

#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)

244

#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)

245

#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)

246

#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)

247

#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)

248

#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)

249

#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)

250

#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)

251

#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)

252

#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)

253

#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)

254

#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)

255

#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)

256

#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)

257

#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)

258

#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)

259

#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)

260

#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)

261

#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)

262

#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)

263

#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)

264

#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)

265

#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)

266

#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)

267

#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)

268

#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)

269

#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)

270

#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)

271

#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)

272

#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)

273

#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)

274

#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)

275

#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)

276

#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)

277

#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)

278

#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)

279

#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)

280

#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)

281

#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)

282

#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)

283

#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)

284

#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)

285

#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)

286

#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)

287

#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)

288

#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)

289

#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)

290

#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)

291

#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)

292

#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)

293

#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)

294

#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)

295

#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)

296

#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)

297

#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)

298

#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)

299

#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)

300

#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)

301

#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)

302

#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)

303

#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)

304

#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)

305

#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)

306

#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)

307

#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)

308

#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)

309

#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)

310

#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)

311

#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)

312

#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)

313

#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)

314

#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)

315

#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)

316

#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)

317

#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)

318

#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)

319

#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)

320

#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)

321

#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)

322

#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)

323

#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)

324

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

325

#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \

326

{ \

327

type idx = start; \

328

LOOP_UNROLLING_##num(idx, step, macro); \

329

}

Giorgio Arena

ea8d266

2021-05-20 11:36:56 +0100

[diff] [blame]

330

#else // !defined(UNROLL_WITH_PRAGMA)

331

#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \

332

{ \

333

_Pragma("unroll") \

334

for(type idx = start; idx < (num * step); idx += step) \

{ \

(macro); \

} \

}

#endif // !defined(UNROLL_WITH_PRAGMA)

340

#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

341

342

/** Get the get_global_id with partial N0. This function is useful when the dimension is not multiple of N0 and we need to use a partial N0

343

* to avoid out-of-bound read/write

344

*

345

* @note PARTIAL_N0 is used for get_global_id(n) = 0.

346

*

347

* @param[in] IDX get_global_id index (0,1 and 2 only)

348

* @param[in] N0 Number of elements read/written on the IDX direction

349

* @param[in] PARTIAL_N0 Number of elements read/written on the IDX direction for get_global_id(IDX) = 0. If zero,

350

* the Number of elements read/written on the IDX direction for get_global_id(IDX) = 0 is N0

351

*/

352

#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))

353

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

354

/** Dot product integet 8bit function

355

*

356

* @note Performs: c += dot(a, b)

357

*

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

358

* @param[in] A_DATA_TYPE A (lhs) data type

359

* @param[in] B_DATA_TYPE B (rhs) data type

360

* @param[in] C_DATA_TYPE C (accumulator) data type

361

* @param[in] K0 Number of accumulations

362

* @param[in] a OpenCL vector a

363

* @param[in] b OpenCL vector b

364

* @param[in] c Scalar variable c

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

365

*/

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

366

#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)

367

#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)

368

#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

369

({ \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

370

c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b); \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

371

})

Viet-Hoa Do

82169b3

2022-05-26 16:50:21 +0100

[diff] [blame]

372

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)

373

#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));

374

#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));

375

#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));

376

#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

377

#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));

378

#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));

379

#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

380

#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Michalis Spyrou

c38ca38

2021-07-14 13:30:28 +0100

[diff] [blame]

381

#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

382

#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));

383

#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

384

#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

385

#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

386

({ \

387

c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \

388

c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \

389

})

390

#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

391

({ \

392

DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c); \

393

c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \

394

})

395

#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

396

({ \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

397

val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \

398

val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \

399

val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \

400

val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

401

})

402

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

403

#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

404

({ \

405

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \

406

DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c); \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

407

})

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

408

#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

409

({ \

410

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \

411

DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c); \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

412

})

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

413

#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

414

({ \

415

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \

416

DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c); \

417

})

418

#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

419

({ \

420

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \

421

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \

422

})

423

#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

424

({ \

425

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

426

DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c); \

427

})

428

#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

429

({ \

430

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

431

DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c); \

432

})

433

#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

434

({ \

435

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

436

DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c); \

437

})

438

#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

439

({ \

440

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

441

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c); \

442

})

443

#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

444

({ \

445

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

446

DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c); \

447

})

448

#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

449

({ \

450

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

451

DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c); \

452

})

453

#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

454

({ \

455

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \

456

DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c); \

457

})

458

#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

459

({ \

460

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \

461

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \

462

})

463

464

/** Dot product integet 8bit function

465

*

466

* @note Performs: c += dot(a, b)

467

*

468

* @param[in] A_DATA_TYPE A (lhs) data type

469

* @param[in] B_DATA_TYPE B (rhs) data type

470

* @param[in] C_DATA_TYPE C (accumulator) data type

471

* @param[in] K0 Number of accumulations

472

* @param[in] a OpenCL vector a

473

* @param[in] c Scalar variable c

474

*/

475

#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)

476

#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

477

478

/** Load a vector from global memory (tensor)

479

*

480

* @param[in] DATA_TYPE Data type

481

* @param[in] WIDTH Number of dst columns

482

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).

483

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

484

* @param[in] TENSOR Tensor basename

485

* @param[in] X Starting X position

486

* @param[in] Y Starting Y position

487

* @param[in] STRIDE_Y Stride Y (in bytes)

488

*/

489

#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)

490

#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)

491

#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \

492

VLOAD(WIDTH) \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

493

(0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

494

#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))

495

Gian Marco Iodice

2022-12-30 16:07:45 +0000

[diff] [blame]

496

/** Store a vector in global memory (tensor)

497

*

498

* @param[in] DATA_TYPE Data type

499

* @param[in] WIDTH Number of dst columns

500

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).

501

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

502

* @param[in] TENSOR Tensor basename

503

* @param[in] X Starting X position

504

* @param[in] Y Starting Y position

505

* @param[in] STRIDE_Y Stride Y (in bytes)

506

* @param[in] VALUES Values to store in memory

507

*/

508

#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)

509

#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)

510

#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \

511

VSTORE(WIDTH) \

512

(VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))

513

#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)

514

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

515

/** Load a tile from global memory (tensor)

516

*

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

517

* @param[in] DATA_TYPE Data type

518

* @param[in] HEIGHT Number of dst rows

519

* @param[in] WIDTH Number of dst columns

520

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).

521

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

522

* @param[in] TENSOR Tensor basename

523

* @param[in] X Starting X position

524

* @param[in] Y Starting Y position

525

* @param[in] YI_MULTIPLIER Parameter used to multiply the internal row increment (_i).

526

* In common cases should be 1 but it becomes useful when we want to load rows which are multiple of STRIDE_Y. (e.g. loading the weights of convolution layer).

527

* In this case the address calculation is performed as: (Y + _i * Y_MULTIPLIER) * STRIDE_Y

528

* @param[in] STRIDE_Y Stride Y (in bytes) used to load each row.

529

* @param[out] dst Output tile

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

530

*/

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

531

#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

532

({ \

533

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

534

{ \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

535

dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

536

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

537

})

538

539

/** Load a tile from global memory (tensor) using an indirect Y index tile

540

*

541

* @param[in] DATA_TYPE Data type

542

* @param[in] HEIGHT Number of dst rows

543

* @param[in] WIDTH Number of dst columns

544

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

545

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

546

* @param[in] TENSOR Tensor basename

547

* @param[in] X Starting X position

548

* @param[in] STRIDE_Y Stride Y (in bytes)

549

* @param[in] indirect_y Indirect Y index tile

550

* @param[out] dst Output tile

551

*/

552

#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

553

({ \

554

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

555

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

556

dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

557

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

558

})

559

Adnan AlSinan

3e155a5

2021-12-10 12:34:02 +0000

[diff] [blame]

560

/** Load a tile from global memory (tensor) using an indirect Y index tile and conditionally use a different length for the load

561

*

562

* @note If WIDTH1_CONDITION is true, the load will use the WIDTH1 length for the store

563

* @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones

564

*

565

* @param[in] DATA_TYPE Data type

566

* @param[in] HEIGHT Number of dst rows

567

* @param[in] WIDTH0 Store width to use if WIDTH1_CONDITION = false

568

* @param[in] WIDTH1 Store width to use if WIDTH1_CONDITION = true

569

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).

570

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

571

* @param[in] TENSOR Tensor basename

572

* @param[in] X Starting X position

573

* @param[in] STRIDE_Y Stride Y (in bytes) used to load each row.

574

* @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store

575

* @param[out] dst Output tile

576

* @param[out] indirect_y Indirect Y index tile

577

*/

578

#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y) \

579

({ \

580

if(WIDTH1_CONDITION) \

581

{ \

582

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

583

{ \

584

VLOAD_PARTIAL(WIDTH0, WIDTH1) \

585

(dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \

}) \

} \

else \

{ \

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

591

{ \

592

dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \

593

}) \

594

} \

595

})

Gian Marco Iodice

534b889

2021-04-01 16:17:16 +0100

[diff] [blame]

596

/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout

597

*

598

* @param[in] DATA_TYPE Data type

599

* @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension

600

* @param[in] TILE_WIDTH Number of elements to load from X (width) dimension

601

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

602

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

603

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

604

* @param[in] TENSOR Tensor basename

605

* @param[in] B Starting batch index

606

* @param[in] Y Starting Y index

607

* @param[in] X Starting X index

608

* @param[in] C Starting C index

609

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

610

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

611

* @param[in] STRIDE_Y Stride Y (in bytes)

612

* @param[out] dst Output tile

613

*/

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

614

#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

615

({ \

616

LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \

617

{ \

618

LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \

619

{ \

620

int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH); \

621

_src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \

Gian Marco Iodice

534b889

2021-04-01 16:17:16 +0100

[diff] [blame]

622

int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

623

if(_src_valid_y != 0) \

624

{ \

625

dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

626

} \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

627

}) \

628

}) \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

629

})

630

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

631

/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout with dilation for the X and Y increments

632

*

633

* @param[in] DATA_TYPE Data type

634

* @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension

635

* @param[in] TILE_WIDTH Number of elements to load from X (width) dimension

636

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

637

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

638

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

639

* @param[in] TENSOR Tensor basename

640

* @param[in] B Starting batch index

641

* @param[in] Y Starting Y index

642

* @param[in] X Starting X index

643

* @param[in] C Starting C index

644

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

645

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

646

* @param[in] DILATION_X Dilation for the X increment

647

* @param[in] DILATION_Y Dilation for the Y increment

648

* @param[in] BOUNDARY_CHECK Boundary check flag. If true, it checks for any out-of-bound reads

649

* @param[out] dst Output tile

650

*/

651

#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst) \

652

({ \

653

LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \

654

{ \

655

LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \

656

{ \

657

int _src_y = (X) + _xk * (DILATION_X); \

658

int _src_z = ((Y) + _yk * (DILATION_Y)); \

659

int _src_w = (B); \

660

bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \

661

if(!(BOUNDARY_CHECK)) \

662

{ \

663

dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \

664

(0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \

} \

else \

{ \

if(_src_valid_y) \

{ \

dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \

671

(0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \

} \

} \

}) \

}) \

})

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

678

/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout using indirect X and Y coordinates

679

*

680

* @param[in] DATA_TYPE Data type

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

681

* @param[in] TILE_AREA Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

682

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

683

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

684

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

685

* @param[in] TENSOR Tensor basename

686

* @param[in] B Starting batch index

687

* @param[in] Y Starting Y index

688

* @param[in] X Starting X index

689

* @param[in] C Starting C index

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

690

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

691

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

692

* @param[in] STRIDE_Y Stride Y (in bytes)

693

* @param[out] xi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate

694

* @param[out] yi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate

695

* @param[out] dst Output tile

696

*/

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

697

#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst) \

698

({ \

699

LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \

700

{ \

701

int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH); \

702

_src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

703

int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

704

if(_src_valid_y != 0) \

705

{ \

706

dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

707

} \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

708

}) \

Gian Marco Iodice

534b889

2021-04-01 16:17:16 +0100

[diff] [blame]

709

})

710

Gian Marco Iodice

76335eb

2022-11-17 11:03:39 +0000

[diff] [blame]

711

/** Load a tile from global memory (tensor) using an indirect buffer for the Y coordinates

712

*

713

* @param[in] DATA_TYPE Data type

714

* @param[in] TILE_AREA Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension

715

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

Gian Marco Iodice

2022-12-30 16:07:45 +0000

[diff] [blame]

716

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).

717

* When TENSOR_TYPE=IMAGE, the if condition for the out-of-bound check can be skipped

Gian Marco Iodice

76335eb

2022-11-17 11:03:39 +0000

[diff] [blame]

718

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

719

* @param[in] TENSOR Tensor basename

720

* @param[in] C Starting C index

721

* @param[in] STRIDE_Y Stride Y (in bytes)

722

* @param[out] yi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate

723

* 16 is the maximum indirect buffer size.

724

* @param[out] dst Output tile

725

*/

Gian Marco Iodice

2022-12-30 16:07:45 +0000

[diff] [blame]

726

#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)

727

#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)

728

#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \

729

({ \

730

LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \

731

{ \

732

if(yi[0].s[_i] >= 0) \

733

{ \

734

dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \

} \

}) \

})

#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \

740

({ \

741

LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \

742

{ \

743

dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \

744

}) \

Gian Marco Iodice

3394f3e

2022-09-16 14:14:21 +0100

[diff] [blame]

745

})

746

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

747

/** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates

748

*

749

* @param[in] DATA_TYPE Data type

750

* @param[in] TILE_AREA Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension

751

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

752

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

753

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

754

* @param[in] TENSOR Tensor basename

755

* @param[in] B Starting batch index

756

* @param[in] Z Starting Z index

757

* @param[in] Y Starting Y index

758

* @param[in] X Starting X index

759

* @param[in] C Starting C index

760

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

761

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

762

* @param[in] TENSOR_DEPTH Number of elements to load from Z (depth) dimension

763

* @param[in] STRIDE_Y Stride Y (in bytes)

764

* @param[out] xi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate

765

* @param[out] yi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate

766

* @param[out] zi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Z coordinate

767

* @param[out] dst Output tile

768

*/

769

#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \

770

({ \

771

LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \

772

{ \

773

int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT); \

774

_src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH); \

775

int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT) \

776

&& ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH)); \

777

if(_src_valid_y != 0) \

778

{ \

779

dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

} \

}) \

})

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

784

/** Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a different length for the store

785

*

786

* @note If WIDTH1_CONDITION is true, the store will use the WIDTH1 length for the store

787

* @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones

788

*

789

* @param[in] DATA_TYPE Data type

790

* @param[in] HEIGHT Number of src rows

791

* @param[in] WIDTH0 Store width to use if WIDTH1_CONDITION = false

792

* @param[in] WIDTH1 Store width to use if WIDTH1_CONDITION = true

793

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

794

* cl_image is not supported.

795

* @param[in] TENSOR Tensor basename

796

* @param[in] X Starting X position

797

* @param[in] STRIDE_Y Stride Y (in bytes)

798

* @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store

799

* @param[in] src Input tile

800

* @param[in] indirect_y Indirect Y index tile

801

*/

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

802

#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y) \

803

({ \

804

if(WIDTH1_CONDITION) \

805

{ \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

806

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

807

{ \

808

VSTORE_PARTIAL(WIDTH0, WIDTH1) \

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

809

(CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

810

}) \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

811

} \

812

else \

813

{ \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

814

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

815

{ \

816

VSTORE(WIDTH0) \

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

817

(CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

818

}) \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

819

} \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

820

})

821

822

/** Offset correction for the QASYMM8 computation

823

*

824

* @param[in] ACC_DATA_TYPE Accumulator data type

825

* @param[in] M0 Number of src/dst rows

826

* @param[in] N0 Number of src/dst columns

827

* @param[in] K0 Number of src columns

828

* @param[in] SRC_OFFSET Source quantization offset

829

* @param[in] WEI_OFFSET Weights quantization shift

830

* @param[in] lhs LHS tile

831

* @param[in] rhs RHS tile

832

* @param[out] dst DST tile

833

*/

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

834

#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst) \

835

({ \

836

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

837

{ \

838

ACC_DATA_TYPE _tm = 0; \

839

LOOP_UNROLLING(int, _k0, 0, 1, K0, \

840

{ \

841

_tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET); \

842

}) \

843

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

844

{ \

845

dst[_m0].s[_n0] += _tm; \

846

LOOP_UNROLLING(int, _k0, 0, 1, K0, \

847

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

848

dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

849

}) \

850

}) \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

851

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

852

})

853

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

854

/** 8-bit quantization with fixed-point scale

855

*

856

* @param[in] SRC_DATA_TYPE SRC data type

857

* @param[in] DST_DATA_TYPE DST data type

858

* @param[in] QUANTIZATION_TYPE Quantization type (PER_TENSOR or PER_CHANNEL)

859

* @param[in] M0 Number of src/dst rows

860

* @param[in] N0 Number of src/dst columns

861

* @param[in] DST_OFFSET Quantization offset used for both the per-tensor and per-channel quantization

862

* @param[in] DST_SHIFT Quantization shift for the per-tensor quantization

863

* @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization

864

* @param[in] src Input tile

865

* @param[in] dst_multipliers Output multipliers tile for the per-channel quantization

866

* @param[in] dst_shifts Output shift tile for the per-channel quantization

867

* @param[out] dst Output tile

868

*/

869

#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)

870

#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)

871

872

/** 8-bit per-tensor quantization with fixed-point scale

873

*

874

* @param[in] SRC_DATA_TYPE SRC data type

875

* @param[in] DST_DATA_TYPE DST data type

876

* @param[in] M0 Number of src/dst rows

877

* @param[in] N0 Number of src/dst columns

878

* @param[in] DST_OFFSET Quantization offset

879

* @param[in] DST_SHIFT Quantization shift for the per-tensor quantization

880

* @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization

881

* @param[in] src Input tile

882

* @param[in] dst_multipliers (unused)

883

* @param[in] dst_shifts (unused)

884

* @param[out] dst Output tile

885

*/

886

#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \

887

({ \

888

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

889

{ \

890

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

891

{ \

892

SRC_DATA_TYPE _tmp = 0; \

893

SRC_DATA_TYPE _src = src[_m0].s[_n0]; \

894

_src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \

895

SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \

896

long a_64 = (long)(_src); \

897

long b_64 = (long)(DST_MULTIPLIER); \

898

long ab_64 = a_64 * b_64; \

899

long mask1 = 1 << 30; \

900

long mask2 = 1 - (1 << 30); \

901

long is_positive_or_zero = ab_64 >= 0; \

902

long nudge = select(mask2, mask1, is_positive_or_zero); \

903

SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \

904

_tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \

905

if(DST_SHIFT >= 0) \

906

{ \

Freddie Liardet

767dbf9

2021-07-21 16:20:41 +0100

[diff] [blame]

907

long mask = ((((int)1) << DST_SHIFT) - (long)1); \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

908

long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \

909

_tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \

910

} \

911

_tmp += DST_OFFSET; \

912

dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \

}) \

}) \

})

/** 8-bit per-channel quantization with fixed-point scale

918

*

919

* @param[in] SRC_DATA_TYPE SRC data type

920

* @param[in] DST_DATA_TYPE DST data type

921

* @param[in] M0 Number of src/dst rows

922

* @param[in] N0 Number of src/dst columns

923

* @param[in] DST_OFFSET Quantization offset

924

* @param[in] DST_SHIFT (unused)

925

* @param[in] DST_MULTIPLIER (unused)

926

* @param[in] src Input tile

927

* @param[in] dst_multipliers Output multipliers tile for the per-channel quantization

928

* @param[in] dst_shifts Output shift tile for the per-channel quantization

929

* @param[out] dst Output tile

930

*/

931

#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \

932

({ \

933

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

934

{ \

935

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

936

{ \

937

SRC_DATA_TYPE _tmp = 0; \

Pablo Marquez Tello

96fb194

2022-11-01 09:32:20 +0000

[diff] [blame]

938

SRC_DATA_TYPE _tmp2 = 0; \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

939

SRC_DATA_TYPE _src = src[_m0].s[_n0]; \

940

SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \

941

SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \

942

_src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \

943

SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \

944

long a_64 = (long)(_src); \

945

long b_64 = (long)(_dst_multiplier); \

946

long ab_64 = a_64 * b_64; \

947

long mask1 = 1 << 30; \

948

long mask2 = 1 - (1 << 30); \

949

long is_positive_or_zero = ab_64 >= 0; \

950

long nudge = select(mask2, mask1, is_positive_or_zero); \

951

SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \

952

_tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \

Pablo Marquez Tello

96fb194

2022-11-01 09:32:20 +0000

[diff] [blame]

953

long mask = ((((int)1) << _dst_shift) - (int)1); \

954

long threshold = (mask >> 1) + any(_tmp); \

955

_tmp2 = _tmp >> _dst_shift; \

956

_tmp2 += select(0, 1, (_tmp & mask) > threshold); \

957

_tmp = select(_tmp, _tmp2, _dst_shift >= 0); \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

958

_tmp += DST_OFFSET; \

959

dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \

}) \

}) \

})

/** Quantized the 8-bit tile with fixed-point scale for asymmetric

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

965

*

966

* @param[in] SRC_DATA_TYPE SRC data type

967

* @param[in] DST_DATA_TYPE DST data type

968

* @param[in] M0 Number of src/dst rows

969

* @param[in] N0 Number of src/dst columns

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

970

* @param[in] DST_OFFSET Quantization offset used for both the per-tensor and per-channel quantization

971

* @param[in] DST_SHIFT Quantization shift for the per-tensor quantization

972

* @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

973

* @param[in] src Input tile

974

* @param[out] dst Output tile

975

*/

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

976

#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst) \

977

({ \

978

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

979

{ \

980

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

981

{ \

982

SRC_DATA_TYPE _tmp = 0; \

983

SRC_DATA_TYPE _src = src[_m0].s[_n0]; \

984

_src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \

985

SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \

986

long a_64 = (long)(_src); \

987

long b_64 = (long)(DST_MULTIPLIER); \

988

long ab_64 = a_64 * b_64; \

989

long mask1 = 1 << 30; \

990

long mask2 = 1 - (1 << 30); \

991

long is_positive_or_zero = ab_64 >= 0; \

992

long nudge = select(mask2, mask1, is_positive_or_zero); \

993

SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \

994

_tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \

995

if(DST_SHIFT >= 0) \

996

{ \

997

long mask = ((((int)1) << DST_SHIFT) - (int)1); \

998

long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \

999

_tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \

1000

} \

1001

_tmp += DST_OFFSET; \

1002

dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \

1003

}) \

1004

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

1005

})

1006

1007

/** Conditional rowset (memset by row)

1008

*

1009

* @note Set the row to VALUE_TO_SET if the corresponding mask == 0

1010

*

1011

* @param[in] DATA_TYPE Data type

1012

* @param[in] M0 Number of LHS rows

1013

* @param[in] N0 Number of LHS columns

1014

* @param[in] VALUE_TO_SET Value to set the row

1015

* @param[in, out] a Input/output tile

1016

* @param[out] mask Mask to check for setting the row to VALUE_TO_SET

1017

*/

1018

#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1019

({ \

1020

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

1021

{ \

1022

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

1023

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

1024

a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1025

}) \

1026

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

1027

})

1028

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

1029

/** Element-wise activation for floating point types

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

1030

*

1031

* @note Performs: activation(LHS) = DST

1032

*

1033

* @param[in] DATA_TYPE SRC/DST data type

1034

* @param[in] M0 Number of SRC/DST rows

1035

* @param[in] N0 Number of SRC/DST columns

1036

* @param[in] ACTIVATION_TYPE Activation type

1037

* @param[in] A_VAL A value used for the activation (e.g. tanh_op, brelu,..)

1038

* @param[in] B_VAL B value used for the activation (e.g. tanh_op, brelu,..)

1039

* @param[out] src SRC tile

1040

* @param[out] dst DST tile

1041

*/

1042

#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1043

({ \

1044

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

1045

{ \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

1046

dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1047

}) \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

1048

})

1049

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

1050

// RELU Activation

1051

#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))

1052

// Bounded RELU Activation

1053

#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))

1054

// Lower Upper Bounded RELU Activation

1055

#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))

1056

// Hard Swish Activation

1057

#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))

1058

// Identity Activation

1059

#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)

1060

1061

#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)

1062

#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)

1063

Gunes Bayir

2021-12-10 16:17:56 +0000

[diff] [blame]

1064

#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))

Ramy Elgammal

2022-12-14 09:20:09 +0000

[diff] [blame^]

1065

#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))

Gunes Bayir

2021-12-10 16:17:56 +0000

[diff] [blame]

1066

#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))

Jakub Sujak

7359a87

2023-01-05 14:24:13 +0000

[diff] [blame]

1067

#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

1068

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

1069

/** Element-wise activation for quantized types

1070

*

1071

* @note Performs: activation(LHS) = DST

1072

*

1073

* @param[in] DATA_TYPE SRC/DST data type

1074

* @param[in] M0 Number of SRC/DST rows

1075

* @param[in] N0 Number of SRC/DST columns

1076

* @param[in] ACTIVATION_TYPE Activation type

1077

* @param[in] ZERO_VALUE The zero value to consider in the computation

1078

* @param[in] A_VAL A value used for the activation (e.g. tanh_op, brelu,..)

1079

* @param[in] B_VAL B value used for the activation (e.g. tanh_op, brelu,..)

1080

* @param[out] src SRC tile

1081

* @param[out] dst DST tile

1082

*/

1083

#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst) \

1084

({ \

1085

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

1086

{ \

1087

dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \

}) \

})

Gunes Bayir

2021-12-10 16:17:56 +0000

[diff] [blame]

1091

/** Element-wise addition between two tiles

1092

*

1093

* @note Performs: LHS + RHS = DST

1094

*

1095

* @param[in] DATA_TYPE LHS/RHS/DST data type

1096

* @param[in] M0 Number of LHS rows

1097

* @param[in] N0 Number of LHS columns

1098

* @param[in] lhs LHS tile

1099

* @param[in] rhs Constant RHS tile

1100

* @param[out] dst DST tile

1101

*/

1102

#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \

1103

({ \

1104

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

1105

{ \

1106

dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \

}) \

})

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

1110

/** Element-wise addition with a constant value

1111

*

1112

* @note Performs: LHS + constant = DST

1113

*

1114

* @param[in] DATA_TYPE LHS/RHS/DST data type

1115

* @param[in] M0 Number of LHS rows

1116

* @param[in] N0 Number of LHS columns

1117

* @param[in] lhs LHS tile

1118

* @param[in] rhs_constant Constant value

1119

* @param[out] dst DST tile

1120

*/

1121

#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1122

({ \

1123

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

1124

{ \

Gunes Bayir

2021-12-10 16:17:56 +0000

[diff] [blame]

1125

dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant; \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1126

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

1127

})

1128

Gunes Bayir

2021-12-10 16:17:56 +0000

[diff] [blame]

1129

#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

Viet-Hoa Do

b3077fb

2023-01-03 17:59:14 +0000

[diff] [blame]

1130

#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

1131

#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

1132

Ramy Elgammal

2022-12-14 09:20:09 +0000

[diff] [blame^]

1133

#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

1134

#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

1135

Gunes Bayir

2021-12-10 16:17:56 +0000

[diff] [blame]

1136

#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

1137

Jakub Sujak

7359a87

2023-01-05 14:24:13 +0000

[diff] [blame]

1138

#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

1139

#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

1140

Gunes Bayir

2021-12-10 16:17:56 +0000

[diff] [blame]

1141

/** Element-wise scale with a constant value

1142

*

1143

* @note Performs: LHS * constant = DST

1144

*

1145

* @param[in] DATA_TYPE LHS/RHS/DST data type

1146

* @param[in] M0 Number of LHS rows

1147

* @param[in] N0 Number of LHS columns

1148

* @param[in] lhs LHS tile

1149

* @param[in] rhs_constant Constant value

1150

* @param[out] dst DST tile

1151

*/

1152

#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \

1153

({ \

1154

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

1155

{ \

1156

dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \

1157

}) \

1158

})

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

1159

1160

/** Element-wise operation with RHS broadcasted (RHS has the X dimension only)

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

1161

*

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

1162

* @note Performs: LHS OP RHS[broadcasted] = DST

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

1163

* @note Both tiles must have same data type

1164

*

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

1165

* @param[in] T_ELWISE_OP Elementwise operator to perform

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

1166

* @param[in] DST_DATA_TYPE DST data type

1167

* @param[in] M0 Number of LHS rows

1168

* @param[in] N0 Number of LHS columns

1169

* @param[in] lhs LHS tile

1170

* @param[in] rhs RHS tile

1171

* @param[out] dst DST tile

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

1172

*/

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

1173

#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1174

({ \

1175

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

1176

{ \

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

1177

dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1178

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

1179

})

1180

Viet-Hoa Do

b3077fb

2023-01-03 17:59:14 +0000

[diff] [blame]

1181

/** Element-wise operation with LHS broadcasted (LHS has the X dimension only)

1182

*

1183

* @note Performs: LHS[broadcasted] OP RHS = DST

1184

* @note Both tiles must have same data type

1185

*

1186

* @param[in] T_ELWISE_OP Elementwise operator to perform

1187

* @param[in] DST_DATA_TYPE DST data type

1188

* @param[in] M0 Number of RHS rows

1189

* @param[in] N0 Number of RHS columns

1190

* @param[in] lhs LHS tile

1191

* @param[in] rhs RHS tile

1192

* @param[out] dst DST tile

1193

*/

1194

#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \

1195

({ \

1196

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

1197

{ \

1198

dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \

}) \

})

Gunes Bayir

2021-12-10 16:17:56 +0000

[diff] [blame]

1202

#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

Ramy Elgammal

2022-12-14 09:20:09 +0000

[diff] [blame^]

1203

#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

Gunes Bayir

2021-12-10 16:17:56 +0000

[diff] [blame]

1204

#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

Jakub Sujak

7359a87

2023-01-05 14:24:13 +0000

[diff] [blame]

1205

#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

1206

1207

/** Element-wise operation between two tiles (LHS and RHS)

Michalis Spyrou

2022-05-06 17:06:21 +0100

[diff] [blame]

1208

*

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

1209

* @note Performs: LHS OP RHS = DST

Michalis Spyrou

2022-05-06 17:06:21 +0100

[diff] [blame]

1210

* @note Both tiles must have same data type

1211

*

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

1212

* @param[in] T_ELWISE_OP Elementwise operator to perform

Michalis Spyrou

2022-05-06 17:06:21 +0100

[diff] [blame]

1213

* @param[in] DST_DATA_TYPE DST data type

1214

* @param[in] M0 Number of LHS rows

1215

* @param[in] N0 Number of LHS columns

1216

* @param[in] lhs LHS tile

1217

* @param[in] rhs RHS tile

1218

* @param[out] dst DST tile

1219

*/

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

1220

#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \

Michalis Spyrou

2022-05-06 17:06:21 +0100

[diff] [blame]

1221

({ \

1222

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

1223

{ \

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

1224

dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \

}) \

})

/** Floor operation on a tile

1229

*

1230

* @note Performs: floor(SRC) = DST

1231

* @note Both tiles must have same data type

1232

*

1233

* @param[in] DST_DATA_TYPE DST data type

1234

* @param[in] M0 Number of SRC rows

1235

* @param[in] N0 Number of SRC columns

1236

* @param[in] src LHS tile

1237

* @param[out] dst DST tile

1238

*/

1239

#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \

1240

({ \

1241

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

1242

{ \

1243

dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \

Michalis Spyrou

2022-05-06 17:06:21 +0100

[diff] [blame]

}) \

})

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

1247

/** Matrix multiplication

1248

*

1249

* @note Performs: LHS X RHS + DST = DST

1250

*

1251

* @param[in] LHS_DATA_TYPE LHS tile data type

1252

* @param[in] RHS_DATA_TYPE RHS tile data type

1253

* @param[in] DST_DATA_TYPE RHS tile data type

1254

* @param[in] M0 Number of LHS rows

1255

* @param[in] N0 Number of RHS columns

1256

* @param[in] K0 Number of LHS columns

1257

* @param[in] LHS_LAYOUT LHS layout (T= transposed, NT= not transposed)

1258

* @param[in] RHS_LAYOUT RHS layout (T= transposed, NT= not transposed)

1259

* @param[in] lhs LHS tile

1260

* @param[in] rhs RHS tile

1261

* @param[in, out] dst DST tile

1262

*/

1263

#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

1264

#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)

1265

#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)

Giorgio Arena

2021-10-13 11:13:04 +0100

[diff] [blame]

1266

#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)

Gian Marco Iodice