Blame - src/core/CL/cl_kernels/tile_helpers.h - ml/ComputeLibrary

2021-03-19 11:26:20 +0000

[diff] [blame]

/*

*

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

// *INDENT-OFF*

// clang-format off

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

28

#define TILE_VECTOR_SIZE1 1

29

#define TILE_VECTOR_SIZE2 2

30

#define TILE_VECTOR_SIZE3 3

31

#define TILE_VECTOR_SIZE4 4

32

#define TILE_VECTOR_SIZE5 8

33

#define TILE_VECTOR_SIZE6 8

34

#define TILE_VECTOR_SIZE7 8

35

#define TILE_VECTOR_SIZE8 8

36

#define TILE_VECTOR_SIZE9 16

37

#define TILE_VECTOR_SIZE10 16

38

#define TILE_VECTOR_SIZE11 16

39

#define TILE_VECTOR_SIZE12 16

40

#define TILE_VECTOR_SIZE13 16

41

#define TILE_VECTOR_SIZE14 16

42

#define TILE_VECTOR_SIZE15 16

43

#define TILE_VECTOR_SIZE16 16

44

45

#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1

46

#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2

47

#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3

48

#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4

49

#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8

50

#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8

51

#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8

52

#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8

53

#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16

54

#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16

55

#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16

56

#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16

57

#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16

58

#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16

59

#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16

60

#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16

61

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

62

/** Tile object

63

* A tile object is a 2D memory block and can be accessed using the following syntax:

64

* -# a[m0].v = access the the vector at row "m0" (OpenCL vector)

65

* -# a[m0].s[x] = access the scalar element at row "m0" and column "n0" (scalar access)

66

*

67

* @param[in] DATA_TYPE Data type of the tile

68

* @param[in] H Number of tile rows

69

* @param[in] W Number of tile colums

70

* @param[in] BASENAME Tile's name

71

*/

72

#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)

73

#define TILE_STR(DATA_TYPE, H, W, BASENAME) \

74

union { \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

75

DATA_TYPE s[TILE_VECTOR_SIZE##W]; \

76

TILE_VECTOR_TYPE##W(DATA_TYPE) v; \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

77

} BASENAME[H]

78

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

79

#define TENSOR4D_IMAGE(name) \

80

__read_only image2d_t name##_img, \

81

__global uchar *name##_ptr, \

82

uint name##_stride_x, \

83

uint name##_step_x, \

84

uint name##_stride_y, \

85

uint name##_step_y, \

86

uint name##_stride_z, \

87

uint name##_step_z, \

88

uint name##_stride_w, \

89

uint name##_step_w, \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

90

uint name##_offset_first_element_in_bytes

91

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

92

#define TENSOR4D_BUFFER(name) \

93

__global uchar *name##_ptr, \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

94

uint name##_stride_x, \

95

uint name##_step_x, \

96

uint name##_stride_y, \

97

uint name##_step_y, \

98

uint name##_stride_z, \

99

uint name##_step_z, \

100

uint name##_stride_w, \

101

uint name##_step_w, \

102

uint name##_offset_first_element_in_bytes

103

104

#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)

105

#define TENSOR4D(name, type) TENSOR4D_STR(name, type)

106

Giorgio Arena

ea8d266

2021-05-20 11:36:56 +0100

[diff] [blame]

107

#if !defined(UNROLL_WITH_PRAGMA)

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

108

#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)

109

110

#define LOOP_UNROLLING_1(idx, step, macro) (macro)

111

#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)

112

#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)

113

#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)

114

#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)

115

#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)

116

#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)

117

#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)

118

#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)

119

#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)

120

#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)

121

#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)

122

#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)

123

#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)

124

#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)

125

#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)

126

#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)

127

#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)

128

#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)

129

#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)

130

#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)

131

#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)

132

#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)

133

#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)

134

#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)

135

#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)

136

#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)

137

#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)

138

#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)

139

#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)

140

#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)

141

#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)

142

#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)

143

#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)

144

#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)

145

#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)

146

#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)

147

#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)

148

#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)

149

#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)

150

#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)

151

#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)

152

#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)

153

#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)

154

#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)

155

#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)

156

#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)

157

#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)

158

#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)

159

#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)

160

#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)

161

#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)

162

#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)

163

#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)

164

#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)

165

#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)

166

#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)

167

#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)

168

#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)

169

#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)

170

#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)

171

#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)

172

#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)

173

#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)

174

#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)

175

#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)

176

#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)

177

#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)

178

#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)

179

#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)

180

#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)

181

#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)

182

#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)

183

#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)

184

#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)

185

#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)

186

#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)

187

#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)

188

#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)

189

#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)

190

#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)

191

#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)

192

#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)

193

#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)

194

#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)

195

#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)

196

#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)

197

#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)

198

#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)

199

#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)

200

#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)

201

#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)

202

#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)

203

#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)

204

#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)

205

#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)

206

#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)

207

#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)

208

#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)

209

#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)

210

#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)

211

#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)

212

#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)

213

#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)

214

#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)

215

#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)

216

#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)

217

#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)

218

#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)

219

#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)

220

#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)

221

#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)

222

#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)

223

#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)

224

#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)

225

#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)

226

#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)

227

#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)

228

#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)

229

#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)

230

#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)

231

#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)

232

#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)

233

#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)

234

#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)

235

#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)

236

#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)

237

#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)

238

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

239

#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \

240

{ \

241

type idx = start; \

242

LOOP_UNROLLING_##num(idx, step, macro); \

243

}

Giorgio Arena

ea8d266

2021-05-20 11:36:56 +0100

[diff] [blame]

244

#else // !defined(UNROLL_WITH_PRAGMA)

245

#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \

246

{ \

247

_Pragma("unroll") \

248

for(type idx = start; idx < (num * step); idx += step) \

{ \

(macro); \

} \

}

#endif // !defined(UNROLL_WITH_PRAGMA)

254

#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

255

256

/** Get the get_global_id with partial N0. This function is useful when the dimension is not multiple of N0 and we need to use a partial N0

257

* to avoid out-of-bound read/write

258

*

259

* @note PARTIAL_N0 is used for get_global_id(n) = 0.

260

*

261

* @param[in] IDX get_global_id index (0,1 and 2 only)

262

* @param[in] N0 Number of elements read/written on the IDX direction

263

* @param[in] PARTIAL_N0 Number of elements read/written on the IDX direction for get_global_id(IDX) = 0. If zero,

264

* the Number of elements read/written on the IDX direction for get_global_id(IDX) = 0 is N0

265

*/

266

#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))

267

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

268

/** Dot product integet 8bit function

269

*

270

* @note Performs: c += dot(a, b)

271

*

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

272

* @param[in] A_DATA_TYPE A (lhs) data type

273

* @param[in] B_DATA_TYPE B (rhs) data type

274

* @param[in] C_DATA_TYPE C (accumulator) data type

275

* @param[in] K0 Number of accumulations

276

* @param[in] a OpenCL vector a

277

* @param[in] b OpenCL vector b

278

* @param[in] c Scalar variable c

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

279

*/

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

280

#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)

281

#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)

282

#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

283

({ \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

284

c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b); \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

285

})

286

#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

287

#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));

288

#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));

289

#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

290

#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

291

#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0), ), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0));

292

#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0);

293

#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

294

#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

295

#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

296

({ \

297

c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \

298

c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \

299

})

300

#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

301

({ \

302

DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c); \

303

c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \

304

})

305

#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

306

({ \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

307

val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \

308

val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \

309

val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \

310

val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

311

})

312

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

313

#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

314

({ \

315

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \

316

DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c); \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

317

})

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

318

#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

319

({ \

320

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \

321

DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c); \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

322

})

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

323

#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

324

({ \

325

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \

326

DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c); \

327

})

328

#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

329

({ \

330

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \

331

DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \

332

})

333

#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \

334

({ \

335

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \

336

DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \

337

})

338

339

/** Dot product integet 8bit function

340

*

341

* @note Performs: c += dot(a, b)

342

*

343

* @param[in] A_DATA_TYPE A (lhs) data type

344

* @param[in] B_DATA_TYPE B (rhs) data type

345

* @param[in] C_DATA_TYPE C (accumulator) data type

346

* @param[in] K0 Number of accumulations

347

* @param[in] a OpenCL vector a

348

* @param[in] c Scalar variable c

349

*/

350

#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)

351

#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

352

353

/** Load a vector from global memory (tensor)

354

*

355

* @param[in] DATA_TYPE Data type

356

* @param[in] WIDTH Number of dst columns

357

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).

358

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

359

* @param[in] TENSOR Tensor basename

360

* @param[in] X Starting X position

361

* @param[in] Y Starting Y position

362

* @param[in] STRIDE_Y Stride Y (in bytes)

363

*/

364

#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)

365

#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)

366

#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \

367

VLOAD(WIDTH) \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

368

(0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

369

#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))

370

371

/** Load a tile from global memory (tensor)

372

*

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

373

* @param[in] DATA_TYPE Data type

374

* @param[in] HEIGHT Number of dst rows

375

* @param[in] WIDTH Number of dst columns

376

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).

377

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

378

* @param[in] TENSOR Tensor basename

379

* @param[in] X Starting X position

380

* @param[in] Y Starting Y position

381

* @param[in] YI_MULTIPLIER Parameter used to multiply the internal row increment (_i).

382

* In common cases should be 1 but it becomes useful when we want to load rows which are multiple of STRIDE_Y. (e.g. loading the weights of convolution layer).

383

* In this case the address calculation is performed as: (Y + _i * Y_MULTIPLIER) * STRIDE_Y

384

* @param[in] STRIDE_Y Stride Y (in bytes) used to load each row.

385

* @param[out] dst Output tile

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

386

*/

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

387

#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

388

({ \

389

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

390

{ \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

391

dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

392

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

393

})

394

395

/** Load a tile from global memory (tensor) using an indirect Y index tile

396

*

397

* @param[in] DATA_TYPE Data type

398

* @param[in] HEIGHT Number of dst rows

399

* @param[in] WIDTH Number of dst columns

400

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

401

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

402

* @param[in] TENSOR Tensor basename

403

* @param[in] X Starting X position

404

* @param[in] STRIDE_Y Stride Y (in bytes)

405

* @param[in] indirect_y Indirect Y index tile

406

* @param[out] dst Output tile

407

*/

408

#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

409

({ \

410

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

411

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

412

dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

413

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

414

})

415

Gian Marco Iodice

534b889

2021-04-01 16:17:16 +0100

[diff] [blame]

416

/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout

417

*

418

* @param[in] DATA_TYPE Data type

419

* @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension

420

* @param[in] TILE_WIDTH Number of elements to load from X (width) dimension

421

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

422

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

423

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

424

* @param[in] TENSOR Tensor basename

425

* @param[in] B Starting batch index

426

* @param[in] Y Starting Y index

427

* @param[in] X Starting X index

428

* @param[in] C Starting C index

429

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

430

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

431

* @param[in] STRIDE_Y Stride Y (in bytes)

432

* @param[out] dst Output tile

433

*/

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

434

#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

435

({ \

436

LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \

437

{ \

438

LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \

439

{ \

440

int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH); \

441

_src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \

Gian Marco Iodice

534b889

2021-04-01 16:17:16 +0100

[diff] [blame]

442

int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

443

if(_src_valid_y != 0) \

444

{ \

445

dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

446

} \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

447

}) \

448

}) \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

449

})

450

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

451

/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout with dilation for the X and Y increments

452

*

453

* @param[in] DATA_TYPE Data type

454

* @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension

455

* @param[in] TILE_WIDTH Number of elements to load from X (width) dimension

456

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

457

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

458

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

459

* @param[in] TENSOR Tensor basename

460

* @param[in] B Starting batch index

461

* @param[in] Y Starting Y index

462

* @param[in] X Starting X index

463

* @param[in] C Starting C index

464

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

465

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

466

* @param[in] DILATION_X Dilation for the X increment

467

* @param[in] DILATION_Y Dilation for the Y increment

468

* @param[in] STRIDE_Y Stride Y (in bytes)

469

* @param[in] BOUNDARY_CHECK Boundary check flag. If true, it checks for any out-of-bound reads

470

* @param[out] dst Output tile

471

*/

472

#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, STRIDE_Y, BOUNDARY_CHECK, dst) \

473

({ \

474

LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \

475

{ \

476

LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \

477

{ \

478

int _src_y = (X) + _xk * (DILATION_X) + ((Y) + _yk * (DILATION_Y)) * (TENSOR_WIDTH); \

479

_src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \

480

bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \

481

if(!(BOUNDARY_CHECK)) \

482

{ \

483

dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

} \

else \

{ \

if(_src_valid_y) \

{ \

dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

} \

} \

}) \

}) \

})

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

496

/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout using indirect X and Y coordinates

497

*

498

* @param[in] DATA_TYPE Data type

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

499

* @param[in] TILE_AREA Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

500

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

501

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

502

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

503

* @param[in] TENSOR Tensor basename

504

* @param[in] B Starting batch index

505

* @param[in] Y Starting Y index

506

* @param[in] X Starting X index

507

* @param[in] C Starting C index

508

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

509

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

510

* @param[in] STRIDE_Y Stride Y (in bytes)

511

* @param[out] xi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate

512

* @param[out] yi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate

513

* @param[out] dst Output tile

514

*/

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

515

#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst) \

516

({ \

517

LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \

518

{ \

519

int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH); \

520

_src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

521

int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

522

if(_src_valid_y != 0) \

523

{ \

524

dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

Gian Marco Iodice

2021-04-08 17:20:00 +0100

[diff] [blame]

525

} \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

526

}) \

Gian Marco Iodice

534b889

2021-04-01 16:17:16 +0100

[diff] [blame]

527

})

528

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

529

/** Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a different length for the store

530

*

531

* @note If WIDTH1_CONDITION is true, the store will use the WIDTH1 length for the store

532

* @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones

533

*

534

* @param[in] DATA_TYPE Data type

535

* @param[in] HEIGHT Number of src rows

536

* @param[in] WIDTH0 Store width to use if WIDTH1_CONDITION = false

537

* @param[in] WIDTH1 Store width to use if WIDTH1_CONDITION = true

538

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

539

* cl_image is not supported.

540

* @param[in] TENSOR Tensor basename

541

* @param[in] X Starting X position

542

* @param[in] STRIDE_Y Stride Y (in bytes)

543

* @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store

544

* @param[in] src Input tile

545

* @param[in] indirect_y Indirect Y index tile

546

*/

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

547

#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y) \

548

({ \

549

if(WIDTH1_CONDITION) \

550

{ \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

551

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

552

{ \

553

VSTORE_PARTIAL(WIDTH0, WIDTH1) \

554

(src[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

555

}) \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

556

} \

557

else \

558

{ \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

559

LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

560

{ \

561

VSTORE(WIDTH0) \

562

(src[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

563

}) \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

564

} \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

565

})

566

567

/** Offset correction for the QASYMM8 computation

568

*

569

* @param[in] ACC_DATA_TYPE Accumulator data type

570

* @param[in] M0 Number of src/dst rows

571

* @param[in] N0 Number of src/dst columns

572

* @param[in] K0 Number of src columns

573

* @param[in] SRC_OFFSET Source quantization offset

574

* @param[in] WEI_OFFSET Weights quantization shift

575

* @param[in] lhs LHS tile

576

* @param[in] rhs RHS tile

577

* @param[out] dst DST tile

578

*/

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

579

#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst) \

580

({ \

581

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

582

{ \

583

ACC_DATA_TYPE _tm = 0; \

584

LOOP_UNROLLING(int, _k0, 0, 1, K0, \

585

{ \

586

_tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET); \

587

}) \

588

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

589

{ \

590

dst[_m0].s[_n0] += _tm; \

591

LOOP_UNROLLING(int, _k0, 0, 1, K0, \

592

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

593

dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

594

}) \

595

}) \

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

596

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

597

})

598

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

599

/** 8-bit quantization with fixed-point scale

600

*

601

* @param[in] SRC_DATA_TYPE SRC data type

602

* @param[in] DST_DATA_TYPE DST data type

603

* @param[in] QUANTIZATION_TYPE Quantization type (PER_TENSOR or PER_CHANNEL)

604

* @param[in] M0 Number of src/dst rows

605

* @param[in] N0 Number of src/dst columns

606

* @param[in] DST_OFFSET Quantization offset used for both the per-tensor and per-channel quantization

607

* @param[in] DST_SHIFT Quantization shift for the per-tensor quantization

608

* @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization

609

* @param[in] src Input tile

610

* @param[in] dst_multipliers Output multipliers tile for the per-channel quantization

611

* @param[in] dst_shifts Output shift tile for the per-channel quantization

612

* @param[out] dst Output tile

613

*/

614

#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)

615

#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)

616

617

/** 8-bit per-tensor quantization with fixed-point scale

618

*

619

* @param[in] SRC_DATA_TYPE SRC data type

620

* @param[in] DST_DATA_TYPE DST data type

621

* @param[in] M0 Number of src/dst rows

622

* @param[in] N0 Number of src/dst columns

623

* @param[in] DST_OFFSET Quantization offset

624

* @param[in] DST_SHIFT Quantization shift for the per-tensor quantization

625

* @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization

626

* @param[in] src Input tile

627

* @param[in] dst_multipliers (unused)

628

* @param[in] dst_shifts (unused)

629

* @param[out] dst Output tile

630

*/

631

#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \

632

({ \

633

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

634

{ \

635

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

636

{ \

637

SRC_DATA_TYPE _tmp = 0; \

638

SRC_DATA_TYPE _src = src[_m0].s[_n0]; \

639

_src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \

640

SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \

641

long a_64 = (long)(_src); \

642

long b_64 = (long)(DST_MULTIPLIER); \

643

long ab_64 = a_64 * b_64; \

644

long mask1 = 1 << 30; \

645

long mask2 = 1 - (1 << 30); \

646

long is_positive_or_zero = ab_64 >= 0; \

647

long nudge = select(mask2, mask1, is_positive_or_zero); \

648

SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \

649

_tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \

650

if(DST_SHIFT >= 0) \

651

{ \

652

long mask = ((((int)1) << DST_SHIFT) - (int)1); \

653

long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \

654

_tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \

655

} \

656

_tmp += DST_OFFSET; \

657

dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \

}) \

}) \

})

/** 8-bit per-channel quantization with fixed-point scale

663

*

664

* @param[in] SRC_DATA_TYPE SRC data type

665

* @param[in] DST_DATA_TYPE DST data type

666

* @param[in] M0 Number of src/dst rows

667

* @param[in] N0 Number of src/dst columns

668

* @param[in] DST_OFFSET Quantization offset

669

* @param[in] DST_SHIFT (unused)

670

* @param[in] DST_MULTIPLIER (unused)

671

* @param[in] src Input tile

672

* @param[in] dst_multipliers Output multipliers tile for the per-channel quantization

673

* @param[in] dst_shifts Output shift tile for the per-channel quantization

674

* @param[out] dst Output tile

675

*/

676

#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \

677

({ \

678

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

679

{ \

680

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

681

{ \

682

SRC_DATA_TYPE _tmp = 0; \

683

SRC_DATA_TYPE _src = src[_m0].s[_n0]; \

684

SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \

685

SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \

686

_src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \

687

SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \

688

long a_64 = (long)(_src); \

689

long b_64 = (long)(_dst_multiplier); \

690

long ab_64 = a_64 * b_64; \

691

long mask1 = 1 << 30; \

692

long mask2 = 1 - (1 << 30); \

693

long is_positive_or_zero = ab_64 >= 0; \

694

long nudge = select(mask2, mask1, is_positive_or_zero); \

695

SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \

696

_tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \

697

if(_dst_shift >= 0) \

698

{ \

699

long mask = ((((int)1) << _dst_shift) - (int)1); \

700

long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \

701

_tmp = (_tmp & mask) > threshold ? (_tmp >> _dst_shift) + (int)1 : (_tmp >> _dst_shift); \

702

} \

703

_tmp += DST_OFFSET; \

704

dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \

}) \

}) \

})

/** Quantized the 8-bit tile with fixed-point scale for asymmetric

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

710

*

711

* @param[in] SRC_DATA_TYPE SRC data type

712

* @param[in] DST_DATA_TYPE DST data type

713

* @param[in] M0 Number of src/dst rows

714

* @param[in] N0 Number of src/dst columns

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

715

* @param[in] DST_OFFSET Quantization offset used for both the per-tensor and per-channel quantization

716

* @param[in] DST_SHIFT Quantization shift for the per-tensor quantization

717

* @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

718

* @param[in] src Input tile

719

* @param[out] dst Output tile

720

*/

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

721

#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst) \

722

({ \

723

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

724

{ \

725

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

726

{ \

727

SRC_DATA_TYPE _tmp = 0; \

728

SRC_DATA_TYPE _src = src[_m0].s[_n0]; \

729

_src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \

730

SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \

731

long a_64 = (long)(_src); \

732

long b_64 = (long)(DST_MULTIPLIER); \

733

long ab_64 = a_64 * b_64; \

734

long mask1 = 1 << 30; \

735

long mask2 = 1 - (1 << 30); \

736

long is_positive_or_zero = ab_64 >= 0; \

737

long nudge = select(mask2, mask1, is_positive_or_zero); \

738

SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \

739

_tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \

740

if(DST_SHIFT >= 0) \

741

{ \

742

long mask = ((((int)1) << DST_SHIFT) - (int)1); \

743

long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \

744

_tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \

745

} \

746

_tmp += DST_OFFSET; \

747

dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \

748

}) \

749

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

750

})

751

752

/** Conditional rowset (memset by row)

753

*

754

* @note Set the row to VALUE_TO_SET if the corresponding mask == 0

755

*

756

* @param[in] DATA_TYPE Data type

757

* @param[in] M0 Number of LHS rows

758

* @param[in] N0 Number of LHS columns

759

* @param[in] VALUE_TO_SET Value to set the row

760

* @param[in, out] a Input/output tile

761

* @param[out] mask Mask to check for setting the row to VALUE_TO_SET

762

*/

763

#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

764

({ \

765

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

766

{ \

767

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

768

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

769

a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

770

}) \

771

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

772

})

773

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

774

/** Element-wise activation for floating point types

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

775

*

776

* @note Performs: activation(LHS) = DST

777

*

778

* @param[in] DATA_TYPE SRC/DST data type

779

* @param[in] M0 Number of SRC/DST rows

780

* @param[in] N0 Number of SRC/DST columns

781

* @param[in] ACTIVATION_TYPE Activation type

782

* @param[in] A_VAL A value used for the activation (e.g. tanh_op, brelu,..)

783

* @param[in] B_VAL B value used for the activation (e.g. tanh_op, brelu,..)

784

* @param[out] src SRC tile

785

* @param[out] dst DST tile

786

*/

787

#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

788

({ \

789

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

790

{ \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

791

dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

792

}) \

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

793

})

794

Gian Marco Iodice

2021-04-16 15:08:59 +0100

[diff] [blame]

795

// RELU Activation

796

#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))

797

// Bounded RELU Activation

798

#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))

799

// Lower Upper Bounded RELU Activation

800

#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))

801

// Hard Swish Activation

802

#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))

803

// Identity Activation

804

#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)

805

806

#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)

807

#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)

808

809

/** Element-wise activation for quantized types

810

*

811

* @note Performs: activation(LHS) = DST

812

*

813

* @param[in] DATA_TYPE SRC/DST data type

814

* @param[in] M0 Number of SRC/DST rows

815

* @param[in] N0 Number of SRC/DST columns

816

* @param[in] ACTIVATION_TYPE Activation type

817

* @param[in] ZERO_VALUE The zero value to consider in the computation

818

* @param[in] A_VAL A value used for the activation (e.g. tanh_op, brelu,..)

819

* @param[in] B_VAL B value used for the activation (e.g. tanh_op, brelu,..)

820

* @param[out] src SRC tile

821

* @param[out] dst DST tile

822

*/

823

#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst) \

824

({ \

825

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

826

{ \

827

dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \

}) \

})

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

831

/** Element-wise addition with a constant value

832

*

833

* @note Performs: LHS + constant = DST

834

*

835

* @param[in] DATA_TYPE LHS/RHS/DST data type

836

* @param[in] M0 Number of LHS rows

837

* @param[in] N0 Number of LHS columns

838

* @param[in] lhs LHS tile

839

* @param[in] rhs_constant Constant value

840

* @param[out] dst DST tile

841

*/

842

#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

843

({ \

844

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

845

{ \

846

LOOP_UNROLLING(int, _n0, 0, 1, N0, \

847

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

848

dst[_m0].s[_n0] = lhs[_m0].s[_n0] + rhs_constant; \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

849

}) \

850

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

851

})

852

853

/** Element-wise addition with RHS broadcasted (RHS has the X dimension only)

854

*

855

* @note Performs: LHS + RHS[broadcasted] = DST

856

* @note Both tiles must have same data type

857

*

858

* @param[in] DATA_TYPE LHS/RHS/DST data type

859

* @param[in] M0 Number of LHS rows

860

* @param[in] N0 Number of LHS columns

861

* @param[in] lhs LHS tile

862

* @param[in] rhs RHS tile

863

* @param[out] dst DST tile

864

*/

865

#define T_ADD_BROADCAST_X(DATA_TYPE, M0, N0, lhs, rhs, dst) \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

866

({ \

867

LOOP_UNROLLING(int, _m0, 0, 1, M0, \

868

{ \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

869

dst[_m0].v = lhs[_m0].v + rhs[0].v; \

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

870

}) \

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

871

})

872

873

/** Matrix multiplication

874

*

875

* @note Performs: LHS X RHS + DST = DST

876

*

877

* @param[in] LHS_DATA_TYPE LHS tile data type

878

* @param[in] RHS_DATA_TYPE RHS tile data type

879

* @param[in] DST_DATA_TYPE RHS tile data type

880

* @param[in] M0 Number of LHS rows

881

* @param[in] N0 Number of RHS columns

882

* @param[in] K0 Number of LHS columns

883

* @param[in] LHS_LAYOUT LHS layout (T= transposed, NT= not transposed)

884

* @param[in] RHS_LAYOUT RHS layout (T= transposed, NT= not transposed)

885

* @param[in] lhs LHS tile

886

* @param[in] rhs RHS tile

887

* @param[in, out] dst DST tile

888

*/

889

#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)

Gian Marco Iodice