Blame - src/core/CL/cl_kernels/common/mat_mul_mmul.cl - ml/ComputeLibrary

* @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).

34

* @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)

35

* @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).

36

* @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)

37

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=4). K must be a multiple of MMUL_K0

38

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_NT)

39

* @note Only the following configurations of M0, N0 and K0 are currently supported:

40

* - M0 > 0

41

* - N0 = 1, 2, 3, 4, 8, 16

42

* - K0 = 1

43

* @note Values > 8 for M0 are not expected to be efficient

44

*

45

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

46

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

47

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

48

* @param[in] lhs_w The width of the lhs tensor

49

* @param[in] lhs_h The height of the lhs tensor

50

* @param[in] lhs_n Number of the matrices (buffers) in the batch

51

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

52

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

53

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

54

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

55

* @param[in] rhs_w The width of the rhs tensor

56

* @param[in] rhs_h The height of the rhs tensor

57

* @param[in] rhs_n Number of the matrices (buffers) in the batch

58

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

59

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

60

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

61

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

62

* @param[in] dst_w The width of the dst tensor

63

* @param[in] dst_h The height of the dst tensor

64

* @param[in] dst_n Number of the matrices (buffers) in the batch

65

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

66

* @param[in] M Number of rows in LHS matrix

67

* @param[in] N Number of columns in RHS matrix

68

*/

69

__kernel void mat_mul_native_mmul_nt_nt(

70

TENSOR3D_T(lhs, BUFFER),

71

TENSOR3D_T(rhs, BUFFER),

72

TENSOR3D_T(dst, BUFFER),

const int M,

const int N)

{

#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)

77

78

const uint x0 = get_global_id(0); // (N / N0) * MMUL_M0

79

const uint y0 = get_global_id(1); // (M / M0) / MMUL_M0

80

const uint z = get_global_id(2); // Batch

81

82

// Get block coordinates

83

const uint block_x = (x0 / MMUL_BLOCK_SIZE);

84

const uint block_y = y0;

85

86

// Get thread coordinates within a block

87

const uint thread_id = (x0 % MMUL_BLOCK_SIZE);

88

const uint thread_x = thread_id % MMUL_N0;

89

const uint thread_y = (thread_id / MMUL_N0);

90

91

// Starting destination coordinates

92

// Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication

93

// part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results

94

// Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.

95

const uint dst_x_unclamped = thread_x * N0 + block_x * N0 * MMUL_N0;

96

const uint dst_y_unclamped = thread_y * M0 + block_y * M0 * MMUL_M0;

97

const uint dst_x = min(dst_x_unclamped, (uint)(N - N0));

98

const uint dst_y = min(dst_y_unclamped, (uint)(M - M0));

99

100

// Starting LHS coordinates

101

const uint lhs_x = thread_x;

102

const uint lhs_y = dst_y;

103

104

// Starting RHS coordinates

105

const uint rhs_x = dst_x;

106

const uint rhs_y = thread_y;

107

108

// Compute LHS/RHS/DST matrix address

109

lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;

110

rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;

111

dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;

112

113

// Initialize the accumulators

114

// MMUL extension accumulate the result in F32 for both F32 and F16

115

TILE(float, M0, N0, c_f32);

116

117

LOOP_UNROLLING(int, i, 0, 1, M0,

{

c_f32[i].v = 0;

})

for(int k = 0; k < K; k += MMUL_K0)

123

{

124

// A tile of M0xK0 but K0 must be set to 1

125

TILE(DATA_TYPE, M0, 1, a);

126

// A tile of K0xN0 but K0 must be set to 1

127

TILE(DATA_TYPE, 1, N0, b);

128

129

// Load tile from the lhs/rhs tensors

130

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

131

T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

132

133

LOOP_UNROLLING(int, m0, 0, 1, M0,

134

{

135

LOOP_UNROLLING(int, n0, 0, 1, N0,

136

{

137

c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);

})

})

lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);

142

rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;

143

}

144

145

// For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply

146

if(dst_x_unclamped >= N || dst_y_unclamped >= M)

{

return;

}

#if defined(HALF_PRECISION)

152