Blame - arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h - ml/ComputeLibrary

MatrixMultiplyWorkload(unsigned int offset_transformed_b, unsigned int x0, unsigned int xmax, unsigned int k0, unsigned int kmax, unsigned int multi, int kern_k, int bblocks)

56

: _offset_transformed_b(offset_transformed_b), _x0(x0), _xmax(xmax), _k0(k0), _kmax(kmax), _multi(multi), _kern_k(kern_k), _bblocks(bblocks)

57

{

58

}

59

unsigned int _offset_transformed_b; /**< Offset from the start of transformed_b's allocation.*/

60

unsigned int _x0; /**< First value to process along the X dimension (N). */

61

unsigned int _xmax; /**< Last value to process along the X dimension (N). */

62

unsigned int _k0; /**< First value to process along the K dimension. */

63

unsigned int _kmax; /**< Last value to process along the K dimension. */

64

unsigned int _multi; /**< Multi index. */

65

int _kern_k; /**< Number of elements along K actually processed by the kernel. */

66

int _bblocks; /**< Number of x_block processed by the kernel. */

67

};

68

69

/** Common interface for the templated wrappers around the matrix multiply NEON assembly implementations */

70

class NEGEMMInterleavedMatrixMultiplyWrapper

71

{

72

public:

73

/** Transform the block at the given coordinates

74

*

75

* @param[in] wl Workload to process.

76

* @param[in] info Information about the current thread.

77

* @param[in] batch_window Window containing iteration information for the M and batch dimensions.

78

* @param[in] start_offset Offset relative to the beginning of batch_window to start the processing from.

79

* @param[in] end_offset Offset relative to the beginning of batch_window to stop the processing.

80

*/

81

virtual void transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) = 0;

82

/** Generate an array of workloads

83

*

84

* @param[out] workloads Container to store the generated workloads.

85

*/

86

virtual void create_workloads(std::vector<MatrixMultiplyWorkload> &workloads) = 0;

87

/** Default destructor */

88

virtual ~NEGEMMInterleavedMatrixMultiplyWrapper() = default;

89

};

90

91

/** Equivalent to arm_gemm::GemmInterleaved's strategy::kernel() but using Compute Library types. */

Georgios Pinitas

7cd26d4

2019-01-09 18:35:17 +0000

[diff] [blame]

92

template <typename strategy>

Anthony Barbier

3d677cc

2018-07-23 16:42:59 +0100

[diff] [blame]

93

class NEGEMMInterleavedMatrixMultiplyWrapperTemplate : public NEGEMMInterleavedMatrixMultiplyWrapper

94

{

95

public:

96

/** Configure the matrix multiplication: C = alpha * A * B + beta * C

97

*

98

* @param[in] prepared_a Already reshaped matrix A.

99

* @param[in] transformed_b Already reshaped matrix B.

100

* @param[out] tmp_c Temporary buffer to be used to store intermediate results.

101

* @param[in,out] c Result matrix C.

Georgios Pinitas

7cd26d4

2019-01-09 18:35:17 +0000

[diff] [blame]

102

* @param[in] block_walker Window containing iteration information for the M and batch dimensions.

Anthony Barbier

3d677cc

2018-07-23 16:42:59 +0100

[diff] [blame]

103

* @param[in] block_sizes Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes).

104

* @param[in] params M, N, K sizes.

105

* @param[in] is_pretransposed Is B also pretransposed ?

106

* @param[in] alpha Alpha value

107

* @param[in] beta Beta value

108

* @param[in] max_num_threads Maximum number of threads that might be used for the calculations.

109

*/

Georgios Pinitas

7cd26d4

2019-01-09 18:35:17 +0000

[diff] [blame]

110

void configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker, const BlockSizes &block_sizes,

111

const INEGEMMWrapperKernel::Params &params, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads)

112

{

113

_prepared_a = prepared_a;

114

_transformed_b = transformed_b;

115

_tmp_c = tmp_c;

116

_c = c;

117

_block_walker = block_walker;

118

_block_sizes = block_sizes;

119

_params = params;

120

_b_is_pretransposed = b_is_pretransposed;

_alpha = alpha;

_beta = beta;

auto_init_if_empty(*_tmp_c->info(), c->info()->clone()->set_tensor_shape(TensorShape{ _block_sizes.x_block * strategy::out_height(), max_num_threads }));

125

}

Anthony Barbier

3d677cc

2018-07-23 16:42:59 +0100

[diff] [blame]

126

127

// Inherited methods overridden:

Georgios Pinitas

7cd26d4

2019-01-09 18:35:17 +0000

[diff] [blame]

128

void transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override

129

{

130

strategy strat(info.cpu_info);

131

TensorAccessor<typename strategy::operand_type> prepared_a(*_prepared_a);

132

TensorAccessor<typename strategy::operand_type> transformed_b(*_transformed_b);

133

TensorAccessor<typename strategy::result_type> c(*_c);

134

TensorAccessor<typename strategy::result_type> tmp_c(*_tmp_c);

135

136

int prev_batch = -1;

137

typename strategy::operand_type *a_ptr = nullptr;

138

auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)

139

{

140

const unsigned int y = id.x();

141

const unsigned int batch = id.y();

142

const unsigned int ymax = std::min(_params.M, y + strategy::out_height());

143

144

// If it's the first block of a new batch then reset the pointer to A.

145

if(prev_batch != static_cast<int>(batch))

146

{

147

const unsigned int first_m = id.x();

148

a_ptr = prepared_a(0, first_m, batch);

prev_batch = batch;

}

// Call matrix multiply assembly routine to process the block:

153

strat.kernel(a_ptr, transformed_b(wl._offset_transformed_b), tmp_c(0, info.thread_id), 1, wl._bblocks, wl._kern_k);

154

a_ptr += strategy::out_height() * wl._kern_k;

155

156

// Merge the result with the other blocks' results:

157

strat.transforms.Merge(c(0, 0, batch, wl._multi), tmp_c(0, info.thread_id), c.stride(1), y, ymax, wl._x0, wl._xmax, _alpha, (wl._k0 == 0 ? _beta : static_cast<typename strategy::result_type>(1)));

158

});

159