Blame - src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp - ml/ComputeLibrary

2018-04-26 10:24:30 +0100

[diff] [blame]

55

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);

56

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);

57

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

58

59

if(!is_interleaved_transposed)

60

{

Michele Di Giorgio

2018-04-26 10:24:30 +0100

[diff] [blame]

61

ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

62

63

if(output->total_size() != 0)

64

{

65

ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));

66

ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));

67

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);

}

}

else

{

const int m = reshape_info.m();

73

const int n = reshape_info.n();

74

const int k = reshape_info.k();

75

const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();

76

const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();

77

78

TensorShape tensor_shape0{ input0->tensor_shape() };

79

tensor_shape0.set(0, k);

80

tensor_shape0.set(1, m);

81

82

TensorShape tensor_shape1{ input1->tensor_shape() };

83

tensor_shape1.set(0, n);

84

tensor_shape1.set(1, k);

85

86

const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);

87

const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);

88

89

const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));

90

const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));

91

92

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);

93

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);

94

95

if(output->total_size() != 0)

96

{

97

ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));

98

ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));

99

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

}

}

return Status{};

}

inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,

107

bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,

108

GPUTarget gpu_target, ElementsProcessed &num_elements_processed)

109

{

110

ARM_COMPUTE_UNUSED(gpu_target);

111

112

// Output tensor auto inizialitation if not yet initialized

113

TensorShape tensor_shape{ input0->tensor_shape() };

114

tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->dimension(0));

115

tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->dimension(1));

116

117

auto_init_if_empty(*output, input0->clone()->set_tensor_shape(tensor_shape));

118

119

bool window_changed = false;

120

Window win{};

121

122

const DataType data_type = input0->data_type();

123

unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];

124

unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];

125

126

if(is_interleaved_transposed)

127

{

128

// Configure window kernel

129

num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);

130

num_elems_processed_per_iteration_y = 4;

131

132

win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

133

134

AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);

135

AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);

136

AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);

137

138

update_window_and_padding(win, input0_access, input1_access, output_access);

139

140

output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));

141

}

142

else // The input tensors have not been reshaped

143

{

Michele Di Giorgio

2018-04-26 10:24:30 +0100

[diff] [blame]

144

// Special case for 1xN, 2xN, 3xN and 4xN input0 tensor.

145

num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

switch(data_type)

{

case DataType::F16:

num_elems_processed_per_iteration_x = 4;

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

break;

case DataType::F32:

num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

break;

default:

ARM_COMPUTE_ERROR("Current data type is not supported");

break;

}

win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

163

164

AccessWindowStatic input0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));

165

AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));

166

AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);

167

168

update_window_and_padding(win, input0_access, input1_access, output_access);

169

170

Coordinates coord;

171

coord.set_num_dimensions(output->num_dimensions());

172

output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));

173

}

174

175

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

176

return std::make_pair(err, win);

177

}

178

} // namespace

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

179

180

GCGEMMMatrixMultiplyKernel::GCGEMMMatrixMultiplyKernel()

181

: _input0(nullptr), _input1(nullptr), _output(nullptr)

{

}

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

185

void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

186

{

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

187

ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

188

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

189

// Perform validate step

190

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

_input0 = input0;

_input1 = input1;

_output = output;

Michele Di Giorgio

2018-04-25 11:58:07 +0100

[diff] [blame]

196

// Get target architecture

197

GPUTarget gpu_target = get_target();

198

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

199

ElementsProcessed num_elements_processed{};

200

201

// Configure kernel window

Michele Di Giorgio

b8fc60f

2018-04-25 11:58:07 +0100

[diff] [blame]

202

auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

203

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

204

IGCKernel::configure(win_config.second);

205

206

// Create build options

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

207

std::set<std::string> build_opts;

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

208

std::string kernel_name;

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

209

210

build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));

211

build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));

212

build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));

213

build_opts.emplace("#define COLS_A " + support::cpp11::to_string(input0->info()->dimension(0)));

214

build_opts.emplace("#define COLS_B " + support::cpp11::to_string(input1->info()->dimension(0)));

215

build_opts.emplace("#define ALPHA " + float_to_string_with_full_precision(alpha));

216

217

// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication

218

if(is_interleaved_transposed)

219

{

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

220

const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();

221

const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();

222

223

build_opts.emplace("#define MULT_TRANSPOSE1XW_WIDTH " + support::cpp11::to_string(mult_transpose1xW_width));

224

build_opts.emplace("#define MULT_INTERLEAVE4X4_HEIGHT " + support::cpp11::to_string(mult_interleave4x4_height));

225

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

226

switch(input0->info()->data_type())

227

{

228

case DataType::F16:

229

build_opts.emplace("#define DATA_TYPE_FP16");

break;

case DataType::F32:

build_opts.emplace("#define DATA_TYPE_FP32");

break;

default:

ARM_COMPUTE_ERROR("Current data type is not supported");

break;

}

build_opts.emplace("#define GEMM_MM_INTERLEAVED_TRANSPOSED");

242

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

243

kernel_name = "gemm_mm_interleaved_transposed";

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

244

}

245

else

246

{

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

247

// Special case for 1xN, 2xN, 3xN and 4xN input0 tensor

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

248

Michele Di Giorgio

2018-04-26 10:24:30 +0100

[diff] [blame]

249

GPUTarget arch_target = get_arch_from_target(gpu_target);

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

250

switch(input0->info()->data_type())

251

{

252

case DataType::F16:

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

253

build_opts.emplace("#define DATA_TYPE_FP16");

Frank Lei

b9d38ee

2017-12-05 10:43:33 +0800

[diff] [blame]

254

build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED");

Michele Di Giorgio

2018-04-26 10:24:30 +0100

[diff] [blame]

255

build_opts.emplace("#define GEMM_MM_FLOATING_POINT");

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

256

break;

257

258

case DataType::F32:

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

259

build_opts.emplace("#define DATA_TYPE_FP32");

Michele Di Giorgio

2018-04-26 10:24:30 +0100

[diff] [blame]

260

261

if(arch_target == GPUTarget::BIFROST && input0->info()->num_dimensions() != 1)

262

{

263

build_opts.emplace("#define GEMM_MM_FLOATING_POINT_BIFROST");

}

else

{

build_opts.emplace("#define GEMM_MM_FLOATING_POINT");

268

}

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

break;

default:

ARM_COMPUTE_ERROR("Current data type is not supported");

break;

}

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

276

build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elements_processed.x()));

277

build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elements_processed.y()));

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

278

Michele Di Giorgio

2018-04-13 14:28:08 +0100

[diff] [blame]

279

kernel_name = "gemm_mm_floating_point";

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

280

}

281

Michele Di Giorgio