blob: 94b318c93e629c8cbd03f5d7ee2eb3d404de88ca [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
giuros011c9efeb2019-01-11 14:04:43 +00002 * Copyright (c) 2017-2019 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLGEMM.h"
25
Gian Marco Iodice926afe12019-03-19 11:44:13 +000026#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/CL/ICLTensor.h"
Gian Marco Iodice926afe12019-03-19 11:44:13 +000028#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
29#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010030#include "arm_compute/core/Error.h"
Gian Marco Iodice750641d2018-05-08 12:01:57 +010031#include "arm_compute/core/GPUTarget.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/Helpers.h"
33#include "arm_compute/core/TensorInfo.h"
34#include "arm_compute/core/Types.h"
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +010035#include "arm_compute/core/Utils.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010036#include "arm_compute/core/Validate.h"
Gian Marco Iodicee16c8902019-06-14 16:11:10 +010037#include "arm_compute/core/utils/helpers/float_ops.h"
Gian Marco Iodice750641d2018-05-08 12:01:57 +010038#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010039#include "arm_compute/runtime/CL/CLScheduler.h"
40#include "arm_compute/runtime/ITensorAllocator.h"
41
giuros011c9efeb2019-01-11 14:04:43 +000042namespace arm_compute
43{
Gian Marco Iodice750641d2018-05-08 12:01:57 +010044using namespace arm_compute::misc::shape_calculator;
Gian Marco Iodice90313eb2019-01-16 15:40:25 +000045using namespace arm_compute::cl_gemm;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010046
Georgios Pinitas8a94e7c2017-09-15 19:06:47 +010047CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +010048 : _memory_group(std::move(memory_manager)),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +010049 _mm_kernel(),
50 _ma_kernel(),
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +000051 _reshape_lhs_kernel(),
52 _reshape_rhs_kernel(),
53 _mm_reshaped_kernel(),
Gian Marco Iodice926afe12019-03-19 11:44:13 +000054 _mm_reshaped_only_rhs_kernel(),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +010055 _tmp_a(),
56 _tmp_b(),
57 _original_b(nullptr),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +010058 _run_addition(false),
59 _reshape_b_only_on_first_run(false),
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +000060 _is_prepared(false),
Gian Marco Iodice926afe12019-03-19 11:44:13 +000061 _gemm_type(GEMMType::NATIVE)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010062{
63}
64
Gian Marco Iodice926afe12019-03-19 11:44:13 +000065CLGEMM::GEMMType CLGEMM::select_gemm_type(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010066{
Gian Marco Iodice926afe12019-03-19 11:44:13 +000067 GEMMType gemm_type = GEMMType::RESHAPED_V1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010068
Gian Marco Iodice926afe12019-03-19 11:44:13 +000069 if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
70 {
71 if((m > 1) && (n < 16))
72 {
73 gemm_type = GEMMType::RESHAPED_V1;
74 }
75 else if((m == 1) && (data_type == DataType::F32))
76 {
77 gemm_type = GEMMType::RESHAPED_ONLY_RHS;
78 }
79 else
80 {
81 // COMPMID-852
82 if((k > 256) && (m > 4) && is_data_type_float(data_type) && reshape_b_only_on_first_run)
83 {
84 constexpr float alpha = 3.2f;
85 constexpr float fact0 = 1.51f;
86 constexpr float fact1 = 1.66f;
87 constexpr float ops = 12.0f;
88 const float scale = k > 1024 ? 1.07f : 1.0f;
89 gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
90 }
91 else
92 {
93 gemm_type = GEMMType::NATIVE;
94 }
95 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010096
Gian Marco Iodice926afe12019-03-19 11:44:13 +000097 const auto workload = static_cast<float>((m * n) / 20.0f);
Gian Marco Iodice1246b632017-08-16 18:38:32 +010098
Gian Marco Iodice926afe12019-03-19 11:44:13 +000099 gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
100 }
101 else
102 {
103 // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
104 gemm_type = ((m != 1) && reshape_b_only_on_first_run) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
105 }
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100106
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000107 return gemm_type;
108}
109
110void CLGEMM::configure_native(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
111{
112 const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
113 const unsigned int n = b->info()->dimension(0);
114 const unsigned int k = a->info()->dimension(0);
115 const GPUTarget gpu_target = CLScheduler::get().target();
Gian Marco36a0a462018-01-12 10:21:40 +0000116
117 // Set the target for the kernels
Gian Marco36a0a462018-01-12 10:21:40 +0000118 _mm_kernel.set_target(gpu_target);
119
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000120 GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d());
121
122 // Configure and tune matrix multiply kernel
123 _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision());
124
125 // Tune kernel statically
126 CLScheduler::get().tune_kernel_static(_mm_kernel);
127}
128
129void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
130{
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000131 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
132 const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
133 const unsigned int n = b->info()->dimension(0);
134 const unsigned int k = a->info()->dimension(0);
135 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000136 const GPUTarget gpu_target = CLScheduler::get().target();
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000137 int mult_transpose1xW_width = 1;
138 int mult_interleave4x4_height = 1;
Gian Marco36a0a462018-01-12 10:21:40 +0000139
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000140 // Set the target for the kernels
141 _reshape_lhs_kernel.set_target(gpu_target);
142 _mm_kernel.set_target(gpu_target);
143
Gian Marco Iodice750641d2018-05-08 12:01:57 +0100144 if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
Gian Marco36a0a462018-01-12 10:21:40 +0000145 {
146 mult_transpose1xW_width = 4;
147 mult_interleave4x4_height = 2;
148 }
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000149
giuros018b6b4a92018-12-18 19:01:33 +0000150 GEMMRHSMatrixInfo rhs_info;
151 rhs_info.n0 = 16 / b->info()->element_size();
152 rhs_info.k0 = 1;
153 rhs_info.h0 = mult_transpose1xW_width;
154 rhs_info.interleave = false;
155 rhs_info.transpose = false;
Gian Marco36a0a462018-01-12 10:21:40 +0000156
giuros011c9efeb2019-01-11 14:04:43 +0000157 GEMMLHSMatrixInfo lhs_info;
158 lhs_info.m0 = 4;
159 lhs_info.k0 = 4;
160 lhs_info.v0 = mult_interleave4x4_height;
161 lhs_info.interleave = true;
162 lhs_info.transpose = true;
163
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000164 GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
Gian Marcob5311a62017-12-13 12:48:03 +0000165
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000166 _memory_group.manage(&_tmp_a);
167 if(!_reshape_b_only_on_first_run)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +0100168 {
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000169 _memory_group.manage(&_tmp_b);
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100170 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100171
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000172 // Configure interleave kernel
173 _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100174
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000175 // Configure transpose kernel
176 _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100177
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000178 // Configure and tune matrix multiply kernel
179 _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision());
180
181 CLScheduler::get().tune_kernel_static(_mm_kernel);
182
183 // Allocate intermediate tensors
184 _tmp_a.allocator()->allocate();
185 if(!_reshape_b_only_on_first_run)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100186 {
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000187 _tmp_b.allocator()->allocate();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100188 }
189}
190
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000191void CLGEMM::configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
192{
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000193 DataType data_type = a->info()->data_type();
194 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
195 const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
196 const unsigned int n = b->info()->dimension(0);
197 const unsigned int k = a->info()->dimension(0);
198 const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
199 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
200 const GPUTarget gpu_target = CLScheduler::get().target();
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100201 bool broadcast_bias = gemm_info.broadcast_bias();
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000202
203 // Set the target for the kernels
204 _reshape_lhs_kernel.set_target(gpu_target);
205 _mm_kernel.set_target(gpu_target);
206
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100207 GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, false, broadcast_bias);
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000208
209 // Manage intermediate buffers
210 _memory_group.manage(&_tmp_a);
211 if(!_reshape_b_only_on_first_run)
212 {
213 _memory_group.manage(&_tmp_b);
214 }
215 // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
216
217 GEMMLHSMatrixInfo lhs_info{};
218 GEMMRHSMatrixInfo rhs_info{};
219
220 // Pick up the GEMM configuration
221 std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
222 ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
223
224 // Configure lhs_info and rhs_info
225 std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
226
227 _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
228 _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
229
230 // Configure and tune matrix multiply kernel
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100231 _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, reshape_info);
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000232
233 // Allocate intermediate tensors
234 _tmp_a.allocator()->allocate();
235 if(!_reshape_b_only_on_first_run)
236 {
237 _tmp_b.allocator()->allocate();
238 }
239}
240
241void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
242{
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000243 DataType data_type = a->info()->data_type();
244 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
245 const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
246 const unsigned int n = b->info()->dimension(0);
247 const unsigned int k = a->info()->dimension(0);
248 const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
249 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
250 const GPUTarget gpu_target = CLScheduler::get().target();
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100251 bool broadcast_bias = gemm_info.broadcast_bias();
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000252
253 // Set the target for the kernels
254 _mm_kernel.set_target(gpu_target);
255
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100256 GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, broadcast_bias);
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000257
258 // Manage intermediate buffers
259 if(!_reshape_b_only_on_first_run)
260 {
261 _memory_group.manage(&_tmp_b);
262 }
263
264 GEMMLHSMatrixInfo lhs_info{};
265 GEMMRHSMatrixInfo rhs_info{};
266
267 // Pick up the GEMM configuration
268 std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
269 ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
270
271 // Configure lhs_info and rhs_info
272 std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
273
274 _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
275
276 // Configure and tune matrix multiply kernel
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100277 _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, reshape_info);
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000278
279 if(!_reshape_b_only_on_first_run)
280 {
281 _tmp_b.allocator()->allocate();
282 }
283}
284
285Status CLGEMM::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
Georgios Pinitas78c00902018-01-09 17:33:11 +0000286{
Gian Marco Iodice750641d2018-05-08 12:01:57 +0100287 ARM_COMPUTE_UNUSED(alpha);
Gian Marco Iodice215b4ea2018-06-28 16:29:29 +0100288 ARM_COMPUTE_UNUSED(output);
Gian Marco Iodice750641d2018-05-08 12:01:57 +0100289
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000290 // Get the GPU target
291 const GPUTarget gpu_target = CLScheduler::get().target();
292 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
293 const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
294 const unsigned int n = b->dimension(0);
295 const unsigned int k = a->dimension(0);
296 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
297 const bool add_c = (beta != 0.f && c != nullptr);
298 const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
299 const bool fuse_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
Gian Marco Iodice750641d2018-05-08 12:01:57 +0100300
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000301 const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
302
303 // Validate matrix multiply
304 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
305 false, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
306
307 if(add_c && !fuse_add)
308 {
309 // Validate matrix addition kernel
310 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
311 }
312
313 return Status{};
314}
315
316Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
317{
318 ARM_COMPUTE_UNUSED(alpha);
319 ARM_COMPUTE_UNUSED(output);
Gian Marco Iodice750641d2018-05-08 12:01:57 +0100320
321 TensorInfo tmp_a_info{};
322 TensorInfo tmp_b_info{};
Gian Marco Iodice750641d2018-05-08 12:01:57 +0100323
324 // Get the GPU target
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000325 const GPUTarget gpu_target = CLScheduler::get().target();
326 const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000327 const unsigned int n = b->dimension(0);
328 const unsigned int k = a->dimension(0);
329 int mult_transpose1xW_width = 1;
330 int mult_interleave4x4_height = 1;
331 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000332 const bool add_c = (beta != 0.f && c != nullptr);
333 const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
334 const bool fuse_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
Gian Marco Iodice750641d2018-05-08 12:01:57 +0100335
336 if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
337 {
338 mult_transpose1xW_width = 4;
339 mult_interleave4x4_height = 2;
340 }
341
giuros018b6b4a92018-12-18 19:01:33 +0000342 GEMMRHSMatrixInfo rhs_info;
343 rhs_info.n0 = 16 / b->element_size();
344 rhs_info.k0 = 1;
345 rhs_info.h0 = mult_transpose1xW_width;
346 rhs_info.interleave = false;
347 rhs_info.transpose = false;
348
giuros011c9efeb2019-01-11 14:04:43 +0000349 GEMMLHSMatrixInfo lhs_info;
350 lhs_info.m0 = 4;
351 lhs_info.k0 = 4;
352 lhs_info.v0 = mult_interleave4x4_height;
353 lhs_info.interleave = true;
354 lhs_info.transpose = true;
355
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000356 const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
Gian Marco Iodice750641d2018-05-08 12:01:57 +0100357
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000358 // Validate interleave kernel
359 auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
360 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000361
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000362 // Validate transpose kernel
363 auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
364 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
Michele Di Giorgioebc3a902018-11-16 16:04:25 +0000365
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000366 // Validate matrix multiply
367 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
368 true, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +0100369
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000370 if(add_c && !fuse_add)
Gian Marco Iodice750641d2018-05-08 12:01:57 +0100371 {
372 // Validate matrix addition kernel
Giorgio Arena0f170392018-07-18 16:13:12 +0100373 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
Gian Marco Iodice750641d2018-05-08 12:01:57 +0100374 }
375
Georgios Pinitas78c00902018-01-09 17:33:11 +0000376 return Status{};
377}
378
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000379Status CLGEMM::validate_reshaped_v2(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
380{
381 ARM_COMPUTE_UNUSED(alpha);
382 ARM_COMPUTE_UNUSED(output);
383
384 TensorInfo tmp_a_info{};
385 TensorInfo tmp_b_info{};
386
387 // Get the GPU target
388 const GPUTarget gpu_target = CLScheduler::get().target();
389 DataType data_type = a->data_type();
390 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
391 const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
392 const unsigned int n = b->dimension(0);
393 const unsigned int k = a->dimension(0);
394 const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
395 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100396 const bool broadcast_bias = gemm_info.broadcast_bias();
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000397
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100398 const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, false, broadcast_bias);
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000399
400 GEMMLHSMatrixInfo lhs_info;
401 GEMMRHSMatrixInfo rhs_info;
402
403 // Pick up the GEMM configuration
404 std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
405 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
406
407 // Configure lhs_info and rhs_info
408 std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
409
410 auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
411 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
412
413 auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
414 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
415
416 // Validate matrix multiply
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100417 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, reshape_info));
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000418
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000419 return Status{};
420}
421
422Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
423{
424 ARM_COMPUTE_UNUSED(alpha);
425 ARM_COMPUTE_UNUSED(output);
426
427 TensorInfo tmp_b_info{};
428
429 // Get the GPU target
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100430 const GPUTarget gpu_target = CLScheduler::get().target();
431 const DataType data_type = a->data_type();
432 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
433 const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
434 const unsigned int n = b->dimension(0);
435 const unsigned int k = a->dimension(0);
436 const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
437 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
438 const bool broadcast_bias = gemm_info.broadcast_bias();
439 const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, broadcast_bias);
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000440
441 GEMMLHSMatrixInfo lhs_info;
442 GEMMRHSMatrixInfo rhs_info;
443
444 // Pick up the GEMM configuration
445 std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
446 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
447
448 // Configure lhs_info and rhs_info
449 std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
450
451 auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
452 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
453
454 // Validate matrix multiply
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100455 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, reshape_info));
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000456
457 return Status{};
458}
459
460void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
461{
462 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
463
464 // Perform validation step
465 ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
466
467 // Check if we need to reshape the matrix B only on the first run
468 _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
469 _is_prepared = gemm_info.retain_internal_weights();
470 _original_b = b;
471
472 // Get the GPU target
473 const GPUTarget gpu_target = CLScheduler::get().target();
474 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
475 const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
476 const unsigned int n = b->info()->dimension(0);
477 const unsigned int k = a->info()->dimension(0);
478
479 // Select GEMMType
480 _gemm_type = select_gemm_type(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
481
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100482 const bool is_fuse_add_c_supported = (_gemm_type == GEMMType::RESHAPED_V2) || (_gemm_type == GEMMType::RESHAPED_ONLY_RHS);
483 const bool add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
484 const bool fuse_add_c = add_c && is_fuse_add_c_supported;
485
486 const ICLTensor *c_to_use = fuse_add_c ? c : nullptr;
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000487
488 switch(_gemm_type)
489 {
490 case GEMMType::NATIVE:
491 {
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100492 configure_native(a, b, c_to_use, output, alpha, beta, gemm_info);
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000493 break;
494 }
495 case GEMMType::RESHAPED_V1:
496 {
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100497 configure_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info);
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000498 break;
499 }
500 case GEMMType::RESHAPED_V2:
501 {
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100502 configure_reshaped_v2(a, b, c_to_use, output, alpha, beta, gemm_info);
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000503 break;
504 }
505 case GEMMType::RESHAPED_ONLY_RHS:
506 {
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100507 configure_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info);
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000508 break;
509 }
510 default:
511 {
512 ARM_COMPUTE_ERROR("GEMMType not supported");
513 }
514 }
515
516 // Configure matrix addition kernel
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100517 if(add_c && !fuse_add_c)
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000518 {
519 _ma_kernel.configure(c, output, beta);
520 _run_addition = true;
521 }
522}
523
524Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
525{
526 // Get the GPU target
527 const GPUTarget gpu_target = CLScheduler::get().target();
528 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
529 const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
530 const unsigned int n = b->dimension(0);
531 const unsigned int k = a->dimension(0);
532
533 // Select GEMMType
534 GEMMType gemm_type = select_gemm_type(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run(), gpu_target);
535
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100536 const bool is_fuse_add_c_supported = (gemm_type == GEMMType::RESHAPED_V2) || (gemm_type == GEMMType::RESHAPED_ONLY_RHS);
537 const bool add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
538 const bool fuse_add_c = add_c && is_fuse_add_c_supported;
539
540 const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
541
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000542 switch(gemm_type)
543 {
544 case GEMMType::NATIVE:
545 {
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100546 ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c_to_use, output, alpha, beta, gemm_info));
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000547 break;
548 }
549 case GEMMType::RESHAPED_V1:
550 {
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100551 ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000552 break;
553 }
554 case GEMMType::RESHAPED_V2:
555 {
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100556 ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c_to_use, output, alpha, beta, gemm_info));
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000557 break;
558 }
559 case GEMMType::RESHAPED_ONLY_RHS:
560 {
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100561 ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000562 break;
563 }
564 default:
565 {
566 ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
567 }
568 }
569
Gian Marco Iodicee16c8902019-06-14 16:11:10 +0100570 // Validate matrix addition kernel
571 if(add_c && !fuse_add_c)
572 {
573 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
574 }
575
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000576 return Status{};
577}
578
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100579void CLGEMM::run()
580{
Georgios Pinitase0437672018-05-02 14:07:55 +0100581 prepare();
582
Georgios Pinitasda953f22019-04-02 17:27:03 +0100583 MemoryGroupResourceScope scope_mg(_memory_group);
Georgios Pinitas8a94e7c2017-09-15 19:06:47 +0100584
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100585 // Run matrix multiply kernel
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000586 switch(_gemm_type)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000587 {
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000588 case GEMMType::NATIVE:
589 {
590 CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
591 break;
592 }
593 case GEMMType::RESHAPED_V1:
594 {
595 // Run interleave kernel
596 CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
597
598 if(!_reshape_b_only_on_first_run)
599 {
600 // Run transpose kernel
601 CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
602 }
603
604 CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
605 break;
606 }
607 case GEMMType::RESHAPED_V2:
608 {
609 // Run interleave kernel
610 CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
611
612 if(!_reshape_b_only_on_first_run)
613 {
614 // Run transpose kernel
615 CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
616 }
617
618 CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
619 break;
620 }
621 case GEMMType::RESHAPED_ONLY_RHS:
622 {
623 if(!_reshape_b_only_on_first_run)
624 {
625 // Run transpose kernel
626 CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
627 }
628
629 CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, !_run_addition);
630 break;
631 }
632 default:
633 {
634 ARM_COMPUTE_ERROR("GEMMType not supported");
635 }
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000636 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100637
638 // Run matrix addition kernel
639 if(_run_addition)
640 {
641 CLScheduler::get().enqueue(_ma_kernel);
642 }
Georgios Pinitase0437672018-05-02 14:07:55 +0100643}
Georgios Pinitas82b51482018-04-24 15:14:12 +0100644
Georgios Pinitase0437672018-05-02 14:07:55 +0100645void CLGEMM::prepare()
646{
647 if(!_is_prepared)
648 {
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000649 if(_gemm_type != GEMMType::NATIVE && _reshape_b_only_on_first_run)
Georgios Pinitase0437672018-05-02 14:07:55 +0100650 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100651 // Run transpose kernel and mark original weights tensor as unused
Georgios Pinitase0437672018-05-02 14:07:55 +0100652 _tmp_b.allocator()->allocate();
giuros018b6b4a92018-12-18 19:01:33 +0000653 CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
Georgios Pinitase0437672018-05-02 14:07:55 +0100654 _original_b->mark_as_unused();
655 }
656 CLScheduler::get().queue().finish();
657 _is_prepared = true;
658 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100659}
giuros011c9efeb2019-01-11 14:04:43 +0000660} // namespace arm_compute