blob: 470cee15570d3b18b93b3dd9a911096f371902de [file] [log] [blame]
Pablo Telloeb82fd22018-02-23 13:43:50 +00001/*
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +00002 * Copyright (c) 2017-2022 Arm Limited.
Pablo Telloeb82fd22018-02-23 13:43:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
Pablo Telloeb82fd22018-02-23 13:43:50 +000026#include <algorithm>
David Mansell318c9f42020-07-08 13:28:45 +010027#include <cassert>
Pablo Telloeb82fd22018-02-23 13:43:50 +000028
29#include "arm_gemm.hpp"
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +000030#include "bfloat.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000031#include "convolver.hpp"
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +000032#include "kernel_weight_format.hpp"
Pablo Telloeb82fd22018-02-23 13:43:50 +000033#include "mergeresults.hpp"
David Mansell318c9f42020-07-08 13:28:45 +010034#include "performance_parameters.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000035#include "quantized.hpp"
Pablo Telloeb82fd22018-02-23 13:43:50 +000036#include "transform.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000037#include "utils.hpp"
Pablo Telloeb82fd22018-02-23 13:43:50 +000038
Michalis Spyroue7e96e02018-04-13 13:44:10 +010039#ifdef CYCLE_PROFILING
40#include "profiler.hpp"
41#endif
42
Pablo Telloeb82fd22018-02-23 13:43:50 +000043// Some macros used to decide how much working space to allocate.
44// Round allocations up to the next cache line.
Anthony Barbier5f707732018-07-03 16:22:02 +010045#define ALLOC_ROUND 64
46#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
Pablo Telloeb82fd22018-02-23 13:43:50 +000047
48// Implementation of the GemmCommon abstract class.
49//
50// This implementation interleaves the source matrices in blocks - good for
51// larger matrices.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000052
Anthony Barbier5f707732018-07-03 16:22:02 +010053namespace arm_gemm {
54
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000055namespace {
56
57// Some kernels output to a linear buffer and require a separate merge step.
58// Others output directly to the matrix result. This helper class calls the
59// appropriate functions, using templating to avoid calling non-existent
60// functions.
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +000061template<bool MergeStep, bool FixedFormat, typename OutputStage>
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000062class kernel_and_merge {
63public:
64 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
65 static void run (
66#ifdef CYCLE_PROFILING
67 profiler &prof,
68#endif
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +000069 strategy &strat, const To *a_ptr, const To *b_panel, size_t b_stride, Tri *c_panel,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000070 Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
71 unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
72 const Activation &act, bool accumulate, const OutputStage &os, const int32_t *col_bias,
73 Tab *acc_buff);
74};
75
76// Run a kernel and call the separate merge step
77template<>
78template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +000079void kernel_and_merge<true, false, Nothing>::run(
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000080#ifdef CYCLE_PROFILING
81 profiler &prof,
82#endif
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +000083 strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000084 Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
85 unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
86 const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
87{
88 const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
89
90 {
91#ifdef CYCLE_PROFILING
92 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
93#endif
94
95 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
96 }
97
98 {
99#ifdef CYCLE_PROFILING
100 auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
101#endif
102 strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate);
103 }
104}
105
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000106// Run a fixed-format kernel and call the separate merge step
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000107template<>
108template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000109void kernel_and_merge<true, true, Nothing>::run(
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000110#ifdef CYCLE_PROFILING
111 profiler &prof,
112#endif
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000113 strategy &strat, const To *a_ptr, const To *b_panel, size_t b_stride, Tri *c_panel,
114 Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
115 unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
116 const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
117{
118 {
119#ifdef CYCLE_PROFILING
120 const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
121 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
122#endif
123
124 strat.kernel(a_ptr, b_panel, b_stride, c_panel, 1, (n_max - n_0), kern_k);
125 }
126
127 {
128#ifdef CYCLE_PROFILING
129 const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
130 auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
131#endif
132 strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate);
133 }
134}
135
136// Run a kernel with integrated merge
137template<>
138template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
139void kernel_and_merge<false, false, Nothing>::run(
140#ifdef CYCLE_PROFILING
141 profiler &prof,
142#endif
143 strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000144 Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
145 unsigned int n_0, unsigned int n_max, const Tr *biasptr,
146 const Activation &act, bool accumulate, const Nothing &, const int32_t *,
147 Tab *acc_buff)
148{
149#ifdef CYCLE_PROFILING
150 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
151#endif
152
153 // We need to offset the C pointer, but as it might be NULL (requesting output to accumulation buffer) we need
154 // to be careful not to offset a null pointer.
155 Tri *offset_c_ptr;
156
157 if (c_ptr == nullptr) {
158 offset_c_ptr = nullptr;
159 } else {
160 offset_c_ptr = c_ptr + m_0 * ldc + n_0;
161 }
162
163 strat.kernel(// A and B pointers are just the packed panels.
164 a_ptr, b_panel,
165 // Provide relevant part of output array and row stride.
166 offset_c_ptr, ldc,
167 // M, N, K sizes
168 m_max-m_0, n_max - n_0, kern_k,
169 // Bias, activation, accumulation. Need to offset the bias as needed.
170 biasptr ? biasptr + n_0 : nullptr, act, accumulate,
171 // Accumulation buffer.
172 acc_buff );
173}
174
175// Run a kernel with integrated merge, quantizing
176template<>
177template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000178void kernel_and_merge<false, false, Requantize32>::run(
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000179#ifdef CYCLE_PROFILING
180 profiler &prof,
181#endif
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000182 strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000183 Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
184 unsigned int n_0, unsigned int n_max, const Tr *,
185 const Activation &, bool accumulate, const Requantize32 &qp, const int32_t *col_bias,
186 Tab *acc_buff)
187{
188#ifdef CYCLE_PROFILING
189 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
190#endif
191
192 strat.kernel(// A and B pointers are just the packed panels.
193 a_ptr, b_panel,
194 // Provide relevant part of output array and row stride.
195 c_ptr + m_0 * ldc + n_0, ldc,
196 // M, N, K sizes
197 m_max-m_0, n_max - n_0, kern_k,
198 // Bias, activation, accumulation. Need to offset the bias as needed.
199 col_bias + n_0, qp, n_0, accumulate, acc_buff);
200}
201
202// Run a kernel and call the separate quantize step
203template<>
204template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000205void kernel_and_merge<true, false, Requantize32>::run(
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000206#ifdef CYCLE_PROFILING
207 profiler &prof,
208#endif
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000209 strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000210 Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
211 unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *,
212 const Activation &, bool, const Requantize32 &qp, const int32_t *col_bias,
213 Tab *)
214{
215 const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
216
217 {
218#ifdef CYCLE_PROFILING
219 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
220#endif
221
222 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
223 }
224
225 {
226#ifdef CYCLE_PROFILING
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100227 auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr)));
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000228#endif
229 // The interleaved kernel outputs in blocks - each block is a
230 // row-major matrix of size out_width * out_height. The merge
231 // kernels are designed to deal with this but the requantizer is
232 // not, so we need to requantize one block at a time.
233 for (int i=0; i<bblocks; i++) {
234 unsigned int n_start = n_0 + (strategy::out_width() * i);
235 unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
236
237 // The row bias is interleaved with the transposed A data, get a pointer to it here.
238 const int32_t *row_bias = reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k);
239
240 requantize_block_32(qp, (n_end - n_start), (m_max-m_0),
241 c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
242 c_ptr + m_0 * ldc + n_start, ldc,
243 row_bias, col_bias + n_start, n_start);
244 }
245 }
246}
247
248// Integer GEMMs can be used in two contexts - "normal" where the full 32-bit output is required, or in
249// "requantizing" context where the output will be requantized.
250//
251// These require different input transforms, as if we are requantizing we want to sum the rows of the A input, and
252// if we are not we don't.
253//
254// This helper class allows the appropriate transforms to be found, without requiring kernels that don't support
255// quantization to define useless "quantized" transforms.
256template<typename strategy, bool quantized>
257class transform_type {
258public:
259 typedef decltype(strategy::transforms) type;
260};
261
262template<typename strategy>
263class transform_type<strategy, true> {
264public:
265 typedef decltype(strategy::transforms_quantized) type;
266};
267
268// We need a similar trick here to figure out what type the accumulator buffer should be.
269template<typename strategy, typename OutputStage>
270class accumulate_buffer_type {
271public:
272 typedef typename strategy::result_type type;
273};
274
275template<typename strategy>
276class accumulate_buffer_type<strategy, Requantize32> {
277public:
278 typedef int32_t type;
279};
280
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000281// Stripe width is a concept only needed for FixedFormat kernels. Use an accessor to avoid issues in other scenarios.
282template<typename strategy, bool FixedFormat>
283struct get_stripe_width {
284 static unsigned int get() {
285 return 0;
286 }
287};
288
289template<typename strategy>
290struct get_stripe_width<strategy, true> {
291 static unsigned int get() {
292 return strategy::stripe_width();
293 }
294};
295
296// KernelWeightFormat is a similar story.
297template<typename strategy, bool FixedFormat, typename To>
298struct get_kernel_weight_format {
299 static KernelWeightFormat get() {
300 return KernelWeightFormat::NON_FIXED;
301 }
302};
303
304template<typename strategy, typename To>
305struct get_kernel_weight_format<strategy, true, To> {
306 static KernelWeightFormat get() {
307 KernelWeightFormat kwf = strategy::kernel_weight_format();
308
309 // If we are using a BF16 kernel to do an FP32 problem (fast mode) then we need to set the BF16 flag on the
310 // weight format.
311 if (std::is_same<To, float>::value && std::is_same<typename strategy::operand_type, bfloat16>::value) {
312 uint32_t kwf_i = static_cast<uint32_t>(kwf);
313 kwf_i |= 0x10;
314 kwf = static_cast<KernelWeightFormat>(kwf_i);
315 }
316
317 return kwf;
318 }
319};
320
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000321} // anonymous namespace
322
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000323template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool FixedFormat=false, bool ForceThreadColumns=false>
Anthony Barbier5f707732018-07-03 16:22:02 +0100324class GemmInterleaved : public GemmCommon<To, Tr> {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000325 typedef typename strategy::operand_type Toi;
Anthony Barbier5f707732018-07-03 16:22:02 +0100326 typedef typename strategy::result_type Tri;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000327 typedef typename accumulate_buffer_type<strategy, OutputStage>::type Tab;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000328
329 /* const properties set by constructor */
Anthony Barbier5f707732018-07-03 16:22:02 +0100330 const CPUInfo * const _ci;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000331
332 const unsigned int _Msize;
333 const unsigned int _Nsize;
334 const unsigned int _Ksize;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000335 const unsigned int _Ksections;
336 const unsigned int _Ktotal;
337 const unsigned int _rounded_Ksize;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000338
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100339 const unsigned int _nbatches;
340 const unsigned int _nmulti;
341
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000342 const bool _thread_columns;
343
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100344 const Activation _act;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000345
Anthony Barbier5f707732018-07-03 16:22:02 +0100346 const int _maxthreads;
347 int _nthreads;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000348
349 /* Blocking info */
Anthony Barbier5f707732018-07-03 16:22:02 +0100350 unsigned int _k_block=0;
351 unsigned int _x_block=0;
352 unsigned int _Mround=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000353
354 /* Working space, pretransposed buffer, buffer manager */
Anthony Barbier5f707732018-07-03 16:22:02 +0100355 const Toi *_B_transposed=nullptr;
Anthony Barbier5f707732018-07-03 16:22:02 +0100356 void *_working_space=nullptr;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000357
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000358 Tab *_accumulation_buffer=nullptr;
359
360 /* Output stage */
361 OutputStage _os;
362
363 /* Quantized support (in addition to 'output stage' above */
364 int32_t *col_bias = nullptr;
365
366 /* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
367 const To * const * const * _indirect_buf = nullptr;
368
369 /* Convolver - only set up for convolution problems, so also doubles as a flag. */
370 std::unique_ptr<convolver<To>> _convolver = nullptr;
371
372 unsigned int get_col_sum_size() const {
373 if (std::is_same<OutputStage, Requantize32>::value) {
374 return _Nsize * _nmulti * sizeof(int32_t);
375 } else {
376 return 0;
377 }
378 }
379
Pablo Telloeb82fd22018-02-23 13:43:50 +0000380 /* We will need to walk through the blocks of B in a few contexts, so
381 * factor that out. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100382 class blockwalker {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000383 private:
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100384 /* Size loops, etc. based on our parent's configuration */
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000385 const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns> &_parent;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000386
Anthony Barbier5f707732018-07-03 16:22:02 +0100387 /* K, X and multi parameters for current iteration. */
388 unsigned int _k0=0, _x0=0, _multi=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000389
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000390 /* Range of X to iterate over - used in "ForceThreadColumns" cases */
391 unsigned int _x_start=0;
392 unsigned int _x_end=_parent._Nsize;
393
Anthony Barbier5f707732018-07-03 16:22:02 +0100394 unsigned int _index=0;
395 bool _done=false;
396 bool _newkblock=true;
397 bool _newmulti=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000398
399 public:
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000400 blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns> &parent) : _parent(parent) { }
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000401
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000402 blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns> &parent,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000403 unsigned int x_start, unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000404
Anthony Barbier5f707732018-07-03 16:22:02 +0100405 unsigned int xmax() {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000406 return std::min(_x0 + _parent._x_block, _x_end);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000407 }
408
Anthony Barbier5f707732018-07-03 16:22:02 +0100409 unsigned int kmax() {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000410 return std::min(_k0 + _parent._k_block, _parent._Ktotal);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000411 }
412
413 /* Advance to the next block, return false at the end. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100414 bool advance(void) {
415 if (_done) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000416 return false;
417 }
418
Anthony Barbier5f707732018-07-03 16:22:02 +0100419 _newkblock=false;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100420 _x0 += _parent._x_block;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000421 if (_x0 >= _x_end) {
422 _x0=_x_start;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100423 _k0 += _parent._k_block;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000424 if (_k0 >= _parent._Ktotal) {
Anthony Barbier5f707732018-07-03 16:22:02 +0100425 _k0=0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100426 _multi++;
Anthony Barbier5f707732018-07-03 16:22:02 +0100427 if (_multi >= _parent._nmulti) {
428 _done=true;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100429 return false;
430 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100431 _newmulti=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000432 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100433 _newkblock=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000434 }
435 _index++;
436
437 return true;
438 }
439
Anthony Barbier5f707732018-07-03 16:22:02 +0100440 unsigned int k0(void) { return _k0; }
441 unsigned int x0(void) { return _x0; }
442 unsigned int multi(void) { return _multi; }
443 unsigned int index(void) { return _index; }
444 bool done(void) { return _done; }
445 bool newkblock(void) { return _newkblock; }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000446 };
447
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000448 // "k block" has two distinct uses: figuring out which iterations of K
449 // to actually process, but also various size/pointer computations. The
450 // latter needs to take account of the extra space needed for the row
451 // sums, if appropriate.
452 unsigned int get_total_k_depth() const {
453 unsigned int k_depth = _k_block;
454
455 if (std::is_same<OutputStage, Requantize32>::value) {
456 k_depth += sizeof(int32_t) / sizeof(Toi);
457 }
458
459 return k_depth;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000460 }
461
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000462 // A working size.
463 size_t get_a_working_size() const {
464 if (_thread_columns) {
465 // For 2D threading: allocate a buffer of one block of rows per thread
466 return ROUND_UP(sizeof(Toi) * get_total_k_depth() * strategy::out_height() * _maxthreads);
467 } else {
468 // For 1D threaded: one of these needed, regardless of thread count. Divided according to window.
469 return ROUND_UP(sizeof(Toi) * get_total_k_depth() * _Mround * _nbatches);
470 }
471 }
472
473 // C working size: One needed per thread. Not needed if there is no merge step.
Anthony Barbier5f707732018-07-03 16:22:02 +0100474 size_t get_c_working_size() const {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000475 if (MergeStep) {
476 return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
477 } else {
478 return 0;
479 }
480 }
481
482 // Accumulation buffer size
483 size_t get_accumulation_buffer_size() const {
484 // We only support an accumulation buffer for non-merge cases.
485 if (MergeStep) {
486 return 0;
487 }
488
489 // Check if we are actually blocking
490 if (_k_block == _Ktotal) {
491 return 0;
492 }
493
494 // We are no-merge, non-quantized with active blocking: accumulation buffer needed.
495 size_t size_per_buffer = sizeof(Tab) * strategy::out_height() * strategy::out_width();
496 size_t num_buffers = iceildiv(_Msize, strategy::out_height()) * iceildiv(_Nsize, strategy::out_width()) * _nbatches * _nmulti;
497
498 return num_buffers * size_per_buffer;
499 }
500
501 // Get pointer into accumulation buffer
502 Tab *get_accumulation_buffer(unsigned int M, unsigned int N, unsigned int batch, unsigned int multi) const {
503 // Don't do anything if there's no buffer.
504 if (_accumulation_buffer == nullptr) {
505 return nullptr;
506 }
507
508 // Here we are indexing an appropriately sized pointer, so no sizeof() needed to convert to bytes.
509 size_t size_per_buffer = strategy::out_height() * strategy::out_width();
510
511 size_t buffer_rows = iceildiv(_Msize, strategy::out_height());
512 size_t buffer_cols = iceildiv(_Nsize, strategy::out_width());
513 size_t buffers_per_batch = (buffer_rows * buffer_cols);
514 size_t buffers_per_multi = buffers_per_batch * _nbatches;
515
516 // M/N must reference the top-left corner of a block.
517 size_t row = M / strategy::out_height();
518 assert(M % strategy::out_height() == 0);
519 size_t col = N / strategy::out_width();
520 assert(N % strategy::out_width() == 0);
521
522 size_t buffer_index = multi * buffers_per_multi + batch * buffers_per_batch + row * buffer_cols + col;
523
524 return _accumulation_buffer + (buffer_index * size_per_buffer);
525 }
526
527 int32_t row_sum_multiplier() const {
528 if (std::is_same<OutputStage, Requantize32>::value) {
529 const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&_os);
530
531 return -qp->b_offset;
532 }
533
534 return 0;
535 }
536
537 // Heuristics to decide whether to use the 'thread columns' regime
538 static bool is_thread_columns(const GemmArgs &args) {
539 // For now, there is a templace parameter to force it.
540 if (ForceThreadColumns) {
541 return true;
542 }
543
544 // Never do this for single threaded cases.
545 if (args._maxthreads == 1) {
546 return false;
547 }
548
549 // How many blocks of work are available for threading on M?
550 int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
551
552 // If we just can't share the work across threads with the row threading regime.
553 if (args._maxthreads > m_blocks) {
554 return true;
555 }
556
557 // If the row threading regime is too wasteful (20% threshold)
558 if (((roundup(m_blocks, args._maxthreads) * 100) / m_blocks) > 120) {
559 return true;
560 }
561
562 return false;
563 }
564
565 static unsigned int get_ktotal(const GemmArgs &args) {
566 return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000567 }
568
David Mansell318c9f42020-07-08 13:28:45 +0100569 static unsigned int get_k_block_size(const GemmArgs &args) {
570 if (args._cfg && args._cfg->inner_block_size) {
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100571 return roundup(args._cfg->inner_block_size, strategy::k_unroll());
David Mansell318c9f42020-07-08 13:28:45 +0100572 }
573
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000574 // K blocking not supported if we are requantizing.
575 if (std::is_same<OutputStage, Requantize32>::value) {
576 return get_ktotal(args);
577 }
578
David Mansell318c9f42020-07-08 13:28:45 +0100579 const unsigned int L1_size = args._ci->get_L1_cache_size();
580 unsigned int k_block;
581
582 // k_block: Find out how much of the larger array can be loaded into half the cache.
583 // This should account for associative caches.
584 k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
585
586 // Needs to be (at least a single) multiple of the K unroll level.
587 k_block /= strategy::k_unroll();
588 k_block = std::max(k_block, 1U) * strategy::k_unroll();
589
590 // Now tune to presented problem size; this is how many blocks we need.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000591 unsigned int num_k_blocks = iceildiv(get_ktotal(args), k_block);
David Mansell318c9f42020-07-08 13:28:45 +0100592
593 // So divide the space equally into that many blocks.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000594 k_block = iceildiv(get_ktotal(args), num_k_blocks);
David Mansell318c9f42020-07-08 13:28:45 +0100595
596 // And round UP to the K unroll level required.
597 k_block = roundup(k_block, strategy::k_unroll());
598
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000599 assert(k_block > 0);
600
David Mansell318c9f42020-07-08 13:28:45 +0100601 return k_block;
602 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000603
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000604 static unsigned int get_x_block_size(const GemmArgs &args) {
605 if (is_thread_columns(args)) {
606 // In 2D mode, override X block, because we will process width first.
607 return roundup(args._Nsize, strategy::out_width());
608 }
609
610 if (args._cfg && args._cfg->outer_block_size) {
611 return roundup(args._cfg->outer_block_size, strategy::out_width());
612 }
613
614 unsigned int x_block;
615 const unsigned int L2_size = args._ci->get_L2_cache_size();
616 const unsigned int k_block = get_k_block_size(args);
617
618 // x_block: Work out how many rows (of length k_block) will fit in the L2
619 // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
620 const unsigned int scaled_l2_size = (L2_size * 9) / 10;
621 const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height());
622
623 // .. if the L1 contents is bigger than the L2, just return a minimal size block.
624 if (k_block_area > scaled_l2_size) {
625 return strategy::out_width();
626 }
627
628 x_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block);
629
630 // Needs to be (at least a single) multiple of the kernel output width.
631 x_block /= strategy::out_width();
632 x_block = std::max(x_block, 1u) * strategy::out_width();
633
634 // And tune to the presented problem size.
635 unsigned int num_x_blocks = iceildiv(args._Nsize, x_block);
636 x_block = iceildiv(args._Nsize, num_x_blocks);
637
638 x_block = roundup(x_block, strategy::out_width());
639
640 assert(x_block > 0);
641
642 return x_block;
643 }
644
Pablo Telloeb82fd22018-02-23 13:43:50 +0000645public:
646 GemmInterleaved(GemmInterleaved &) = delete;
Anthony Barbier5f707732018-07-03 16:22:02 +0100647 GemmInterleaved & operator= (GemmInterleaved &) = delete;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000648
649 /* Constructor */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000650 GemmInterleaved(const GemmArgs &args, const OutputStage &os)
651 : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
652 _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
653 _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
654 _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
655 _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
656 _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
657 _os(os) { }
658
659 /* Constructor without OutputStage */
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100660 GemmInterleaved(const GemmArgs &args)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100661 : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000662 _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
663 _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
664 _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
David Mansell318c9f42020-07-08 13:28:45 +0100665 _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000666 _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
667 _os() { }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000668
669 // Interface implementation - Compulsory functions
670
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100671 // Window size: Only the last thread should do a ragged block, so dole
672 // out work in units of out_height. Factor batches into the window, but
673 // not multi for now (as this would cause problems with the buffer
674 // manager).
Joseph Dobson6f8b17d2020-02-11 19:32:11 +0000675 ndrange_t get_window_size() const override {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000676 unsigned int row_blocks = (_Mround / strategy::out_height()) * _nbatches;
677
678 if (_thread_columns) {
679 return { row_blocks, iceildiv(_Nsize, strategy::out_width()) };
680 } else {
681 // _Mround is a multiple of out_height by definition.
682 return { row_blocks };
683 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000684 }
685
686 // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
Anthony Barbier5f707732018-07-03 16:22:02 +0100687 void set_nthreads(int nthreads) override {
688 _nthreads = std::min(nthreads, _maxthreads);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000689 }
690
691 // Execute
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100692 void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100693#ifdef CYCLE_PROFILING
694 profiler prof;
695#endif
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100696
697 /* Make sure we've been set up correctly. */
Pablo Marquez Tello93581a52022-07-21 13:55:27 +0100698 assert(FixedFormat || _B_transposed);
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100699 assert(_working_space);
700 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
701
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000702 /* Align if needed */
703 intptr_t working_space_v = reinterpret_cast<intptr_t>(_working_space);
704 if (working_space_v & 0x3f) {
705 intptr_t alignment_offset = 0x40 - (working_space_v & 0x3f);
706 working_space_bytes += alignment_offset;
707 }
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100708
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000709 strategy strat(_ci);
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100710
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000711 const auto start = work_range.get_position(0);
712 const auto end = work_range.get_position_end(0);
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100713
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000714 /* Translate 'start' and 'end' into a position within the batches and rows. */
715 const unsigned int window_per_batch = _Mround / strategy::out_height();
716 unsigned int batch_0 = start / window_per_batch;
717 unsigned int batch_end = end / window_per_batch;
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100718
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000719 // In ThreadColumns mode, process work one horizontal strip at a time.
720 // Transpose the block of needed rows at the start, then do all the work on that block.
721 if (_thread_columns) {
722 const auto start_x = work_range.get_position(1) * strategy::out_width();
723 const auto end_x = std::min(work_range.get_position_end(1) * strategy::out_width(), _Nsize);
724
725 Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
726 Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()) +
727 (threadid * sizeof(Toi) * get_total_k_depth() * strategy::out_height()));
728
729 for (unsigned int multi=0; multi<_nmulti; multi++) {
730 for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
731 unsigned int kmax=std::min(k0+_k_block, _Ktotal);
732
733 unsigned int rounded_width = roundup(_Nsize, strategy::out_width());
734
735 const bool first_pass = (k0==0);
736 const bool last_pass = (kmax==_Ktotal);
737
738 // Figure out how many "K" the kernel will actually process.
739 unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll());
740
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000741 const Toi *b_ptr = FixedFormat ?
742 reinterpret_cast<const Toi *>(this->_Bptr) + (multi * this->_B_multi_stride) +
743 ((start_x / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
744 (k0 * get_stripe_width<strategy, FixedFormat>::get()) :
745 _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000746
747 unsigned int batch = batch_0;
748 unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
749
750 for (unsigned int p=start; p<end; p++) {
751 unsigned int end_row = std::min(start_row + strategy::out_height(), _Msize);
752
753 // Set up transposed 'A' block
754 {
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100755#ifdef CYCLE_PROFILING
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000756 auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) * sizeof(Toi));
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100757#endif
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000758 // See comment above on transform_type<> class: this extracts either 'transforms' or
759 // 'transforms_quantized' as appropriate.
760 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
761
762 if (_indirect_buf != nullptr) {
763 transforms.PrepareA_indirect(a_panel,
764 _indirect_buf + (multi * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
765 _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
766 } else if (_convolver) {
767 transforms.PrepareA_convolution(a_panel,
768 this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
769 this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
770 } else {
771 transforms.PrepareA(a_panel,
772 this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
773 this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier());
774 }
775 }
776
777 // Perform the kernel and merge step, either separately or together as required.
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000778 kernel_and_merge<MergeStep, FixedFormat, OutputStage>::run(
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000779 #ifdef CYCLE_PROFILING
780 prof,
781 #endif
782 // Strategy and panel pointers
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000783 strat, a_panel, b_ptr, this->_ldb, c_panel,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000784 // Result buffer pointers
785 this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
786 // K size, and M/N ranges
787 kern_k, start_row, end_row, start_x, end_x,
788 // Only do bias on the first pass
789 ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
790 // Only do activation on the last pass, and accumulation on any non-first pass.
791 (last_pass ? _act : Activation()), !first_pass,
792 // Pass in quantization parameters for requantizing kernels (others will ignore)
793 _os, col_bias + (multi * _Nsize),
794 // Accumulation buffer (not yet implemented on this path)
795 static_cast<Tab *>(nullptr));
796
797 /* Increment to the next block */
798 start_row += strategy::out_height();
799 if (start_row >= _Msize) {
800 start_row = 0;
801 batch++;
802 }
803 }
804 }
805 }
806 } else {
807 blockwalker current(*this);
808
809 /* Compute the M values to operate on */
810 unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
811 unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
812
813 // Private buffers. Treat working_space as an array of C buffers
814 // (one per thread) first, followed by the (window-divided) A
815 // buffer.
816 // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
817 Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
818 Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
819
820 const Toi *b_panel;
821 b_panel = _B_transposed;
822
823 // newkblock() is always true on the first iteration, so these will be set properly on the first loop.
824
825 // kern_k tracks the accumulation depth for the CURRENT K block a_panel_stride similarly tracks the total
826 // stride of the A panel (i.e. with 4 added for cases with embedded row sums)
827
828 // These are distinct from k_block and get_total_k_depth() which are based on the target K block size, and
829 // used for addressing inside a_panel.
830
831 // In cases where K blocking is in use and the blocks are not all the same size, the (smaller) final block
832 // won't use all the memory allocated.
833 unsigned int kern_k = 0;
834 unsigned int a_panel_stride = 0;
835
836 for (;!current.done();current.advance()) {
837 if (current.newkblock()) {
838#ifdef CYCLE_PROFILING
839 auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
840#endif
841 // See comment above on transform_type<> class: this extracts either 'transforms' or
842 // 'transforms_quantized' as appropriate.
843 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
844
845 for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
846 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
847 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
848
849 if (first_m >= last_m)
850 continue;
851
852 if (_indirect_buf != nullptr) {
853 transforms.PrepareA_indirect(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
854 _indirect_buf + (current.multi() * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
855 _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
856 } else if (_convolver) {
857 transforms.PrepareA_convolution(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
858 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
859 this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
860 } else {
861 transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
862 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
863 this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier());
864 }
865 }
866
867 // Figure out how many "K" the kernel will actually process.
868 kern_k = roundup(current.kmax() - current.k0(), strategy::k_unroll());
869
870 // Requantizing GEMMs have the row sums built in to the
871 // transposed data, so the stride between rows is 4 bytes
872 // larger than the (rounded) K value.
873
874 if(std::is_same<OutputStage, Requantize32>::value) {
875 a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Toi));
876 } else {
877 a_panel_stride = kern_k;
878 }
879 }
880
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000881 // For FixedFormat cases, figure out the B pointer. The loop below moves through batches and vertically through the output so this will be the same throughout.
882 if (FixedFormat) {
883 b_panel = reinterpret_cast<const Toi *>(this->_Bptr) + (current.multi() * this->_B_multi_stride) +
884 ((current.x0() / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
885 (current.k0() * get_stripe_width<strategy, FixedFormat>::get());
886 }
887
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000888 /* Do the actual work. */
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100889 for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
890 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
891 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
892
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000893 const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth();
894
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100895 if (first_m >= last_m)
896 continue;
897
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000898 // For the merge case we need to do this out_height() rows
899 // at a time, as that is the size of our intermediate
900 // buffer. If we are not doing that, we can do all the
901 // relevant rows in one go.
902 unsigned int m_step = MergeStep ? strategy::out_height() : (last_m - first_m);
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100903
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000904 // But in the case where we have an accumulation buffer, we can't do that after all, unless
905 // there is no N blocking.
906 if (_accumulation_buffer && ((current.x0() != 0) || (current.xmax() < _Nsize))) {
907 m_step = strategy::out_height();
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100908 }
909
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000910 for (unsigned int y=first_m; y<last_m; y+=m_step) {
911 unsigned int ymax = std::min(_Msize, y + m_step);
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100912
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000913 const bool first_pass = (current.k0() == 0);
914 const bool last_pass = (current.kmax() == _Ktotal);
915
916 // Pointer to appropriate part of result array.
917 Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
918
919 // If we are using an accumulation buffer, we don't pass the result buffer to ask the kernel
920 // to write things into the accumulation buffer instead, except on the last pass.
921 if (_accumulation_buffer && !last_pass) {
922 result_ptr = nullptr;
923 }
924
925 // Perform the kernel and merge step, either separately or together as required.
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000926 kernel_and_merge<MergeStep, FixedFormat, OutputStage>::run(
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000927 #ifdef CYCLE_PROFILING
928 prof,
929 #endif
930 // Strategy and panel pointers
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000931 strat, a_ptr, b_panel, this->_ldb, c_panel,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000932 // Result buffer pointers
933 result_ptr, this->_ldc,
934 // K size, and M/N ranges
935 kern_k, y, ymax, current.x0(), current.xmax(),
936 // Only do bias on the first pass
937 ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
938 // Only do activation on the last pass, and accumulation on any non-first pass.
939 (last_pass ? _act : Activation()), !first_pass,
940 // Pass in quantization parameters for requantizing kernels (others will ignore)
941 _os, col_bias + (current.multi() * _Nsize),
942 // Accumulation buffer
943 get_accumulation_buffer(y, current.x0(), batch, current.multi()) );
944
945 a_ptr += (strategy::out_height() * a_panel_stride);
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100946 }
947 }
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100948
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000949 if (FixedFormat == false) {
950 b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
951 }
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000952 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000953 }
954 }
955
956 // Interface implementation - working space
Anthony Barbier5f707732018-07-03 16:22:02 +0100957 size_t get_working_size() const override {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000958 // In all cases, we need one A buffer plus a C buffer per thread, plus an accumulation buffer.
959 size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads) + get_accumulation_buffer_size();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000960
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000961 size += 128; // Add on two cache lines extra for alignment.
Pablo Telloeb82fd22018-02-23 13:43:50 +0000962
963 return size;
964 }
965
Anthony Barbier5f707732018-07-03 16:22:02 +0100966 void set_working_space(void *working_space) override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000967 // Make sure everything ends up cache line aligned
968 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
Anthony Barbier5f707732018-07-03 16:22:02 +0100969 intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000970
Anthony Barbier5f707732018-07-03 16:22:02 +0100971 size_t diff=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000972
Anthony Barbier5f707732018-07-03 16:22:02 +0100973 if (working_space_int & 0x3F) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000974 diff = 0x40 - (working_space_int & 0x3F);
975 }
976
977 working_space_bytes += diff;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000978 working_space_int += diff;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000979
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100980 // Pretransposed case: just set internal pointer to parameter value.
981 _working_space = reinterpret_cast<void *>(working_space_bytes);
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000982
983 // Set up accumulation buffer
984 if (get_accumulation_buffer_size() > 0) {
985 intptr_t acc_buff_int = working_space_int + get_a_working_size() + (get_c_working_size() * _maxthreads);
986 // Make sure the accumulation buffer is aligned (needed if the other blocks are not a multiple of cache line length)
987 if (acc_buff_int & 0x3F) {
988 acc_buff_int += (0x40 - (acc_buff_int & 0x3F));
989 }
990 _accumulation_buffer = reinterpret_cast<Tab *>(acc_buff_int);
991 } else {
992 _accumulation_buffer = nullptr;
993 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000994 }
995
996 // Interface implementation - pretransposed
Anthony Barbier5f707732018-07-03 16:22:02 +0100997 bool B_is_pretransposed() const override {
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000998 return (FixedFormat == false);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000999 }
1000
Anthony Barbier5f707732018-07-03 16:22:02 +01001001 bool B_pretranspose_required() const override {
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +00001002 return (FixedFormat == false) && (_B_transposed==nullptr);
Pablo Telloeb82fd22018-02-23 13:43:50 +00001003 }
1004
Anthony Barbier5f707732018-07-03 16:22:02 +01001005 size_t get_B_pretransposed_array_size() const override {
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +00001006 if (FixedFormat) {
1007 return 0;
1008 }
1009
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001010 unsigned int x_size = roundup(_Nsize, strategy::out_width());
Pablo Telloeb82fd22018-02-23 13:43:50 +00001011
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001012 return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size();
Pablo Telloeb82fd22018-02-23 13:43:50 +00001013 }
1014
Giorgio Arena63e0beb2021-09-24 14:04:27 +01001015 void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001016 if (std::is_same<OutputStage, Requantize32>::value) {
1017 col_bias = reinterpret_cast<int32_t *>(in_buffer);
1018
1019 Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
1020
1021 for (unsigned int i=0; i<_nmulti; i++) {
1022 // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
1023 compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
1024 }
1025 }
Giorgio Arena63e0beb2021-09-24 14:04:27 +01001026 }
1027
1028 void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
1029 requantize_bias(in_buffer, B, ldb, B_multi_stride);
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001030
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +00001031 // Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001032 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
1033 Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
Anthony Barbier5f707732018-07-03 16:22:02 +01001034 _B_transposed = buffer;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001035
1036 blockwalker current(*this);
David Manselld93991e2018-07-06 14:52:52 +01001037 strategy strat(_ci);
Pablo Telloeb82fd22018-02-23 13:43:50 +00001038
Anthony Barbier5f707732018-07-03 16:22:02 +01001039 do {
Pablo Telloeb82fd22018-02-23 13:43:50 +00001040 /* Figure out the size of each block. */
Georgios Pinitas1d480652019-01-23 11:24:50 +00001041 unsigned int k_size = (current.kmax() - current.k0());
Pablo Telloeb82fd22018-02-23 13:43:50 +00001042
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001043 if (_Ksections > 1) {
1044 // We need to insert padding at the end of each K section.
1045 // The computation needed is a little delicate - the coordinates from the block walker are expressed in
1046 // terms of the full, padded, _Ktotal.
1047 // But we need to transform each section with reference to the original, unpadded, input, letting the
1048 // transform pad each section as needed.
Pablo Telloeb82fd22018-02-23 13:43:50 +00001049
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001050 // This is needed for computations below.
1051 const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll());
Pablo Telloeb82fd22018-02-23 13:43:50 +00001052
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001053 // The expected output format is also an entire <out_width> columns interleaved, then the next set of
1054 // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at
1055 // a time.
1056 for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ) {
1057 unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
Pablo Telloeb82fd22018-02-23 13:43:50 +00001058
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001059 // Track where we are and how much work is left.
1060 unsigned int kpos = current.k0();
1061 unsigned int kleft = k_size;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001062
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001063 while (kleft) {
1064 // Which section are we in? Based on the rounded-up section size.
1065 unsigned int k_section_base = kpos / rounded_section_size;
1066 // How far into the section are we?
1067 unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001068
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001069 // We will either copy the rest of this section, or to the end of the requested length.
1070 unsigned int k_length = std::min(_Ksize - k_offset, kleft);
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001071
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001072 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
1073 x0, xmax,
1074 (k_section_base * _Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length.
1075 (k_section_base * _Ksize) + k_offset + k_length); // K end point - starting point plus length computed above.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001076
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001077 // We need to modify our position based on the ROUNDED version of what we just did.
1078 unsigned int padded_length = roundup(k_length, strategy::k_unroll());
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001079
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001080 buffer += strategy::out_width() * padded_length;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001081
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001082 kpos += padded_length;
1083 kleft -= padded_length;
1084 }
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001085 }
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001086 } else {
1087 // In the single K section case, can process the whole lot in one go.
1088 // Caution: 'blockwalker::kmax()' rounds up, so clamp to valid _Ksize.
1089 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
1090 current.x0(), current.xmax(), current.k0(), std::min(current.kmax(), _Ksize));
1091 buffer += roundup(current.xmax() - current.x0(), strategy::out_width()) * roundup(current.kmax() - current.k0(), strategy::k_unroll());
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001092 }
Anthony Barbier5f707732018-07-03 16:22:02 +01001093 } while (current.advance());
Pablo Telloeb82fd22018-02-23 13:43:50 +00001094 }
1095
Anthony Barbier5f707732018-07-03 16:22:02 +01001096 void set_pretransposed_B_data(void *in_buffer) override {
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +00001097 // Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001098 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
1099 _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
1100 col_bias = reinterpret_cast<int32_t *>(in_buffer);
1101 }
1102
1103 void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
1104 if (std::is_same<OutputStage, Requantize32>::value) {
1105 Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
1106
1107 qp->bias = bias;
1108 qp->bias_multi_stride = bias_multi_stride;
1109 }
1110 }
1111
1112 void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
1113 assert(string_len == _Ksize);
1114 _indirect_buf = ptr;
1115 }
1116
1117 void set_convolution_parameters(ConvolutionParameters parms) override {
1118 assert(parms.input_channels == _Ksize);
1119 _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
Michalis Spyroue7e96e02018-04-13 13:44:10 +01001120 }
David Mansell318c9f42020-07-08 13:28:45 +01001121
1122 // Estimate cycles for given problem given provided parameters
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001123 template<typename perf_type>
1124 static uint64_t estimate_cycles(const GemmArgs &args) {
David Mansell318c9f42020-07-08 13:28:45 +01001125 unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
1126
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001127 const PerformanceParameters &params = strategy::template get_performance_parameters<perf_type>(args._ci);
1128
Georgios Pinitas6f45cf72021-02-23 23:41:40 +00001129 uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * get_ktotal(args);
1130 uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * get_ktotal(args) * sizeof(Toi);
ramelg011f864492022-07-07 15:12:20 +01001131 uint64_t merge_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * k_blocks * args._Msize * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
David Mansell318c9f42020-07-08 13:28:45 +01001132
1133 float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
1134 float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
1135 float merge_cycles = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
1136
1137 float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
1138
1139 // We can't thread over multis or width, which makes this a poor
1140 // choice in many threaded cases. Penalize that here.
1141 float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches) * 0.9f;
1142
1143 if (parallelism_available < args._maxthreads) {
1144 total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
1145 }
1146
1147 return static_cast<uint64_t>(total_cycles);
1148 }
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001149
1150 GemmConfig get_config() override {
1151 GemmConfig c;
1152
1153 c.method = GemmMethod::GEMM_INTERLEAVED;
1154 c.inner_block_size = _k_block;
1155 c.outer_block_size = _x_block;
1156 c.filter = get_type_name<strategy>();
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +00001157 c.weight_format = get_weight_format(get_kernel_weight_format<strategy, FixedFormat, To>::get(), sizeof(To));
Georgios Pinitas4ee8b152021-07-16 16:16:43 +01001158
1159 return c;
1160 }
Pablo Telloeb82fd22018-02-23 13:43:50 +00001161};
1162
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001163// Aliases for the variations
1164template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
1165using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>;
1166
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +00001167template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
1168using GemmInterleavedFixedFormat = GemmInterleaved<strategy, To, Tr, OutputStage, true, true>;
1169
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001170template<typename strategy, typename To, typename Tr>
1171using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, Tr, Requantize32, false>;
1172
1173template<typename strategy, typename To, typename Tr>
1174using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>;
1175
Pablo Telloeb82fd22018-02-23 13:43:50 +00001176} // namespace arm_gemm