blob: e2b849aa3dfab33e54a72cdf955b1dde4cbb5b33 [file] [log] [blame]
Anthony Barbier3d677cc2018-07-23 16:42:59 +01001/*
Georgios Pinitas7cd26d42019-01-09 18:35:17 +00002 * Copyright (c) 2018-2019 ARM Limited.
Anthony Barbier3d677cc2018-07-23 16:42:59 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVEDMATRIXMULTIPLYWRAPPER_H__
25#define __ARM_COMPUTE_NEGEMMINTERLEAVEDMATRIXMULTIPLYWRAPPER_H__
26
27#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
28
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000029#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Anthony Barbier3d677cc2018-07-23 16:42:59 +010031#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000032#include "arm_compute/core/Utils.h"
33#include "arm_compute/core/Validate.h"
Anthony Barbier3d677cc2018-07-23 16:42:59 +010034#include "arm_compute/core/Window.h"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000035#include "arm_compute/core/WindowIterator.h"
Anthony Barbier3d677cc2018-07-23 16:42:59 +010036
37namespace arm_compute
38{
39class ITensor;
40
41/** Unit of work for @ref NEGEMMInterleavedMatrixMultiplyWrapper to process */
42struct MatrixMultiplyWorkload
43{
44 /** Constructor
45 *
46 * @param[in] offset_transformed_b Offset from the start of transformed_b's allocation.
47 * @param[in] x0 First value to process along the X dimension (N).
48 * @param[in] xmax Last value to process along the X dimension (N).
49 * @param[in] k0 First value to process along the K dimension.
50 * @param[in] kmax Last value to process along the K dimension.
51 * @param[in] multi Multi index.
52 * @param[in] kern_k Number of elements along K actually processed by the kernel.
53 * @param[in] bblocks Number of x_block processed by the kernel.
54 */
55 MatrixMultiplyWorkload(unsigned int offset_transformed_b, unsigned int x0, unsigned int xmax, unsigned int k0, unsigned int kmax, unsigned int multi, int kern_k, int bblocks)
56 : _offset_transformed_b(offset_transformed_b), _x0(x0), _xmax(xmax), _k0(k0), _kmax(kmax), _multi(multi), _kern_k(kern_k), _bblocks(bblocks)
57 {
58 }
59 unsigned int _offset_transformed_b; /**< Offset from the start of transformed_b's allocation.*/
60 unsigned int _x0; /**< First value to process along the X dimension (N). */
61 unsigned int _xmax; /**< Last value to process along the X dimension (N). */
62 unsigned int _k0; /**< First value to process along the K dimension. */
63 unsigned int _kmax; /**< Last value to process along the K dimension. */
64 unsigned int _multi; /**< Multi index. */
65 int _kern_k; /**< Number of elements along K actually processed by the kernel. */
66 int _bblocks; /**< Number of x_block processed by the kernel. */
67};
68
69/** Common interface for the templated wrappers around the matrix multiply NEON assembly implementations */
70class NEGEMMInterleavedMatrixMultiplyWrapper
71{
72public:
73 /** Transform the block at the given coordinates
74 *
75 * @param[in] wl Workload to process.
76 * @param[in] info Information about the current thread.
77 * @param[in] batch_window Window containing iteration information for the M and batch dimensions.
78 * @param[in] start_offset Offset relative to the beginning of batch_window to start the processing from.
79 * @param[in] end_offset Offset relative to the beginning of batch_window to stop the processing.
80 */
81 virtual void transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) = 0;
82 /** Generate an array of workloads
83 *
84 * @param[out] workloads Container to store the generated workloads.
85 */
86 virtual void create_workloads(std::vector<MatrixMultiplyWorkload> &workloads) = 0;
87 /** Default destructor */
88 virtual ~NEGEMMInterleavedMatrixMultiplyWrapper() = default;
89};
90
91/** Equivalent to arm_gemm::GemmInterleaved's strategy::kernel() but using Compute Library types. */
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000092template <typename strategy>
Anthony Barbier3d677cc2018-07-23 16:42:59 +010093class NEGEMMInterleavedMatrixMultiplyWrapperTemplate : public NEGEMMInterleavedMatrixMultiplyWrapper
94{
95public:
96 /** Configure the matrix multiplication: C = alpha * A * B + beta * C
97 *
98 * @param[in] prepared_a Already reshaped matrix A.
99 * @param[in] transformed_b Already reshaped matrix B.
100 * @param[out] tmp_c Temporary buffer to be used to store intermediate results.
101 * @param[in,out] c Result matrix C.
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000102 * @param[in] block_walker Window containing iteration information for the M and batch dimensions.
Anthony Barbier3d677cc2018-07-23 16:42:59 +0100103 * @param[in] block_sizes Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes).
104 * @param[in] params M, N, K sizes.
105 * @param[in] is_pretransposed Is B also pretransposed ?
106 * @param[in] alpha Alpha value
107 * @param[in] beta Beta value
108 * @param[in] max_num_threads Maximum number of threads that might be used for the calculations.
109 */
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000110 void configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker, const BlockSizes &block_sizes,
111 const INEGEMMWrapperKernel::Params &params, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads)
112 {
113 _prepared_a = prepared_a;
114 _transformed_b = transformed_b;
115 _tmp_c = tmp_c;
116 _c = c;
117 _block_walker = block_walker;
118 _block_sizes = block_sizes;
119 _params = params;
120 _b_is_pretransposed = b_is_pretransposed;
121 _alpha = alpha;
122 _beta = beta;
123
124 auto_init_if_empty(*_tmp_c->info(), c->info()->clone()->set_tensor_shape(TensorShape{ _block_sizes.x_block * strategy::out_height(), max_num_threads }));
125 }
Anthony Barbier3d677cc2018-07-23 16:42:59 +0100126
127 // Inherited methods overridden:
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000128 void transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override
129 {
130 strategy strat(info.cpu_info);
131 TensorAccessor<typename strategy::operand_type> prepared_a(*_prepared_a);
132 TensorAccessor<typename strategy::operand_type> transformed_b(*_transformed_b);
133 TensorAccessor<typename strategy::result_type> c(*_c);
134 TensorAccessor<typename strategy::result_type> tmp_c(*_tmp_c);
135
136 int prev_batch = -1;
137 typename strategy::operand_type *a_ptr = nullptr;
138 auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
139 {
140 const unsigned int y = id.x();
141 const unsigned int batch = id.y();
142 const unsigned int ymax = std::min(_params.M, y + strategy::out_height());
143
144 // If it's the first block of a new batch then reset the pointer to A.
145 if(prev_batch != static_cast<int>(batch))
146 {
147 const unsigned int first_m = id.x();
148 a_ptr = prepared_a(0, first_m, batch);
149 prev_batch = batch;
150 }
151
152 // Call matrix multiply assembly routine to process the block:
153 strat.kernel(a_ptr, transformed_b(wl._offset_transformed_b), tmp_c(0, info.thread_id), 1, wl._bblocks, wl._kern_k);
154 a_ptr += strategy::out_height() * wl._kern_k;
155
156 // Merge the result with the other blocks' results:
157 strat.transforms.Merge(c(0, 0, batch, wl._multi), tmp_c(0, info.thread_id), c.stride(1), y, ymax, wl._x0, wl._xmax, _alpha, (wl._k0 == 0 ? _beta : static_cast<typename strategy::result_type>(1)));
158 });
159 auto on_new_row_size = [&](unsigned int start, unsigned int end)
160 {
161 //Nothing to do
162 };
163 window_iterator.iterate_2D(on_new_row_size);
164 }
165 void create_workloads(std::vector<MatrixMultiplyWorkload> &workloads) override
166 {
167 unsigned int offset_transformed_b = 0;
168 unsigned int wl_index = 0;
169 unsigned int num_buffers = 0, reshaped_block_size = 0;
170
171 if(!_b_is_pretransposed)
172 {
173 num_buffers = _transformed_b->info()->tensor_shape()[1];
174 reshaped_block_size = _transformed_b->info()->tensor_shape()[0];
175 }
176 execute_window_loop(_block_walker, [&](const Coordinates & id)
177 {
178 const unsigned int x0 = id.x();
179 const unsigned int k0 = id.y();
180 const unsigned int multi = id.z();
181
182 const unsigned int xmax = std::min(x0 + _block_walker.x().step(), _params.N);
183 const unsigned int kmax = std::min(k0 + _block_walker.y().step(), _params.K);
184
185 // Figure out how many "K" the kernel will actually process.
186 const int kern_k = ceil_to_multiple(kmax - k0, strategy::k_unroll());
187 const int bblocks = DIV_CEIL(xmax - x0, strategy::out_width());
188
189 workloads.push_back(MatrixMultiplyWorkload(offset_transformed_b, x0, xmax, k0, kmax, multi, kern_k, bblocks));
190
191 if(_b_is_pretransposed)
192 {
193 offset_transformed_b += bblocks * strategy::out_width() * kern_k;
194 }
195 else
196 {
197 // Rotate through the BufferManager's buffers:
198 wl_index++;
199 offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size;
200 }
201 });
202 }
Anthony Barbier3d677cc2018-07-23 16:42:59 +0100203
204private:
205 const ITensor *_prepared_a
206 {
207 nullptr
208 };
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000209 const ITensor *_transformed_b{ nullptr };
210 ITensor *_tmp_c{ nullptr };
211 ITensor *_c{ nullptr };
212 unsigned int _Nsize{ 0 };
213 unsigned int _Ksize{ 0 };
214 bool _transpose_b{ false };
215 BlockSizes _block_sizes{};
216 INEGEMMWrapperKernel::Params _params{};
217 Window _block_walker{};
218 bool _b_is_pretransposed{ false };
219 typename strategy::result_type _alpha{};
220 typename strategy::result_type _beta{};
Anthony Barbier3d677cc2018-07-23 16:42:59 +0100221};
222
223} // namespace arm_compute
224#endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDMATRIXMULTIPLYWRAPPER_H__ */