blob: 5d2d13e243a25c65bffa5d2e8621929655765a53 [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
25
26#include "arm_compute/core/CL/ICLTensor.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Types.h"
31#include "arm_compute/core/Validate.h"
32#include "arm_compute/runtime/CL/CLScheduler.h"
33
34using namespace arm_compute;
35
36CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
37 : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
38 _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true)
39{
40}
41
42void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output)
43{
44 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
45 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
46 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
47 ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
48 ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
49 ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
50
51 _a_offset = a->info()->quantization_info().offset;
52 _b_offset = b->info()->quantization_info().offset;
53
54 // If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors
55 _is_interleaved_transposed = a->info()->dimension(1) > 16;
56
57 const ICLTensor *matrix_a = a;
58 const ICLTensor *matrix_b = b;
59
60 if(_is_interleaved_transposed)
61 {
62 matrix_a = &_tmp_a;
63 matrix_b = &_tmp_b;
64
65 // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
66 TensorShape shape_tmp_a = a->info()->tensor_shape();
67 shape_tmp_a.set(0, a->info()->dimension(0) * 4);
68 shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
69
70 // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
71 TensorShape shape_tmp_b = b->info()->tensor_shape();
72 shape_tmp_b.set(0, b->info()->dimension(1) * 16);
73 shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
74
75 TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
76 TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
77 _tmp_a.allocator()->init(info_a);
78 _tmp_b.allocator()->init(info_b);
79 _memory_group.manage(&_tmp_a);
80 _memory_group.manage(&_tmp_b);
81
82 // Configure interleave kernel
83 _mtx_a_reshape_kernel.configure(a, &_tmp_a);
84
85 // Configure transpose kernel
86 _mtx_b_reshape_kernel.configure(b, &_tmp_b);
87 }
88
89 // Configure matrix multiply kernel
90 _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed);
91
92 // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
93 if(_a_offset != 0)
94 {
95 TensorShape shape_vector_sum_col = b->info()->tensor_shape();
96 if(b->info()->num_dimensions() > 1)
97 {
98 shape_vector_sum_col.remove_dimension(1);
99 }
100 TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
101 _vector_sum_col.allocator()->init(info_vector_sum_col);
102 _memory_group.manage(&_vector_sum_col);
103
104 // Configure Matrix B reduction kernel
105 _mtx_b_reduction_kernel.configure(b, &_vector_sum_col);
106 }
107
108 // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
109 if(_b_offset != 0)
110 {
111 TensorShape shape_vector_sum_row = a->info()->tensor_shape();
112 shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
113 if(a->info()->num_dimensions() > 1)
114 {
115 shape_vector_sum_row.remove_dimension(1);
116 }
117 TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
118 _vector_sum_row.allocator()->init(info_vector_sum_row);
119 _memory_group.manage(&_vector_sum_row);
120
121 // Configure matrix A reduction kernel
122 _mtx_a_reduction_kernel.configure(a, &_vector_sum_row);
123 }
124
125 // Configure offset contribution kernel
126 _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
127
128 // Allocate tensors
129 if(_is_interleaved_transposed)
130 {
131 _tmp_a.allocator()->allocate();
132 _tmp_b.allocator()->allocate();
133 }
134
135 if(_a_offset != 0)
136 {
137 _vector_sum_col.allocator()->allocate();
138 }
139
140 if(_b_offset != 0)
141 {
142 _vector_sum_row.allocator()->allocate();
143 }
144}
145
146void CLGEMMLowpMatrixMultiplyCore::run()
147{
148 _memory_group.acquire();
149
150 if(_is_interleaved_transposed)
151 {
152 // Run reshape matrix A
153 CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
154
155 // Run reshape matrix B
156 CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
157 }
158
159 // Run matrix multiply
160 CLScheduler::get().enqueue(_mm_kernel, false);
161
162 // Run matrix A reduction kernel only if _b_offset is not equal to 0
163 if(_b_offset != 0)
164 {
165 CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
166 }
167
168 // Run matrix B reduction kernel only if _a_offset is not equal to 0
169 if(_a_offset != 0)
170 {
171 CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
172 }
173
174 // Run offset contribution kernel
175 CLScheduler::get().enqueue(_offset_contribution_kernel, true);
176
177 _memory_group.release();
178}