blob: 19f38bf5a57950a9a5957d6a0fd83f3366d17382 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/AccessWindowTranspose.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#include "arm_compute/core/CL/CLHelpers.h"
29#include "arm_compute/core/CL/CLKernelLibrary.h"
30#include "arm_compute/core/CL/ICLTensor.h"
31#include "arm_compute/core/CL/OpenCL.h"
32#include "arm_compute/core/Error.h"
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010033#include "arm_compute/core/FixedPoint.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/Helpers.h"
35#include "arm_compute/core/Types.h"
36#include "arm_compute/core/Utils.h"
37#include "arm_compute/core/Validate.h"
38#include "arm_compute/core/Window.h"
39
40#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041#include <string>
42
43using namespace arm_compute;
44
Georgios Pinitas358ca202017-12-07 16:47:52 +000045namespace
46{
47using ElementsProcessed = Steps;
48
49inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
50{
51 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
52 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
53 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
54 if(!is_interleaved_transposed)
55 {
56 ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
57 }
58
59 return Status{};
60}
61
62inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
63 bool is_interleaved_transposed, GPUTarget gpu_target,
64 ElementsProcessed &num_elements_processed)
65{
66 bool window_changed = false;
67 Window win{};
68
69 const DataType data_type = input0->data_type();
70 unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
71 unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
72
73 if(is_interleaved_transposed)
74 {
75 // Configure kernel window
76 num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
77 num_elems_processed_per_iteration_y = 4;
78
79 win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
80
81 AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
82 AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
83 AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
84
85 window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
86
87 output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
88 }
89 else // The input tensors have not been reshaped
90 {
91 // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
92 num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
93 num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
94
95 // Create kernels according to the architecture, data type and input size.
96 if(gpu_target == GPUTarget::BIFROST && data_type == DataType::F32)
97 {
Gian Marco1d25ed52017-12-16 19:33:50 +000098 num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
Georgios Pinitas358ca202017-12-07 16:47:52 +000099 }
100
101 // Configure window
102 win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
103
104 AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
105 AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
106 AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
107
108 window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
109
110 Coordinates coord;
111 coord.set_num_dimensions(output->num_dimensions());
112 output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
113 }
114
115 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
116 return std::make_pair(err, win);
117}
118} // namespace
119
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100120CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
121 : _input0(nullptr), _input1(nullptr), _output(nullptr)
122{
123}
124
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100125void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100126{
Georgios Pinitas358ca202017-12-07 16:47:52 +0000127 ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
128
129 // Perform validate step
130 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100131
132 _input0 = input0;
133 _input1 = input1;
134 _output = output;
135
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000136 const DataType data_type = input0->info()->data_type();
137 const int fp_pos = input0->info()->fixed_point_position();
138
139 // Get target architecture
140 GPUTarget arch_target = get_arch_from_target(get_target());
141
142 // Configure LWS hint
Anthony Barbierfcd52fb2017-11-28 10:31:43 +0000143 if(arch_target == GPUTarget::BIFROST && input1->info()->dimension(1) == 24)
144 {
145 // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
146 _lws_hint = cl::NDRange(2, 2);
147 }
148 else if(output->info()->dimension(1) == 196)
149 {
150 _lws_hint = cl::NDRange(1, 7);
151 }
152 else
153 {
154 _lws_hint = cl::NDRange(8, 8);
155 }
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000156
Georgios Pinitas358ca202017-12-07 16:47:52 +0000157 ElementsProcessed num_elements_processed{};
158
159 // Configure kernel window
160 auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, arch_target, num_elements_processed);
161 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
162 ICLKernel::configure(win_config.second);
163
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000164 // Create build options
165 CLBuildOptions build_opts;
166 build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fp_pos));
167
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000168 // Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications.
Georgios Pinitas358ca202017-12-07 16:47:52 +0000169 if(std::abs(1.0f - alpha) > 0.00001f)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100170 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000171 build_opts.add_option_if_else(is_data_type_fixed_point(data_type),
172 "-DALPHA=" + support::cpp11::to_string((data_type == DataType::QS8 ? sqcvt_qs8_f32(alpha, fp_pos) : sqcvt_qs16_f32(alpha, fp_pos))),
173 "-DALPHA=" + float_to_string_with_full_precision(alpha));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100174 }
175
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000176 std::string kernel_name;
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100177 if(is_interleaved_transposed)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100178 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000179 build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
180 if(data_type == DataType::F32)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100181 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000182 kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100183 }
184 else
185 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000186 kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100187 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100188 }
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100189 else // The input tensors have not been reshaped
190 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000191 build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100192
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000193 // Create kernels according to the architecture, data type and input size.
194 if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100195 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000196 // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
197 // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
198 // FC6 and FC7 of AlexNet and VGG-16).
Gian Marco1d25ed52017-12-16 19:33:50 +0000199 kernel_name = (input1->info()->dimension(0) <= 1000 && input0->info()->num_dimensions() == 1) ? "gemm_mm_floating_point_f32_bifrost_1000" : "gemm_mm_floating_point_f32_bifrost";
Georgios Pinitas358ca202017-12-07 16:47:52 +0000200
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000201 // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
202 // via exhaustive autotuning over a range of representative layer configurations.
203 _lws_hint = cl::NDRange(4);
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100204 }
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000205 else if(is_data_type_fixed_point(data_type))
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100206 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000207 kernel_name = "gemm_mm_" + lower_string(string_from_data_type(data_type));
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100208 }
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000209 else // (MIDGARD and F32) or (F16)
210 {
211 build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
212 kernel_name = "gemm_mm_floating_point";
213 }
Georgios Pinitas358ca202017-12-07 16:47:52 +0000214 build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
215 build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100216 }
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000217
218 // Create kernel
219 _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
220
221 // Set config_id for enabling LWS tuning
222 _config_id = "gemm_";
223 _config_id += (is_interleaved_transposed ? "reshaped_" : "");
224 _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
225 _config_id += "_";
226 _config_id += support::cpp11::to_string(output->info()->dimension(1));
227 _config_id += "_";
228 _config_id += support::cpp11::to_string(output->info()->dimension(0));
229 _config_id += "_";
230 _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100231}
232
Georgios Pinitas358ca202017-12-07 16:47:52 +0000233Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target)
234{
235 ElementsProcessed num_elements_processed{};
236 ARM_COMPUTE_UNUSED(alpha);
237 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed));
238 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
239 input1->clone().get(),
240 output->clone().get(),
241 is_interleaved_transposed,
242 gpu_target,
243 num_elements_processed)
244 .first);
245
246 return Status{};
247}
248
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100249void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
250{
251 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
252 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
253
254 Window slice = window.first_slice_window_2D();
255 Window slice_matrix_b = slice;
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100256
257 slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
258 slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100259
260 do
261 {
262 Window slice_b = slice;
263 // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
264 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
265 if(_input1->info()->num_dimensions() < 3)
266 {
267 slice_b = slice_matrix_b;
268 }
269
270 unsigned int idx = 0;
271 add_2D_tensor_argument(idx, _input0, slice);
272 add_2D_tensor_argument(idx, _input1, slice_b);
273 add_2D_tensor_argument(idx, _output, slice);
274 enqueue(queue, *this, slice, _lws_hint);
275 }
276 while(window.slide_window_slice_2D(slice));
277}