blob: 80606dcc07ce43335905e3289557c08d1b2982d0 [file] [log] [blame]
Pablo Tello27066c22017-11-23 11:01:10 +00001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/IAccessWindow.h"
30#include "arm_compute/core/ITensor.h"
31#include "arm_compute/core/TensorInfo.h"
32#include "arm_compute/core/Types.h"
33#include "arm_compute/core/Utils.h"
34#include "arm_compute/core/Validate.h"
35#include "arm_compute/core/Window.h"
36#include "support/ToolchainSupport.h"
37
38namespace arm_compute
39{
40#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
41#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
Michalis Spyrou564ed392017-11-24 17:06:25 +000042#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
Pablo Tello27066c22017-11-23 11:01:10 +000043} // namespace arm_compute
44
45#include <arm_neon.h>
46#include <cstddef>
47#include <cstdint>
48
49// Enable only if compiled for AArch64-V8A targets
50#ifdef ARM_COMPUTE_AARCH64_V8A
51
52namespace arm_compute
53{
Michalis Spyrou564ed392017-11-24 17:06:25 +000054NEGEMMLowpAArch64A53Kernel::NEGEMMLowpAArch64A53Kernel()
55 : _func(nullptr)
56{
57}
58
Georgios Pinitas08c5a062017-12-14 17:53:39 +000059void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
60 const Window &window,
Michalis Spyrou564ed392017-11-24 17:06:25 +000061 const ThreadInfo &info)
62{
63 const int lda = input0->info()->strides_in_bytes().y();
64 const int ldb = input1->info()->strides_in_bytes().y();
65 const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
66
67 const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
68
69 const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
70 const int N = output->info()->tensor_shape().x();
71 const int K = input0->info()->tensor_shape().x();
72
73 // Only iterate over batches
74 Window win(window);
75 win.set(0, Window::Dimension(0, 1, 1));
76 win.set(1, Window::Dimension(0, 1, 1));
77
78 Iterator in0(input0, window);
79 Iterator out(output, window);
80
Georgios Pinitas08c5a062017-12-14 17:53:39 +000081 GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
Michalis Spyrou564ed392017-11-24 17:06:25 +000082
83 constexpr size_t alignment = 4096;
84 const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
85 void *_workspace = workspace->buffer() + offset;
86 size_t workspace_size = workspace->info()->total_size();
87
88 if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
89 {
90 ARM_COMPUTE_ERROR("Not enough space to align buffer!");
91 }
92
93 execute_window_loop(win, [&](const Coordinates & id)
94 {
95 gemm.execute(reinterpret_cast<const int8_t *>(in0.ptr()), lda,
96 reinterpret_cast<const int8_t *>(in1_ptr), ldb,
97 reinterpret_cast<int32_t *>(out.ptr()), ldc,
98 alpha, beta, _workspace);
99 },
100 in0, out);
101}
102
Georgios Pinitas08c5a062017-12-14 17:53:39 +0000103void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
104 const Window &window,
Michalis Spyrou564ed392017-11-24 17:06:25 +0000105 const ThreadInfo &info)
106{
107 const int lda = input0->info()->strides_in_bytes().y();
108 const int ldb = input1->info()->strides_in_bytes().y();
109 const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
110
111 const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
112
113 const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
114 const int N = output->info()->tensor_shape().x();
115 const int K = input0->info()->tensor_shape().x();
116
117 // Only iterate over batches
118 Window win(window);
119 win.set(0, Window::Dimension(0, 1, 1));
120 win.set(1, Window::Dimension(0, 1, 1));
121
122 Iterator in0(input0, window);
123 Iterator out(output, window);
124
Georgios Pinitas08c5a062017-12-14 17:53:39 +0000125 GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
Michalis Spyrou564ed392017-11-24 17:06:25 +0000126
127 constexpr size_t alignment = 4096;
128 const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
129 void *_workspace = workspace->buffer() + offset;
130 size_t workspace_size = workspace->info()->total_size();
131
132 if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
133 {
134 ARM_COMPUTE_ERROR("Not enough space to align buffer!");
135 }
136
137 execute_window_loop(win, [&](const Coordinates & id)
138 {
139 gemm.execute(reinterpret_cast<const uint8_t *>(in0.ptr()), lda,
140 reinterpret_cast<const uint8_t *>(in1_ptr), ldb,
141 reinterpret_cast<uint32_t *>(out.ptr()), ldc,
142 alpha, beta, _workspace);
143 },
144 in0, out);
145}
146
Georgios Pinitas08c5a062017-12-14 17:53:39 +0000147void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
148 bool is_transposed_1)
Pablo Tello27066c22017-11-23 11:01:10 +0000149{
Michalis Spyrou564ed392017-11-24 17:06:25 +0000150 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
Pablo Tello27066c22017-11-23 11:01:10 +0000151 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
152 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
153
Georgios Pinitas08c5a062017-12-14 17:53:39 +0000154 _input0 = input0;
155 _input1 = input1;
156 _output = output;
157 _workspace = workspace;
158 _alpha = alpha;
159 _beta = beta;
160 _is_transposed_0 = is_transposed_0;
161 _is_transposed_1 = is_transposed_1;
Pablo Tello27066c22017-11-23 11:01:10 +0000162
Michalis Spyrou564ed392017-11-24 17:06:25 +0000163 switch(input0->info()->data_type())
164 {
165 case DataType::S8:
166 _func = &gemm_interleaved_s16_12x8;
167 break;
168 case DataType::U8:
169 _func = &gemm_interleaved_u16_12x8;
170 break;
171 default:
172 ARM_COMPUTE_ERROR("Element size not supported");
173 break;
174 }
175
Pablo Tello27066c22017-11-23 11:01:10 +0000176 // Configure kernel window
177 Window win = calculate_max_window(*output->info());
178
179 AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
180
181 const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 12);
182 const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
183
184 update_window_and_padding(win,
185 AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
186 AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
187 output_access);
188
189 INEKernel::configure(win);
190}
191
192void NEGEMMLowpAArch64A53Kernel::run(const Window &window, const ThreadInfo &info)
193{
194 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
195 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Michalis Spyrou564ed392017-11-24 17:06:25 +0000196 ARM_COMPUTE_ERROR_ON(_func == nullptr);
Pablo Tello27066c22017-11-23 11:01:10 +0000197
Georgios Pinitas08c5a062017-12-14 17:53:39 +0000198 (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info);
Pablo Tello27066c22017-11-23 11:01:10 +0000199}
200} // namespace arm_compute
201#endif /* ARM_COMPUTE_AARCH64_V8A */