blob: 2a4a46e76c44af4df001e60fa6d3ce167062beb2 [file] [log] [blame]
Pablo Tellobf2fb952017-09-29 16:43:25 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
25
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
29#include "arm_compute/core/NEON/INEKernel.h"
30#include "arm_compute/core/Types.h"
31#include "arm_compute/core/Validate.h"
32#include "arm_compute/core/Window.h"
33
34#include <arm_neon.h>
35#include <cstddef>
36#include <cstdint>
37#include <tuple>
38
39using namespace arm_compute;
40
41namespace
42{
Pablo Telloeae4ce02017-10-13 15:24:49 +010043inline void gemm_interleave_blocked_transposed_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height)
Pablo Tellobf2fb952017-09-29 16:43:25 +010044{
Pablo Telloeae4ce02017-10-13 15:24:49 +010045 const size_t in_stride = input->info()->strides_in_bytes()[1];
46
47 const unsigned int in_height = input->info()->dimension(1);
48 const unsigned int in_width = input->info()->dimension(0);
49
50 const float scale_y_factor = 1.f / float(block_height);
Pablo Tellobf2fb952017-09-29 16:43:25 +010051
52 // Set window for output tensor
53 Window win_out(window);
54 win_out.scale(Window::DimY, scale_y_factor);
55 Iterator in(input, window);
56
57 win_out.set_dimension_step(Window::DimX, block_width * block_height);
58 Iterator out(output, win_out);
Pablo Telloeae4ce02017-10-13 15:24:49 +010059
60 execute_window_loop(window, [&](const Coordinates &)
61 {
62 std::fill_n(out.ptr(), block_width * block_height, 0);
63 },
64 out);
65
Pablo Tellobf2fb952017-09-29 16:43:25 +010066 execute_window_loop(window, [&](const Coordinates & id)
67 {
Pablo Telloeae4ce02017-10-13 15:24:49 +010068 for(unsigned int z = id.y(); (z < in_width) && z < (id.y() + block_height); ++z)
Pablo Tellobf2fb952017-09-29 16:43:25 +010069 {
Pablo Telloeae4ce02017-10-13 15:24:49 +010070 int j = (z - id.y()) * block_width;
71 for(unsigned int b = id.x(); (b < in_height) && (b < (id.x() + block_width)); ++b)
Pablo Tellobf2fb952017-09-29 16:43:25 +010072 {
Pablo Telloeae4ce02017-10-13 15:24:49 +010073 *(out.ptr() + j++) = *(input->buffer() + b * in_stride + z);
Pablo Tellobf2fb952017-09-29 16:43:25 +010074 }
75 }
76 },
77 in, out);
78}
79
Pablo Telloeae4ce02017-10-13 15:24:49 +010080inline void gemm_interleave_blocked_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height)
81{
82 const size_t in_stride = input->info()->strides_in_bytes()[1];
83
84 const unsigned int in_height = input->info()->dimension(1);
85 const unsigned int in_width = input->info()->dimension(0);
86
87 const float scale_y_factor = 1.f / float(block_height);
88
89 // Set window for output tensor
90 Window win_out(window);
91 win_out.scale(Window::DimY, scale_y_factor);
92 Iterator in(input, window);
93
94 win_out.set_dimension_step(Window::DimX, block_width * block_height);
95 Iterator out(output, win_out);
96
97 execute_window_loop(window, [&](const Coordinates &)
98 {
99 std::fill_n(out.ptr(), block_width * block_height, 0);
100 },
101 out);
102
103 execute_window_loop(window, [&](const Coordinates & id)
104 {
105 for(unsigned int z = id.y(); (z < in_height) && z < (id.y() + block_height); ++z)
106 {
107 int j = (z - id.y()) * block_width;
108 for(unsigned int b = id.x(); (b < in_width) && (b < (id.x() + block_width)); ++b)
109 {
110 *(out.ptr() + j++) = *(input->buffer() + z * in_stride + b);
111 }
112 }
113 },
114 in, out);
115}
Pablo Tellobf2fb952017-09-29 16:43:25 +0100116} // namespace
117
118NEGEMMInterleaveBlockedKernel::NEGEMMInterleaveBlockedKernel()
119 : _block_height(0), _block_width(0), _transpose(false)
120{
121}
122
123void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose)
124{
125 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
126 ARM_COMPUTE_ERROR_ON_NULLPTR(output);
127 ARM_COMPUTE_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0");
128 ARM_COMPUTE_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0");
129
130 TensorShape output_shape = input->info()->tensor_shape();
131 const float interleave_by_f32 = block_height;
132 output_shape.set(0, input->info()->dimension(0) * interleave_by_f32);
133 output_shape.set(1, std::ceil(static_cast<float>(input->info()->dimension(1)) / interleave_by_f32));
134 // Output auto inizialitation if not yet initialized
135 auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
136 ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
137 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
138 ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
139
140 _input = input;
141 _output = output;
142 _block_height = block_height;
143 _block_width = block_width;
144 _transpose = transpose;
145
146 const unsigned int num_elems_processed_per_iteration_x = block_width;
147 const unsigned int num_elems_processed_per_iteration_y = block_height;
148
149 // Configure kernel window
150 Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
151 const float scaley_factor = 1.f / interleave_by_f32;
152
153 AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, num_elems_processed_per_iteration_y, scaley_factor);
154 AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
155 update_window_and_padding(win, output_access, input_access);
156
157 output_access.set_valid_region(win, input->info()->valid_region());
158
159 INEKernel::configure(win);
160}
161
162void NEGEMMInterleaveBlockedKernel::run(const Window &window, const ThreadInfo &info)
163{
164 ARM_COMPUTE_UNUSED(info);
165 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
166 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Pablo Telloeae4ce02017-10-13 15:24:49 +0100167 if(_transpose)
168 {
169 gemm_interleave_blocked_transposed_8bit(_input, _output, window, _block_width, _block_height);
170 }
171 else
172 {
173 gemm_interleave_blocked_8bit(_input, _output, window, _block_width, _block_height);
174 }
Pablo Tellobf2fb952017-09-29 16:43:25 +0100175}