blob: 253280a951aa988d92d94e5d1abed43c73a59196 [file] [log] [blame]
Michalis Spyrou96f977e2021-07-01 12:20:56 +01001/*
2 * Copyright (c) 2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "src/runtime/cpu/operators/CpuWinogradConv2d.h"
25#include "arm_compute/core/Error.h"
26#include "arm_compute/core/Utils.h"
27#include "arm_compute/core/Validate.h"
28#include "arm_compute/core/utils/misc/ShapeCalculator.h"
29#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
30#include "arm_compute/runtime/FunctionDescriptors.h"
31#include "arm_compute/runtime/NEON/NEScheduler.h"
32#include "src/core/CPP/Validate.h"
33#include "src/core/NEON/kernels/convolution/common/utils.hpp"
34#include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
35#include "src/core/cpu/kernels/CpuWinogradConv2dKernel.h"
36#include "src/core/helpers/MemoryHelpers.h"
37#include "src/runtime/cpu/operators/CpuActivation.h"
38#include "src/runtime/cpu/operators/CpuPermute.h"
39#include "src/runtime/cpu/operators/CpuWinogradConv2d.h"
40#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
41
42#include "support/Cast.h"
43
44#include <set>
45
46namespace arm_compute
47{
48namespace cpu
49{
50using namespace arm_compute::experimental;
51using namespace arm_compute::utils::cast;
52
53namespace
54{
55arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info)
56{
57 switch(act_info.activation())
58 {
59 case ActivationLayerInfo::ActivationFunction::RELU:
60 {
61 return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b());
62 }
63 case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
64 {
65 return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b());
66 }
67 default:
68 {
69 return arm_gemm::Activation(arm_gemm::Activation::Type::None);
70 }
71 }
72}
73
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010074inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
75 const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
Michalis Spyrou96f977e2021-07-01 12:20:56 +010076{
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010077 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
78 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
Michalis Spyrou96f977e2021-07-01 12:20:56 +010079
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010080 if(src->data_type() == DataType::F32)
Michalis Spyrou96f977e2021-07-01 12:20:56 +010081 {
82 if(input_dims.width > 4 && input_dims.height > 4)
83 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010084 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 4, 4, 3, 3>::validate(src, input0, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +010085 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010086 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 4, 4, 3, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +010087 }
88 else
89 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010090 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 2, 2, 3, 3>::validate(src, input0, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +010091 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, input1, winograd_info)));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010092 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 2, 2, 3, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +010093 }
94 }
95#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010096 else if(src->data_type() == DataType::F16)
Michalis Spyrou96f977e2021-07-01 12:20:56 +010097 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +010098 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<__fp16, 4, 4, 3, 3>::validate(src, input0, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +010099 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<__fp16, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100100 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<__fp16, 4, 4, 3, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100101 }
102#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
103
104 if(act_info.enabled())
105 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100106 CpuActivation::validate(dst, nullptr, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100107 }
108 return Status{};
109}
110
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100111inline Status validate_kernel_5x5(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
112 const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100113{
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100114 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 2, 2, 5, 5>::validate(src, input0, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100115 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, input1, winograd_info)));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100116 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 2, 2, 5, 5>::validate(batched_mm_output, biases, dst, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100117 if(act_info.enabled())
118 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100119 CpuActivation::validate(dst, nullptr, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100120 }
121 return Status{};
122}
123
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100124inline Status validate_kernel_3x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
125 const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100126{
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100127 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
128 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 1, 6, 1, 3>::validate(src, input0, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100129 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 1, 6, 1, 3>::validate(weights, input1, winograd_info)));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100130 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 1, 6, 1, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100131 if(act_info.enabled())
132 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100133 CpuActivation::validate(dst, nullptr, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100134 }
135 return Status{};
136}
137
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100138inline Status validate_kernel_1x3(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
139 const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100140{
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100141 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
142 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 6, 1, 3, 1>::validate(src, input0, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100143 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 6, 1, 3, 1>::validate(weights, input1, winograd_info)));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100144 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 6, 1, 3, 1>::validate(batched_mm_output, biases, dst, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100145
146 if(act_info.enabled())
147 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100148 CpuActivation::validate(dst, nullptr, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100149 }
150 return Status{};
151}
152
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100153inline Status validate_kernel_5x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
154 const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100155{
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100156 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
157 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 1, 4, 1, 5>::validate(src, input0, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100158 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 1, 4, 1, 5>::validate(weights, input1, winograd_info)));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100159 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 1, 4, 1, 5>::validate(batched_mm_output, biases, dst, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100160 if(act_info.enabled())
161 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100162 CpuActivation::validate(dst, nullptr, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100163 }
164 return Status{};
165}
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100166inline Status validate_kernel_1x5(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
167 const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100168{
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100169 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
170 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 4, 1, 5, 1>::validate(src, input0, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100171 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 4, 1, 5, 1>::validate(weights, input1, winograd_info)));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100172 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 4, 1, 5, 1>::validate(batched_mm_output, biases, dst, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100173 if(act_info.enabled())
174 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100175 CpuActivation::validate(dst, nullptr, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100176 }
177 return Status{};
178}
179
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100180inline Status validate_kernel_7x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
181 const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100182{
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100183 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
184 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 1, 2, 1, 7>::validate(src, input0, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100185 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 1, 2, 1, 7>::validate(weights, input1, winograd_info)));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100186 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 1, 2, 1, 7>::validate(batched_mm_output, biases, dst, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100187 if(act_info.enabled())
188 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100189 CpuActivation::validate(dst, nullptr, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100190 }
191 return Status{};
192}
193
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100194inline Status validate_kernel_1x7(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
195 const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100196{
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100197 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
198 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 2, 1, 7, 1>::validate(src, input0, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100199 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 2, 1, 7, 1>::validate(weights, input1, winograd_info)));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100200 ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 2, 1, 7, 1>::validate(batched_mm_output, biases, dst, winograd_info)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100201
202 if(act_info.enabled())
203 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100204 CpuActivation::validate(dst, nullptr, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100205 }
206 return Status{};
207}
208
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100209inline Tensor4DShape internal_get_input_shape(const ITensorInfo *src)
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100210{
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100211 const DataLayout data_layout = src->data_layout();
212 const int in_width = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
213 const int in_height = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
214 const int in_channels = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
215 const int in_batches = src->dimension(3);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100216
217 return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
218}
219
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100220Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info)
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100221{
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100222 ARM_COMPUTE_UNUSED(dst);
223 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100224
225 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
226 if(biases != nullptr)
227 {
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100228 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100229 ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
230 }
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100231 return ICpuWinogradConv2dTransformWeightsKernel::validate(src, weights);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100232}
233Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataType data_type)
234{
235 Size2D output_tile = Size2D{};
236 if(kernel_dims == Size2D(3U, 3U))
237 {
238 output_tile = (input_dims.width <= 4 || input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
239 if(data_type == DataType::F16)
240 {
241 output_tile = Size2D(4U, 4U);
242 }
243 }
244 else if(kernel_dims == Size2D(5U, 5U))
245 {
246 output_tile = Size2D(2U, 2U);
247 }
248 else if(kernel_dims == Size2D(1U, 3U))
249 {
250 output_tile = Size2D(1U, 6U);
251 }
252 else if(kernel_dims == Size2D(3U, 1U))
253 {
254 output_tile = Size2D(6U, 1U);
255 }
256 else if(kernel_dims == Size2D(1U, 5U))
257 {
258 output_tile = Size2D(1U, 4U);
259 }
260 else if(kernel_dims == Size2D(5U, 1U))
261 {
262 output_tile = Size2D(4U, 1U);
263 }
264 else if(kernel_dims == Size2D(7U, 1U))
265 {
266 output_tile = Size2D(2U, 1U);
267 }
268 else if(kernel_dims == Size2D(1U, 7U))
269 {
270 output_tile = Size2D(1U, 2U);
271 }
272 return output_tile;
273}
274
275bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size, DataType data_type)
276{
277 // Check if we want to configure a Winograd configuration which requires fast math
278 using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
279
280 const std::vector<WinogradConfiguration> fast_math_winograd_f16 =
281 {
282 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3))
283 };
284
285 const std::vector<WinogradConfiguration> fast_math_winograd_f32 =
286 {
287 WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
288 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
289 };
290
291 auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
292 std::pair<int, int>(kernel_size.width, kernel_size.height));
293
294 switch(data_type)
295 {
296 case DataType::F16:
297 return std::find(fast_math_winograd_f16.begin(), fast_math_winograd_f16.end(), p) != fast_math_winograd_f16.end();
298 case DataType::F32:
299 return std::find(fast_math_winograd_f32.begin(), fast_math_winograd_f32.end(), p) != fast_math_winograd_f32.end();
300 default:
301 return false;
302 }
303}
304
305inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
306{
307 return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
308}
309
310} // namespace
311
312CpuWinogradConv2d::CpuWinogradConv2d()
313 : _gemm_function(std::make_unique<CpuGemm>()),
314 _activation_func(std::make_unique<CpuActivation>()),
315 _permute_input(std::make_unique<CpuPermute>()),
316 _permute_output(std::make_unique<CpuPermute>()),
317 _permute_weights(std::make_unique<CpuPermute>()),
318 _transform_input_kernel(nullptr),
319 _transform_weights_kernel(nullptr),
320 _transform_output_kernel(nullptr),
321 _data_layout(),
322 _aux_mem(AuxTensorIdx::Count),
323 _input_nhwc(),
324 _output_nhwc(),
325 _input_workspace(),
326 _kernel_storage(),
327 _output_workspace(),
328 _input_transformed(),
329 _output_transformed(),
330 _weights_hwio(),
331 _run_activation(false),
332 _is_prepared(false)
333{
334}
335
336CpuWinogradConv2d::~CpuWinogradConv2d() = default;
337
338void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
339 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
340{
341 ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
342 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info));
343
344 // Get indices for the width and height
345 _data_layout = src->data_layout();
346 const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
347 const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
348 const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
349
350 const Size2D input_dims = Size2D(src->dimension(width_idx), src->dimension(height_idx));
351 const Size2D kernel_size = Size2D(weights->dimension(width_idx), weights->dimension(height_idx));
352 const DataType data_type = src->data_type();
353 const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
354
355 // Check if the Winograd configuration requires fast math
356 if(!enable_fast_math)
357 {
358 ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
359 "This Winograd configuration requires enable_fast_math=true");
360 }
361
362 _is_prepared = false;
363
364 std::unique_ptr<ICpuWinogradConv2dTransformInputKernel> transform_input_kernel;
365 std::unique_ptr<ICpuWinogradConv2dTransformWeightsKernel> transform_weights_kernel;
366 std::unique_ptr<ICpuWinogradConv2dTransformOutputKernel> transform_output_kernel;
367
368 int n_gemms = 1;
369 int N_BLOCK = 1; // Size of block used by GEMM.
370 if(data_type == DataType::F32)
371 {
372 if(kernel_size == Size2D(3, 3))
373 {
374 if(src->dimension(width_idx) > 4 && src->dimension(height_idx) > 4)
375 {
376 using config = CpuWinogradConv2dConfiguration<float, float, 4, 4, 3, 3>;
377 transform_input_kernel = std::make_unique<config::TransformInputKernel>();
378 transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
379 transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
380 n_gemms = config::WinogradBase::N_GEMMS;
381 N_BLOCK = config::WinogradConv::N_BLOCK;
382 }
383 else
384 {
385 using config = CpuWinogradConv2dConfiguration<float, float, 2, 2, 3, 3>;
386 transform_input_kernel = std::make_unique<config::TransformInputKernel>();
387 transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
388 transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
389 n_gemms = config::WinogradBase::N_GEMMS;
390 N_BLOCK = config::WinogradConv::N_BLOCK;
391 }
392 }
393 else if(kernel_size == Size2D(5, 5))
394 {
395 using config = CpuWinogradConv2dConfiguration<float, float, 2, 2, 5, 5>;
396 transform_input_kernel = std::make_unique<config::TransformInputKernel>();
397 transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
398 transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
399 n_gemms = config::WinogradBase::N_GEMMS;
400 N_BLOCK = config::WinogradConv::N_BLOCK;
401 }
402 else if(kernel_size == Size2D(1, 3))
403 {
404 using config = CpuWinogradConv2dConfiguration<float, float, 6, 1, 3, 1>;
405 transform_input_kernel = std::make_unique<config::TransformInputKernel>();
406 transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
407 transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
408 n_gemms = config::WinogradBase::N_GEMMS;
409 N_BLOCK = config::WinogradConv::N_BLOCK;
410 }
411 else if(kernel_size == Size2D(3, 1))
412 {
413 using config = CpuWinogradConv2dConfiguration<float, float, 1, 6, 1, 3>;
414 transform_input_kernel = std::make_unique<config::TransformInputKernel>();
415 transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
416 transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
417 n_gemms = config::WinogradBase::N_GEMMS;
418 N_BLOCK = config::WinogradConv::N_BLOCK;
419 }
420 else if(kernel_size == Size2D(1, 5))
421 {
422 using config = CpuWinogradConv2dConfiguration<float, float, 4, 1, 5, 1>;
423 transform_input_kernel = std::make_unique<config::TransformInputKernel>();
424 transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
425 transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
426 n_gemms = config::WinogradBase::N_GEMMS;
427 N_BLOCK = config::WinogradConv::N_BLOCK;
428 }
429 else if(kernel_size == Size2D(5, 1))
430 {
431 using config = CpuWinogradConv2dConfiguration<float, float, 1, 4, 1, 5>;
432 transform_input_kernel = std::make_unique<config::TransformInputKernel>();
433 transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
434 transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
435 n_gemms = config::WinogradBase::N_GEMMS;
436 N_BLOCK = config::WinogradConv::N_BLOCK;
437 }
438 else if(kernel_size == Size2D(1, 7))
439 {
440 using config = CpuWinogradConv2dConfiguration<float, float, 2, 1, 7, 1>;
441 transform_input_kernel = std::make_unique<config::TransformInputKernel>();
442 transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
443 transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
444 n_gemms = config::WinogradBase::N_GEMMS;
445 N_BLOCK = config::WinogradConv::N_BLOCK;
446 }
447 else if(kernel_size == Size2D(7, 1))
448 {
449 using config = CpuWinogradConv2dConfiguration<float, float, 1, 2, 1, 7>;
450 transform_input_kernel = std::make_unique<config::TransformInputKernel>();
451 transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
452 transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
453 n_gemms = config::WinogradBase::N_GEMMS;
454 N_BLOCK = config::WinogradConv::N_BLOCK;
455 }
456 else
457 {
458 ARM_COMPUTE_ERROR("Not supported.");
459 }
460 }
461#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
462 else if(data_type == DataType::F16)
463 {
464 if(kernel_size == Size2D(3, 3))
465 {
466 using config = CpuWinogradConv2dConfiguration<__fp16, __fp16, 4, 4, 3, 3>;
467 transform_input_kernel = std::make_unique<config::TransformInputKernel>();
468 transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
469 transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
470 n_gemms = config::WinogradBase::N_GEMMS;
471 N_BLOCK = config::WinogradConv::N_BLOCK;
472 }
473 else
474 {
475 ARM_COMPUTE_ERROR("Not supported.");
476 }
477 }
478#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
479 else
480 {
481 ARM_COMPUTE_ERROR("Not supported.");
482 }
483
484 const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID;
485 const bool use_same_padding = use_padding_type == PADDING_SAME;
486
487 // Get convolved dimensions
488 const int in_channels = src->dimension(channel_idx);
489 const int out_channels = dst->dimension(channel_idx);
490
491 const Tensor4DShape in_shape(internal_get_input_shape(src));
492 const size_t data_type_size = src->element_size();
493 // Get the memory required to instantiate a new Winograd operator.
494 constexpr size_t storage_alignment = 64;
495
496 // Kernel Storage
Georgios Pinitas87a74ef2021-08-20 17:26:45 +0100497 const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100498
499 // Input storage
Georgios Pinitas87a74ef2021-08-20 17:26:45 +0100500 const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100501
502 // Output storage
503 const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size;
504 const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels);
505 const int output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels);
506 const auto output_shape = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
507 const int input_matrix_stride = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
508
509 // Configure GEMM
510 const int tile_rows = iceildiv(output_shape.first, output_tile.height);
511 const int tile_cols = iceildiv(output_shape.second, output_tile.width);
512 const int m = in_shape.n_batches * tile_rows * tile_cols;
513 const int k = in_shape.n_channels;
514 const int n = out_channels;
515 const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
516 const int output_matrix_row_stride = kernel_matrix_row_stride;
517
518 TensorShape a_shape(k, m, 1, n_gemms);
519 Strides a_strides(data_type_size);
520 a_strides.set(1, a_strides[0] * k);
521 //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
522 a_strides.set(2, 0);
523 a_strides.set(3, data_type_size * input_matrix_stride);
524
525 TensorShape b_shape(n, k, n_gemms);
526 Strides b_strides(data_type_size);
527 b_strides.set(1, data_type_size * kernel_matrix_row_stride);
528 b_strides.set(2, data_type_size * kernel_matrix_stride);
529
530 TensorShape d_shape(n, m, 1, n_gemms);
531 Strides d_strides(data_type_size);
532 d_strides.set(1, data_type_size * output_matrix_row_stride);
533 //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
534 d_strides.set(2, 0);
535 d_strides.set(3, data_type_size * output_matrix_stride);
536
537 TensorInfo a_info{};
538 TensorInfo b_info{};
539 TensorInfo d_info{};
540 a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
541 b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
542 d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
543
544 _input_transformed = a_info;
545 _kernel_storage = b_info;
546 _output_transformed = d_info;
547
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100548 const ITensorInfo *input_to_use = src;
549 ITensorInfo *output_to_use = dst;
550 PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
551 const unsigned int max_num_threads = NEScheduler::get().num_threads();
552
553 // Configure the kernel to transform the input tensor from NCHW -> NHWC
554 if(_data_layout == DataLayout::NCHW)
555 {
556 _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100557 input_to_use = &_input_nhwc;
558 weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
559 }
560
561 // Configure input transform kernel
562 transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
563 &_input_transformed, input_matrix_stride, &_input_workspace);
564 const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
Georgios Pinitas66341942021-07-30 12:21:07 +0100565 TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100566 _input_workspace = input_workspace_info;
567
568 // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
569 _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector);
570 transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
571
572 // Configure GEMM function
573 _gemm_function->configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
574
575 // Configure output transform function
576 // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
577 if(_data_layout == DataLayout::NCHW)
578 {
Georgios Pinitas66341942021-07-30 12:21:07 +0100579 // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
580 TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
581 dst->dimension(1), dst->dimension(3)),
582 1, dst->data_type());
583 _output_nhwc = info;
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100584 output_to_use = &_output_nhwc;
585 }
586 const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
587
588 transform_output_kernel->configure(biases,
589 &_output_transformed,
590 output_matrix_stride,
591 output_to_use,
592 in_shape.n_batches,
593 output_shape.first,
594 output_shape.second,
595 out_channels,
596 &_output_workspace,
597 activation);
598
599 const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
Georgios Pinitas66341942021-07-30 12:21:07 +0100600 TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100601 _output_workspace = output_workspace_info;
602
603 // Reorder the convoluted output to ACL's ordering NCHW
604 if(_data_layout == DataLayout::NCHW)
605 {
606 _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100607 }
608
609 _transform_input_kernel = std::move(transform_input_kernel);
610 _transform_weights_kernel = std::move(transform_weights_kernel);
611 _transform_output_kernel = std::move(transform_output_kernel);
612
613 //Configure Activation Layer
614 _run_activation = act_info.enabled() && !fuse_function_supported(act_info);
615 if(_run_activation)
616 {
617 _activation_func->configure(dst, nullptr, act_info);
618 }
619
620 auto asm_mem_req = _gemm_function->workspace();
621 _aux_mem[GemmWorkspace] = asm_mem_req[GemmWorkspace];
622 _aux_mem[Pretranspose] = asm_mem_req[Pretranspose];
623 _aux_mem[InterleavedLHS] = asm_mem_req[InterleavedLHS];
624 _aux_mem[TransposedRHS] = asm_mem_req[TransposedRHS];
625 _aux_mem[TempResult] = asm_mem_req[TempResult];
626
Georgios Pinitas87a74ef2021-08-20 17:26:45 +0100627 // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps.
628 _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, input_storage_size, storage_alignment);
629 _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, output_storage_size, storage_alignment);
630 _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size));
Georgios Pinitas66341942021-07-30 12:21:07 +0100631 _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
Georgios Pinitas87a74ef2021-08-20 17:26:45 +0100632 _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, kernel_storage_size, storage_alignment);
633 if(_data_layout == DataLayout::NCHW)
634 {
635 _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size());
636 _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size());
637 }
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100638}
639
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100640Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100641 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
642{
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100643 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
644 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100645
646 // Get indices for the width and height
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100647 const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
648 const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100649
650 // Input shape, kernel size and output tile
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100651 const Size2D input_dims = Size2D(src->dimension(idx_width), src->dimension(idx_height));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100652 const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100653 const DataType data_type = src->data_type();
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100654 const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
655
656 // Check if the Winograd configuration requires fast math
657 if(!enable_fast_math)
658 {
659 ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
660 "This Winograd configuration requires enable_fast_math=true");
661 }
662
663 const WinogradInfo winograd_info = WinogradInfo(output_tile,
664 kernel_size,
665 input_dims,
666 conv_info,
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100667 src->data_layout());
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100668
669 // Validate input transform
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100670 const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
671 const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100672 // Validate filter transform
673 const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
674 const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
675 // Validate batched matrix multiply
676 TensorShape batched_mm_output_shape = input0.tensor_shape();
677 batched_mm_output_shape[0] = input1.tensor_shape()[0];
678 const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
679
680 if(kernel_size == Size2D(3, 3))
681 {
682 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
683 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
684 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
685 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
686 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
687 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
688 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100689 return validate_kernel_3x3(input_dims, src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100690 }
691 else if(kernel_size == Size2D(5, 5))
692 {
693 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
694 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
695 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
696 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
697 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
698 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
699 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100700 return validate_kernel_5x5(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100701 }
702 if(kernel_size == Size2D(3, 1))
703 {
704 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
705 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
706 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100707 return validate_kernel_3x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100708 }
709 else if(kernel_size == Size2D(1, 3))
710 {
711 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
712 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
713 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100714 return validate_kernel_1x3(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100715 }
716 else if(kernel_size == Size2D(5, 1))
717 {
718 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
719 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
720 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100721 return validate_kernel_5x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100722 }
723 else if(kernel_size == Size2D(1, 5))
724 {
725 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
726 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
727 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100728 return validate_kernel_1x5(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100729 }
730 else if(kernel_size == Size2D(7, 1))
731 {
732 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported");
733 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported");
734 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100735 return validate_kernel_7x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100736 }
737 else if(kernel_size == Size2D(1, 7))
738 {
739 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported");
740 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported");
741 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
Michele Di Giorgiod9cdf142021-07-02 15:17:08 +0100742 return validate_kernel_1x7(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100743 }
744 else
745 {
746 ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported");
747 }
748}
749
750void CpuWinogradConv2d::run(ITensorPack &tensors)
751{
752 prepare(tensors);
753
754 auto a = tensors.get_const_tensor(ACL_SRC_0);
755 auto c = tensors.get_const_tensor(ACL_SRC_2);
756 auto d = tensors.get_tensor(ACL_DST);
757
758 CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true);
Georgios Pinitas87a74ef2021-08-20 17:26:45 +0100759 CpuAuxTensorHandler input_transformed(offset_int_vec(TransformedInput), _input_transformed, tensors, true);
760 CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100761
762 const bool is_nchw = _data_layout == DataLayout::NCHW;
763 if(is_nchw)
764 {
765 //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
766 ITensorPack pack{ { ACL_SRC, a }, { ACL_DST, input_nhwc.get() } };
767 _permute_input->run(pack);
768 }
769
770 // Transform input tensor to the winograd domain
771 ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : a }, { ACL_DST, input_transformed.get() }, { ACL_INT, input_workspace.get() } };
772 NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, _transform_input_kernel->window(), transform_input_pack);
773
Georgios Pinitas87a74ef2021-08-20 17:26:45 +0100774 CpuAuxTensorHandler output_transformed(offset_int_vec(TransformedOutput), _output_transformed, tensors, true);
775 CpuAuxTensorHandler weights_transformed(offset_int_vec(TransformedWeights), _kernel_storage, tensors, true);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100776
777 // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
Georgios Pinitas87a74ef2021-08-20 17:26:45 +0100778 ITensorPack gemm_pack = tensors;
779 gemm_pack.add_const_tensor(ACL_SRC, input_transformed.get());
780 gemm_pack.add_const_tensor(ACL_SRC_1, weights_transformed.get());
781 gemm_pack.add_const_tensor(ACL_BIAS, nullptr);
782 gemm_pack.add_tensor(ACL_DST, output_transformed.get());
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100783 _gemm_function->run(gemm_pack);
784
785 // Transform output tensor to the spatial domain
Georgios Pinitas87a74ef2021-08-20 17:26:45 +0100786 CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true);
787 CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true);
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100788 ITensorPack transform_output_pack{ { ACL_SRC_0, c }, { ACL_SRC_1, output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : d }, { ACL_INT, output_workspace.get() } };
789 NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, _transform_output_kernel->window(), transform_output_pack);
790
791 if(is_nchw)
792 {
793 // Reorder the convoluted output to ACL's ordering NCHW
794 ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, d } };
795 _permute_output->run(pack);
796 }
797
798 if(_run_activation)
799 {
800 ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
801 _activation_func->run(pack);
802 }
803}
804
805void CpuWinogradConv2d::prepare(ITensorPack &tensors)
806{
807 if(!_is_prepared)
808 {
809 // Permute weights
810 const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
811 ITensor *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
812 ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
813
814 CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux);
815 ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
816 _permute_weights->run(permute_tensors);
817
818 // Transform weights
Georgios Pinitas87a74ef2021-08-20 17:26:45 +0100819 ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100820 ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf);
821
822 CpuAuxTensorHandler transformed_weights(_kernel_storage, *weights_transf);
823 ITensorPack transform_tensors{ { ACL_SRC, permuted_weights.get() }, { ACL_DST, transformed_weights.get() } };
824 NEScheduler::get().schedule_op(_transform_weights_kernel.get(), Window::DimX, _transform_weights_kernel->window(), transform_tensors);
825
Georgios Pinitas66341942021-07-30 12:21:07 +0100826 ITensorPack gemm_pack = tensors;
Michalis Spyrou96f977e2021-07-01 12:20:56 +0100827 gemm_pack.add_const_tensor(ACL_SRC_1, transformed_weights.get());
828 _gemm_function->prepare(gemm_pack);
829
830 _is_prepared = true;
831 }
832}
833
834experimental::MemoryRequirements CpuWinogradConv2d::workspace() const
835{
836 return _aux_mem;
837}
838} // namespace cpu
839} // namespace arm_compute