blob: c5bcffbaefd8621d816af7626d15d9fa9db2f2de [file] [log] [blame]
Pablo Tello89519332017-11-17 11:52:36 +00001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25#include <cstdint>
26#include <cstdlib>
27
Pablo Tello89519332017-11-17 11:52:36 +000028#include "gemm.hpp"
29#include "profiler.hpp"
30#include "utils.hpp"
31#include "shims.hpp"
32#include "winograd_gemm.hpp"
33
34#include "transforms.hpp"
35
36#ifndef ALLOC_ALIGN
37#define ALLOC_ALIGN 64
38#endif // ALLOC_ALIGN
39
40
41namespace winograd_shim_nchw {
42 /***************************************************************************/
43 /* Implementation of the Winograd F(2x2, 3x3, 4x4) algorithm using GEMM
44 * internally.
45 */
46 template <typename TOut, typename TIn>
47 class Winograd2x2_3x3GEMM : public winograd::Winograd2x2_3x3GEMM<TOut, TIn> {
48 public:
49 /* Instantiate a new Winograd operator.
50 */
51 Winograd2x2_3x3GEMM(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage);
52
53 void nchw2nhwc( const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, const TIn* const input);
54 void nhwc2nchw( const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, TOut* const output);
55
56
57 std::pair<TOut*,TIn*> get_nhwc_ptrs(const Tensor4DShape& input_shape,const PaddingType padding_type,void *working_space);
58
59 static size_t get_working_space_size(const Tensor4DShape &input_shape,const KernelShape &k_shape, const PaddingType padding);
60 protected:
61 /* Get the memory required to store an NHWC copy of the input tensor. */
62 static size_t get_working_nhwc_input_size(const Tensor4DShape &input_shape);
63
64 /* Get the memory required to store an NHWC copy of the input tensor. */
65 static size_t get_working_nhwc_output_size(const Tensor4DShape &output_shape, const KernelShape &k_shape, const PaddingType padding) ;
66 };
67} // namespace winograd
68
69/*****************************************************************************/
70template <typename TOut, typename TIn>
71winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::Winograd2x2_3x3GEMM(
72 const KernelShape &kernel_shape, const Tensor4DShape input_shape,
73 const PaddingType padding_type, void *kernel_storage
74) : winograd::Winograd2x2_3x3GEMM<TOut, TIn>(kernel_shape,input_shape,padding_type,kernel_storage) {
75}
76
77/*****************************************************************************/
78template <typename TOut, typename TIn>
79void winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::nchw2nhwc(const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, const TIn* const input) {
80 assert(working_space);
81 int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
82
83 // Extract the top chunk of the working space to store the input and output
84 // tensors in NHWC format.
85 const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type);
86 const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type);
87
88 // Allocate working space for the input and output in NHWC format
89 TIn* const input_nhwc = reinterpret_cast<TIn *>(
90 ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes)
91 );
92
93 // Re-order the input tensor
94 this->prof(
95 "NCHW -> NHWC",
96 [input, input_shape, input_nhwc] () {
97 nchw_to_nhwc(
98 input, input_nhwc,
99 input_shape.n_batches,
100 input_shape.n_channels,
101 input_shape.n_rows,
102 input_shape.n_cols
103 );
104 },
105 input_shape.size(), 0, input_shape.size()
106 );
107}
108
109/*****************************************************************************/
110template <typename TOut, typename TIn>
111void winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::nhwc2nchw(const Tensor4DShape& input_shape, const PaddingType padding_type,
112 void *working_space, TOut* const output) {
113
114 assert(working_space);
115 int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
116
117 // Extract the top chunk of the working space to store the input and output
118 // tensors in NHWC format.
119 const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type);
120 const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type);
121
122 TOut* const output_nhwc = reinterpret_cast<TOut *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) + get_working_nhwc_input_size(input_shape));
123
124 // Re-order the output tensor into NCHW
125 const auto output_shape = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_shape(input_shape, this->kernel_shape, padding_type);
126 this->prof(
127 "NHWC -> NCHW",
128 [output_nhwc, output_shape, output] () {
129 nhwc_to_nchw(
130 output_nhwc, output,
131 output_shape.n_batches,
132 output_shape.n_rows,
133 output_shape.n_cols,
134 output_shape.n_channels
135 );
136 },
137 output_shape.size(), 0, output_shape.size()
138 );
139}
140
141
142/*****************************************************************************/
143template <typename TOut, typename TIn>
144std::pair<TOut*,TIn*> winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_nhwc_ptrs(
145 const Tensor4DShape& input_shape,
146 const PaddingType padding_type,
147 void *working_space
148) {
149 assert(working_space);
150 int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
151
152 // Extract the top chunk of the working space to store the input and output
153 // tensors in NHWC format.
154 const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type);
155 const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type);
156
157 // Allocate working space for the input and output in NHWC format
158 TIn* input_nhwc = reinterpret_cast<TIn *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes));
159 TOut* output_nhwc = reinterpret_cast<TOut *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) + get_working_nhwc_input_size(input_shape));
160 return std::make_pair(output_nhwc,input_nhwc);
161}
162
163
164
165
166/*****************************************************************************/
167template <typename TOut, typename TIn>
168size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_space_size(
169 const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type
170) {
171 // TODO Add memory required for NHWC copies of input tensors
172 return winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_space_size(
173 input_shape, k_shape, padding_type)
174 + get_working_nhwc_input_size(input_shape)
175 + get_working_nhwc_output_size(input_shape, k_shape, padding_type);
176}
177
178template <typename TOut, typename TIn>
179size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_nhwc_input_size(
180 const Tensor4DShape& input_shape
181) {
182 return roundup(input_shape.size() * sizeof(TIn), static_cast<size_t>(ALLOC_ALIGN));
183}
184
185template <typename TOut, typename TIn>
186size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_nhwc_output_size(
187 const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type
188) {
189 const auto output_shape = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_shape(input_shape,k_shape, padding_type);
190 return roundup(output_shape.size() * sizeof(TIn), static_cast<size_t>(ALLOC_ALIGN));
191}