Blame - src/core/NEON/kernels/convolution/winograd/winograd.cpp - ml/ComputeLibrary

blob: 226f303c7d7ad68a49295a65a3b0cb22479d64a1 [file] [log] [blame]

Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	1	/*
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	2	* Copyright (c) 2017-2019 ARM Limited.
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Pablo Tello	79ffade	2018-05-04 11:45:13 +0100	[diff] [blame]	24	#include <cstring>
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	25	#include "winograd.hpp"
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	26	using namespace winograd;
				27
				28	/** Get the output shape of a convolution. */
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	29	template <int kr, int kc, int itr, int itc, WinogradRoots R>
				30	template <typename TOut, typename TIn, typename TInGEMM, typename TOutGEMM>
				31	Tensor4DShape WinogradGEMM<kr, kc, itr, itc, R>::Convolution<TOut, TIn, TInGEMM, TOutGEMM>::get_output_shape(
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	32	const KernelShape &kernel_shape,
				33	const Tensor4DShape &in_shape,
				34	const PaddingType padding
				35	)
				36	{
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	37	return Tensor4DShape {
				38	in_shape.n_batches,
Pablo Tello	79ffade	2018-05-04 11:45:13 +0100	[diff] [blame]	39	(padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 1),
				40	(padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 1),
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	41	kernel_shape.n_output_channels,
				42	in_shape.ordering
				43	};
				44	}
				45
				46	/* Get the memory required to transform the kernel.
				47	*/
				48	template <int kernel_rows, int kernel_cols,
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	49	int output_tile_rows, int output_tile_cols, WinogradRoots roots>
				50	template <typename TOut, typename TIn, typename TGIn, typename TGOut>
				51	size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_transform_working_size(const KernelShape &shape)
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	52	{
				53	if (shape.ordering == HWIO)
				54	{
				55	// Kernel is already in the correct order, so no additional memory is
				56	// required.
				57	return 0;
				58	}
				59	else
				60	{
				61	// Need to re-order the kernel into HWIO form, require enough space to
				62	// represent the tensor.
				63	return sizeof(TIn) * shape.size();
				64	}
				65	}
				66
				67	/** Get the memory required to store the kernel transformed into the
				68	* Winograd domain.
				69	*/
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	70	template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
				71	template <typename TOut, typename TIn, typename TGIn, typename TGOut>
				72	size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_storage_size(const KernelShape &shape)
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	73	{
				74	return N_GEMMS * get_kernel_matrix_size(shape);
				75	}
				76
				77
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	78	template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
				79	template <typename TOut, typename TIn, typename TGIn, typename TGOut>
				80	size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_input_storage_size(
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	81	const KernelShape &kernel_shape,
				82	const Tensor4DShape &input_shape,
				83	const PaddingType padding
				84	)
				85	{
				86	return N_GEMMS * get_input_matrix_size(kernel_shape, input_shape, padding);
				87	}
				88
				89
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	90	template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
				91	template <typename TOut, typename TIn, typename TGIn, typename TGOut>
				92	size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_output_storage_size(
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	93	const KernelShape &kernel_shape,
				94	const Tensor4DShape &input_shape,
				95	const PaddingType padding
				96	)
				97	{
				98	return N_GEMMS * get_output_matrix_size(kernel_shape, input_shape, padding);
				99	}
				100
				101
				102	/** Get the memory required to apply a Winograd operator to some input.
				103	*/
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	104	template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
				105	template <typename TOut, typename TIn, typename TGIn, typename TGOut>
				106	size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_working_space_size(
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	107	const KernelShape &kernel_shape,
				108	const Tensor4DShape &input_shape,
				109	const PaddingType padding_type
				110	)
				111	{
				112	const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type);
				113
				114	// Get the memory required to store the matrices
				115	const size_t matrix_sizes = N_GEMMS * (
				116	get_input_matrix_size(kernel_shape, input_shape, padding_type) +
				117	get_output_matrix_size(kernel_shape, input_shape, padding_type)
				118	);
				119
				120	// Add additional space to re-order the input and output if the input tensor
				121	// is not in NHWC format.
				122	if (input_shape.ordering == NHWC)
				123	{
				124	return matrix_sizes; // No extra spacing required
				125	}
				126	else // NCHW, must reorder the input and output tensors
				127	{
				128	// We only need to re-order the input or output at any one time, so request
				129	// enough memory to do the largest of these.
				130	const size_t extra_memory = std::max(
				131	sizeof(TIn) * input_shape.size(),
				132	sizeof(TOut) * output_shape.size()
				133	);
				134	return matrix_sizes + extra_memory;
				135	}
				136	}
				137
				138
				139	/* Get the memory required by a single "input" matrix.
				140	*/
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	141	template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
				142	template <typename TOut, typename TIn, typename TGIn, typename TGOut>
				143	size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_input_matrix_size(
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	144	const KernelShape &kernel_shape,
				145	const Tensor4DShape &input_shape,
				146	const PaddingType padding_type
				147	)
				148	{
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	149	return get_input_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TGIn);
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	150	}
				151
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	152	template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
				153	template <typename TOut, typename TIn, typename TGIn, typename TGOut>
				154	int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_input_matrix_stride(
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	155	const KernelShape &kernel_shape,
				156	const Tensor4DShape &input_shape,
				157	const PaddingType padding_type
				158	)
				159	{
				160	// Compute shape for the GEMM
				161	const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type);
				162	const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows);
				163	const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols);
				164	const int M = roundup(input_shape.n_batches * tile_rows * tile_cols, M_BLOCK);
				165	const int K = kernel_shape.n_input_channels;
				166
				167	return M * K;
				168	}
				169
				170
				171	/* Get the memory required by a single "output" matrix.
				172	*/
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	173	template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
				174	template <typename TOut, typename TIn, typename TGIn, typename TGOut>
				175	size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_output_matrix_size(
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	176	const KernelShape &kernel_shape,
				177	const Tensor4DShape &input_shape,
				178	const PaddingType padding_type
				179	)
				180	{
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	181	return get_output_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TGOut);
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	182	}
				183
				184
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	185	template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
				186	template <typename TOut, typename TIn, typename TGIn, typename TGOut>
				187	int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_output_matrix_stride(
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	188	const KernelShape &kernel_shape,
				189	const Tensor4DShape &input_shape,
				190	const PaddingType padding_type
				191	)
				192	{
				193	// Compute shape for the GEMM
				194	const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type);
				195	const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows);
				196	const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols);
				197	const int M = roundup(tile_rows * tile_cols, M_BLOCK);
				198	const int N = roundup(kernel_shape.n_output_channels, N_BLOCK);
				199
				200	return input_shape.n_batches * M * N;
				201	}
				202
				203
				204	/* Get the memory required by a single "kernel" matrix.
				205	*/
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	206	template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
				207	template <typename TOut, typename TIn, typename TGIn, typename TGOut>
				208	size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_matrix_size(const KernelShape &shape)
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	209	{
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	210	return sizeof(TGIn) * get_kernel_matrix_stride(shape);
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	211	}
				212
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	213	template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
				214	template <typename TOut, typename TIn, typename TGIn, typename TGOut>
				215	int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_matrix_stride(const KernelShape &shape)
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	216	{
				217	const int K = shape.n_input_channels;
				218	const int N = roundup(shape.n_output_channels, N_BLOCK);
				219	return K * N;
				220	}
				221
				222
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	223	// Instantiate required implementations
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	224	template class WinogradGEMM<2, 2, 3, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
				225	template class WinogradGEMM<4, 4, 3, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	226
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	227	template class WinogradGEMM<1, 6, 1, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
				228	template class WinogradGEMM<6, 1, 3, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	229
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	230	template class WinogradGEMM<2, 2, 5, 5, WinogradRoots::Integers>::Convolution<float, float, float, float>;
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	231
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	232	template class WinogradGEMM<1, 4, 1, 5, WinogradRoots::Integers>::Convolution<float, float, float, float>;
				233	template class WinogradGEMM<4, 1, 5, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	234
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	235	template class WinogradGEMM<1, 2, 1, 7, WinogradRoots::Integers>::Convolution<float, float, float, float>;
				236	template class WinogradGEMM<2, 1, 7, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;