Blame - src/runtime/NEON/functions/NEWinogradLayer.cpp - ml/ComputeLibrary

blob: 7d93bcff075a7d965c21c75debfa8bdb07af3dbe [file] [log] [blame]

Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	1	/*
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	2	* Copyright (c) 2017-2018 ARM Limited.
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
				25
Isabella Gottardi	6acc6ad	2018-02-02 17:19:18 +0000	[diff] [blame]	26	#include "arm_compute/core/Error.h"
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	27	#include "arm_compute/core/Utils.h"
				28	#include "arm_compute/core/Validate.h"
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	29	#include "arm_compute/core/Validate.h"
				30	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Michalis Spyrou	2b3129e	2018-04-25 18:10:13 +0100	[diff] [blame^]	31	#include "arm_compute/runtime/NEON/AssemblyHelper.h"
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	32	#include "arm_compute/runtime/NEON/NEScheduler.h"
				33	#include "support/ToolchainSupport.h"
				34
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	35	#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
				36
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	37	#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	38
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	39	namespace
				40	{
				41	inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
				42	{
				43	const int in_width = input->info()->dimension(0);
				44	const int in_height = input->info()->dimension(1);
				45	const int in_batches = input->info()->dimension(3);
				46	const int in_channels = input->info()->dimension(2);
				47	return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
				48	}
				49	} /* namespace */
				50
				51	namespace arm_compute
				52	{
Isabella Gottardi	6acc6ad	2018-02-02 17:19:18 +0000	[diff] [blame]	53	namespace
				54	{
				55	Status validate_arguments(const ITensorInfo input, const ITensorInfo weights, const ITensorInfo biases, const ITensorInfo output, const PadStrideInfo &conv_info)
				56	{
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	57	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
				58	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
				59	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
Isabella Gottardi	6acc6ad	2018-02-02 17:19:18 +0000	[diff] [blame]	60	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
Andrew Mundy	4d9379a	2018-03-15 16:47:03 +0000	[diff] [blame]	61	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
Isabella Gottardi	6acc6ad	2018-02-02 17:19:18 +0000	[diff] [blame]	62	ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 3 && weights->dimension(0) != 5, "Only 3 and 5 kernels are supported");
				63	ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
				64
				65	if(biases != nullptr)
				66	{
				67	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
				68	ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
				69	}
				70
				71	// Get parameters from conv_info
				72	unsigned int stride_x = 0;
				73	unsigned int stride_y = 0;
				74	std::tie(stride_x, stride_y) = conv_info.stride();
				75	ARM_COMPUTE_RETURN_ERROR_ON_MSG(stride_y != 1 \|\| stride_x != 1, "Winograd layer only supports unit strides.");
				76
				77	ARM_COMPUTE_UNUSED(output);
Isabella Gottardi	6acc6ad	2018-02-02 17:19:18 +0000	[diff] [blame]	78	return Status{};
				79	}
				80	} //namespace
				81
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	82	NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
Michalis Spyrou	2b3129e	2018-04-25 18:10:13 +0100	[diff] [blame^]	83	: _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
Isabella Gottardi	3f217ec	2018-02-12 14:59:19 +0000	[diff] [blame]	84	_activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
Michalis Spyrou	2b3129e	2018-04-25 18:10:13 +0100	[diff] [blame^]	85	_workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	86	{
				87	} /* arm_compute */
				88
Isabella Gottardi	3f217ec	2018-02-12 14:59:19 +0000	[diff] [blame]	89	void NEWinogradLayer::configure(const ITensor input, const ITensor weights, const ITensor biases, ITensor output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	90	{
Andrew Mundy	4d9379a	2018-03-15 16:47:03 +0000	[diff] [blame]	91	ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
Isabella Gottardi	6acc6ad	2018-02-02 17:19:18 +0000	[diff] [blame]	92	ARM_COMPUTE_UNUSED(conv_info);
Andrew Mundy	4d9379a	2018-03-15 16:47:03 +0000	[diff] [blame]	93	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	94
				95	_weights = weights;
				96	_input = input;
				97	_output = output;
				98
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	99	std::unique_ptr<INEWinogradLayerTransformInputKernel<float>> transform_input_kernel;
				100	std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
				101	std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>> transform_output_kernel;
				102
Michalis Spyrou	2b3129e	2018-04-25 18:10:13 +0100	[diff] [blame^]	103	const int weights_width = weights->info()->dimension(0);
				104	const int weights_height = weights->info()->dimension(1);
				105
				106	int output_tile_rows = 0;
				107	int output_tile_cols = 0;
				108	int n_gemms = 0;
				109	int N_BLOCK = 0; // Size of block used by GEMM.
				110
				111	switch(weights_width)
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	112	{
				113	case 3:
				114	{
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	115	transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
				116	transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
				117	transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
Michalis Spyrou	2b3129e	2018-04-25 18:10:13 +0100	[diff] [blame^]	118	output_tile_rows = 2;
				119	output_tile_cols = 2;
				120	n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
				121	N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	122	break;
				123	}
				124	case 5:
				125	{
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	126	transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
				127	transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
				128	transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
Michalis Spyrou	2b3129e	2018-04-25 18:10:13 +0100	[diff] [blame^]	129	output_tile_rows = 2;
				130	output_tile_cols = 2;
				131	n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
				132	N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	133	break;
				134	}
				135	default:
				136	{
				137	ARM_COMPUTE_ERROR("Not supported.");
				138	break;
				139	}
				140	}
				141
Pablo Tello	679463a	2018-02-06 11:47:59 +0000	[diff] [blame]	142	const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
				143	const bool use_same_padding = use_padding_type == PADDING_SAME;
				144
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	145	// Get parameters from conv_info
				146	unsigned int stride_x = 0;
				147	unsigned int stride_y = 0;
				148	std::tie(stride_x, stride_y) = conv_info.stride();
				149	ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 \|\| stride_x != 1, "Winograd layer only supports unit strides.");
				150
				151	// Get convolved dimensions
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	152	const int in_channels = input->info()->dimension(2);
				153	const int out_channels = output->info()->dimension(2);
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	154
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	155	const Tensor4DShape in_shape(internal_get_input_shape(input));
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	156	const size_t data_type_size = input->info()->element_size();
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	157	// Get the memory required to instantiate a new Winograd operator.
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	158	constexpr size_t storage_alignment = 64;
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	159	const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	160	_kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	161	_kernel_storage.allocator()->allocate();
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	162	// Input storage
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	163	const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	164	_input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	165	_input_workspace.allocator()->allocate();
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	166
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	167	// Output storage
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	168	const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	169	_output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	170	_output_workspace.allocator()->allocate();
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	171
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	172	// configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
				173	TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
				174	_output->info()->dimension(1), _output->info()->dimension(3)),
				175	1, _output->info()->data_type());
				176	_output_nhwc.allocator()->init(info);
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	177	_output_nhwc.allocator()->allocate();
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	178
				179	// Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
Georgios Pinitas	02ee429	2018-02-15 17:22:36 +0000	[diff] [blame]	180	_permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	181	_weights_hwio.allocator()->allocate();
				182
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	183	// configure the kernel to transform the input tensor from NCHW -> NHWC
				184	_permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	185	_input_nhwc.allocator()->allocate();
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	186
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	187	const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	188
				189	// Configure the InputTransform
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	190	const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
				191	transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	192	reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
				193
				194	// Configure WeightsTransform
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	195	const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
				196	transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	197
				198	// Configure OutputTransform
				199	//The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	200	const int output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
				201	const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	202
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	203	transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	204	output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
				205	in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	206
Michalis Spyrou	2b3129e	2018-04-25 18:10:13 +0100	[diff] [blame^]	207	// Configure GEMM
				208	const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows);
				209	const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols);
				210	const int m = in_shape.n_batches * tile_rows * tile_cols;
				211	const int k = in_shape.n_channels;
				212	const int n = out_channels;
				213	const int input_matrix_row_stride = in_shape.n_channels;
				214	const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
				215	const int output_matrix_row_stride = kernel_matrix_row_stride;
				216	unsigned int num_threads = NEScheduler::get().num_threads();
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	217
Michalis Spyrou	2b3129e	2018-04-25 18:10:13 +0100	[diff] [blame^]	218	_arm_gemm = arm_gemm::gemm<float, float>(NEScheduler::get().cpu_info(), m, n, k, 1, n_gemms, false, false, 1.f, 0.f, num_threads, false);
				219	_arm_gemm->set_arrays(reinterpret_cast<float >(_input_workspace.buffer()), input_matrix_row_stride, 0, input_matrix_stride, reinterpret_cast<float >(_kernel_storage.buffer()),
				220	kernel_matrix_row_stride, kernel_matrix_stride, reinterpret_cast<float *>(_output_workspace.buffer()), output_matrix_row_stride, 0, output_matrix_stride);
				221
				222	auto acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<arm_gemm::GemmCommon<float, float>>>();
				223	acl_gemm_wrapper->configure(_arm_gemm.get());
				224	const size_t workspace_size = _arm_gemm->get_working_size();
				225
				226	// Allocate workspace
				227	if(workspace_size > 0)
				228	{
				229	const unsigned int alignment = 4096;
				230	allocate_workspace(workspace_size, _workspace, _memory_group, alignment, 1);
				231	_arm_gemm->set_working_space(reinterpret_cast<float *>(_workspace.buffer()));
				232	}
				233
				234	const unsigned int window_size = _arm_gemm->get_window_size();
				235	if(window_size < num_threads)
				236	{
				237	num_threads = window_size;
				238	_arm_gemm->set_nthreads(num_threads);
				239	}
				240
				241	_gemm_kernel = std::move(acl_gemm_wrapper);
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	242
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	243	// Reorder the convoluted output to ACL's ordering NCHW
				244	_permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	245
				246	_transform_input_kernel = std::move(transform_input_kernel);
				247	_transform_weights_kernel = std::move(transform_weights_kernel);
				248	_transform_output_kernel = std::move(transform_output_kernel);
Isabella Gottardi	3f217ec	2018-02-12 14:59:19 +0000	[diff] [blame]	249
				250	//Configure Activation Layer
				251	_is_activationlayer_enabled = act_info.enabled();
				252	if(_is_activationlayer_enabled)
				253	{
				254	_activationlayer_function.configure(output, nullptr, act_info);
				255	}
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	256	}
				257
				258	void NEWinogradLayer::run()
				259	{
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	260	_memory_group.acquire();
				261	if(!_reshaped_kernel)
				262	{
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	263	_reshaped_kernel = true;
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	264	_permute_weights.run();
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	265	NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	266	}
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	267	//Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	268	_permute_input.run();
Pablo Tello	679463a	2018-02-06 11:47:59 +0000	[diff] [blame]	269
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	270	// Transform input tensor to the winograd domain
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	271	NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	272
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	273	//Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
Michalis Spyrou	2b3129e	2018-04-25 18:10:13 +0100	[diff] [blame^]	274	NEScheduler::get().schedule(_gemm_kernel.get(), Window::DimX);
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	275
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	276	// Transform output tensor to the spatial domain
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame]	277	NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	278
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	279	// Reorder the convoluted output to ACL's ordering NCHW
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	280	_permute_output.run();
Isabella Gottardi	3f217ec	2018-02-12 14:59:19 +0000	[diff] [blame]	281
				282	if(_is_activationlayer_enabled)
				283	{
				284	_activationlayer_function.run();
				285	}
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	286	_memory_group.release();
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	287	}
Isabella Gottardi	6acc6ad	2018-02-02 17:19:18 +0000	[diff] [blame]	288
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	289	Status NEWinogradLayer::validate(const ITensorInfo input, const ITensorInfo weights, const ITensorInfo biases, const ITensorInfo output, const PadStrideInfo &conv_info,
				290	const ActivationLayerInfo &act_info)
Isabella Gottardi	6acc6ad	2018-02-02 17:19:18 +0000	[diff] [blame]	291	{
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	292	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
Isabella Gottardi	6acc6ad	2018-02-02 17:19:18 +0000	[diff] [blame]	293
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	294	// Get indices for the width and height
				295	const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
				296	const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
Vidhya Sudhan Loganathan	84ce1f9	2018-04-25 13:00:09 +0100	[diff] [blame]	297	// Input shape
				298	const TensorShape input_shape = input->tensor_shape();
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	299
				300	// Kernel size
				301	const unsigned int kernel_w = weights->tensor_shape()[idx_width];
				302	const unsigned int kernel_h = weights->tensor_shape()[idx_height];
				303
Vidhya Sudhan Loganathan	84ce1f9	2018-04-25 13:00:09 +0100	[diff] [blame]	304	const WinogradInfo winograd_info = WinogradInfo(Size2D(2, 2),
				305	Size2D(kernel_w, kernel_h),
				306	Size2D(input_shape[idx_width], input_shape[idx_height]),
				307	conv_info,
				308	input->data_layout());
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	309
				310	// Validate input transform
Vidhya Sudhan Loganathan	84ce1f9	2018-04-25 13:00:09 +0100	[diff] [blame]	311	const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	312	const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
				313	switch(weights->dimension(0))
				314	{
				315	case 3:
				316	{
Vidhya Sudhan Loganathan	84ce1f9	2018-04-25 13:00:09 +0100	[diff] [blame]	317	ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, &input0, winograd_info)));
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	318	break;
				319	}
				320	case 5:
				321	{
Vidhya Sudhan Loganathan	84ce1f9	2018-04-25 13:00:09 +0100	[diff] [blame]	322	ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, &input0, winograd_info)));
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	323	break;
				324	}
				325	default:
				326	{
				327	ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
				328	break;
				329	}
				330	}
				331	// Validate filter transform
Vidhya Sudhan Loganathan	84ce1f9	2018-04-25 13:00:09 +0100	[diff] [blame]	332	const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	333	const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
				334
				335	switch(weights->dimension(0))
				336	{
				337	case 3:
				338	{
Vidhya Sudhan Loganathan	84ce1f9	2018-04-25 13:00:09 +0100	[diff] [blame]	339	ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, &input1, winograd_info)));
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	340	break;
				341	}
				342	case 5:
				343	{
Vidhya Sudhan Loganathan	84ce1f9	2018-04-25 13:00:09 +0100	[diff] [blame]	344	ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, &input1, winograd_info)));
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	345	break;
				346	}
				347	default:
				348	{
				349	ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
				350	break;
				351	}
				352	}
				353	// Validate batched matrix multiply
				354	TensorShape batched_mm_output_shape = input0.tensor_shape();
				355	batched_mm_output_shape[0] = input1.tensor_shape()[0];
				356	const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
				357	switch(weights->dimension(0))
				358	{
				359	case 3:
				360	{
				361	ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false,
				362	true /* Reshape weights only for the first run*/))));
				363	// Validate output transform
Vidhya Sudhan Loganathan	84ce1f9	2018-04-25 13:00:09 +0100	[diff] [blame]	364	ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	365	break;
				366	}
				367	case 5:
				368	{
				369	ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false,
				370	true /* Reshape weights only for the first run*/))));
				371	// Validate output transform
Vidhya Sudhan Loganathan	84ce1f9	2018-04-25 13:00:09 +0100	[diff] [blame]	372	ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(&batched_mm_output, biases, output, winograd_info)));
Vidhya Sudhan Loganathan	3ca9786	2018-04-23 08:20:04 +0100	[diff] [blame]	373	break;
				374	}
				375	default:
				376	{
				377	ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
				378	break;
				379	}
				380	}
				381
				382	// Validate Activation Layer
				383	if(act_info.enabled())
				384	{
				385	NEActivationLayer::validate(output, nullptr, act_info);
				386	}
Isabella Gottardi	6acc6ad	2018-02-02 17:19:18 +0000	[diff] [blame]	387	return Status{};
				388	}
				389
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	390	} // namespace arm_compute