Blame - src/runtime/NEON/functions/NEWinogradLayer.cpp - ml/ComputeLibrary

blob: 215f1bfddf8ed12a5a2811daeaee8db27dbceaee [file] [log] [blame]

Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	1	/*
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	2	* Copyright (c) 2017-2018 ARM Limited.
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
				25
				26	#include "arm_compute/core/Utils.h"
				27	#include "arm_compute/core/Validate.h"
				28	#include "arm_compute/runtime/NEON/NEScheduler.h"
				29	#include "support/ToolchainSupport.h"
				30
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	31	#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	32
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	33	namespace
				34	{
				35	inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
				36	{
				37	const int in_width = input->info()->dimension(0);
				38	const int in_height = input->info()->dimension(1);
				39	const int in_batches = input->info()->dimension(3);
				40	const int in_channels = input->info()->dimension(2);
				41	return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
				42	}
				43	} /* namespace */
				44
				45	namespace arm_compute
				46	{
				47	NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	48	: _memory_group(std::move(memory_manager)), _winograd_kernel(), _transform_input_kernel(), _transform_output_kernel(), _transform_weights_kernel(), _permute_input(), _permute_weights(),
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	49	_permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(), _reshaped_kernel(false)
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	50	{
				51	} /* arm_compute */
				52
				53	void NEWinogradLayer::configure(const ITensor input, const ITensor weights, const ITensor biases, ITensor output, const PadStrideInfo &conv_info)
				54	{
				55	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	56	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, biases);
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	57	ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(1) != 3 \|\| weights->info()->dimension(0) != 3, "Only 3x3 kernels are supported");
				58	ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
				59
				60	if(biases != nullptr)
				61	{
				62	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
				63	ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
				64	}
				65
				66	_weights = weights;
				67	_input = input;
				68	_output = output;
				69
Pablo Tello	679463a	2018-02-06 11:47:59 +0000	[diff] [blame]	70	const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
				71	const bool use_same_padding = use_padding_type == PADDING_SAME;
				72
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	73	// Get parameters from conv_info
				74	unsigned int stride_x = 0;
				75	unsigned int stride_y = 0;
				76	std::tie(stride_x, stride_y) = conv_info.stride();
				77	ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 \|\| stride_x != 1, "Winograd layer only supports unit strides.");
				78
				79	// Get convolved dimensions
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	80	const int in_channels = input->info()->dimension(2);
				81	const int out_channels = output->info()->dimension(2);
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	82
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	83	const Tensor4DShape in_shape(internal_get_input_shape(input));
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	84	const size_t data_type_size = input->info()->element_size();
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	85	// Get the memory required to instantiate a new Winograd operator.
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	86	constexpr size_t storage_alignment = 64;
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	87	const size_t kernel_storage_size = NEWinogradLayerTransformWeightsKernel<2, 2, 3, 3>::get_weight_storage_size(out_channels, in_channels) * data_type_size;
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	88	_kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	89	_memory_group.manage(&_kernel_storage);
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	90	_memory_group.manage(&_input_nhwc);
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	91	_kernel_storage.allocator()->allocate();
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	92	// Input storage
Pablo Tello	679463a	2018-02-06 11:47:59 +0000	[diff] [blame]	93
				94	using IT = NEWinogradLayerTransformInputKernel<2, 2, 3, 3>;
				95	const size_t input_storage_size = IT::get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	96	_input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
				97	_memory_group.manage(&_input_workspace);
				98	_input_workspace.allocator()->allocate();
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	99
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	100	// Output storage
Pablo Tello	679463a	2018-02-06 11:47:59 +0000	[diff] [blame]	101	using OT = NEWinogradLayerTransformOutputKernel<2, 2, 3, 3>;
				102	const size_t output_storage_size = OT::get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	103	_output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
				104	_memory_group.manage(&_output_workspace);
				105	_output_workspace.allocator()->allocate();
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	106
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	107	// configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
				108	TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
				109	_output->info()->dimension(1), _output->info()->dimension(3)),
				110	1, _output->info()->data_type());
				111	_output_nhwc.allocator()->init(info);
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	112	_output_nhwc.allocator()->allocate();
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	113
				114	// Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
Georgios Pinitas	02ee429	2018-02-15 17:22:36 +0000	[diff] [blame^]	115	_permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	116	_weights_hwio.allocator()->allocate();
				117
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	118	// configure the kernel to transform the input tensor from NCHW -> NHWC
				119	_permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	120	_input_nhwc.allocator()->allocate();
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	121
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	122	using T = winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>;
				123	const int weights_width = weights->info()->dimension(0);
				124	const int weights_height = weights->info()->dimension(1);
				125	const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	126
				127	// Configure the InputTransform
Pablo Tello	679463a	2018-02-06 11:47:59 +0000	[diff] [blame]	128	const int input_matrix_stride = T::get_input_matrix_stride(kernel_shape, in_shape, use_padding_type);
				129	_transform_input_kernel.configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	130	reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
				131
				132	// Configure WeightsTransform
				133	const int kernel_matrix_stride = T::get_kernel_matrix_stride(kernel_shape);
				134	_transform_weights_kernel.configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
				135
				136	// Configure OutputTransform
				137	//The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
Pablo Tello	679463a	2018-02-06 11:47:59 +0000	[diff] [blame]	138	const int output_matrix_stride = T::get_output_matrix_stride(kernel_shape, in_shape, use_padding_type);
				139	const auto output_shape(T::get_output_shape(kernel_shape, in_shape, use_padding_type));
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	140
				141	_transform_output_kernel.configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
				142	output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
				143	in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	144
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	145	// Configure Batched GEMMs
				146	const int tile_rows = iceildiv(output_shape.n_rows, NEWinogradLayerKernel<2, 2, 3, 3>::_output_tile_rows);
				147	const int tile_cols = iceildiv(output_shape.n_cols, NEWinogradLayerKernel<2, 2, 3, 3>::_output_tile_cols);
				148	const int m = in_shape.n_batches * tile_rows * tile_cols;
				149	const int k = in_shape.n_channels;
				150	const int n = out_channels;
				151	const int input_matrix_row_stride = in_shape.n_channels;
				152	const int kernel_matrix_row_stride = roundup(out_channels, NEWinogradLayerKernel<2, 2, 3, 3>::WinogradConv::N_BLOCK);
				153	const int output_matrix_row_stride = kernel_matrix_row_stride;
				154
				155	_winograd_kernel.configure(NEWinogradLayerKernel<2, 2, 3, 3>::WinogradBase::N_GEMMS, m, k, n,
				156	input_matrix_stride, input_matrix_row_stride,
				157	kernel_matrix_stride, kernel_matrix_row_stride,
				158	output_matrix_stride, output_matrix_row_stride,
				159	reinterpret_cast<float >(_input_workspace.buffer()), reinterpret_cast<float >(_kernel_storage.buffer()), reinterpret_cast<float *>(_output_workspace.buffer()));
				160
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	161	// Reorder the convoluted output to ACL's ordering NCHW
				162	_permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	163	}
				164
				165	void NEWinogradLayer::run()
				166	{
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	167	_memory_group.acquire();
				168	if(!_reshaped_kernel)
				169	{
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	170	_reshaped_kernel = true;
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	171	_permute_weights.run();
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	172	NEScheduler::get().schedule(&_transform_weights_kernel, Window::DimX);
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	173	}
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	174	//Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	175	_permute_input.run();
Pablo Tello	679463a	2018-02-06 11:47:59 +0000	[diff] [blame]	176
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	177	// Transform input tensor to the winograd domain
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	178	NEScheduler::get().schedule(&_transform_input_kernel, Window::DimX);
				179
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	180	//Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	181	NEScheduler::get().schedule(&_winograd_kernel, Window::DimX);
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	182
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	183	// Transform output tensor to the spatial domain
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	184	NEScheduler::get().schedule(&_transform_output_kernel, Window::DimX);
				185
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	186	// Reorder the convoluted output to ACL's ordering NCHW
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	187	_permute_output.run();
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	188	_memory_group.release();
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	189	}
				190	} // namespace arm_compute