Blame - tests/validation/reference/Winograd.cpp - ml/ComputeLibrary

blob: c760663b225160c4eded6c11a79ae60287bc9b2f [file] [log] [blame]

Giorgio Arena	1f9ca1d	2018-03-01 11:13:45 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2018 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "Winograd.h"
				25
				26	#include "tests/validation/Helpers.h"
				27	#include "tests/validation/reference/Utils.h"
				28
Gian Marco Iodice	7e4b239	2018-02-22 16:17:20 +0000	[diff] [blame]	29	#include "arm_compute/core/Types.h"
				30
Giorgio Arena	1f9ca1d	2018-03-01 11:13:45 +0000	[diff] [blame]	31	namespace arm_compute
				32	{
				33	namespace test
				34	{
				35	namespace validation
				36	{
				37	namespace reference
				38	{
				39	namespace
				40	{
				41	template <typename T>
Gian Marco Iodice	7e4b239	2018-02-22 16:17:20 +0000	[diff] [blame]	42	void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out)
				43	{
				44	// Simple tensor for the 3x3 input tile
				45	SimpleTensor<T> input_tile{ TensorShape(3u, 3u), in.data_type(), 1 };
				46
				47	// Simple tensor for the transformation matrix
				48	SimpleTensor<T> trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 };
				49
				50	// Simple tensor for the transformation matrix transpose
				51	SimpleTensor<T> trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 };
				52
				53	// Simple tensor for the 4x3 temporary tile
				54	SimpleTensor<T> tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 };
				55
				56	// Simple tensor for the 4x4 output tile
				57	SimpleTensor<T> output_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
				58
				59	// Initialize transformation matrix
				60	// 1 \| 0 \| 0
				61	// 0.5 \| 0.5 \| 0.5
				62	// 0.5 \|-0.5 \| 0.5
				63	// 0 \| 0 \| 1
				64	trans_matrix[0 + 0 * 3] = 1.0f;
				65	trans_matrix[1 + 0 * 3] = 0.0f;
				66	trans_matrix[2 + 0 * 3] = 0.0f;
				67	trans_matrix[0 + 1 * 3] = 0.5f;
				68	trans_matrix[1 + 1 * 3] = 0.5f;
				69	trans_matrix[2 + 1 * 3] = 0.5f;
				70	trans_matrix[0 + 2 * 3] = 0.5f;
				71	trans_matrix[1 + 2 * 3] = -0.5f;
				72	trans_matrix[2 + 2 * 3] = 0.5f;
				73	trans_matrix[0 + 3 * 3] = 0.0f;
				74	trans_matrix[1 + 3 * 3] = 0.0f;
				75	trans_matrix[2 + 3 * 3] = 1.0f;
				76
				77	// Transpose the transformation matrix
				78	transpose_matrix(trans_matrix, trans_matrix_transposed);
				79
				80	const int num_channels = in.shape()[2];
				81	const int num_filters = in.shape()[3];
				82	const int num_batches = in.shape().total_size() / (9 * num_channels * num_filters);
				83
				84	for(int n = 0; n < num_batches; ++n)
				85	{
				86	for(int w = 0; w < num_filters; ++w)
				87	{
				88	for(int z = 0; z < num_channels; ++z)
				89	{
				90	// Load the 3x3 tile from the input tensor
				91	get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
				92
				93	// First transformation
				94	matrix_multiply(trans_matrix, input_tile, tmp_tile);
				95
				96	// Second transformation
				97	matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
				98
				99	// Store the 4x4 output tile across the 16 channels
				100	const int output_offset = w + z * num_filters;
				101	out[output_offset + 0 * num_filters * num_channels] = output_tile[0 + 0 * 4];
				102	out[output_offset + 1 * num_filters * num_channels] = output_tile[1 + 0 * 4];
				103	out[output_offset + 2 * num_filters * num_channels] = output_tile[2 + 0 * 4];
				104	out[output_offset + 3 * num_filters * num_channels] = output_tile[3 + 0 * 4];
				105	out[output_offset + 4 * num_filters * num_channels] = output_tile[0 + 1 * 4];
				106	out[output_offset + 5 * num_filters * num_channels] = output_tile[1 + 1 * 4];
				107	out[output_offset + 6 * num_filters * num_channels] = output_tile[2 + 1 * 4];
				108	out[output_offset + 7 * num_filters * num_channels] = output_tile[3 + 1 * 4];
				109	out[output_offset + 8 * num_filters * num_channels] = output_tile[0 + 2 * 4];
				110	out[output_offset + 9 * num_filters * num_channels] = output_tile[1 + 2 * 4];
				111	out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4];
				112	out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4];
				113	out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4];
				114	out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4];
				115	out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4];
				116	out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4];
				117	}
				118	}
				119	}
				120	}
Gian Marco Iodice	d2fab73	2018-03-02 11:18:12 +0000	[diff] [blame^]	121
				122	template <typename T>
				123	void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
				124	{
				125	TensorShape shape4x4(4u, 4u);
				126
				127	// Simple tensor for the 4x4 input tile
				128	SimpleTensor<T> src_tile{ shape4x4, src.data_type() };
				129
				130	// Simple tensor for the 4x4 temporary tile
				131	SimpleTensor<T> tmp_tile{ shape4x4, src.data_type() };
				132
				133	// Simple tensor for the 4x4 output tile
				134	SimpleTensor<T> dst_tile{ shape4x4, src.data_type() };
				135
				136	// Simple tensor for the transformation matrix
				137	SimpleTensor<T> matrix{ shape4x4, src.data_type() };
				138
				139	// Simple tensor for the transformation matrix transposed
				140	SimpleTensor<T> matrix_transposed{ shape4x4, src.data_type() };
				141
				142	const float matrix_values[] = { 1.f, 0.f, -1.f, 0.f,
				143	0.f, 1.f, 1.f, 0.f,
				144	0.f, -1.f, 1.f, 0.f,
				145	0.f, 1.f, 0.f, -1.f
				146	};
				147
				148	for(int i = 0; i < matrix.num_elements(); ++i)
				149	{
				150	matrix[i] = matrix_values[i];
				151	}
				152
				153	transpose_matrix(matrix, matrix_transposed);
				154
				155	const int in_w = src.shape().x();
				156	const int in_h = src.shape().y();
				157	const int in_d = src.shape().z();
				158	const int num_batches = src.shape().total_size() / (in_w * in_h * in_d);
				159	const int num_tiles_x = std::ceil((in_w - 2 + conv_info.pad_left() + conv_info.pad_right()) / 2.0f);
				160	const int num_tiles_y = std::ceil((in_h - 2 + conv_info.pad_top() + conv_info.pad_bottom()) / 2.0f);
				161
				162	ARM_COMPUTE_ERROR_ON((num_tiles_x * num_tiles_y) != static_cast<int>(dst.shape().y()));
				163
				164	for(int b = 0; b < num_batches; ++b)
				165	{
				166	for(int z = 0; z < in_d; ++z)
				167	{
				168	for(int y = 0; y < num_tiles_y; ++y)
				169	{
				170	for(int x = 0; x < num_tiles_x; ++x)
				171	{
				172	int xi = x * 2 - conv_info.pad_left();
				173	int yi = y * 2 - conv_info.pad_top();
				174
				175	// Get the 4x4 tile from the input tensor
				176	get_tile(src, src_tile, Coordinates(xi, yi, z, b));
				177
				178	// Compute the transformation
				179	matrix_multiply(matrix, src_tile, tmp_tile);
				180	matrix_multiply(tmp_tile, matrix_transposed, dst_tile);
				181
				182	// Store the 4x4 output tile across the 16 channels
				183	for(int i = 0; i < 16; ++i)
				184	{
				185	int xo = z;
				186	int yo = x + y * num_tiles_x;
				187	dst[coords2index(dst.shape(), Coordinates(xo, yo, i, b))] = dst_tile[i];
				188	}
				189	}
				190	}
				191	}
				192	}
				193	}
				194
				195	template <typename T>
				196	void winograd_output_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out, int num_tiles_x)
				197	{
				198	ARM_COMPUTE_ERROR_ON(in.shape()[2] != 16);
				199	ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]);
				200
				201	// Simple tensor for the 3x3 input tile
				202	SimpleTensor<T> input_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
				203
				204	// Simple tensor for the transformation matrix
				205	SimpleTensor<T> trans_matrix{ TensorShape(4u, 2u), in.data_type(), 1 };
				206
				207	// Simple tensor for the transformation matrix transpose
				208	SimpleTensor<T> trans_matrix_transposed{ TensorShape(2u, 4u), in.data_type(), 1 };
				209
				210	// Simple tensor for the 4x3 temporary tile
				211	SimpleTensor<T> tmp_tile{ TensorShape(4u, 2u), in.data_type(), 1 };
				212
				213	// Simple tensor for the 4x4 output tile
				214	SimpleTensor<T> output_tile{ TensorShape(2u, 2u), in.data_type(), 1 };
				215
				216	// Initialize transformation matrix
				217	// 1 \| 1 \| 1 \| 1
				218	// 0 \| 1 \| -1 \| -1
				219	trans_matrix[0 + 0 * 4] = 1.0f;
				220	trans_matrix[1 + 0 * 4] = 1.0f;
				221	trans_matrix[2 + 0 * 4] = 1.0f;
				222	trans_matrix[3 + 0 * 4] = 0.0f;
				223	trans_matrix[0 + 1 * 4] = 0.0f;
				224	trans_matrix[1 + 1 * 4] = 1.0f;
				225	trans_matrix[2 + 1 * 4] = -1.0f;
				226	trans_matrix[3 + 1 * 4] = -1.0f;
				227
				228	// Transpose the transformation matrix
				229	transpose_matrix(trans_matrix, trans_matrix_transposed);
				230
				231	const int w_in = in.shape()[0];
				232	const int h_in = in.shape()[1];
				233	const int c_in = in.shape()[2];
				234	const int w_out = out.shape()[0];
				235	const int h_out = out.shape()[1];
				236	const int c_out = out.shape()[2];
				237	const int num_batches = in.shape().total_size() / (w_in * h_in * c_in);
				238
				239	// Input strides
				240	const int stridey_in = w_in;
				241	const int stridez_in = stridey_in * h_in;
				242	const int stridew_in = stridez_in * c_in;
				243
				244	// Output strides
				245	const int stridey_out = w_out;
				246	const int stridez_out = stridey_out * h_out;
				247	const int stridew_out = stridez_out * c_out;
				248
				249	for(int n = 0; n < num_batches; ++n)
				250	{
				251	for(int y = 0; y < h_in; ++y)
				252	{
				253	for(int x = 0; x < w_in; ++x)
				254	{
				255	// Load the 4x4 tile across the 16 channels of the input tensor
				256	for(int z = 0; z < c_in; ++z)
				257	{
				258	input_tile[z] = in[x + (y * stridey_in) + (z * stridez_in) + (n * stridew_in)];
				259	}
				260
				261	// First transformation
				262	matrix_multiply(trans_matrix, input_tile, tmp_tile);
				263
				264	// Second transformation
				265	matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
				266
				267	// Store the 2x2 output tile
				268	const int xo = (y % num_tiles_x) * 2;
				269	const int yo = (y / num_tiles_x) * 2;
				270	const int zo = x;
				271
				272	const int output_offset = xo + (yo * stridey_out) + (zo * stridez_out) + (n * stridew_out);
				273	out[output_offset + 0 * stridey_out + 0] = output_tile[0 + 0 * 2];
				274
				275	// Check out-of-bound writes
				276	if(xo + 1 < w_out)
				277	{
				278	out[output_offset + 0 * stridey_out + 1] = output_tile[1 + 0 * 2];
				279	}
				280
				281	if(yo + 1 < h_out)
				282	{
				283	out[output_offset + 1 * stridey_out + 0] = output_tile[0 + 1 * 2];
				284	}
				285
				286	if((yo + 1 < h_out) && (xo + 1 < w_out))
				287	{
				288	out[output_offset + 1 * stridey_out + 1] = output_tile[1 + 1 * 2];
				289	}
				290	}
				291	}
				292	}
				293	}
Giorgio Arena	1f9ca1d	2018-03-01 11:13:45 +0000	[diff] [blame]	294	} // namespace
				295
				296	template <typename T>
				297	SimpleTensor<T> winograd_input_transform(const SimpleTensor<T> &src, const TensorShape &dst_shape, const PadStrideInfo &conv_info, const Size2D &kernel_dims)
				298	{
				299	ARM_COMPUTE_ERROR_ON(kernel_dims.width != kernel_dims.height);
				300	ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NCHW);
				301
				302	SimpleTensor<T> dst{ dst_shape, src.data_type() };
				303
				304	switch(kernel_dims.width)
				305	{
				306	case 3:
				307	winograd_input_transform3x3(src, dst, conv_info);
				308	break;
				309	default:
				310	ARM_COMPUTE_ERROR("Only 3x3 kernels are supported");
				311	}
				312
				313	return dst;
				314	}
				315
Gian Marco Iodice	7e4b239	2018-02-22 16:17:20 +0000	[diff] [blame]	316	template <typename T>
				317	SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const TensorShape &output_shape)
				318	{
				319	ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format");
				320
				321	// Create reference
				322	SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
				323
				324	switch(in.shape()[0])
				325	{
				326	case 3:
				327	winograd_filter_transform3x3(in, out);
				328	break;
				329	default:
				330	ARM_COMPUTE_ERROR("Only supported 3x3 kernel");
				331	break;
				332	}
				333
				334	return out;
				335	}
				336
Gian Marco Iodice	d2fab73	2018-03-02 11:18:12 +0000	[diff] [blame^]	337	template <typename T>
				338	SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles)
				339	{
				340	ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format");
				341	ARM_COMPUTE_ERROR_ON(kernel_dims.width != kernel_dims.height);
				342	ARM_COMPUTE_ERROR_ON(in.shape()[1] != num_tiles.area());
				343
				344	// Create reference
				345	SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
				346
				347	switch(kernel_dims.width)
				348	{
				349	case 3:
				350	winograd_output_transform3x3(in, out, num_tiles.width);
				351	break;
				352	default:
				353	ARM_COMPUTE_ERROR("Only supported 3x3 kernel");
				354	break;
				355	}
				356
				357	return out;
				358	}
				359
Giorgio Arena	1f9ca1d	2018-03-01 11:13:45 +0000	[diff] [blame]	360	template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &src, const TensorShape &dst_shape, const PadStrideInfo &conv_info, const Size2D &kernel_dims);
Gian Marco Iodice	7e4b239	2018-02-22 16:17:20 +0000	[diff] [blame]	361	template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape);
Gian Marco Iodice	d2fab73	2018-03-02 11:18:12 +0000	[diff] [blame^]	362	template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles);
Giorgio Arena	1f9ca1d	2018-03-01 11:13:45 +0000	[diff] [blame]	363	} // namespace reference
				364	} // namespace validation
				365	} // namespace test
				366	} // namespace arm_compute