Blame - src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp - ml/ComputeLibrary

blob: 05f06a81ee3eceae8dce7acdbf6ad554822c3fe3 [file] [log] [blame]

Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	1	/*
Pablo Tello	5264b7d	2019-10-21 14:25:41 +0100	[diff] [blame]	2	* Copyright (c) 2017-2019 ARM Limited.
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	25	#include "output.hpp"
				26	#include "arm.hpp"
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	27
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	28	namespace winograd
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	29	{
				30
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	31	template <>
				32	void OutputTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>::transform_tile(
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	33	const int n_channels,
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	34	const float* inptr,
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	35	const int matrix_stride,
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	36	const float* bptr,
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	37	float* const output,
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	38	const int, // No need to stride across rows
Pablo Tello	5264b7d	2019-10-21 14:25:41 +0100	[diff] [blame]	39	const int output_col_stride,
				40	const float output_min,
				41	const float output_max
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	42	)
				43	{
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	44	// Construct a map to the output cells
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	45	float *outptrs[output_tile_cols];
				46	for (int j = 0; j < output_tile_cols; j++)
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	47	{
				48	outptrs[j] = output + j*output_col_stride;
				49	}
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	50
				51	// For each channel of the output
				52	int channels_remaining = n_channels;
				53	#ifdef __arm_any__
				54	for (; channels_remaining >= 4; channels_remaining -= 4)
				55	{
				56	// Matrices used and computed during this transform
				57	float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
				58
				59	// Read a 1x8 tile in the Winograd domain
				60	for (int j = 0; j < inner_tile_cols; j++)
				61	{
				62	F[j] = vld1q_f32(inptr + j*matrix_stride);
				63	}
				64	inptr += 4;
				65
				66	f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
				67	f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
				68	f[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
				69	f[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
				70	f[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 81), F[5], 81), F[4], 16), F[3], 16);
				71	f[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 243), F[4], 32), F[3], -32), F[5], -243), F[1], -1);
				72
				73	// Write out the output tile
				74	if (bptr != 0)
				75	{
				76	b = vld1q_f32(bptr);
				77	bptr += 4;
				78	}
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	79	for (int j = 0; j < output_tile_cols; j++)
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	80	{
Pablo Tello	5264b7d	2019-10-21 14:25:41 +0100	[diff] [blame]	81	const auto y = vminq_f32(vmaxq_f32(f[j] + b, vdupq_n_f32(output_min)),
				82	vdupq_n_f32(output_max));
				83	vst1q_f32(outptrs[j], y);
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	84	outptrs[j] += 4;
				85	}
				86	}
				87	for (; channels_remaining >= 2; channels_remaining -= 2)
				88	{
				89	// Matrices used and computed during this transform
				90	float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
				91
				92	// Read a 1x8 tile in the Winograd domain
				93	for (int j = 0; j < inner_tile_cols; j++)
				94	{
				95	F[j] = vld1_f32(inptr + j*matrix_stride);
				96	}
				97	inptr += 2;
				98
				99	f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
				100	f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
				101	f[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
				102	f[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
				103	f[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 81), F[5], 81), F[4], 16), F[3], 16);
				104	f[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 243), F[4], 32), F[3], -32), F[5], -243), F[1], -1);
				105
				106	// Write out the output tile
				107	if (bptr != 0)
				108	{
				109	b = vld1_f32(bptr);
				110	bptr += 2;
				111	}
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	112	for (int j = 0; j < output_tile_cols; j++)
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	113	{
Pablo Tello	5264b7d	2019-10-21 14:25:41 +0100	[diff] [blame]	114	const auto y = vmin_f32(vmax_f32(f[j] + b, vdup_n_f32(output_min)),
				115	vdup_n_f32(output_max));
				116	vst1_f32(outptrs[j], y);
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	117	outptrs[j] += 2;
				118	}
				119	}
				120	#endif // __arm_any__
				121	for (; channels_remaining; channels_remaining--)
				122	{
				123	// Matrices used and computed during this transform
				124	float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
				125
				126	// Read a 1x8 tile in the Winograd domain
				127	for (int j = 0; j < inner_tile_cols; j++)
				128	{
				129	F[j] = (inptr + jmatrix_stride);
				130	}
				131	inptr++;
				132
				133	f[0] = F[0]1 + F[1]1 + F[2]1 + F[3]1 + F[4]1 + F[5]1 + F[6]*1;
				134	f[1] = F[1]-1 + F[5]-3 + F[3]-2 + F[4]2 + F[6]3 + F[2]1;
				135	f[2] = F[3]4 + F[4]4 + F[5]9 + F[6]9 + F[1]1 + F[2]1;
				136	f[3] = F[1]-1 + F[5]-27 + F[3]-8 + F[4]8 + F[6]27 + F[2]1;
				137	f[4] = F[3]16 + F[4]16 + F[5]81 + F[6]81 + F[1]1 + F[2]1;
				138	f[5] = F[1]-1 + F[5]-243 + F[3]-32 + F[4]32 + F[6]243 + F[2]1 + F[7]*1;
				139
				140	// Write out the output tile
				141	if (bptr != 0)
				142	{
				143	b = *(bptr++);
				144	}
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	145	for (int j = 0; j < output_tile_cols; j++)
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	146	{
Pablo Tello	5264b7d	2019-10-21 14:25:41 +0100	[diff] [blame]	147	*(outptrs[j]++) = std::max(std::min(f[j] + b, output_max), output_min);
Pablo Tello	bda6e4b	2018-08-22 11:40:33 +0100	[diff] [blame]	148	}
				149	}
				150	}
				151
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	152	template class OutputTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>;
				153	template class OutputTransform<3, 1, 8, 1, float, float, WinogradRoots::Integers>;
Pablo Tello	d3d97d2	2018-10-05 10:59:48 +0100	[diff] [blame]	154
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	155	} // namespace