Blame - src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp - ml/ComputeLibrary

blob: feb2a5a2c1f4c8e9c5b07fe5384cabdeb11fcb0c [file] [log] [blame]

Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	1	/*
Pablo Tello	7594f98	2023-01-30 14:19:24 +0000	[diff] [blame]	2	* Copyright (c) 2022-2023 Arm Limited.
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	25	#include <algorithm>
				26	#include <cstddef>
				27	#include <arm_neon.h>
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	28
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	29	namespace arm_conv {
				30	namespace winograd {
				31	namespace output_transform {
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	32
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	33	void arm_fp32_1x4_1x5(
				34	unsigned int n_channels,
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	35	const float* inptr,
Pablo Tello	7594f98	2023-01-30 14:19:24 +0000	[diff] [blame]	36	size_t matrix_stride,
Pablo Tello	8f43d74	2019-03-27 09:28:32 +0000	[diff] [blame]	37	const float* bptr,
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	38	float *outptr,
				39	size_t, // No need to stride across rows
Pablo Tello	7594f98	2023-01-30 14:19:24 +0000	[diff] [blame]	40	size_t output_col_stride,
				41	float output_min,
				42	float output_max
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	43	)
				44	{
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	45	constexpr auto inner_tile_cols = 8u, output_tile_cols = 4u;
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	46
				47	// For each channel of the output
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	48	for (; n_channels >= 4; n_channels -= 4)
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	49	{
				50	// Matrices used and computed during this transform
				51	float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
				52
				53	// Read a 1x8 tile in the Winograd domain
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	54	for (auto j = 0u; j < inner_tile_cols; j++)
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	55	{
				56	F[j] = vld1q_f32(inptr + j*matrix_stride);
				57	}
				58	inptr += 4;
				59
				60	f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
				61	f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
				62	f[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
				63	f[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
				64
				65	// Write out the output tile
				66	if (bptr != 0)
				67	{
				68	b = vld1q_f32(bptr);
				69	bptr += 4;
				70	}
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	71	for (auto j = 0u; j < output_tile_cols; j++)
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	72	{
Pablo Tello	5264b7d	2019-10-21 14:25:41 +0100	[diff] [blame]	73	const auto y =
				74	vmaxq_f32(vminq_f32(vaddq_f32(f[j], b), vdupq_n_f32(output_max)),
				75	vdupq_n_f32(output_min));
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	76	vst1q_f32(outptr + j*output_col_stride, y);
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	77	}
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	78	outptr += 4;
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	79	}
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	80	for (; n_channels >= 2; n_channels -= 2)
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	81	{
				82	// Matrices used and computed during this transform
				83	float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
				84
				85	// Read a 1x8 tile in the Winograd domain
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	86	for (auto j = 0u; j < inner_tile_cols; j++)
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	87	{
				88	F[j] = vld1_f32(inptr + j*matrix_stride);
				89	}
				90	inptr += 2;
				91
				92	f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
				93	f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
				94	f[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
				95	f[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
				96
				97	// Write out the output tile
				98	if (bptr != 0)
				99	{
				100	b = vld1_f32(bptr);
				101	bptr += 2;
				102	}
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	103	for (auto j = 0u; j < output_tile_cols; j++)
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	104	{
Pablo Tello	5264b7d	2019-10-21 14:25:41 +0100	[diff] [blame]	105	const auto y =
				106	vmax_f32(vmin_f32(vadd_f32(f[j], b), vdup_n_f32(output_max)),
				107	vdup_n_f32(output_min));
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	108	vst1_f32(outptr + j*output_col_stride, y);
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	109	}
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	110	outptr += 2;
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	111	}
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	112	for (; n_channels; n_channels--)
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	113	{
				114	// Matrices used and computed during this transform
				115	float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
				116
				117	// Read a 1x8 tile in the Winograd domain
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	118	for (auto j = 0u; j < inner_tile_cols; j++)
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	119	{
				120	F[j] = (inptr + jmatrix_stride);
				121	}
				122	inptr++;
				123
				124	f[0] = F[0]1 + F[1]1 + F[2]1 + F[3]1 + F[4]1 + F[5]1 + F[6]*1;
				125	f[1] = F[1]-1 + F[5]-3 + F[3]-2 + F[4]2 + F[6]3 + F[2]1;
				126	f[2] = F[3]4 + F[4]4 + F[5]9 + F[6]9 + F[1]1 + F[2]1;
				127	f[3] = F[1]-1 + F[5]-27 + F[3]-8 + F[4]8 + F[6]27 + F[2]1 + F[7]*1;
				128
				129	// Write out the output tile
				130	if (bptr != 0)
				131	{
				132	b = *(bptr++);
				133	}
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	134	for (auto j = 0u; j < output_tile_cols; j++)
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	135	{
Pablo Tello	5264b7d	2019-10-21 14:25:41 +0100	[diff] [blame]	136	const auto y = std::max(std::min(f[j] + b, output_max), output_min);
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	137	(outptr + joutput_col_stride) = y;
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	138	}
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	139	outptr++;
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	140	}
				141	}
				142
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	143	} // namespace output_transform
Pablo Tello	000d33a	2018-09-03 16:59:20 +0100	[diff] [blame]	144	} // namespace winograd
ramelg01	a1f7851	2022-06-29 16:28:10 +0100	[diff] [blame]	145	} // namespace arm_conv