Blame - src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp - ml/ComputeLibrary

blob: 974be481e7639e1b9d6a8a20b2b6b6472309b792 [file] [log] [blame]

Georgios Pinitas	94672fb	2020-01-22 18:36:27 +0000	[diff] [blame]	1	/*
Michele Di Giorgio	d9eaf61	2020-07-08 11:12:57 +0100	[diff] [blame]	2	* Copyright (c) 2017-2019 Arm Limited.
Georgios Pinitas	94672fb	2020-01-22 18:36:27 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#pragma once
				25
				26	#ifdef __aarch64__
				27
				28	#include "transpose_interleave_common.hpp"
				29
				30	// Generic unblocked transposed 8x32-bit sized specialisation
				31	template <>
				32	template <typename T>
				33	inline void TransformImpl<8, 1, true, 4, 4, false>::Transform(
				34	T* out, const T* const in, const int stride,
				35	const int x0, const int xmax, const int k0, const int kmax
				36	) {
				37	// Redirect to a 16 x uint16_t specialisation
				38	TransformImpl<16, 1, true, 2, 2, false>::Transform(
				39	reinterpret_cast<uint16_t *>(out),
				40	reinterpret_cast<const uint16_t *>(in),
				41	stride2, x02, xmax*2, k0, kmax
				42	);
				43	}
				44
				45	// Generic 16x16-bit sized specialisation
				46	template <>
				47	template <typename T>
				48	inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
				49	T* out, const T* const in, const int stride,
				50	const int x0, const int xmax, const int k0, const int kmax
				51	) {
				52	// Redirect to a uint16_t specialisation
				53	Transform(
				54	reinterpret_cast<uint16_t *>(out),
				55	reinterpret_cast<const uint16_t *>(in),
				56	stride, x0, xmax, k0, kmax
				57	);
				58	}
				59
				60	// Specialised 16 x uint16_t version
				61	template <>
				62	inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t &in0, uint16_t const out) {
				63	__asm volatile (
				64	"LDR q0, [%[in0]]\n"
				65	"STR q0, [%[out]]\n"
				66	"LDR q1, [%[in0], #0x10]\n"
				67	"STR q1, [%[out], #0x10]\n"
				68	"ADD %x[in0], %x[in0], #0x20\n"
				69	ASM_PREFETCH("[%[in0], #192]")
				70	: [in0] "+r" (in0)
				71	: [out] "r" (out)
				72	: "v0", "v1", "memory"
				73	);
				74	}
				75
				76	template <>
				77	inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t &in0, const uint16_t &in1, uint16_t *const out) {
				78	__asm volatile (
				79	"LDR q0, [%[in0]]\n"
				80	"STR q0, [%[out]]\n"
				81	"LDR q1, [%[in0], #0x10]\n"
				82	"STR q1, [%[out], #0x10]\n"
				83	"ADD %x[in0], %x[in0], #0x20\n"
				84	ASM_PREFETCH("[%[in0], #192]")
				85
				86	"LDR q2, [%[in1]]\n"
				87	"STR q2, [%[out], #0x20]\n"
				88	"LDR q3, [%[in1], #0x10]\n"
				89	"STR q3, [%[out], #0x30]\n"
				90	"ADD %x[in1], %x[in1], #0x20\n"
				91	ASM_PREFETCH("[%[in1], #192]")
				92	: [in0] "+r" (in0),
				93	[in1] "+r" (in1)
				94	: [out] "r" (out)
				95	: "v0", "v1", "v2", "v3", "memory"
				96	);
				97	}
				98
				99	template <>
				100	inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t &in0, const uint16_t &in1, const uint16_t &in2, const uint16_t &in3, uint16_t *const out) {
				101	__asm __volatile (
				102	"LDR q0, [%[in0]]\n"
				103	"STR q0, [%[out]]\n"
				104	"LDR q1, [%[in0], #0x10]\n"
				105	"STR q1, [%[out], #0x10]\n"
				106	"ADD %x[in0], %x[in0], #0x20\n"
				107	ASM_PREFETCH("[%[in0], #192]")
				108
				109	"LDR q2, [%[in1]]\n"
				110	"STR q2, [%[out], #0x20]\n"
				111	"LDR q3, [%[in1], #0x10]\n"
				112	"STR q3, [%[out], #0x30]\n"
				113	"ADD %x[in1], %x[in1], #0x20\n"
				114	ASM_PREFETCH("[%[in1], #192]")
				115
				116	"LDR q0, [%[in2]]\n"
				117	"STR q0, [%[out], #0x40]\n"
				118	"LDR q1, [%[in2], #0x10]\n"
				119	"STR q1, [%[out], #0x50]\n"
				120	"ADD %x[in2], %x[in2], #0x20\n"
				121	ASM_PREFETCH("[%[in2], #192]")
				122
				123	"LDR q2, [%[in3]]\n"
				124	"STR q2, [%[out], #0x60]\n"
				125	"LDR q3, [%[in3], #0x10]\n"
				126	"STR q3, [%[out], #0x70]\n"
				127	"ADD %x[in3], %x[in3], #0x20\n"
				128	ASM_PREFETCH("[%[in3], #192]")
				129	: [in0] "+r" (in0),
				130	[in1] "+r" (in1),
				131	[in2] "+r" (in2),
				132	[in3] "+r" (in3)
				133	: [out] "r" (out)
				134	: "v0", "v1", "v2", "v3", "memory"
				135	);
				136	}
				137
				138	template <>
				139	template <>
				140	inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
				141	uint16_t* out, const uint16_t* const in, const int stride,
				142	const int x0, const int xmax, const int k0, const int kmax
				143	) {
				144	TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
				145	}
				146
				147	#endif // __aarch64__