Blame - src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp - ml/ComputeLibrary

blob: ed4cfb86b93237b5951ea9cee03715fe1e2d8aaf [file] [log] [blame]

Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame^]	1	/*
				2	* Copyright (c) 2018 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	/*
				26	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				27	*
				28	* NOTE: Header to be included by implementation files only.
				29	*
				30	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				31	*/
				32	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				33	#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
				34	#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
				35
				36	#pragma once
				37
				38	namespace depthwise
				39	{
				40	// Partial specialisation for FP16 to FP16
				41	template <int OutputTileRows, int OutputTileCols,
				42	int KernelRows, int KernelCols,
				43	int StrideRows, int StrideCols>
				44	struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float16_t, float16_t>
				45	{
				46	typedef DepthwiseConvolution<
				47	OutputTileRows, OutputTileCols,
				48	KernelRows, KernelCols,
				49	StrideRows, StrideCols,
				50	float16_t, float16_t
				51	> DWC;
				52
				53	template <
				54	bool Specialize=false, // Specialize (or not) the method
				55	int InPadTop=0, // If specialized, top padding
				56	int InPadLeft=0, // If specialized, left padding
				57	int InPadBottom=0, // If specialized, bottom padding
				58	int InPadRight=0, // If specialized, right padding
				59	int OutPadBottom=0, // If specialized, bottom output padding
				60	int OutPadRight=0 // If specialized, bottom right padding
				61	>
				62	static void process_tile(
				63	const int n_channels,
				64	const float16_t* const weights,
				65	const int weight_row_stride,
				66	const int weight_col_stride,
				67	const float16_t* const inptr,
				68	const int in_row_stride,
				69	const int in_col_stride,
				70	float16_t* const outptr,
				71	const int out_row_stride,
				72	const int out_col_stride,
				73	const int in_pad_top=0,
				74	const int in_pad_left=0,
				75	const int in_pad_bottom=0,
				76	const int in_pad_right=0,
				77	const int out_pad_bottom=0,
				78	const int out_pad_right=0
				79	);
				80	};
				81
				82
				83	template <int OTR, int OTC, int KR, int KC, int SR, int SC>
				84	template <
				85	bool Specialize,
				86	int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
				87	int OutPadBottom, int OutPadRight
				88	>
				89	void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float16_t, float16_t>::process_tile(
				90	const int n_channels,
				91	const float16_t *__restrict__ const weights,
				92	const int weight_row_stride,
				93	const int weight_col_stride,
				94	const float16_t *__restrict__ const inptr,
				95	const int in_row_stride,
				96	const int in_col_stride,
				97	float16_t *__restrict__ const outptr,
				98	const int out_row_stride,
				99	const int out_col_stride,
				100	const int _in_pad_top,
				101	const int _in_pad_left,
				102	const int _in_pad_bottom,
				103	const int _in_pad_right,
				104	const int _out_pad_bottom,
				105	const int _out_pad_right
				106	)
				107	{
				108	constexpr auto inner_tile_rows = DWC::inner_tile_rows;
				109	constexpr auto inner_tile_cols = DWC::inner_tile_cols;
				110	constexpr auto kernel_rows = DWC::kernel_rows;
				111	constexpr auto kernel_cols = DWC::kernel_cols;
				112	constexpr auto output_tile_rows = DWC::output_tile_rows;
				113	constexpr auto output_tile_cols = DWC::output_tile_cols;
				114	constexpr auto stride_rows = DWC::stride_rows;
				115	constexpr auto stride_cols = DWC::stride_cols;
				116
				117	// Extract parameters
				118	const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
				119	const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
				120	const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
				121	const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
				122	const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
				123	const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
				124
				125	// Compute valid ranges of the tile
				126	const int in_cells_i = inner_tile_rows - in_pad_bottom;
				127	const int in_cells_j = inner_tile_cols - in_pad_right;
				128	const int out_cells_i = output_tile_rows - out_pad_bottom;
				129	const int out_cells_j = output_tile_cols - out_pad_right;
				130
				131	// Instantiate pointers
				132	const float16_t* __restrict__ inptr_base = inptr;
				133	const float16_t* __restrict__ wptr_base = weights;
				134	float16_t* __restrict__ outptr_base = outptr;
				135
				136	// Perform the depthwise convolution
				137	int channels_remaining = n_channels;
				138	#ifdef __aarch64__
				139	for (; channels_remaining >= 8; channels_remaining -= 8)
				140	{
				141	// Load input tile
				142	float16x8_t u[inner_tile_rows][inner_tile_cols];
				143	for (int i = 0; i < inner_tile_rows; i++)
				144	{
				145	const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
				146	for (int j = 0; j < inner_tile_cols; j++)
				147	{
				148	if (i < in_pad_top \|\| in_cells_i <= i \|\|
				149	j < in_pad_left \|\| in_cells_j <= j)
				150	{
				151	u[i][j] = vdupq_n_f16(0.0f);
				152	}
				153	else
				154	{
				155	u[i][j] = vld1q_f16(inptr_row + (j - in_pad_left)*in_col_stride);
				156	}
				157	}
				158	}
				159	inptr_base += 8;
				160
				161	// Load weights tile
				162	float16x8_t w[kernel_rows][kernel_cols];
				163	for (int i = 0; i < kernel_rows; i++)
				164	{
				165	const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
				166	for (int j = 0; j < kernel_cols; j++)
				167	{
				168	w[i][j] = vld1q_f16(wptr_row + j*weight_col_stride);
				169	}
				170	}
				171	wptr_base += 8;
				172
				173	// Perform the convolution
				174	float16x8_t v[output_tile_rows][output_tile_cols];
				175	for (int out_i = 0; out_i < out_cells_i; out_i++)
				176	{
				177	for (int out_j = 0; out_j < out_cells_j; out_j++)
				178	{
				179	// Base co-ordinate
				180	const int base_i = out_i * stride_rows;
				181	const int base_j = out_j * stride_cols;
				182
				183	// Fill the accumulator
				184	for (int in_i = 0; in_i < kernel_rows; in_i++)
				185	{
				186	const int i = base_i + in_i;
				187	for (int in_j = 0; in_j < kernel_cols; in_j++)
				188	{
				189	const int j = base_j + in_j;
				190	if (in_i == 0 && in_j == 0)
				191	{
				192	// v[out_i][out_j] = w[in_i][in_j] * u[i][j];
				193	v[out_i][out_j] = vmulq_f16(w[in_i][in_j], u[i][j]);
				194	}
				195	else
				196	{
				197	// v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				198	v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
				199	}
				200	}
				201	}
				202	}
				203	}
				204
				205	// Store the output tile
				206	for (int i = 0; i < out_cells_i; i++)
				207	{
				208	float16_t* const outptr_row = outptr_base + i*out_row_stride;
				209	for (int j = 0; j < out_cells_j; j++)
				210	{
				211	vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
				212	}
				213	}
				214	outptr_base += 8;
				215	}
				216	#endif // __aarch64__
				217	for (; channels_remaining; channels_remaining--)
				218	{
				219	// Load input tile
				220	float16_t u[inner_tile_rows][inner_tile_cols];
				221	for (int i = 0; i < inner_tile_rows; i++)
				222	{
				223	const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
				224	for (int j = 0; j < inner_tile_cols; j++)
				225	{
				226	if (i < in_pad_top \|\| in_cells_i <= i \|\|
				227	j < in_pad_left \|\| in_cells_j <= j)
				228	{
				229	u[i][j] = static_cast<float16_t>(0);
				230	}
				231	else
				232	{
				233	u[i][j] = (inptr_row + (j - in_pad_left)in_col_stride);
				234	}
				235	}
				236	}
				237	inptr_base++;
				238
				239	// Load weights tile
				240	float16_t w[kernel_rows][kernel_cols];
				241	for (int i = 0; i < kernel_rows; i++)
				242	{
				243	const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
				244	for (int j = 0; j < kernel_cols; j++)
				245	{
				246	w[i][j] = (wptr_row + jweight_col_stride);
				247	}
				248	}
				249	wptr_base++;
				250
				251	// Perform the convolution
				252	float16_t v[output_tile_rows][output_tile_cols];
				253	for (int out_i = 0; out_i < out_cells_i; out_i++)
				254	{
				255	for (int out_j = 0; out_j < out_cells_j; out_j++)
				256	{
				257	// Clear the accumulator
				258	v[out_i][out_j] = static_cast<float16_t>(0);
				259
				260	// Base co-ordinate
				261	const int base_i = out_i * stride_rows;
				262	const int base_j = out_j * stride_cols;
				263
				264	// Fill the accumulator
				265	for (int in_i = 0; in_i < kernel_rows; in_i++)
				266	{
				267	const int i = base_i + in_i;
				268	for (int in_j = 0; in_j < kernel_cols; in_j++)
				269	{
				270	const int j = base_j + in_j;
				271	v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				272	}
				273	}
				274	}
				275	}
				276
				277	// Store the output tile
				278	for (int i = 0; i < out_cells_i; i++)
				279	{
				280	float16_t* const outptr_row = outptr_base + i*out_row_stride;
				281	for (int j = 0; j < out_cells_j; j++)
				282	{
				283	(outptr_row + jout_col_stride) = v[i][j];
				284	}
				285	}
				286	outptr_base++;
				287	}
				288	}
				289	} // namespace depthwise
				290	#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC