Blame - src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp - ml/ComputeLibrary

blob: 840086f917c11b618a204b3f4eb4ecef2d4aecfd [file] [log] [blame]

Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2018 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	/*
				26	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				27	*
				28	* NOTE: Header to be included by implementation files only.
				29	*
				30	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				31	*/
				32
				33	#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
				34	#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
				35
				36	#pragma once
				37
				38	namespace depthwise
				39	{
				40	// Partial specialisation for FP32 to FP32
				41	template <int OutputTileRows, int OutputTileCols,
				42	int KernelRows, int KernelCols,
				43	int StrideRows, int StrideCols>
				44	struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float, float>
				45	{
				46	typedef DepthwiseConvolution<
				47	OutputTileRows, OutputTileCols,
				48	KernelRows, KernelCols,
				49	StrideRows, StrideCols,
				50	float, float
				51	> DWC;
				52
				53	template <
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	54	bool Specialize=false, // Specialize (or not) the method
				55	int InPadTop=0, // If specialized, top padding
				56	int InPadLeft=0, // If specialized, left padding
				57	int InPadBottom=0, // If specialized, bottom padding
				58	int InPadRight=0, // If specialized, right padding
				59	int OutPadBottom=0, // If specialized, bottom output padding
				60	int OutPadRight=0 // If specialized, bottom right padding
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	61	>
				62	static void process_tile(
				63	const int n_channels,
				64	const float* const weights,
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	65	const int weight_row_stride,
				66	const int weight_col_stride,
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	67	const float* const inptr,
				68	const int in_row_stride,
				69	const int in_col_stride,
				70	float* const outptr,
				71	const int out_row_stride,
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	72	const int out_col_stride,
				73	const int in_pad_top=0,
				74	const int in_pad_left=0,
				75	const int in_pad_bottom=0,
				76	const int in_pad_right=0,
				77	const int out_pad_bottom=0,
Georgios Pinitas	a799ce0	2018-09-12 20:11:34 +0100	[diff] [blame]	78	const int out_pad_right=0,
				79	const int input_offset=0,
				80	const int weights_offset=0
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	81	);
				82	};
				83
				84
				85	template <int OTR, int OTC, int KR, int KC, int SR, int SC>
				86	template <
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	87	bool Specialize,
				88	int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
				89	int OutPadBottom, int OutPadRight
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	90	>
				91	void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float, float>::process_tile(
				92	const int n_channels,
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	93	const float *__restrict__ const weights,
				94	const int weight_row_stride,
				95	const int weight_col_stride,
				96	const float *__restrict__ const inptr,
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	97	const int in_row_stride,
				98	const int in_col_stride,
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	99	float *__restrict__ const outptr,
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	100	const int out_row_stride,
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	101	const int out_col_stride,
				102	const int _in_pad_top,
				103	const int _in_pad_left,
				104	const int _in_pad_bottom,
				105	const int _in_pad_right,
				106	const int _out_pad_bottom,
Georgios Pinitas	a799ce0	2018-09-12 20:11:34 +0100	[diff] [blame]	107	const int _out_pad_right,
				108	const int _input_offset,
				109	const int _weights_offset
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	110	)
				111	{
				112	constexpr auto inner_tile_rows = DWC::inner_tile_rows;
				113	constexpr auto inner_tile_cols = DWC::inner_tile_cols;
				114	constexpr auto kernel_rows = DWC::kernel_rows;
				115	constexpr auto kernel_cols = DWC::kernel_cols;
				116	constexpr auto output_tile_rows = DWC::output_tile_rows;
				117	constexpr auto output_tile_cols = DWC::output_tile_cols;
				118	constexpr auto stride_rows = DWC::stride_rows;
				119	constexpr auto stride_cols = DWC::stride_cols;
				120
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	121	// Extract parameters
				122	const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
				123	const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
				124	const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
				125	const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
				126	const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
				127	const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
				128
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	129	// Compute valid ranges of the tile
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	130	const int in_cells_i = inner_tile_rows - in_pad_bottom;
				131	const int in_cells_j = inner_tile_cols - in_pad_right;
				132	const int out_cells_i = output_tile_rows - out_pad_bottom;
				133	const int out_cells_j = output_tile_cols - out_pad_right;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	134
				135	// Instantiate pointers
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	136	const float* __restrict__ inptr_base = inptr;
				137	const float* __restrict__ wptr_base = weights;
				138	float* __restrict__ outptr_base = outptr;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	139
				140	// Perform the depthwise convolution
				141	int channels_remaining = n_channels;
				142	#ifdef __aarch64__
				143	for (; channels_remaining >= 4; channels_remaining -= 4)
				144	{
				145	// Load input tile
				146	float32x4_t u[inner_tile_rows][inner_tile_cols];
				147	for (int i = 0; i < inner_tile_rows; i++)
				148	{
				149	const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
				150	for (int j = 0; j < inner_tile_cols; j++)
				151	{
				152	if (i < in_pad_top \|\| in_cells_i <= i \|\|
				153	j < in_pad_left \|\| in_cells_j <= j)
				154	{
				155	u[i][j] = vdupq_n_f32(0.0f);
				156	}
				157	else
				158	{
				159	u[i][j] = vld1q_f32(inptr_row + (j - in_pad_left)*in_col_stride);
				160	}
				161	}
				162	}
				163	inptr_base += 4;
				164
				165	// Load weights tile
				166	float32x4_t w[kernel_rows][kernel_cols];
				167	for (int i = 0; i < kernel_rows; i++)
				168	{
				169	const float* const wptr_row = wptr_base + i*weight_row_stride;
				170	for (int j = 0; j < kernel_cols; j++)
				171	{
				172	w[i][j] = vld1q_f32(wptr_row + j*weight_col_stride);
				173	}
				174	}
				175	wptr_base += 4;
				176
				177	// Perform the convolution
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	178	float32x4_t v[output_tile_rows][output_tile_cols];
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	179	for (int out_i = 0; out_i < out_cells_i; out_i++)
				180	{
				181	for (int out_j = 0; out_j < out_cells_j; out_j++)
				182	{
				183	// Base co-ordinate
				184	const int base_i = out_i * stride_rows;
				185	const int base_j = out_j * stride_cols;
				186
				187	// Fill the accumulator
				188	for (int in_i = 0; in_i < kernel_rows; in_i++)
				189	{
				190	const int i = base_i + in_i;
				191	for (int in_j = 0; in_j < kernel_cols; in_j++)
				192	{
				193	const int j = base_j + in_j;
				194	if (in_i == 0 && in_j == 0)
				195	{
				196	// v[out_i][out_j] = w[in_i][in_j] * u[i][j];
				197	v[out_i][out_j] = vmulq_f32(w[in_i][in_j], u[i][j]);
				198	}
				199	else
				200	{
				201	// v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				202	v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
				203	}
				204	}
				205	}
				206	}
				207	}
				208
				209	// Store the output tile
				210	for (int i = 0; i < out_cells_i; i++)
				211	{
				212	float* const outptr_row = outptr_base + i*out_row_stride;
				213	for (int j = 0; j < out_cells_j; j++)
				214	{
				215	vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
				216	}
				217	}
				218	outptr_base += 4;
				219	}
				220	#endif // __aarch64__
				221	for (; channels_remaining; channels_remaining--)
				222	{
				223	// Load input tile
				224	float u[inner_tile_rows][inner_tile_cols];
				225	for (int i = 0; i < inner_tile_rows; i++)
				226	{
				227	const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
				228	for (int j = 0; j < inner_tile_cols; j++)
				229	{
				230	if (i < in_pad_top \|\| in_cells_i <= i \|\|
				231	j < in_pad_left \|\| in_cells_j <= j)
				232	{
				233	u[i][j] = static_cast<float>(0);
				234	}
				235	else
				236	{
				237	u[i][j] = (inptr_row + (j - in_pad_left)in_col_stride);
				238	}
				239	}
				240	}
				241	inptr_base++;
				242
				243	// Load weights tile
				244	float w[kernel_rows][kernel_cols];
				245	for (int i = 0; i < kernel_rows; i++)
				246	{
				247	const float* const wptr_row = wptr_base + i*weight_row_stride;
				248	for (int j = 0; j < kernel_cols; j++)
				249	{
				250	w[i][j] = (wptr_row + jweight_col_stride);
				251	}
				252	}
				253	wptr_base++;
				254
				255	// Perform the convolution
Georgios Pinitas	be0ae93	2018-03-13 13:08:12 +0000	[diff] [blame]	256	float v[output_tile_rows][output_tile_cols];
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	257	for (int out_i = 0; out_i < out_cells_i; out_i++)
				258	{
				259	for (int out_j = 0; out_j < out_cells_j; out_j++)
				260	{
				261	// Clear the accumulator
				262	v[out_i][out_j] = static_cast<float>(0);
				263
				264	// Base co-ordinate
				265	const int base_i = out_i * stride_rows;
				266	const int base_j = out_j * stride_cols;
				267
				268	// Fill the accumulator
				269	for (int in_i = 0; in_i < kernel_rows; in_i++)
				270	{
				271	const int i = base_i + in_i;
				272	for (int in_j = 0; in_j < kernel_cols; in_j++)
				273	{
				274	const int j = base_j + in_j;
				275	v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				276	}
				277	}
				278	}
				279	}
				280
				281	// Store the output tile
				282	for (int i = 0; i < out_cells_i; i++)
				283	{
				284	float* const outptr_row = outptr_base + i*out_row_stride;
				285	for (int j = 0; j < out_cells_j; j++)
				286	{
				287	(outptr_row + jout_col_stride) = v[i][j];
				288	}
				289	}
				290	outptr_base++;
				291	}
				292	}
				293
				294	} // namespace depthwise