Blame - src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp - ml/ComputeLibrary

blob: cbdb19a06718008e1bdbb89f19008ab26c7905ea [file] [log] [blame]

Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	1	/*
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	2	* Copyright (c) 2018-2019 ARM Limited.
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	/*
				26	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				27	*
				28	* NOTE: Header to be included by implementation files only.
				29	*
				30	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				31	*/
				32	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				33	#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
				34	#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
				35
				36	#pragma once
				37
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	38	using namespace neon_convolution_kernels;
				39
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	40	namespace depthwise
				41	{
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	42
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	43	template <
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	44	unsigned int OutputTileRows, unsigned int OutputTileCols,
				45	unsigned int KernelRows, unsigned int KernelCols,
				46	unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	47	>
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	48	DepthwiseConvolution<
				49	OutputTileRows, OutputTileCols,
				50	KernelRows, KernelCols, StrideRows, StrideCols,
				51	float16_t, float16_t, float16_t
				52	>::DepthwiseConvolution(
				53	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				54	ActivationFunction activation,
				55	unsigned int padding_top,
				56	unsigned int padding_left,
				57	unsigned int padding_bottom,
				58	unsigned int padding_right
				59	) : Base(
				60	n_batches, n_input_rows, n_input_cols, n_channels, activation,
				61	padding_top, padding_left, padding_bottom, padding_right
				62	)
				63	{
				64	}
				65
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	66	template <
				67	unsigned int OutputTileRows, unsigned int OutputTileCols,
				68	unsigned int KernelRows, unsigned int KernelCols,
				69	unsigned int StrideRows, unsigned int StrideCols
				70	>
				71	template <ActivationFunction Activation>
				72	void DepthwiseConvolution<
				73	OutputTileRows, OutputTileCols,
				74	KernelRows, KernelCols, StrideRows, StrideCols,
				75	float16_t, float16_t, float16_t
				76	>::execute_tile(
				77	int n_channels,
				78	const void *weights_biases_ptr,
				79	const float16_t *input,
				80	const unsigned int in_row_stride,
				81	const unsigned int in_col_stride,
				82	float16_t *output,
				83	const unsigned int out_row_stride,
				84	const unsigned int out_col_stride
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	85	)
				86	{
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	87	// Instantiate pointers
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	88	const float16_t* __restrict__ inptr_base = input;
				89	float16_t* __restrict__ outptr_base = output;
				90	const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	91
				92	// Perform the depthwise convolution
				93	int channels_remaining = n_channels;
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	94	for (; channels_remaining >= 8; channels_remaining -= 8)
				95	{
				96	// Load input tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	97	float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
				98	for (int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	99	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	100	const float16_t* const inptr_row = inptr_base + i*in_row_stride;
				101	for (int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	102	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	103	u[i][j] = vld1q_f16(inptr_row + j*in_col_stride);
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	104	}
				105	}
				106	inptr_base += 8;
				107
				108	// Load weights tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	109	float16x8_t vbias = vld1q_f16(params);
				110	params += 8;
				111
				112	float16x8_t w[KernelRows][KernelCols];
				113	for (unsigned int i = 0; i < KernelRows; i++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	114	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	115	for (unsigned int j = 0; j < KernelCols; j++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	116	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	117	w[i][j] = vld1q_f16(params);
				118	params += 8;
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	119	}
				120	}
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	121
				122	// Perform the convolution
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	123	float16x8_t v[OutputTileRows][OutputTileCols];
				124	for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	125	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	126	for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	127	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	128	v[out_i][out_j] = vbias;
				129
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	130	// Base co-ordinate
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	131	const int base_i = out_i * StrideRows;
				132	const int base_j = out_j * StrideCols;
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	133
				134	// Fill the accumulator
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	135	for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	136	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	137	const unsigned int i = base_i + in_i;
				138	for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	139	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	140	const unsigned int j = base_j + in_j;
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame^]	141
				142	// v[out_i][out_j] += w[in_i][in_j] * u[i][j];
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	143	v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	144	}
				145	}
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	146
				147	// Apply the activation function
				148	if (Activation == ActivationFunction::ReLU \|\|
				149	Activation == ActivationFunction::ReLU6)
				150	{
				151	v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
				152	}
				153	if (Activation == ActivationFunction::ReLU6)
				154	{
				155	v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
				156	}
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	157	}
				158	}
				159
				160	// Store the output tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	161	for (unsigned int i = 0; i < OutputTileRows; i++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	162	{
				163	float16_t* const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	164	for (unsigned int j = 0; j < OutputTileCols; j++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	165	{
				166	vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
				167	}
				168	}
				169	outptr_base += 8;
				170	}
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	171	for (; channels_remaining; channels_remaining--)
				172	{
				173	// Load input tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	174	float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
				175	for (int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	176	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	177	const float16_t* const inptr_row = inptr_base + i*in_row_stride;
				178	for (int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	179	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	180	u[i][j] = (inptr_row + jin_col_stride);
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	181	}
				182	}
				183	inptr_base++;
				184
				185	// Load weights tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	186	float16_t bias = *(params++);
				187	float16_t w[KernelRows][KernelCols];
				188	for (unsigned int i = 0; i < KernelRows; i++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	189	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	190	for (unsigned int j = 0; j < KernelCols; j++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	191	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	192	w[i][j] = *(params++);
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	193	}
				194	}
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	195
				196	// Perform the convolution
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	197	float16_t v[OutputTileRows][OutputTileCols];
				198	for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	199	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	200	for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	201	{
				202	// Clear the accumulator
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	203	v[out_i][out_j] = bias;
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	204
				205	// Base co-ordinate
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	206	const int base_i = out_i * StrideRows;
				207	const int base_j = out_j * StrideCols;
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	208
				209	// Fill the accumulator
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	210	for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	211	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	212	const unsigned int i = base_i + in_i;
				213	for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	214	{
				215	const int j = base_j + in_j;
				216	v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				217	}
				218	}
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	219
				220	// Apply the activation function
				221	if (Activation == ActivationFunction::ReLU \|\|
				222	Activation == ActivationFunction::ReLU6)
				223	{
				224	v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
				225	}
				226	if (Activation == ActivationFunction::ReLU6)
				227	{
				228	v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
				229	}
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	230	}
				231	}
				232
				233	// Store the output tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	234	for (unsigned int i = 0; i < OutputTileRows; i++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	235	{
				236	float16_t* const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	237	for (unsigned int j = 0; j < OutputTileCols; j++)
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	238	{
				239	(outptr_row + jout_col_stride) = v[i][j];
				240	}
				241	}
				242	outptr_base++;
				243	}
				244	}
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	245
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame^]	246	template <
				247	unsigned int OutputTileRows, unsigned int OutputTileCols,
				248	unsigned int KernelRows, unsigned int KernelCols,
				249	unsigned int StrideRows, unsigned int StrideCols
				250	>
				251	template <ActivationFunction Activation>
				252	void DepthwiseConvolution<
				253	OutputTileRows, OutputTileCols,
				254	KernelRows, KernelCols, StrideRows, StrideCols,
				255	float16_t, float16_t, float16_t
				256	>::execute_tile(
				257	int n_channels,
				258	const void *weights_biases_ptr,
				259	const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
				260	float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols]
				261	)
				262	{
				263	// Instantiate pointers
				264	const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
				265	int n = 0;
				266
				267	// Perform the depthwise convolution
				268	int channels_remaining = n_channels;
				269	for (; channels_remaining >= 8; channels_remaining -= 8, n += 8)
				270	{
				271	// Load input tile
				272	float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
				273	for (int i = 0; i < Base::inner_tile_rows; i++)
				274	{
				275	for (int j = 0; j < Base::inner_tile_cols; j++)
				276	{
				277	u[i][j] = vld1q_f16(inptrs[i][j] + n);
				278	}
				279	}
				280
				281	// Load weights tile
				282	float16x8_t vbias = vld1q_f16(params);
				283	params += 8;
				284
				285	float16x8_t w[KernelRows][KernelCols];
				286	for (unsigned int i = 0; i < KernelRows; i++)
				287	{
				288	for (unsigned int j = 0; j < KernelCols; j++)
				289	{
				290	w[i][j] = vld1q_f16(params);
				291	params += 8;
				292	}
				293	}
				294
				295	// Perform the convolution
				296	float16x8_t v[OutputTileRows][OutputTileCols];
				297	for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
				298	{
				299	for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
				300	{
				301	v[out_i][out_j] = vbias;
				302
				303	// Base co-ordinate
				304	const int base_i = out_i * StrideRows;
				305	const int base_j = out_j * StrideCols;
				306
				307	// Fill the accumulator
				308	for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
				309	{
				310	const unsigned int i = base_i + in_i;
				311	for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
				312	{
				313	const unsigned int j = base_j + in_j;
				314
				315	// v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				316	v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
				317	}
				318	}
				319
				320	// Apply the activation function
				321	if (Activation == ActivationFunction::ReLU \|\|
				322	Activation == ActivationFunction::ReLU6)
				323	{
				324	v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
				325	}
				326	if (Activation == ActivationFunction::ReLU6)
				327	{
				328	v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
				329	}
				330	}
				331	}
				332
				333	// Store the output tile
				334	for (unsigned int i = 0; i < OutputTileRows; i++)
				335	{
				336	for (unsigned int j = 0; j < OutputTileCols; j++)
				337	{
				338	vst1q_f16(outptrs[i][j] + n, v[i][j]);
				339	}
				340	}
				341	}
				342	for (; channels_remaining; channels_remaining--, n++)
				343	{
				344	// Load input tile
				345	float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
				346	for (int i = 0; i < Base::inner_tile_rows; i++)
				347	{
				348	for (int j = 0; j < Base::inner_tile_cols; j++)
				349	{
				350	u[i][j] = *(inptrs[i][j] + n);
				351	}
				352	}
				353
				354	// Load weights tile
				355	float16_t bias = *(params++);
				356	float16_t w[KernelRows][KernelCols];
				357	for (unsigned int i = 0; i < KernelRows; i++)
				358	{
				359	for (unsigned int j = 0; j < KernelCols; j++)
				360	{
				361	w[i][j] = *(params++);
				362	}
				363	}
				364
				365	// Perform the convolution
				366	float16_t v[OutputTileRows][OutputTileCols];
				367	for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
				368	{
				369	for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
				370	{
				371	// Clear the accumulator
				372	v[out_i][out_j] = bias;
				373
				374	// Base co-ordinate
				375	const int base_i = out_i * StrideRows;
				376	const int base_j = out_j * StrideCols;
				377
				378	// Fill the accumulator
				379	for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
				380	{
				381	const unsigned int i = base_i + in_i;
				382	for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
				383	{
				384	const int j = base_j + in_j;
				385	v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				386	}
				387	}
				388
				389	// Apply the activation function
				390	if (Activation == ActivationFunction::ReLU \|\|
				391	Activation == ActivationFunction::ReLU6)
				392	{
				393	v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
				394	}
				395	if (Activation == ActivationFunction::ReLU6)
				396	{
				397	v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
				398	}
				399	}
				400	}
				401
				402	// Store the output tile
				403	for (unsigned int i = 0; i < OutputTileRows; i++)
				404	{
				405	for (unsigned int j = 0; j < OutputTileCols; j++)
				406	{
				407	*(outptrs[i][j] + n) = v[i][j];
				408	}
				409	}
				410	}
				411	}
				412
Georgios Pinitas	20c246a	2018-09-12 16:45:53 +0100	[diff] [blame]	413	} // namespace depthwise
				414	#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC