Blame - src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp - ml/ComputeLibrary

blob: b0d8126a40e873aa022e877274bf3271fabc2f66 [file] [log] [blame]

Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	1	/*
Michele Di Giorgio	d9eaf61	2020-07-08 11:12:57 +0100	[diff] [blame]	2	* Copyright (c) 2018-2019 Arm Limited.
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	/*
				26	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				27	*
				28	* NOTE: Header to be included by implementation files only.
				29	*
				30	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				31	*/
				32
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame]	33	#include "arm.hpp"
				34	#include "impl_base.hpp"
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	35
				36	#pragma once
				37
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	38	using namespace neon_convolution_kernels;
				39
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	40	namespace depthwise
				41	{
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	42
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	43	template <
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	44	unsigned int OutputTileRows, unsigned int OutputTileCols,
				45	unsigned int KernelRows, unsigned int KernelCols,
				46	unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	47	>
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	48	DepthwiseConvolution<
				49	OutputTileRows, OutputTileCols,
				50	KernelRows, KernelCols, StrideRows, StrideCols,
				51	float, float, float
				52	>::DepthwiseConvolution(
				53	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				54	ActivationFunction activation,
				55	unsigned int padding_top,
				56	unsigned int padding_left,
				57	unsigned int padding_bottom,
				58	unsigned int padding_right
				59	) : Base(
				60	n_batches, n_input_rows, n_input_cols, n_channels, activation,
				61	padding_top, padding_left, padding_bottom, padding_right
				62	)
				63	{
				64	}
				65
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame]	66	template <
				67	unsigned int OutputTileRows, unsigned int OutputTileCols,
				68	unsigned int KernelRows, unsigned int KernelCols,
				69	unsigned int StrideRows, unsigned int StrideCols
				70	>
				71	DepthwiseConvolution<
				72	OutputTileRows, OutputTileCols,
				73	KernelRows, KernelCols, StrideRows, StrideCols,
				74	float, float, float
				75	>::DepthwiseConvolution(
				76	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				77	int n_output_rows, int n_output_cols,
				78	ActivationFunction activation,
				79	unsigned int padding_top,
				80	unsigned int padding_left,
				81	unsigned int padding_bottom,
				82	unsigned int padding_right
				83	) : Base(
				84	n_batches, n_input_rows, n_input_cols, n_channels,
				85	n_output_rows, n_output_cols, activation,
				86	padding_top, padding_left, padding_bottom, padding_right
				87	)
				88	{
				89	}
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	90
				91	template <
				92	unsigned int OutputTileRows, unsigned int OutputTileCols,
				93	unsigned int KernelRows, unsigned int KernelCols,
				94	unsigned int StrideRows, unsigned int StrideCols
				95	>
				96	template <ActivationFunction Activation>
				97	void DepthwiseConvolution<
				98	OutputTileRows, OutputTileCols,
				99	KernelRows, KernelCols, StrideRows, StrideCols,
				100	float, float, float
				101	>::execute_tile(
				102	int n_channels,
				103	const void *weights_biases_ptr,
				104	const float *input,
				105	const unsigned int in_row_stride,
				106	const unsigned int in_col_stride,
				107	float *output,
				108	const unsigned int out_row_stride,
				109	const unsigned int out_col_stride
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	110	)
				111	{
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	112	// Instantiate pointers
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	113	const float* __restrict__ inptr_base = input;
				114	float* __restrict__ outptr_base = output;
				115	const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	116
				117	// Perform the depthwise convolution
				118	int channels_remaining = n_channels;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	119	for (; channels_remaining >= 4; channels_remaining -= 4)
				120	{
				121	// Load input tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	122	float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
				123	for (int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	124	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	125	const float* const inptr_row = inptr_base + i*in_row_stride;
				126	for (int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	127	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	128	u[i][j] = vld1q_f32(inptr_row + j*in_col_stride);
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	129	}
				130	}
				131	inptr_base += 4;
				132
				133	// Load weights tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	134	float32x4_t vbias = vld1q_f32(params);
				135	params += 4;
				136
				137	float32x4_t w[KernelRows][KernelCols];
				138	for (unsigned int i = 0; i < KernelRows; i++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	139	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	140	for (unsigned int j = 0; j < KernelCols; j++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	141	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	142	w[i][j] = vld1q_f32(params);
				143	params += 4;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	144	}
				145	}
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	146
				147	// Perform the convolution
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	148	float32x4_t v[OutputTileRows][OutputTileCols];
				149	for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	150	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	151	for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	152	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	153	v[out_i][out_j] = vbias;
				154
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	155	// Base co-ordinate
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	156	const int base_i = out_i * StrideRows;
				157	const int base_j = out_j * StrideCols;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	158
				159	// Fill the accumulator
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	160	for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	161	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	162	const unsigned int i = base_i + in_i;
				163	for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	164	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	165	const unsigned int j = base_j + in_j;
				166
				167	// v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				168	v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	169	}
				170	}
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	171
				172	// Apply the activation function
				173	if (Activation == ActivationFunction::ReLU \|\|
				174	Activation == ActivationFunction::ReLU6)
				175	{
				176	v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
				177	}
				178	if (Activation == ActivationFunction::ReLU6)
				179	{
				180	v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
				181	}
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	182	}
				183	}
				184
				185	// Store the output tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	186	for (unsigned int i = 0; i < OutputTileRows; i++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	187	{
				188	float* const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	189	for (unsigned int j = 0; j < OutputTileCols; j++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	190	{
				191	vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
				192	}
				193	}
				194	outptr_base += 4;
				195	}
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	196	for (; channels_remaining; channels_remaining--)
				197	{
				198	// Load input tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	199	float u[Base::inner_tile_rows][Base::inner_tile_cols];
				200	for (int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	201	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	202	const float* const inptr_row = inptr_base + i*in_row_stride;
				203	for (int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	204	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	205	u[i][j] = (inptr_row + jin_col_stride);
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	206	}
				207	}
				208	inptr_base++;
				209
				210	// Load weights tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	211	float bias = *(params++);
				212	float w[KernelRows][KernelCols];
				213	for (unsigned int i = 0; i < KernelRows; i++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	214	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	215	for (unsigned int j = 0; j < KernelCols; j++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	216	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	217	w[i][j] = *(params++);
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	218	}
				219	}
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	220
				221	// Perform the convolution
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	222	float v[OutputTileRows][OutputTileCols];
				223	for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	224	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	225	for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	226	{
				227	// Clear the accumulator
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	228	v[out_i][out_j] = bias;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	229
				230	// Base co-ordinate
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	231	const int base_i = out_i * StrideRows;
				232	const int base_j = out_j * StrideCols;
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	233
				234	// Fill the accumulator
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	235	for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	236	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	237	const unsigned int i = base_i + in_i;
				238	for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	239	{
				240	const int j = base_j + in_j;
				241	v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				242	}
				243	}
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	244
				245	// Apply the activation function
				246	if (Activation == ActivationFunction::ReLU \|\|
				247	Activation == ActivationFunction::ReLU6)
				248	{
				249	v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
				250	}
				251	if (Activation == ActivationFunction::ReLU6)
				252	{
				253	v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
				254	}
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	255	}
				256	}
				257
				258	// Store the output tile
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	259	for (unsigned int i = 0; i < OutputTileRows; i++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	260	{
				261	float* const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	262	for (unsigned int j = 0; j < OutputTileCols; j++)
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	263	{
				264	(outptr_row + jout_col_stride) = v[i][j];
				265	}
				266	}
				267	outptr_base++;
				268	}
				269	}
				270
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	271
				272	template <
				273	unsigned int OutputTileRows, unsigned int OutputTileCols,
				274	unsigned int KernelRows, unsigned int KernelCols,
				275	unsigned int StrideRows, unsigned int StrideCols
				276	>
				277	template <ActivationFunction Activation>
				278	void DepthwiseConvolution<
				279	OutputTileRows, OutputTileCols,
				280	KernelRows, KernelCols, StrideRows, StrideCols,
				281	float, float, float
				282	>::execute_tile(
				283	int n_channels,
				284	const void *weights_biases_ptr,
				285	const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
				286	float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
				287	)
				288	{
				289	const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
				290
				291	// Perform the depthwise convolution
				292	int channels_remaining = n_channels;
				293	int n = 0;
				294	for (; channels_remaining >= 4; channels_remaining -= 4, n += 4)
				295	{
				296	// Load input tile
				297	float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
				298	for (int i = 0; i < Base::inner_tile_rows; i++)
				299	{
				300	for (int j = 0; j < Base::inner_tile_cols; j++)
				301	{
				302	u[i][j] = vld1q_f32(inptrs[i][j] + n);
				303	}
				304	}
				305
				306	// Load weights tile
				307	float32x4_t vbias = vld1q_f32(params);
				308	params += 4;
				309
				310	float32x4_t w[KernelRows][KernelCols];
				311	for (unsigned int i = 0; i < KernelRows; i++)
				312	{
				313	for (unsigned int j = 0; j < KernelCols; j++)
				314	{
				315	w[i][j] = vld1q_f32(params);
				316	params += 4;
				317	}
				318	}
				319
				320	// Perform the convolution
				321	float32x4_t v[OutputTileRows][OutputTileCols];
				322	for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
				323	{
				324	for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
				325	{
				326	v[out_i][out_j] = vbias;
				327
				328	// Base co-ordinate
				329	const int base_i = out_i * StrideRows;
				330	const int base_j = out_j * StrideCols;
				331
				332	// Fill the accumulator
				333	for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
				334	{
				335	const unsigned int i = base_i + in_i;
				336	for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
				337	{
				338	const unsigned int j = base_j + in_j;
				339
				340	// v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				341	v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
				342	}
				343	}
				344
				345	// Apply the activation function
				346	if (Activation == ActivationFunction::ReLU \|\|
				347	Activation == ActivationFunction::ReLU6)
				348	{
				349	v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
				350	}
				351	if (Activation == ActivationFunction::ReLU6)
				352	{
				353	v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
				354	}
				355	}
				356	}
				357
				358	// Store the output tile
				359	for (unsigned int i = 0; i < OutputTileRows; i++)
				360	{
				361	for (unsigned int j = 0; j < OutputTileCols; j++)
				362	{
				363	vst1q_f32(outptrs[i][j] + n, v[i][j]);
				364	}
				365	}
				366	}
				367	for (; channels_remaining; channels_remaining--, n++)
				368	{
				369	// Load input tile
				370	float u[Base::inner_tile_rows][Base::inner_tile_cols];
				371	for (int i = 0; i < Base::inner_tile_rows; i++)
				372	{
				373	for (int j = 0; j < Base::inner_tile_cols; j++)
				374	{
				375	u[i][j] = *(inptrs[i][j] + n);
				376	}
				377	}
				378
				379	// Load weights tile
				380	float bias = *(params++);
				381	float w[KernelRows][KernelCols];
				382	for (unsigned int i = 0; i < KernelRows; i++)
				383	{
				384	for (unsigned int j = 0; j < KernelCols; j++)
				385	{
				386	w[i][j] = *(params++);
				387	}
				388	}
				389
				390	// Perform the convolution
				391	float v[OutputTileRows][OutputTileCols];
				392	for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
				393	{
				394	for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
				395	{
				396	// Clear the accumulator
				397	v[out_i][out_j] = bias;
				398
				399	// Base co-ordinate
				400	const int base_i = out_i * StrideRows;
				401	const int base_j = out_j * StrideCols;
				402
				403	// Fill the accumulator
				404	for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
				405	{
				406	const unsigned int i = base_i + in_i;
				407	for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
				408	{
				409	const int j = base_j + in_j;
				410	v[out_i][out_j] += w[in_i][in_j] * u[i][j];
				411	}
				412	}
				413
				414	// Apply the activation function
				415	if (Activation == ActivationFunction::ReLU \|\|
				416	Activation == ActivationFunction::ReLU6)
				417	{
				418	v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
				419	}
				420	if (Activation == ActivationFunction::ReLU6)
				421	{
				422	v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
				423	}
				424	}
				425	}
				426
				427	// Store the output tile
				428	for (unsigned int i = 0; i < OutputTileRows; i++)
				429	{
				430	for (unsigned int j = 0; j < OutputTileCols; j++)
				431	{
				432	*(outptrs[i][j] + n) = v[i][j];
				433	}
				434	}
				435	}
				436	}
				437
Georgios Pinitas	4074c99	2018-01-30 18:13:46 +0000	[diff] [blame]	438	} // namespace depthwise