Blame - src/core/NEON/kernels/NEWinogradLayerKernel.cpp - ml/ComputeLibrary

blob: b2e44f8e09eb1b1268f28bc08b120f7bb7cd49bf [file] [log] [blame]

Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	1	/*
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	2	* Copyright (c) 2017-2018 ARM Limited.
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
				29	#include "arm_compute/core/TensorInfo.h"
Pablo Tello	3d4968a	2017-12-04 15:03:35 +0000	[diff] [blame]	30	#include "support/ToolchainSupport.h"
				31
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	32	namespace arm_compute
				33	{
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	34	//Batched Gemms
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	35	template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				36	NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerBatchedGEMMKernel()
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	37	: _gemms()
Pablo Tello	3d4968a	2017-12-04 15:03:35 +0000	[diff] [blame]	38	{
				39	}
				40
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	41	template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				42	void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	43	const unsigned int n_gemms,
				44	const int M, const int K, const int N,
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	45	const int a_matrix_stride,
				46	const int a_row_stride,
				47	const int b_matrix_stride,
				48	const int b_row_stride,
				49	const int c_matrix_stride,
				50	const int c_row_stride,
				51	const TIn *const a_ptr,
				52	const TIn *const b_ptr,
				53	TOut *const c_ptr)
Pablo Tello	3d4968a	2017-12-04 15:03:35 +0000	[diff] [blame]	54	{
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	55	_gemms = support::cpp14::make_unique<MultiGEMM>(n_gemms, M, K, N, a_matrix_stride, a_row_stride, b_matrix_stride, b_row_stride, c_matrix_stride, c_row_stride, a_ptr, b_ptr, c_ptr);
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	56	Window win;
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	57	auto win_last = _gemms->get_window();
Pablo Tello	9ceebbe	2018-01-10 16:44:13 +0000	[diff] [blame]	58	win.set(Window::DimX, Window::Dimension(0, win_last, 1));
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	59	INEKernel::configure(win);
				60	}
				61
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	62	template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				63	void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	64	{
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	65	ARM_COMPUTE_UNUSED(info);
				66	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Pablo Tello	02541fb	2017-12-15 09:48:59 +0000	[diff] [blame]	67	const size_t first_gemm = window.x().start();
				68	const size_t last_gemm = window.x().end();
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	69	_gemms->run(first_gemm, last_gemm);
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	70	}
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	71
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	72	template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				73	unsigned int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_gemms() const
				74	{
				75	return WinogradBase::N_GEMMS;
				76	}
				77
				78	template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				79	int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_rows() const
				80	{
				81	return _output_tile_rows;
				82	}
				83
				84	template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				85	int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_cols() const
				86	{
				87	return _output_tile_cols;
				88	}
				89
				90	template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				91	int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_blocks() const
				92	{
				93	return WinogradConv::N_BLOCK;
				94	}
				95
				96	template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>;
				97	template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>;
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	98
				99	// Weights transform
				100
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	101	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				102	unsigned int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int n_output_channels, int n_input_channels) const
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	103	{
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	104	const KernelShape shape(n_output_channels, KernelRows, KernelCols, n_input_channels);
				105	return static_cast<unsigned int>(
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	106	// WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T
				107	WinogradConv::get_kernel_storage_size(shape) / sizeof(T));
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	108	}
				109
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	110	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				111	NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformWeightsKernel()
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	112	: _transform()
				113	{
				114	}
				115
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	116	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				117	int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(const KernelShape &kernel_shape) const
				118	{
				119	return WinogradConv::get_kernel_matrix_stride(kernel_shape);
				120	}
				121
				122	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				123	void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	124	const ITensor *weights_hwio,
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	125	T *const output,
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	126	const int matrix_stride, /** Stride across matrices in the output. */
				127	const int n_output_channels, /** Number of filters. */
				128	const int n_input_channels) /** Number of channels in each filter. */
				129	{
				130	const int matrix_row_stride = roundup(n_output_channels, WinogradConv::N_BLOCK);
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	131	_transform = support::cpp14::make_unique<WeightsTransform>(reinterpret_cast<T *>(weights_hwio->buffer()), output, matrix_stride, matrix_row_stride, n_output_channels,
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	132	n_input_channels);
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	133	Window win;
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	134	auto win_last = _transform->get_window();
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	135	win.set(Window::DimX, Window::Dimension(0, win_last, 1));
				136	INEKernel::configure(win);
				137	}
				138
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	139	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				140	void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	141	{
				142	ARM_COMPUTE_UNUSED(info);
				143	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				144	const size_t fst = window.x().start();
				145	const size_t lst = window.x().end();
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	146	_transform->run(fst, lst);
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	147	}
				148
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	149	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				150	bool NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	151	{
				152	return false;
				153	}
				154
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	155	template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>;
				156	template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>;
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	157
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	158	// Input transform
				159
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	160	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				161	unsigned int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_input_storage_size(
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	162	int n_batches, /** Number of batches in the input tensor. */
				163	int n_channels, /** Number of feature maps in the input tensor. */
				164	int n_rows, /** Number of rows in each feature map. */
				165	int n_cols, /** Number of columns in each feature map. */
				166	bool same_padding /** Use "SAME" padding, otherwise use "VALID". */
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	167	) const
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	168	{
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	169	// Construct shapes for the input and kernel tensors.
				170	const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels);
				171	const KernelShape kern_shape(1, KernelRows, KernelCols, n_channels);
				172	const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
				173	// Return the size, converted into units of TIn
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	174	return static_cast<unsigned int>(WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) / sizeof(T));
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	175	}
				176
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	177	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				178	int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
				179	const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
				180	{
				181	return WinogradConv::get_input_matrix_stride(kernel_shape, input_shape, padding_type);
				182	}
				183
				184	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				185	NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformInputKernel()
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	186	: _transform()
				187	{
				188	}
				189
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	190	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				191	void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
				192	const T const input, /* Input tensor data */
				193	const int n_batches, /** Number of batches in input tensor. */
				194	const int n_rows, /** Number of rows in input tensor. */
				195	const int n_cols, /** Number of columns in input tensor. */
				196	const int n_channels, /** Number of channels in input tensor. */
				197	const PaddingType padding, /** Padding type. */
				198	T const output, /* Base of output matrices. */
				199	const int matrix_stride) /** Stride between output matrices. */
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	200	{
				201	// _input_matrix_row_stride(n_input_channels),
				202	_transform = support::cpp14::make_unique<InputTransform>(input, n_batches, n_rows, n_cols, n_channels, padding, output, matrix_stride, n_channels);
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	203	Window win;
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	204	auto win_last = _transform->get_window();
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	205	win.set(Window::DimX, Window::Dimension(0, win_last, 1));
				206	INEKernel::configure(win);
				207	}
				208
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	209	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				210	void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	211	{
				212	ARM_COMPUTE_UNUSED(info);
				213	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				214	const size_t fst = window.x().start();
				215	const size_t lst = window.x().end();
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	216	_transform->run(fst, lst);
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	217	}
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	218
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	219	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				220	bool NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	221	{
				222	return false;
				223	}
				224
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	225	template class NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>;
				226	template class NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>;
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	227
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	228	// Output transform
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	229
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	230	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				231	unsigned int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_storage_size(
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	232	int n_batches, /** Number of batches in the output tensor. */
				233	int n_rows, /** Number of rows in each feature map of the input tensor. */
				234	int n_cols, /** Number of columns in each feature map of the input tensor. */
				235	int n_output_channels, /** Number of feature maps in the output tensor. */
				236	bool same_padding /** Use "SAME" padding, otherwise use "VALID". */
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	237	) const
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	238	{
				239	// Construct shapes for the input and kernel tensors.
				240	const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1);
				241	const KernelShape kern_shape(n_output_channels, KernelRows, KernelCols, 1);
				242	const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
				243
				244	// Return the size, converted into units of TOut
				245	return static_cast<unsigned int>(
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	246	WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) / sizeof(T));
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	247	}
				248
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	249	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				250	NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformOutputKernel()
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	251	: _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output(nullptr), _n_batches(0), _n_rows(0), _n_cols(0), _n_channels(0)
				252	{
				253	}
				254
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	255	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				256	int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
				257	const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
				258	{
				259	return WinogradConv::get_output_matrix_stride(kernel_shape, input_shape, padding_type);
				260	}
				261	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				262	Tensor4DShape NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_shape(
				263	const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const
				264	{
				265	return WinogradConv::get_output_shape(kernel_shape, in_shape, padding);
				266	}
				267
				268	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				269	void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
				270	const ITensor *biases,
				271	const T *const output_workingspace,
				272	const int matrix_stride,
				273	T *const output,
				274	const int n_batches,
				275	const int n_rows,
				276	const int n_cols,
				277	const int n_channels)
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	278	{
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	279	_biases = biases;
				280	_output_workspace = output_workingspace;
				281	_matrix_stride = matrix_stride;
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	282	_matrix_row_stride = roundup(n_channels, WinogradConv::N_BLOCK);
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	283	_output = output;
				284	_n_batches = n_batches;
				285	_n_rows = n_rows;
				286	_n_cols = n_cols;
				287	_n_channels = n_channels;
				288
				289	// We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window
				290	OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride, nullptr, _output, _n_batches, _n_rows, _n_cols, _n_channels);
				291	Window win;
				292	auto win_last = output_transform.get_window();
				293	win.set(Window::DimX, Window::Dimension(0, win_last, 1));
				294	INEKernel::configure(win);
				295	}
				296
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	297	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				298	void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	299	{
				300	ARM_COMPUTE_UNUSED(info);
				301	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				302	ARM_COMPUTE_ERROR_ON_NULLPTR(_biases->buffer());
				303	ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
				304	ARM_COMPUTE_ERROR_ON_NULLPTR(_output);
				305
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	306	OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride,
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	307	reinterpret_cast<T *>(_biases->buffer()), _output,
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	308	_n_batches, _n_rows, _n_cols, _n_channels);
				309
				310	// The code below cannot be moved to configure because biases hasn't been allocated at that point
				311	const size_t fst = window.x().start();
				312	const size_t lst = window.x().end();
				313	output_transform.run(fst, lst);
				314	}
				315
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	316	template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
				317	bool NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
Pablo Tello	d6ca478	2018-01-23 09:36:04 +0000	[diff] [blame]	318	{
				319	return false;
				320	}
				321
Pablo Tello	f6c572c	2018-02-14 12:47:30 +0000	[diff] [blame^]	322	template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>;
				323	template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>;
Pablo Tello	52140b4	2018-01-30 14:48:11 +0000	[diff] [blame]	324
Pablo Tello	8951933	2017-11-17 11:52:36 +0000	[diff] [blame]	325	} // namespace arm_compute