Blame - src/runtime/CL/functions/CLFullyConnectedLayer.cpp - ml/ComputeLibrary

blob: 66a858d3ed2527a6ebcdfc09b567e7d797b0cef4 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
				25
Gian Marco Iodice	13edbff	2017-06-26 17:20:16 +0100	[diff] [blame]	26	#include "arm_compute/core/Size2D.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	27	#include "arm_compute/core/Validate.h"
				28	#include "arm_compute/runtime/CL/CLScheduler.h"
				29
				30	#include <algorithm>
				31	#include <cmath>
				32
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	33	namespace arm_compute
				34	{
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	35	CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
				36	: _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
				37	{
				38	}
				39
				40	void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor input, ICLTensor output, bool transpose_weights, bool is_batched_fc_layer)
				41	{
Gian Marco Iodice	7d323a6	2017-07-05 20:05:23 +0100	[diff] [blame]	42	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	43	ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	44	ARM_COMPUTE_ERROR_ON(output == nullptr);
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	45	ARM_COMPUTE_ERROR_ON(!transpose_weights && !is_batched_fc_layer);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	46
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	47	const DataType data_type = input->info()->data_type();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	48	const int fixed_point_position = input->info()->fixed_point_position();
				49
				50	_transpose_weights = transpose_weights;
				51	_is_batched_fc_layer = is_batched_fc_layer;
				52
				53	// Check if we need to transpose the weights
				54	if(_transpose_weights)
				55	{
				56	if(_is_batched_fc_layer)
				57	{
				58	// Initialize the output tensor for transpose
				59	TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	60	_transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, data_type, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	61	_transpose_kernel.configure(input, &_transpose_output);
				62
				63	// Configure transpose 1xW kernel
				64	_transpose1xW_kernel.configure(&_transpose_output, output);
				65
				66	// Allocate temporary tensor used for transposing the weights
				67	_transpose_output.allocator()->allocate();
				68	}
				69	else
				70	{
				71	_transpose_kernel.configure(input, output);
				72	}
				73	}
				74	else
				75	{
				76	if(_is_batched_fc_layer)
				77	{
				78	// Configure transpose 1xW kernel
				79	_transpose1xW_kernel.configure(input, output);
				80	}
				81	else
				82	{
				83	ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
				84	}
				85	}
				86	}
				87
				88	void CLFullyConnectedLayerReshapeWeights::run()
				89	{
				90	if(_transpose_weights)
				91	{
				92	CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
				93	}
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	94
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	95	if(_is_batched_fc_layer)
				96	{
				97	CLScheduler::get().enqueue(_transpose1xW_kernel);
				98	}
				99	}
				100
				101	CLFullyConnectedLayer::CLFullyConnectedLayer()
				102	: _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	103	_are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	104	{
				105	}
				106
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	107	void CLFullyConnectedLayer::configure(const ICLTensor input, const ICLTensor weights, const ICLTensor biases, ICLTensor output, bool transpose_weights, bool are_weights_reshaped)
				108	{
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	109	// With the Fully Connected layer we can have 4 different cases:
				110	// 1) Convolution layer -> Fully Connected layer without batches
				111	// 2) Fully Connected layer -> Fully Connected layer without batches
				112	// 3) Convolution layer -> Fully Connected layer with batches
				113	// 4) Fully Connected layer -> Fully Connected layer with batches
				114
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	115	// Expected shape before transpose and reshaping
				116	// Input: In x B (In and B can be multi-dimensional)
				117	// Weights: flat(In) x Out
				118	// Biases: Out
				119	// Output: Out x B (B can be multi-dimensional)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	120
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	121	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
				122	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
				123	ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, weights, output);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	124
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	125	const DataType data_type = input->info()->data_type();
				126	const int fixed_point_position = input->info()->fixed_point_position();
				127	const int num_batch_dimensions = std::max(0, static_cast<int>(output->info()->tensor_shape().num_dimensions()) - 1);
				128	const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
				129	const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
				130
				131	_linearize_input = input->info()->tensor_shape().x() != linear_input_size;
				132	_are_weights_reshaped = are_weights_reshaped;
				133	_accumulate_biases = biases != nullptr;
				134	_is_batched_fc_layer = num_batch_dimensions > 0;
				135
				136	// Check if number of batches match
				137	ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size_upper(num_input_dimensions) != output->info()->tensor_shape().total_size_upper(1));
				138	ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
				139
				140	const size_t interleave_width = 16 / input->info()->element_size();
				141	const ICLTensor *weights_to_use = weights;
				142
				143	if(!are_weights_reshaped && (transpose_weights \|\| _is_batched_fc_layer))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	144	{
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	145	weights_to_use = &_reshape_weights_output;
				146
				147	TensorShape reshaped_weights_shape(weights->info()->tensor_shape());
				148
				149	// Transpose weights if the user hasn't done it
				150	if(transpose_weights)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	151	{
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	152	const size_t shape_x = reshaped_weights_shape.x();
				153	reshaped_weights_shape.set(0, reshaped_weights_shape.y());
				154	reshaped_weights_shape.set(1, shape_x);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	155	}
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	156
				157	// If the we run multiple batches we need 1xW transpose, too.
				158	if(_is_batched_fc_layer)
				159	{
				160	const float shape_x = reshaped_weights_shape.x();
				161	reshaped_weights_shape.set(0, reshaped_weights_shape.y() * interleave_width);
				162	reshaped_weights_shape.set(1, static_cast<unsigned int>(std::ceil(shape_x / interleave_width)));
				163	}
				164
				165	_reshape_weights_output.allocator()->init(TensorInfo(reshaped_weights_shape, 1, data_type, fixed_point_position));
				166
				167	// Reshape the weights
				168	_reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
				169	}
				170
				171	// Check correct shape of weights
				172	if(_is_batched_fc_layer)
				173	{
				174	// Transpose + Transpose1xW
				175	ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != linear_input_size * interleave_width);
				176	ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != static_cast<unsigned int>(std::ceil(static_cast<float>(output->info()->tensor_shape().x()) / interleave_width)));
				177	}
				178	else
				179	{
				180	// Transpose
				181	ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != output->info()->tensor_shape().x());
				182	ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != linear_input_size);
				183	}
				184
				185	const ICLTensor *multiply_input = input;
				186
				187	if(_linearize_input)
				188	{
				189	TensorShape shape_im2col(input->info()->tensor_shape());
				190	shape_im2col.collapse(num_input_dimensions);
				191	_im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, data_type, fixed_point_position));
				192
				193	// Configure im2col kernel
				194	_im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
				195
				196	multiply_input = &_im2col_output;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	197	}
				198
				199	if(_is_batched_fc_layer)
				200	{
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	201	TensorShape shape_interleaved(multiply_input->info()->tensor_shape());
				202	shape_interleaved.set(0, shape_interleaved.x() * 4);
				203	shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
				204	_interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, data_type, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	205
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	206	// Configure interleave4x4 kernel
				207	_interleave4x4_kernel.configure(multiply_input, &_interleave4x4_output);
				208
				209	multiply_input = &_interleave4x4_output;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	210	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	211
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	212	// Configure matrix multiply kernel
				213	_mm_kernel.configure(multiply_input, weights_to_use, output, 1.0f);
				214
				215	if(_accumulate_biases)
				216	{
				217	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
				218	ARM_COMPUTE_ERROR_ON(biases->info()->tensor_shape().x() != output->info()->tensor_shape().x());
				219
				220	// Configure accumulate biases kernel
				221	_accumulate_biases_kernel.configure(output, biases);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	222	}
				223
				224	// Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	225	if(!are_weights_reshaped && (transpose_weights \|\| _is_batched_fc_layer))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	226	{
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	227	// Allocate the tensor for the weights reshaped
				228	_reshape_weights_output.allocator()->allocate();
				229	}
				230
				231	if(_linearize_input)
				232	{
				233	_im2col_output.allocator()->allocate();
				234	}
				235
				236	if(_is_batched_fc_layer)
				237	{
				238	_interleave4x4_output.allocator()->allocate();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	239	}
				240	}
				241
				242	void CLFullyConnectedLayer::run()
				243	{
				244	// Reshape of the weights (happens only once)
				245	if(!_are_weights_reshaped)
				246	{
				247	_are_weights_reshaped = true;
				248	_reshape_weights_kernel.run();
				249	}
				250
				251	// Linearize input if it comes from a convolutional layer
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	252	if(_linearize_input)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	253	{
				254	CLScheduler::get().enqueue(_im2col_kernel, false);
				255	}
				256
				257	// Interleave input
				258	if(_is_batched_fc_layer)
				259	{
				260	CLScheduler::get().enqueue(_interleave4x4_kernel, false);
				261	}
				262
				263	// Run matrix multiply
				264	CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
				265
				266	// Accumulate biases if provided
				267	if(_accumulate_biases)
				268	{
				269	CLScheduler::get().enqueue(_accumulate_biases_kernel);
				270	}
				271	}
Moritz Pflanzer	768e9f1	2017-08-11 15:33:30 +0100	[diff] [blame^]	272	} // namespace arm_compute