Blame - src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp - ml/ComputeLibrary

blob: 2b7b391c432deb9609f468d3dae79687413a7108 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
				25
				26	#include "arm_compute/core/AccessWindowTranspose.h"
				27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/IAccessWindow.h"
				30	#include "arm_compute/core/ITensor.h"
				31	#include "arm_compute/core/NEON/NEFixedPoint.h"
				32	#include "arm_compute/core/TensorInfo.h"
				33	#include "arm_compute/core/Types.h"
				34	#include "arm_compute/core/Utils.h"
				35	#include "arm_compute/core/Validate.h"
				36	#include "arm_compute/core/Window.h"
				37
				38	#include <arm_neon.h>
				39	#include <cstddef>
				40	#include <cstdint>
				41	#include <tuple>
				42
				43	using namespace arm_compute;
				44
				45	namespace arm_compute
				46	{
				47	class Coordinates;
				48	} // namespace arm_compute
				49
				50	namespace
				51	{
Pablo Tello	afde732	2017-07-25 09:19:46 +0100	[diff] [blame^]	52	void vector_matrix_multiply_f16(const ITensor input0, const ITensor input1, ITensor *output, const Window &window)
				53	{
				54	#ifdef ARM_COMPUTE_ENABLE_FP16
				55	const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
				56	const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
				57	const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
				58
				59	// The implementation computes 16 elements per iteration
				60	const int window_start_x = 16 * window.thread_id();
				61	const int window_step_x = 16 * window.num_threads();
				62	// Make sure (window_end_x - window_start_x) is a multiple of window_step_x
				63	const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
				64
				65	Window win_out(window);
				66	win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
				67
				68	Window win_a(window);
				69	win_a.set(Window::DimX, Window::Dimension(0, 1, 1));
				70
				71	Iterator ina(input0, win_a);
				72	Iterator out(output, win_out);
				73
				74	execute_window_loop(win_out, [&](const Coordinates & id)
				75	{
				76	if(id.x() > width_matrix_b)
				77	{
				78	return;
				79	}
				80
				81	float16x8_t acc0 = vdupq_n_f16(0.f);
				82	float16x8_t acc1 = vdupq_n_f16(0.f);
				83	float16x8_t acc2 = vdupq_n_f16(0.f);
				84	float16x8_t acc3 = vdupq_n_f16(0.f);
				85
				86	auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr());
				87	auto matrix_b = reinterpret_cast<const float16_t *>(input1->ptr_to_element(Coordinates(id[0], 0, id[1])));
				88
				89	const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
				90
				91	for(; vec_a <= (vec_a_end_addr - 4);)
				92	{
				93	const float16x4_t a0l = vld1_f16(vec_a);
				94
				95	float16x8_t b00 = vld1q_f16(matrix_b);
				96	float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
				97	float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
				98	float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
				99
				100	float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
				101	float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
				102	float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
				103	float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
				104
				105	acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
				106	acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
				107	acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
				108	acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
				109	acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
				110	acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
				111	acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
				112	acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
				113
				114	matrix_b += 2 * in_b_stride;
				115
				116	b00 = vld1q_f16(matrix_b);
				117	b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
				118	b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
				119	b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
				120	b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
				121	b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
				122	b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
				123	b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
				124
				125	acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
				126	acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
				127	acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
				128	acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
				129	acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
				130	acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
				131	acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
				132	acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
				133
				134	vec_a += 4;
				135	matrix_b += 2 * in_b_stride;
				136	}
				137
				138	for(; vec_a < vec_a_end_addr;)
				139	{
				140	const float16_t a0 = *vec_a;
				141	const float16x8_t b00 = vld1q_f16(matrix_b);
				142	const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
				143	const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
				144	const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
				145
				146	acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
				147	acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
				148	acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
				149	acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
				150
				151	vec_a += 1;
				152	matrix_b += in_b_stride;
				153	}
				154
				155	const auto vec_out = reinterpret_cast<float16_t *>(out.ptr());
				156
				157	vst1q_f16(vec_out + 0, acc0);
				158	vst1q_f16(vec_out + 8, acc1);
				159	vst1q_f16(vec_out + 16, acc2);
				160	vst1q_f16(vec_out + 24, acc3);
				161	},
				162	ina, out);
				163	#else /* ARM_COMPUTE_ENABLE_FP16 */
				164	ARM_COMPUTE_UNUSED(input0);
				165	ARM_COMPUTE_UNUSED(input1);
				166	ARM_COMPUTE_UNUSED(output);
				167	ARM_COMPUTE_UNUSED(window);
				168	ARM_COMPUTE_ERROR("Not supported, recompile with -march=armv8.2-a+fp16+simd.");
				169	#endif /* ARM_COMPUTE_ENABLE_FP16 */
				170	}
				171
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	172	void vector_matrix_multiply_f32(const ITensor input0, const ITensor input1, ITensor *output, const Window &window)
				173	{
				174	const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
				175	const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
				176	const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
				177
				178	// The implementation computes 16 elements per iteration
				179	const int window_start_x = 16 * window.thread_id();
				180	const int window_step_x = 16 * window.num_threads();
				181	// Make sure (window_end_x - window_start_x) is a multiple of window_step_x
				182	const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
				183
				184	Window win_out(window);
				185	win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
				186
				187	Window win_a(window);
				188	win_a.set(Window::DimX, Window::Dimension(0, 1, 1));
				189
				190	Iterator ina(input0, win_a);
				191	Iterator out(output, win_out);
				192
				193	execute_window_loop(win_out, [&](const Coordinates & id)
				194	{
				195	if(id.x() > width_matrix_b)
				196	{
				197	return;
				198	}
				199
				200	float32x4_t acc0 = vdupq_n_f32(0.f);
				201	float32x4_t acc1 = vdupq_n_f32(0.f);
				202	float32x4_t acc2 = vdupq_n_f32(0.f);
				203	float32x4_t acc3 = vdupq_n_f32(0.f);
				204
				205	auto vec_a = reinterpret_cast<const float *>(ina.ptr());
				206	auto matrix_b = reinterpret_cast<const float *>(input1->ptr_to_element(Coordinates(id[0], 0, id[1])));
				207
				208	#if __arm__
				209	asm volatile("PLD [%0, #1284]" ::"r"(reinterpret_cast<const uint8_t >(vec_a)));
				210	asm volatile("PLD [%0, #1284]" ::"r"(reinterpret_cast<const uint8_t >(matrix_b)));
				211	asm volatile("PLD [%0, #1284]" ::"r"(reinterpret_cast<const uint8_t >(matrix_b + in_b_stride)));
Anthony Barbier	ac69aa1	2017-07-03 17:39:37 +0100	[diff] [blame]	212	#endif /* __arm__ */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	213
				214	const float *vec_a_end_addr = vec_a + num_elems_vec_a;
				215
				216	for(; vec_a <= (vec_a_end_addr - 4);)
				217	{
				218	float32x2_t a0l = vld1_f32(vec_a);
				219
				220	float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
				221	float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
				222	float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
				223	float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
				224
				225	float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
				226	float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
				227	float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
				228	float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
				229
				230	#if __arm__
				231	asm volatile("PLD [%0, #1284]" ::"r"(reinterpret_cast<const uint8_t >(vec_a)));
				232	asm volatile("PLD [%0, #1281]" ::"r"(reinterpret_cast<const uint8_t >(matrix_b + 1 * in_b_stride)));
				233	asm volatile("PLD [%0, #1281]" ::"r"(reinterpret_cast<const uint8_t >(matrix_b + 2 * in_b_stride)));
				234	asm volatile("PLD [%0, #1281]" ::"r"(reinterpret_cast<const uint8_t >(matrix_b + 3 * in_b_stride)));
				235	asm volatile("PLD [%0, #1281]" ::"r"(reinterpret_cast<const uint8_t >(matrix_b + 4 * in_b_stride)));
Anthony Barbier	ac69aa1	2017-07-03 17:39:37 +0100	[diff] [blame]	236	#endif /* __arm __ */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	237
				238	acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
				239	acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
				240	acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
				241	acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
				242
				243	acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
				244	acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
				245	acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
				246	acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
				247
				248	vec_a += 2;
				249	matrix_b += 2 * in_b_stride;
				250
				251	a0l = vld1_f32(vec_a);
				252
				253	b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
				254	b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
				255	b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
				256	b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
				257
				258	b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
				259	b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
				260	b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
				261	b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
				262
				263	acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
				264	acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
				265	acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
				266	acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
				267
				268	acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
				269	acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
				270	acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
				271	acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
				272
				273	vec_a += 2;
				274	matrix_b += 2 * in_b_stride;
				275	}
				276
				277	for(; vec_a < vec_a_end_addr;)
				278	{
				279	const float a0 = *vec_a;
				280
				281	const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
				282	const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
				283	const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
				284	const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
				285
				286	acc0 = vmlaq_n_f32(acc0, b00, a0);
				287	acc1 = vmlaq_n_f32(acc1, b01, a0);
				288	acc2 = vmlaq_n_f32(acc2, b02, a0);
				289	acc3 = vmlaq_n_f32(acc3, b03, a0);
				290
				291	vec_a += 1;
				292	matrix_b += in_b_stride;
				293	}
				294
				295	const auto vec_out = reinterpret_cast<float *>(out.ptr());
				296
				297	vst1q_f32(vec_out + 0, acc0);
				298	vst1q_f32(vec_out + 4, acc1);
				299	vst1q_f32(vec_out + 8, acc2);
				300	vst1q_f32(vec_out + 12, acc3);
				301	},
				302	ina, out);
				303	}
				304	} // namespace
				305
				306	NELocallyConnectedMatrixMultiplyKernel::NELocallyConnectedMatrixMultiplyKernel()
				307	: _input0(nullptr), _input1(nullptr), _output(nullptr)
				308	{
				309	}
				310
				311	void NELocallyConnectedMatrixMultiplyKernel::configure(const ITensor input0, const ITensor input1, ITensor *output)
				312	{
Pablo Tello	afde732	2017-07-25 09:19:46 +0100	[diff] [blame^]	313	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
				314	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
				315	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
				316	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	317	ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
				318
				319	_input0 = input0;
				320	_input1 = input1;
				321	_output = output;
				322
Pablo Tello	afde732	2017-07-25 09:19:46 +0100	[diff] [blame^]	323	const unsigned int num_elems_processed_per_iteration_x = 16;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	324
				325	// Configure kernel window
				326	Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
				327
				328	AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
				329
				330	update_window_and_padding(win,
				331	AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x),
				332	AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
				333	output_access);
				334
				335	output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
				336
				337	INEKernel::configure(win);
				338	}
				339
				340	void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window)
				341	{
				342	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				343	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				344
Pablo Tello	afde732	2017-07-25 09:19:46 +0100	[diff] [blame^]	345	switch(_input0->info()->data_type())
				346	{
				347	case DataType::F16:
				348	{
				349	vector_matrix_multiply_f16(_input0, _input1, _output, window);
				350	break;
				351	}
				352	case DataType::F32:
				353	{
				354	vector_matrix_multiply_f32(_input0, _input1, _output, window);
				355	break;
				356	}
				357	default:
				358	{
				359	ARM_COMPUTE_ERROR("Data type not supported");
				360	break;
				361	}
				362	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	363	}