Blame - src/runtime/CL/tuners/BifrostTuner.cpp - ml/ComputeLibrary

blob: 187f52fcf78d6375ec861a4df70c13724fb66068 [file] [log] [blame]

Georgios Pinitas	c0d1c86	2018-03-23 15:13:15 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2018 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
				25
				26	#include "arm_compute/core/CL/CLHelpers.h"
				27	#include "arm_compute/core/CL/CLKernels.h"
				28	#include "arm_compute/core/utils/misc/Cast.h"
				29
				30	namespace arm_compute
				31	{
				32	namespace tuners
				33	{
				34	namespace
				35	{
				36	/** Tunes a @ref CLDirectConvolutionLayerKernel for a bifrost target
				37	*
				38	* @param[in] k Kernels to tune
				39	*/
				40	void tune_direct_convolution_kernel(CLDirectConvolutionLayerKernel &k)
				41	{
				42	cl::NDRange lws_hint = k.lws_hint();
				43
				44	const GPUTarget gpu_target = k.get_target();
				45	const DataType dt = k._input->info()->data_type();
				46	const TensorShape weights_shape = k._weights->info()->tensor_shape();
				47	const TensorShape inputs_shape = k._input->info()->tensor_shape();
				48	const size_t kernel_size = weights_shape.x();
				49	const unsigned int stride_x = k._conv_stride_x;
				50	const unsigned int stride_y = k._conv_stride_y;
				51
				52	if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (kernel_size <= 5) && (stride_x == 1) && (stride_y == 1) && (dt == DataType::F32))
				53	{
				54	// Through extensive experimentation with over 30 representative tensor
				55	// shapes, we found a small number of local work size configurations
				56	// that result in nearly optimal execution times. Selecting the right
				57	// lws for a given shape, however, required a complex decision tree,
				58	// until we constructed a simple feature as described below.
				59	//
				60	// We started from the number of multiply-accumulate operations for a
				61	// convolution layer, which is equal to the product of the input
				62	// dimensions 0..2 and the weights dimensions 0..2. Unfortunately,
				63	// this resulted in ties between distinct shapes that required distinct
				64	// lws configurations. Replacing the width of the input with the kernel
				65	// size, however, resulted in nearly optimal predictions. We use underscores
				66	// in variable names to indicate when they are intentionally misleading.
				67	const size_t product_of_weights_dimensions = weights_shape[0] * weights_shape[1] * weights_shape[2];
				68	const size_t product_of_input_dimensions_ = inputs_shape[0] * inputs_shape[1] * inputs_shape[2];
				69	const float mega_ops_ = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
				70
				71	switch(kernel_size)
				72	{
				73	case 1:
				74	{
				75	if(mega_ops_ < 1.f)
				76	{
				77	lws_hint = cl::NDRange(1, 1, 8);
				78	}
				79	else if(mega_ops_ < 7.f)
				80	{
				81	lws_hint = cl::NDRange(1, 1, 4);
				82	}
				83	else
				84	{
				85	lws_hint = cl::NDRange(1, 1, 2);
				86	}
				87	break;
				88	}
				89	case 3:
				90	{
				91	if(mega_ops_ < 1.f)
				92	{
				93	lws_hint = cl::NDRange(1, 1, 8);
				94	}
				95	else if(mega_ops_ < 13.f)
				96	{
				97	lws_hint = cl::NDRange(2, 1, 4);
				98	}
				99	else if(mega_ops_ < 50.f)
				100	{
				101	lws_hint = cl::NDRange(3, 1, 4);
				102	}
				103	else
				104	{
				105	lws_hint = cl::NDRange(2, 1, 6);
				106	}
				107	break;
				108	}
				109	case 5:
				110	{
				111	if(mega_ops_ < 2.f \|\| mega_ops_ > 80.f)
				112	{
				113	lws_hint = cl::NDRange(2, 1, 4);
				114	}
				115	else
				116	{
				117	lws_hint = cl::NDRange(2, 1, 8);
				118	}
				119	break;
				120	}
				121	default:
				122	break;
				123	}
				124	k.set_lws_hint(lws_hint);
				125	}
				126	}
Georgios Pinitas	17812ba	2018-06-04 19:27:13 +0100	[diff] [blame]	127
				128	void tune_col2im_kernel(CLCol2ImKernel &k)
				129	{
				130	cl::NDRange lws_hint = k.lws_hint();
				131	const GPUTarget gpu_target = k.get_target();
				132
				133	// Configure the local work size for Bifrost with a value obtained
				134	// via exhaustive autotuning over 30 representative tensor shapes.
Georgios Pinitas	a34286e	2018-09-04 12:18:50 +0100	[diff] [blame^]	135	if(gpu_target_is_in(gpu_target,
				136	GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
				137	GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
				138	GPUTarget::G52, GPUTarget::G52LIT))
Georgios Pinitas	17812ba	2018-06-04 19:27:13 +0100	[diff] [blame]	139	{
Giorgio Arena	226e4b9	2018-08-23 12:00:02 +0100	[diff] [blame]	140	if((k._convolved_dims.width == 7) \|\| (k._convolved_dims.width == 14))
Georgios Pinitas	17812ba	2018-06-04 19:27:13 +0100	[diff] [blame]	141	{
				142	lws_hint = cl::NDRange(1, 7, 1);
				143	}
				144	else
				145	{
				146	lws_hint = cl::NDRange(1, 8, 1);
				147	}
				148	}
				149
				150	k.set_lws_hint(lws_hint);
				151	}
				152
				153	void tune_im2col_kernel(CLIm2ColKernel &k)
				154	{
				155	cl::NDRange lws_hint = k.lws_hint();
				156	const GPUTarget gpu_target = k.get_target();
				157
				158	// Local work size optimized for the 11x11 AlexNet convolution on Bifrost.
Georgios Pinitas	a34286e	2018-09-04 12:18:50 +0100	[diff] [blame^]	159	if(gpu_target_is_in(gpu_target,
				160	GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
				161	GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
				162	GPUTarget::G52, GPUTarget::G52LIT)
				163	&& k._kernel_dims.width == 11)
Georgios Pinitas	17812ba	2018-06-04 19:27:13 +0100	[diff] [blame]	164	{
				165	const bool is_square_kernel = (k._kernel_dims.width == k._kernel_dims.height);
				166	if(!is_square_kernel && k._kernel_dims.width > 1 && !k._conv_info.has_padding())
				167	{
				168	lws_hint = cl::NDRange(1, 1, 1);
				169	}
				170	}
				171	k.set_lws_hint(lws_hint);
				172	}
				173
				174	void tune_depthwise_im2col_kernel(CLDepthwiseIm2ColKernel &k)
				175	{
				176	cl::NDRange lws_hint = k.lws_hint();
				177	const GPUTarget gpu_target = k.get_target();
				178
				179	// Configure the local work size for Bifrost with a value obtained
				180	// via exhaustive autotuning for the MobileNets tensor shapes.
Georgios Pinitas	a34286e	2018-09-04 12:18:50 +0100	[diff] [blame^]	181	if(gpu_target_is_in(gpu_target,
				182	GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
				183	GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
				184	GPUTarget::G52, GPUTarget::G52LIT))
Georgios Pinitas	17812ba	2018-06-04 19:27:13 +0100	[diff] [blame]	185	{
				186	lws_hint = cl::NDRange(1, 2, 1);
				187	}
				188
				189	k.set_lws_hint(lws_hint);
				190	}
				191
				192	void tune_gemv_kernel(CLGEMMMatrixVectorMultiplyKernel &k)
				193	{
				194	cl::NDRange lws_hint = k.lws_hint();
				195	const GPUTarget gpu_target = k.get_target();
				196
				197	// Configure the local work size for Bifrost with a value obtained
				198	// via exhaustive autotuning for the MobileNets tensor shapes.
Georgios Pinitas	a34286e	2018-09-04 12:18:50 +0100	[diff] [blame^]	199	if(gpu_target_is_in(gpu_target,
				200	GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
				201	GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
				202	GPUTarget::G52, GPUTarget::G52LIT))
Georgios Pinitas	17812ba	2018-06-04 19:27:13 +0100	[diff] [blame]	203	{
				204	lws_hint = cl::NDRange(1, 1, 1);
				205	}
				206
				207	k.set_lws_hint(lws_hint);
				208	}
				209
				210	void tune_gemm_kernel(CLGEMMMatrixMultiplyKernel &k)
				211	{
				212	cl::NDRange lws_hint = k.lws_hint();
				213	const GPUTarget gpu_target = k.get_target();
				214
				215	// Configure LWS hint
				216	switch(gpu_target)
				217	{
				218	case GPUTarget::G71:
				219	case GPUTarget::G72:
				220	case GPUTarget::G51:
				221	case GPUTarget::G51BIG:
				222	case GPUTarget::G51LIT:
Georgios Pinitas	a34286e	2018-09-04 12:18:50 +0100	[diff] [blame^]	223	case GPUTarget::G52:
				224	case GPUTarget::G52LIT:
Georgios Pinitas	b03f7c5	2018-07-12 10:49:53 +0100	[diff] [blame]	225	case GPUTarget::G76:
Georgios Pinitas	17812ba	2018-06-04 19:27:13 +0100	[diff] [blame]	226	if(k._input1->info()->dimension(1) == 24)
				227	{
				228	// LWS optimized for the 11x11 AlexNet convolution on Bifrost.
				229	lws_hint = cl::NDRange(2, 2);
				230	}
				231	else if(k._output->info()->dimension(1) == 196)
				232	{
				233	lws_hint = cl::NDRange(1, 7);
				234	}
				235	else
				236	{
				237	lws_hint = cl::NDRange(8, 8);
				238	}
				239	break;
				240	default:
				241	lws_hint = cl::NullRange;
				242	}
				243
				244	k.set_lws_hint(lws_hint);
				245	}
				246
				247	void tune_pooling_kernel(CLPoolingLayerKernel &k)
				248	{
				249	cl::NDRange lws_hint = k.lws_hint();
				250	const GPUTarget gpu_target = k.get_target();
				251
				252	// Configure the local work size (hint) from the first two dimensions of the global work size.
				253	// On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized
				254	// kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is
				255	// invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
				256	if(k._input->info()->data_layout() == DataLayout::NCHW)
				257	{
Georgios Pinitas	a34286e	2018-09-04 12:18:50 +0100	[diff] [blame^]	258	if(gpu_target_is_in(gpu_target,
				259	GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
				260	GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
				261	GPUTarget::G52, GPUTarget::G52LIT))
Georgios Pinitas	17812ba	2018-06-04 19:27:13 +0100	[diff] [blame]	262	{
				263	cl::NDRange gws = ICLKernel::gws_from_window(k.window());
				264	lws_hint = cl::NDRange(gws[0], gws[1], 1);
				265	}
				266	}
				267
				268	k.set_lws_hint(lws_hint);
				269	}
Georgios Pinitas	6c95c2d	2018-08-20 16:06:58 +0100	[diff] [blame]	270
				271	void tune_scale_kernel(CLScaleKernel &k)
				272	{
				273	cl::NDRange lws_hint = k.lws_hint();
				274	const GPUTarget gpu_target = k.get_target();
				275	const DataType dt = k.input()->info()->data_type();
				276	const InterpolationPolicy interpolation = k._interpolationPolicy;
				277
				278	// Configure the local work size for Bifrost, interpolation (bilinear) and datatype F32.
				279	// The value are obtained via exhaustive autotuning.
				280	if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (dt == DataType::F32) && (interpolation == InterpolationPolicy::BILINEAR))
				281	{
				282	auto dim_0 = k.output()->info()->dimension(0);
				283	if(dim_0 == 480)
				284	{
				285	lws_hint = cl::NDRange(2, 1);
				286	}
				287	else if(dim_0 == 3120)
				288	{
				289	lws_hint = cl::NDRange(2, 8);
				290	}
				291	else if(dim_0 == 4160)
				292	{
				293	lws_hint = cl::NDRange(4, 8);
				294	}
				295	k.set_lws_hint(lws_hint);
				296	}
				297	}
Georgios Pinitas	c0d1c86	2018-03-23 15:13:15 +0000	[diff] [blame]	298	} // namespace
				299
				300	void BifrostTuner::tune_kernel_static(ICLKernel &kernel)
				301	{
Georgios Pinitas	c0d1c86	2018-03-23 15:13:15 +0000	[diff] [blame]	302	if(dynamic_cast<CLDirectConvolutionLayerKernel *>(&kernel) != nullptr)
				303	{
				304	tune_direct_convolution_kernel(utils::cast::polymorphic_downcast<CLDirectConvolutionLayerKernel >(&kernel));
				305	}
Georgios Pinitas	17812ba	2018-06-04 19:27:13 +0100	[diff] [blame]	306	else if(dynamic_cast<CLCol2ImKernel *>(&kernel) != nullptr)
				307	{
				308	tune_col2im_kernel(utils::cast::polymorphic_downcast<CLCol2ImKernel >(&kernel));
				309	}
				310	else if(dynamic_cast<CLIm2ColKernel *>(&kernel) != nullptr)
				311	{
				312	tune_im2col_kernel(utils::cast::polymorphic_downcast<CLIm2ColKernel >(&kernel));
				313	}
				314	else if(dynamic_cast<CLDepthwiseIm2ColKernel *>(&kernel) != nullptr)
				315	{
				316	tune_depthwise_im2col_kernel(utils::cast::polymorphic_downcast<CLDepthwiseIm2ColKernel >(&kernel));
				317	}
				318	else if(dynamic_cast<CLGEMMMatrixVectorMultiplyKernel *>(&kernel) != nullptr)
				319	{
				320	tune_gemv_kernel(utils::cast::polymorphic_downcast<CLGEMMMatrixVectorMultiplyKernel >(&kernel));
				321	}
				322	else if(dynamic_cast<CLGEMMMatrixMultiplyKernel *>(&kernel) != nullptr)
				323	{
				324	tune_gemm_kernel(utils::cast::polymorphic_downcast<CLGEMMMatrixMultiplyKernel >(&kernel));
				325	}
				326	else if(dynamic_cast<CLPoolingLayerKernel *>(&kernel) != nullptr)
				327	{
				328	tune_pooling_kernel(utils::cast::polymorphic_downcast<CLPoolingLayerKernel >(&kernel));
				329	}
Georgios Pinitas	6c95c2d	2018-08-20 16:06:58 +0100	[diff] [blame]	330	else if(dynamic_cast<CLScaleKernel *>(&kernel) != nullptr)
				331	{
				332	tune_scale_kernel(utils::cast::polymorphic_downcast<CLScaleKernel >(&kernel));
				333	}
Georgios Pinitas	c0d1c86	2018-03-23 15:13:15 +0000	[diff] [blame]	334	}
				335
				336	void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel)
				337	{
				338	ARM_COMPUTE_UNUSED(kernel);
				339	}
				340	} // namespace tuners
				341	} // namespace arm_compute