Blame - src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp - ml/ComputeLibrary

blob: 870de64eb871b2f5025883ef8bd82e1f9c202841 [file] [log] [blame]

SiCong Li	f44bbc5	2022-08-29 18:25:51 +0100	[diff] [blame^]	1	/*
				2	* Copyright (c) 2022 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "ClTemplateDirectConv2d.h"
				25
				26	#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
				27	#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
				28
				29	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
				30	#include "src/core/helpers/WindowHelpers.h"
				31
				32	#include "support/StringSupport.h"
				33
				34	namespace arm_compute
				35	{
				36	namespace experimental
				37	{
				38	namespace dynamic_fusion
				39	{
				40	ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId id,
				41	const ArgumentPack<ITensorInfo> &tensors,
				42	const Attributes &attributes,
				43	const Settings &settings)
				44	: IGpuTemplateComponentWriter{ id, tensors },
				45	_src{},
				46	_weight{},
				47	_bias{},
				48	_dst{},
				49	_attributes{ attributes },
				50	_settings{ settings }
				51	{
				52	_src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
				53	_weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
				54	if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
				55	{
				56	_bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
				57	}
				58	_dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
				59	ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
				60	}
				61
				62	std::string ClTemplateDirectConv2d::get_name() const
				63	{
				64	return "direct_conv2d";
				65	}
				66
				67	std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
				68	{
				69	ARM_COMPUTE_UNUSED(comp_group);
				70
				71	const auto channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
				72	const auto k0 = adjust_vec_size(is_data_type_quantized(_src->data_type()) ? 16u : 8u, _src->dimension(channel_idx));
				73	const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
				74
				75	std::string code = R"_(
				76	//------------------ START KERNEL {{meta_kernel_id}} ---------------------
				77	// IN_0(src) {{src}}
				78	// IN_1(wei) {{weight}}
				79	)_";
				80	if(_bias && _bias->has_valid_id())
				81	{
				82	code += R"_(
				83	// IN_1(bia) {{bias}}
				84	)_";
				85	}
				86	code += R"_(
				87	// OUT(dst, accum) {{dst}}
				88
				89	// Initialize the accumulators
				90	TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
				91	{
				92	// All the tensor dimensions are passed at compile time.
				93	// In case of dynamic tensor support, the following dimensions should be passed as function argument.
				94	#define _IWEI_WIDTH {{WEI_WIDTH}}
				95	#define _IWEI_HEIGHT {{WEI_HEIGHT}}
				96	#define _ISRC_WIDTH {{src}}_w
				97	#define _ISRC_HEIGHT {{src}}_h
				98	#define _ISRC_CHANNELS {{src}}_c
				99	#define _IDST_WIDTH {{arg_dst}}_w
				100	#define _IDST_HEIGHT {{arg_dst}}_h
				101	#define _IDST_CHANNELS {{arg_dst}}_c
				102	#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
				103
				104	// .v = access the whole vector (OpenCL vector)
				105	// .s[x] = access the vector element at position x (scalar access)
				106	TILE(int, M0, 1, xi);
				107	TILE(int, M0, 1, yi);
				108
				109	// Convert the linear index to coordinate
				110	LOOP_UNROLLING(int, i, 0, 1, M0,
				111	{
				112	xi[i].v = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
				113	yi[i].v = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
				114	xi[i].v -= {{PAD_LEFT}};
				115	yi[i].v -= {{PAD_TOP}};
				116	})
				117
				118	LOOP_UNROLLING(int, i, 0, 1, M0,
				119	{
				120	{{dst}}[i].v = 0;
				121	})
				122
				123	for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
				124	{
				125	int ck = 0;
				126	int xk = i % _IWEI_WIDTH;
				127	int yk = i / _IWEI_WIDTH;
				128
				129	int k = 0;
				130	for(; k <= (_ISRC_CHANNELS - K0); k += K0)
				131	{
				132	TILE({{SRC_DATA_TYPE}}, M0, K0, a);
				133	TILE({{WEI_DATA_TYPE}}, N0, K0, b);
				134
				135	// Initialize tiles
				136	LOOP_UNROLLING(int, i, 0, 1, M0,
				137	{
				138	a[i].v = {{ZERO_VALUE}};
				139	})
				140
				141	LOOP_UNROLLING(int, i, 0, 1, N0,
				142	{
				143	b[i].v = {{ZERO_VALUE}};
				144	})
				145
				146	// Load tile from the src tensor
				147	T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
				148
				149	// Load tile from the weights tensor
				150	T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
				151
				152	// Compute the matrix multiplication between two tiles
				153	T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
				154
				155	ck += K0;
				156	}
				157
				158	// We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
				159	// This #if directive should be removed in case of dynamic tensor support
				160	)_";
				161
				162	if(leftover_loop)
				163	{
				164	code += R"_(
				165	// Left-over accumulations
				166	for(; k < _ISRC_CHANNELS; ++k)
				167	{
				168	TILE({{SRC_DATA_TYPE}}, M0, 1, a);
				169	TILE({{WEI_DATA_TYPE}}, N0, 1, b);
				170
				171	// Initialize tiles
				172	LOOP_UNROLLING(int, i, 0, 1, M0,
				173	{
				174	a[i].v = {{ZERO_VALUE}};
				175	})
				176
				177	LOOP_UNROLLING(int, i, 0, 1, N0,
				178	{
				179	b[i].v = {{ZERO_VALUE}};
				180	})
				181
				182	// Load tile from the src tensor
				183	T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
				184
				185	// Load tile from the weights tensor
				186	// The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
				187	T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
				188
				189	// Compute the matrix multiplication between two tiles
				190	T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
				191
				192	++ck;
				193	}
				194	)_";
				195	}
				196
				197	code += R"_(
				198	#undef _I_WEI_WIDTH
				199	#undef _I_WEI_HEIGHT
				200	#undef _ISRC_WIDTH
				201	#undef _ISRC_HEIGHT
				202	#undef _ISRC_CHANNELS
				203	#undef _IDST_WIDTH
				204	#undef _IDST_HEIGHT
				205	#undef _IDST_CHANNELS
				206	#undef _IY_MULTIPLIER
				207
				208	}
				209	)_";
				210
				211	if(_bias && _bias->has_valid_id())
				212	{
				213	code += R"_(
				214	TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
				215
				216	T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
				217
				218	// c = c + bias[broadcasted]
				219	T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
				220	)_";
				221	}
				222
				223	code += R"_(
				224	}
				225	//------------------ END KERNEL {{meta_kernel_id}} ---------------------
				226	)_";
				227	return code;
				228	}
				229
				230	void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
				231	{
				232	vtable.declare_variable(
				233	_src,
				234	GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
				235	comp_group.is_intermediate_tensor(_src),
				236	"src");
				237
				238	const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
				239	vtable.declare_variable(
				240	_weight,
				241	GpuKernelArgumentInfo(weight_type),
				242	comp_group.is_intermediate_tensor(_weight),
				243	"weight");
				244
				245	if(_bias && _bias->has_valid_id()) // optional bias
				246	{
				247	vtable.declare_variable(
				248	_bias,
				249	GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
				250	comp_group.is_intermediate_tensor(_bias),
				251	"bias");
				252	}
				253	vtable.declare_variable(
				254	_dst,
				255	GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
				256	comp_group.is_intermediate_tensor(_dst),
				257	"dst");
				258	}
				259
				260	TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
				261	{
				262	TagLUT lut{};
				263	// Arguments and global shared variables
				264	lut["src"] = vtable.get_variable(_src);
				265	lut["weight"] = vtable.get_variable(_weight);
				266
				267	if(_bias && _bias->has_valid_id()) // optional bias
				268	{
				269	lut["bias"] = vtable.get_variable(_bias);
				270	lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
				271	}
				272	lut["dst"] = vtable.get_variable(_dst);
				273
				274	const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
				275	lut["arg_dst"] = dst_argument.uniq_name;
				276
				277	// Local build options
				278	lut["meta_kernel_id"] = id();
				279	lut["ACC_DATA_TYPE"] = _src->data_type();
				280	lut["SRC_DATA_TYPE"] = _src->data_type();
				281	lut["WEI_DATA_TYPE"] = _weight->data_type();
				282
				283	lut["SRC_TENSOR_TYPE"] = "BUFFER";
				284	switch(vtable.get_variable(_weight).kernel_argument_info.type)
				285	{
				286	case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
				287	case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
				288	case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
				289	{
				290	lut["WEI_TENSOR_TYPE"] = "IMAGE";
				291	break;
				292	}
				293	default:
				294	{
				295	lut["WEI_TENSOR_TYPE"] = "BUFFER";
				296	break;
				297	}
				298	}
				299	const auto width_idx = 1;
				300	const auto height_idx = 2;
				301	lut["WEI_WIDTH"] = _weight->dimension(width_idx);
				302	lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
				303
				304	lut["STRIDE_X"] = _attributes.stride().x();
				305	lut["STRIDE_Y"] = _attributes.stride().y();
				306
				307	lut["PAD_LEFT"] = _attributes.pad().left;
				308	lut["PAD_TOP"] = _attributes.pad().top;
				309
				310	lut["ZERO_VALUE"] = 0;
				311
				312	return lut;
				313	}
				314
				315	CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
				316	{
				317	const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
				318	const DataType data_type = _src->data_type();
				319
				320	/// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future
				321	const auto root_window = comp_group.get_root_component()->template_writer()->get_window();
				322	const unsigned int n0 = root_window.x().step();
				323	const unsigned int m0 = root_window.y().step();
				324	const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, _src->dimension(channel_idx));
				325	const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
				326
				327	CLBuildOptions build_opts{};
				328	if(_settings.fast_relaxed_math())
				329	{
				330	build_opts.add_option("-cl-fast-relaxed-math");
				331	}
				332	else
				333	{
				334	// -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
				335	// to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
				336	build_opts.add_option("-cl-unsafe-math-optimizations");
				337	}
				338	build_opts.add_option("-DIS_TILED");
				339	build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
				340	build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
				341	build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
				342	build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
				343
				344	return build_opts;
				345	}
				346
				347	std::string ClTemplateDirectConv2d::get_config_id() const
				348	{
				349	const DataType data_type = _src->data_type();
				350	const DataLayout data_layout = _src->data_layout();
				351
				352	const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
				353	const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
				354
				355	const unsigned int kernel_size = _weight->dimension(width_idx);
				356
				357	std::string config_id{};
				358	config_id += lower_string(string_from_data_type(data_type));
				359	config_id += "_";
				360	config_id += support::cpp11::to_string(kernel_size);
				361	config_id += "_";
				362	config_id += support::cpp11::to_string(_attributes.stride().x());
				363	config_id += "_";
				364	config_id += support::cpp11::to_string(_attributes.stride().y());
				365	config_id += "_";
				366	config_id += support::cpp11::to_string(_dst->dimension(width_idx));
				367	config_id += "_";
				368	config_id += support::cpp11::to_string(_dst->dimension(height_idx));
				369	config_id += "_";
				370	config_id += lower_string(string_from_data_layout(data_layout));
				371	return config_id;
				372	}
				373
				374	std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
				375	{
				376	return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
				377	}
				378
				379	Window ClTemplateDirectConv2d::get_window() const
				380	{
				381	ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
				382
				383	const auto output_shape = _dst->tensor_shape();
				384
				385	const unsigned int vec_size = std::min(static_cast<unsigned int>(output_shape[0]), 4u);
				386	const unsigned int num_rows = (_dst->tensor_shape()[0] > 16) ? ((_src->data_type() == DataType::F32) ? 2U : 4U) : 1U;
				387
				388	// Create and configure kernel window
				389	Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
				390
				391	const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
				392	win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
				393	win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
				394
				395	return win;
				396	}
				397
				398	} // namespace dynamic_fusion
				399	} // namespace experimental
				400	} // namespace arm_compute