Blame - src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp - ml/ComputeLibrary

blob: 6fa77aafe3189c68f7d51f1178568cd54e76fe1c [file] [log] [blame]

Gunes Bayir	7dc0234	2022-11-21 21:46:50 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2022 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "ClTemplateDepthwiseConv2d.h"
				25
				26	#include "src/core/helpers/WindowHelpers.h"
				27	#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
				28
				29	namespace arm_compute
				30	{
				31	namespace experimental
				32	{
				33	namespace dynamic_fusion
				34	{
				35	ClTemplateDepthwiseConv2d::ClTemplateDepthwiseConv2d(ComponentId id,
				36	const ArgumentPack<ITensorInfo> &tensors,
				37	const Attributes &attributes,
				38	const Settings &settings)
				39	: IGpuTemplateComponentWriter{ id, tensors },
				40	_src{},
				41	_weight{},
				42	_bias{},
				43	_dst{},
				44	_attributes{ attributes },
				45	_settings{ settings }
				46	{
				47	_src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
				48	_weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
				49	if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
				50	{
				51	_bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
				52	}
				53	_dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
				54	ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
				55	}
				56
				57	std::string ClTemplateDepthwiseConv2d::get_name() const
				58	{
				59	return "depthwise_conv2d";
				60	}
				61
				62	std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup &comp_group) const
				63	{
				64	ARM_COMPUTE_UNUSED(comp_group);
				65
				66	constexpr int height_idx = 2; // Data Layout is NHWC
				67
				68	std::string code = R"_(
				69	//------------------ START KERNEL {{meta_kernel_id}} ---------------------
				70	// IN_0(src) {{src}}
				71	// IN_1(wei) {{weight}}
				72	)_";
				73
				74	if(_bias != nullptr && _bias->has_valid_id())
				75	{
				76	code += R"_(
				77	// IN_1(bia) {{bias}}
				78	)_";
				79	}
				80
				81	code += R"_(
				82	// OUT(dst, accum) {{dst}}
				83
				84	TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
				85	TILE(uint, M0, 1, g_dst_indirect_y);
				86
				87	{
				88	#define _IWEI_WIDTH {{WEI_WIDTH}}
				89	#define _IWEI_HEIGHT {{WEI_HEIGHT}}
				90	#define _IDST_WIDTH {{arg_dst}}_w
				91	#define _IDST_HEIGHT {{arg_dst}}_h
				92	#define _IM0_A M0_A
				93	#define _IN0_A N0_A
				94	#define _IM0_B _IWEI_WIDTH
				95	#define _IN0_B N0
				96	#define _IBOUNDARY_CHECK (!((_IWEI_WIDTH == 1 && _IWEI_HEIGHT == 1 && {{PAD_LEFT}} == 0 && {{PAD_TOP}} == 0 && M0 == 1)))
				97	)_";
				98
				99	code += R"_(
				100	const int yo = g_ind_2 % {{arg_dst}}_h;
				101	const int bout = g_ind_2 / {{arg_dst}}_h;
				102	)_";
				103
				104	code += R"_(
				105
				106	int xi = g_ind_1 * {{STRIDE_X}};
				107	int yi = yo * {{STRIDE_Y}};
				108	xi -= {{PAD_LEFT}};
				109	yi -= {{PAD_TOP}};
				110
				111	LOOP_UNROLLING(int, i, 0, 1, M0,
				112	{
				113	{{dst}}[i].v = 0;
				114	})
				115	)_";
				116
				117	if(_weight->dimension(height_idx) < 5)
				118	{
				119	code += R"_(
				120	LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
				121	)_";
				122	}
				123	else
				124	{
				125	code += R"_(
				126	for(int yk = 0; yk < _IWEI_HEIGHT; ++yk)
				127	)_";
				128	}
				129
				130	code += R"_(
				131	{
				132	TILE({{SRC_DATA_TYPE}}, _IM0_A, _IN0_A, a);
				133
				134	LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
				135	{
				136	a[i].v = 0;
				137	})
				138
				139	T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, _IM0_A, _IN0_A, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi + yk * {{DILATION_Y}}, xi, (g_ind_0 / {{DEPTH_MULTIPLIER}}), {{src}}_w, {{src}}_h, {{DILATION_X}}, 1, _IBOUNDARY_CHECK, a);
				140
				141	TILE({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, b);
				142
				143	T_LOAD({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, {{WEI_TENSOR_TYPE}}, {{weight}}, g_ind_0, yk * _IM0_B, 1, {{weight}}_stride_y, b);
				144
				145	LOOP_UNROLLING(int, m0, 0, 1, M0,
				146	{
				147	LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
				148	{
				149	)_";
				150
				151	if(!_settings.is_fma_available())
				152	{
				153	code += R"_(
				154	{{dst}}[m0].v += a[xk + m0].v * b[xk].v;
				155	)_";
				156	}
				157	else
				158	{
				159	code += R"_(
				160	{{dst}}[m0].v = fma(a[xk + m0].v, b[xk].v, {{dst}}[m0].v);
				161	)_";
				162	}
				163
				164	code += R"_(
				165	})
				166	})
				167	}
				168	)_";
				169
				170	if(_weight->dimension(height_idx) < 5)
				171	{
				172	code += R"_(
				173	)
				174	)_";
				175	}
				176
				177	if(_bias && _bias->has_valid_id())
				178	{
				179	code += R"_(
				180	TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}});
				181
				182	T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 0, 0, {{bias}});
				183
				184	T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, {{bias}}, {{dst}});
				185	)_";
				186	}
				187
				188	code += R"_(
				189	LOOP_UNROLLING(int, i, 0, 1, M0,
				190	{
				191	g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
				192	g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w);
				193	g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
				194	})
				195	}
				196	//------------------ END KERNEL {{meta_kernel_id}} ---------------------
				197	)_";
				198
				199	return code;
				200	}
				201
				202	void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
				203	{
				204	const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() ?
				205	GpuKernelArgumentInfo::Type::Tensor_4D_t_Image :
				206	GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
				207
				208	vtable.declare_variable(
				209	_src,
				210	GpuKernelArgumentInfo(input_type),
				211	comp_group.is_intermediate_tensor(_src),
				212	"src");
				213
				214	const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ?
				215	GpuKernelArgumentInfo::Type::Tensor_4D_t_Image :
				216	GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
				217
				218	vtable.declare_variable(
				219	_weight,
				220	GpuKernelArgumentInfo(weight_type),
				221	comp_group.is_intermediate_tensor(_weight),
				222	"weight");
				223
				224	if(_bias != nullptr && _bias->has_valid_id()) // optional bias
				225	{
				226	vtable.declare_variable(
				227	_bias,
				228	GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
				229	comp_group.is_intermediate_tensor(_bias),
				230	"bias");
				231	}
				232	vtable.declare_variable(
				233	_dst,
				234	GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
				235	comp_group.is_intermediate_tensor(_dst),
				236	"dst");
				237	}
				238
				239	TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
				240	{
				241	TagLUT lut{};
				242
				243	// Arguments and global shared variables
				244	lut["src"] = vtable.get_variable(_src);
				245	lut["weight"] = vtable.get_variable(_weight);
				246
				247	if(_bias != nullptr && _bias->has_valid_id()) // optional bias
				248	{
				249	lut["bias"] = vtable.get_variable(_bias);
				250	lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
				251	}
				252	lut["dst"] = vtable.get_variable(_dst);
				253
Viet-Hoa Do	04f4620	2022-12-14 14:49:56 +0000	[diff] [blame]	254	const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
Gunes Bayir	7dc0234	2022-11-21 21:46:50 +0000	[diff] [blame]	255	lut["arg_dst"] = dst_argument.uniq_name;
				256
				257	// Local build options
				258	lut["meta_kernel_id"] = id();
				259	lut["ACC_DATA_TYPE"] = _src->data_type();
				260	lut["SRC_DATA_TYPE"] = _src->data_type();
				261	lut["WEI_DATA_TYPE"] = _weight->data_type();
				262
				263	switch(vtable.get_variable(_src).kernel_argument_info.type)
				264	{
				265	case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
				266	case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
				267	case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
				268	lut["SRC_TENSOR_TYPE"] = "IMAGE";
				269	break;
				270	default:
				271	lut["SRC_TENSOR_TYPE"] = "BUFFER";
				272	break;
				273	}
				274
				275	switch(vtable.get_variable(_weight).kernel_argument_info.type)
				276	{
				277	case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
				278	case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
				279	case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
				280	lut["WEI_TENSOR_TYPE"] = "IMAGE";
				281	break;
				282	default:
				283	lut["WEI_TENSOR_TYPE"] = "BUFFER";
				284	break;
				285	}
				286
				287	// Data Layout is NHWC
				288	constexpr int width_idx = 1;
				289	constexpr int height_idx = 2;
				290
				291	lut["WEI_WIDTH"] = _weight->dimension(width_idx);
				292	lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
				293
				294	lut["STRIDE_X"] = _attributes.stride().x();
				295	lut["STRIDE_Y"] = _attributes.stride().y();
				296
				297	lut["PAD_LEFT"] = _attributes.pad().left;
				298	lut["PAD_TOP"] = _attributes.pad().top;
				299
				300	lut["DILATION_X"] = _attributes.dilation().x();
				301	lut["DILATION_Y"] = _attributes.dilation().y();
				302
				303	lut["DEPTH_MULTIPLIER"] = _attributes.depth_multiplier();
				304
				305	return lut;
				306	}
				307
				308	CLBuildOptions ClTemplateDepthwiseConv2d::get_build_options(const ComponentGroup &comp_group) const
				309	{
				310	ARM_COMPUTE_UNUSED(comp_group);
				311
				312	constexpr unsigned int width_idx = 1; // Data Layout is NHWC
				313
				314	const unsigned int n0 = _settings.n0();
				315	const unsigned int m0 = _settings.m0();
				316	const unsigned int m0_a = _weight->dimension(width_idx) + m0 - 1;
				317	const unsigned int n0_a = _attributes.depth_multiplier() > 1 ? 1 : n0;
				318	const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
				319
				320	CLBuildOptions build_opts{};
				321
				322	if(_settings.fast_relaxed_math())
				323	{
				324	build_opts.add_option("-cl-fast-relaxed-math");
				325	}
				326	else
				327	{
				328	// -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
				329	// to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
				330	build_opts.add_option("-cl-unsafe-math-optimizations");
				331	}
				332
				333	build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
				334	build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
				335	build_opts.add_option("-DN0_A=" + support::cpp11::to_string(n0_a));
				336	build_opts.add_option("-DM0_A=" + support::cpp11::to_string(m0_a));
				337	build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
				338
				339	return build_opts;
				340	}
				341
				342	std::string ClTemplateDepthwiseConv2d::get_config_id() const
				343	{
				344	std::string config_id{};
				345
				346	config_id += support::cpp11::to_string(_src->dimension(0));
				347	config_id += "_";
				348	config_id += support::cpp11::to_string(_src->dimension(1));
				349	config_id += "_";
				350	config_id += support::cpp11::to_string(_src->dimension(2));
				351	config_id += "_";
				352	config_id += support::cpp11::to_string(_dst->dimension(0));
				353	config_id += "_";
				354	config_id += support::cpp11::to_string(_dst->dimension(1));
				355	config_id += "_";
				356	config_id += support::cpp11::to_string(_dst->dimension(2));
				357	config_id += "_";
				358	config_id += string_from_data_type(_src->data_type());
				359
				360	return config_id;
				361	}
				362
				363	std::set<std::string> ClTemplateDepthwiseConv2d::get_headers_list() const
				364	{
				365	return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
				366	}
				367
				368	Window ClTemplateDepthwiseConv2d::get_window() const
				369	{
				370	ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
				371
				372	Window win = calculate_max_window(*_dst, Steps(_settings.n0(), _settings.m0()));
				373	return win.collapse(win, Window::DimZ);
				374	}
				375
				376	} // namespace dynamic_fusion
				377	} // namespace experimental
				378	} // namespace arm_compute