Blame - src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp - ml/ComputeLibrary

blob: fb86e163f204bdcce0af7a571b499bb6a64af9a0 [file] [log] [blame]

Mohammed Suhail Munshi	a18d85c	2023-01-03 10:16:16 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2023 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "ClTemplatePool2d.h"
				25
				26	#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
				27	#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
				28
				29	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
				30	#include "src/core/helpers/WindowHelpers.h"
				31
				32	#include "support/StringSupport.h"
				33
				34	namespace arm_compute
				35	{
				36	namespace experimental
				37	{
				38	namespace dynamic_fusion
				39	{
				40	namespace
				41	{
				42	// Shape indexes for NHWC Datalayout
Mohammed Suhail Munshi	a18d85c	2023-01-03 10:16:16 +0000	[diff] [blame]	43	constexpr static int32_t height_idx = 2;
				44	constexpr static int32_t width_idx = 1;
				45	constexpr static int32_t channel_idx = 0;
Omar Al Khatib	3c7c1fa	2023-03-07 09:57:49 +0000	[diff] [blame]	46	} // namespace
Mohammed Suhail Munshi	a18d85c	2023-01-03 10:16:16 +0000	[diff] [blame]	47	ClTemplatePool2d::ClTemplatePool2d(ComponentId id,
				48	const ArgumentPack<ITensorInfo> &tensors,
				49	const Attributes &attributes,
				50	const Settings &settings)
				51	: IGpuTemplateComponentWriter{ id, tensors },
				52	_src{},
				53	_dst{},
				54	_attributes{ attributes },
				55	_settings{ settings }
				56	{
				57	_src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
				58	_dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
				59	ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
				60	}
				61
				62	std::string ClTemplatePool2d::get_name() const
				63	{
				64	return "pool2d";
				65	}
				66
				67	std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_group) const
				68	{
				69	ARM_COMPUTE_UNUSED(comp_group);
				70
				71	// Condition to use 2x2 optimized kernel
				72	if(_attributes.pool_size() == Size2D(2, 2))
				73	{
				74	return get_2x2_kernel_code();
				75	}
				76	else
				77	{
				78	return get_MxN_kernel_code();
				79	}
				80	}
				81
				82	std::string ClTemplatePool2d::get_MxN_kernel_code() const
				83	{
				84	const auto pool_type = _attributes.pool_type();
				85	const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
				86
				87	// Define pool op macro.
				88	std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
				89
				90	// Kernel start
				91	// Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0
				92	// Note: If C is less than N0, N0 should be SHRINKED to the closest smaller N0. This operation is performed on the host side
				93	std::string code = R"_(
				94	//------------------ START KERNEL {{meta_kernel_id}} ---------------------
				95	// IN_0(src) {{src}}
				96	// OUT(dst, accum) {{dst}}
				97
				98	{
				99	const int idx_out_c = g_ind_0;
				100	const int idx_out_w = g_ind_1;
				101	)_";
				102
				103	// Add macro for POOL_OP
				104	code += "\n" + pool_op + "\n";
				105
				106	code += R"_(
				107	const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
				108	const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
				109	)_";
				110
				111	// Define common variables.
				112	code += R"_(
				113	__global unsigned char in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
				114
				115	__global unsigned char out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n * {{dst}}_stride_w;
				116
				117	VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
				118	res0 = {{INITIAL_VALUE}};
				119
				120	const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
				121	const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
				122
				123	const int pool_x_s = max((int)0, -idx_in_w);
				124	const int pool_x_e = min((int){{POOL_SIZE_X}}, (int){{SRC_WIDTH}} - idx_in_w);
				125	const int pool_y_s = max((int)0, -idx_in_h);
				126	const int pool_y_e = min((int){{POOL_SIZE_Y}}, (int){{SRC_HEIGHT}} - idx_in_h);
				127	)_";
				128
				129	// Determine filter size depending on if padding is excluded or not
				130	if(_attributes.exclude_padding())
				131	{
				132	code += R"_(
				133	const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
				134	)_";
				135	}
				136	else
				137	{
				138	code += R"_(
				139	const int filter_size = {{POOL_SIZE_X}} * {{POOL_SIZE_Y}};
				140	)_";
				141	}
				142
				143	// Loop through pool size
				144	// if global pooling
				145	if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx))
				146	{
				147	// Begin loop
				148	code += R"_(
				149	// Global pooling path
				150	for(int y = 0; y < {{POOL_SIZE_Y}}; ++y)
				151	{
				152	#pragma unroll 8
				153	for(int x = 0; x < {{POOL_SIZE_X}}; ++x)
				154	{
				155	VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
				156	data0;
				157	)_";
				158	}
				159	else // if local pooling size
				160	{
				161	code += R"_(
				162	for(int y = pool_y_s; y < pool_y_e; ++y)
				163	{
				164	#pragma unroll 8
				165	for(int x = pool_x_s; x < pool_x_e; ++x)
				166	{
				167	VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
				168	data0;
				169	)_";
				170	} // end else
				171
				172	// if condition inside loop - use 32bit acc if mixed_precision.
				173	// End loop through pooling section.
				174	if(fp_mixed_precision)
				175	{
				176	// In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
				177	code += R"_(
				178	data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} )(in_base_ptr + (x + idx_in_w) {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
				179	res0 = POOL_OP(res0, data0);
				180	}
				181	}
				182	)_";
				183	}
				184	else // load data, compute result and end loop
				185	{
				186	code += R"_(
				187	data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} )(in_base_ptr + (x + idx_in_w) {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z));
				188	res0 = POOL_OP(res0, data0);
				189	}
				190	}
				191	)_";
				192	}
				193
				194	// For Pool AVG ONLY, divide pool output by filter size
				195	if(pool_type == PoolingType::AVG)
				196	{
				197	code += R"_(
				198	res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
				199	)_";
				200	}
				201
				202	// If mixed precision convert datatype before storing. Then end kernel.
				203	if(fp_mixed_precision)
				204	{
				205	code += R"_(
				206	VEC_DATA_TYPE({{DATA_TYPE}}, N0)
				207	res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
				208	STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
				209	)_";
				210	}
				211	else
				212	{
				213	// Store data
				214	code += R"_(
				215	STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
				216	)_";
				217	}
				218
				219	code += R"_(
				220	//------------------ END KERNEL {{meta_kernel_id}} ---------------------
				221	}
				222	)_";
				223
				224	return code;
				225	}
				226
				227	std::string ClTemplatePool2d::get_2x2_kernel_code() const
				228	{
				229	const auto pool_type = _attributes.pool_type();
				230	const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
				231	std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
				232
				233	std::string code = R"_(
				234	//------------------ START KERNEL {{meta_kernel_id}} ---------------------
				235	// IN_0(src) {{src}}
				236	// OUT(dst, accum) {{dst}}
				237
				238	#define SELECT_TYPE SELECT_VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
				239
				240	{
				241	const int idx_out_c = g_ind_0;
				242	const int idx_out_w = g_ind_1;
				243	)_";
				244
				245	// Add pool op macro
				246	code += "\n" + pool_op + "\n";
				247
				248	// If batch size != 1, the batch size dimension is collapsed over the height dimension
				249	code += R"_(
				250	const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
				251	const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
				252	)_";
				253
				254	code += R"_(
				255	const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
				256	const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
				257
				258	__global unsigned char in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
				259	__global unsigned char out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n *
				260	{{dst}}_stride_w;
				261	const int pool_x_s = max((int)0, -idx_in_w);
				262	const int pool_x_e = min((int)2, (int){{SRC_WIDTH}} - idx_in_w);
				263	const int pool_y_s = max((int)0, -idx_in_h);
				264	const int pool_y_e = min((int)2, (int){{SRC_HEIGHT}} - idx_in_h);
				265
				266	const int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
				267	const int x0 = pool_x_s + idx_in_w;
				268	const int y0 = pool_y_s + idx_in_h;
				269	const int x1 = pool_x_e - 1 + idx_in_w;
				270	const int y1 = pool_y_e - 1 + idx_in_h;
				271
				272	REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0);
				273	)_";
				274
				275	if(fp_mixed_precision)
				276	{
				277	// In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
				278	code += R"_(
				279	data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} )(in_base_ptr + x0 {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
				280	data1 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} )(in_base_ptr + x1 {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
				281	data2 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} )(in_base_ptr + x0 {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
				282	data3 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} )(in_base_ptr + x1 {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
				283	)_";
				284	}
				285	else
				286	{
				287	code += R"_(
				288	data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} )(in_base_ptr + x0 {{src}}_stride_y + y0 * {{src}}_stride_z));
				289	data1 = VLOAD(N0)(0, (__global {{DATA_TYPE}} )(in_base_ptr + x1 {{src}}_stride_y + y0 * {{src}}_stride_z));
				290	data2 = VLOAD(N0)(0, (__global {{DATA_TYPE}} )(in_base_ptr + x0 {{src}}_stride_y + y1 * {{src}}_stride_z));
				291	data3 = VLOAD(N0)(0, (__global {{DATA_TYPE}} )(in_base_ptr + x1 {{src}}_stride_y + y1 * {{src}}_stride_z));
				292	)_";
				293	}
				294
				295	if(pool_type != PoolingType::MAX)
				296	{
				297	// Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
				298	code += R"_(
				299	if(filter_size != 4)
				300	{
				301	SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
				302	SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)({{SRC_WIDTH}} - 1);
				303	SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
				304	SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)({{SRC_HEIGHT}} - 1);
				305
				306	data0 = select(data0, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s \| cond_h_s));
				307	data1 = select(data1, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e \| cond_h_s));
				308	data2 = select(data2, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s \| cond_h_e));
				309	data3 = select(data3, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e \| cond_h_e));
				310	}
				311	)_";
				312	}
				313
				314	code += R"_(
				315	VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
				316	res0 = data0;
				317	res0 = POOL_OP(res0, data1);
				318	res0 = POOL_OP(res0, data2);
				319	res0 = POOL_OP(res0, data3);
				320	)_";
				321
				322	if(pool_type == PoolingType::AVG)
				323	{
				324	// If avg pooling divide result accordingly.
				325	if(_attributes.exclude_padding())
				326	{
				327	code += R"_(
				328	res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
				329	)_";
				330	}
				331	else
				332	{
				333	code += R"_(
				334	res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))4;
				335	)_";
				336	}
				337	}
				338
				339	// Store result
				340	if(fp_mixed_precision)
				341	{
				342	code += R"_(
				343	VEC_DATA_TYPE({{DATA_TYPE}}, N0)
				344	res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
				345	STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
				346	)_";
				347	}
				348	else
				349	{
				350	code += R"_(
				351	STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
				352	)_";
				353	}
				354
				355	code += R"_(
				356	//------------------ END KERNEL {{meta_kernel_id}} ---------------------
				357	}
				358	#undef SELECT_TYPE
				359	)_";
				360
				361	return code;
				362	}
				363
				364	void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
				365	{
				366	vtable.declare_variable(
				367	comp_group,
				368	_src,
				369	GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
				370	"src");
				371
				372	vtable.declare_variable(
				373	comp_group,
				374	_dst,
				375	GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
				376	"dst");
				377	}
				378
				379	TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
				380	{
				381	ARM_COMPUTE_UNUSED(comp_group);
				382
				383	TagLUT lut{};
				384	// Arguments and global shared variables
				385	lut["src"] = vtable.get_variable(_src);
				386	lut["dst"] = vtable.get_variable(_dst);
				387
				388	// Local build options
				389	lut["meta_kernel_id"] = id();
				390
				391	// Retrieve relevant data
Omar Al Khatib	3c7c1fa	2023-03-07 09:57:49 +0000	[diff] [blame]	392	const auto padding = _attributes.pad();
				393	const auto stride = _attributes.stride();
				394	const auto pool_size = _attributes.pool_size();
				395	const auto data_type = _src->data_type();
				396	const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
				397	const std::string max_initial_value = _settings.use_inf_as_limit() ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
Mohammed Suhail Munshi	a18d85c	2023-01-03 10:16:16 +0000	[diff] [blame]	398
				399	// pool specific
				400	lut["STRIDE_X"] = stride.x();
				401	lut["STRIDE_Y"] = stride.y();
				402	lut["PAD_X"] = padding.left;
				403	lut["PAD_Y"] = padding.top;
				404	lut["POOL_SIZE_X"] = pool_size.width;
				405	lut["POOL_SIZE_Y"] = pool_size.height;
				406
				407	// Datatypes and variables
				408	lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
				409	lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type);
				410	lut["SRC_WIDTH"] = _src->dimension(width_idx);
				411	lut["SRC_HEIGHT"] = _src->dimension(height_idx);
Adnan AlSinan	227db8d	2023-02-14 14:24:09 +0000	[diff] [blame]	412	lut["INITIAL_VALUE"] = (_attributes.pool_type() == PoolingType::MAX) ? max_initial_value : std::string("0");
Mohammed Suhail Munshi	a18d85c	2023-01-03 10:16:16 +0000	[diff] [blame]	413
				414	// Tensor specific data
				415	lut["DST_HEIGHT"] = _dst->dimension(height_idx);
				416
				417	return lut;
				418	}
				419
				420	CLBuildOptions ClTemplatePool2d::get_build_options(const ComponentGroup &comp_group) const
				421	{
				422	const auto root_window = comp_group.get_root_component()->template_writer()->get_window();
				423	const unsigned int n0 = root_window.x().step();
				424	const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
				425
				426	CLBuildOptions build_opts{};
				427	build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
				428	build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
				429
				430	return build_opts;
				431	}
				432
				433	std::string ClTemplatePool2d::get_config_id() const
				434	{
				435	const DataType data_type = _src->data_type();
				436	const DataLayout data_layout = _src->data_layout();
				437
				438	std::string config_id{};
				439	config_id += "pooling_layer_2d_";
				440	config_id += lower_string(string_from_data_type(data_type));
				441	config_id += "_";
				442	config_id += lower_string(string_from_data_layout(data_layout));
				443	config_id += "_";
				444	config_id += support::cpp11::to_string(_dst->dimension(width_idx));
				445	config_id += "_";
				446	config_id += support::cpp11::to_string(_dst->dimension(height_idx));
				447	config_id += "_";
				448	config_id += support::cpp11::to_string(_dst->dimension(channel_idx));
				449
				450	return config_id;
				451	}
				452
				453	std::set<std::string> ClTemplatePool2d::get_headers_list() const
				454	{
				455	return std::set<std::string>{ "helpers.h", "tile_helpers.h", "repeat.h" };
				456	}
				457
				458	Window ClTemplatePool2d::get_window() const
				459	{
				460	ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
				461	const auto output_shape = _dst->tensor_shape();
				462	const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
				463
				464	// Create and configure kernel window
				465	auto win = calculate_max_window(output_shape, Steps(vec_size));
				466	win = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
				467	return win;
				468	}
				469
				470	} // namespace dynamic_fusion
				471	} // namespace experimental
				472	} // namespace arm_compute