Blame - tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp - ml/ComputeLibrary

blob: 753e0a4625bd8455156c915a9ccd3dc2c14776ee [file] [log] [blame]

Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2022 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	24	#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
				25
				26	#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
				27
				28	#include "src/core/utils/helpers/float_ops.h"
				29	#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
				30	#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
				31	#include "tests/CL/CLAccessor.h"
				32	#include "tests/framework/Macros.h"
				33	#include "tests/framework/datasets/Datasets.h"
				34	#include "tests/validation/Validation.h"
				35	#include "tests/validation/reference/ElementwiseOperations.h"
				36	#include "tests/validation/reference/GEMM.h"
				37
				38	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
				39	#include "src/core/AccessWindowStatic.h"
				40	#include "src/core/helpers/AutoConfiguration.h"
				41	#include "src/core/helpers/WindowHelpers.h"
				42
				43	#include <chrono>
				44
				45	using namespace arm_compute::experimental::dynamic_fusion;
				46
				47	namespace arm_compute
				48	{
				49	namespace test
				50	{
				51	namespace validation
				52	{
				53	namespace
				54	{
				55	/** Macros which measures the wall clock time, and records it into a map measurement_map with name clock_name */
				56	#define TICK(clock_name) \
				57	auto clock_name##_tick = std::chrono::high_resolution_clock::now();
				58	#define TOCK(clock_name, measurement_map) \
				59	auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
				60	measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
				61	#define TOCK_AVG(clock_name, measurement_map, num_iterations) \
				62	auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
				63	measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
				64
				65	template <typename T, typename U>
				66	void fill(U &&tensor, int seed)
				67	{
				68	static_assert(std::is_floating_point<T>::value \|\| std::is_same<T, half>::value, "Only floating point data types supported.");
				69	using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
				70
				71	DistributionType distribution{ T(-1.0f), T(1.0f) };
				72	library->fill(tensor, distribution, seed);
				73
				74	// Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
				75	DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
				76	library->fill_borders_with_garbage(tensor, distribution_inf, seed);
				77	}
				78
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	79	void set_build_options(ClKernelCode &cl_code, GemmNativeDescriptor gemm_native_desc,
				80	const TensorInfo &t_lhs_info,
				81	const TensorInfo &t_rhs_info,
				82	const TensorInfo *t_bias_info,
				83	const TensorInfo &t_dst_info)
				84	{
				85	CLBuildOptions ref_cl_build_options;
				86	{
				87	// If reinterpret_input_as_3d = reinterpret_output_as_3d = true,
				88	// we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
				89	// This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
				90	auto reinterpret_input_as_3d = gemm_native_desc.reinterpret_input_as_3d;
				91	auto reinterpret_output_as_3d = gemm_native_desc.depth_output_gemm3d != 0;
				92	auto _slide_matrix_b = (t_rhs_info.num_dimensions() >= t_lhs_info.num_dimensions());
				93	auto _use_dummy_work_items = false;
				94	// In case both input and dst have to be reinterpreted as 3D tensors,
				95	// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
				96	if(reinterpret_input_as_3d == reinterpret_output_as_3d)
				97	{
				98	reinterpret_input_as_3d = false;
				99	reinterpret_output_as_3d = false;
				100	}
				101
				102	const unsigned int internal_m = reinterpret_output_as_3d ? gemm_native_desc.m : t_dst_info.dimension(1);
				103
				104	const unsigned int h_gemm_3d = reinterpret_output_as_3d ? t_dst_info.dimension(1) : t_lhs_info.dimension(1);
				105	const unsigned int d_gemm_3d = reinterpret_output_as_3d ? t_dst_info.dimension(2) : t_lhs_info.dimension(2);
				106
				107	// Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
				108	const unsigned int partial_store_m0 = internal_m % gemm_native_desc.lhs_info.m0;
				109	const unsigned int partial_store_n0 = gemm_native_desc.n % gemm_native_desc.rhs_info.n0;
				110
				111	// Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
				112	const unsigned int internal_m0 = std::min(internal_m, gemm_native_desc.lhs_info.m0);
				113
				114	ref_cl_build_options.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info.data_type()));
				115	ref_cl_build_options.add_option_if(!(helpers::float_ops::is_one(gemm_native_desc.alpha)), "-DALPHA=" + float_to_string_with_full_precision(gemm_native_desc.alpha));
				116	ref_cl_build_options.add_option_if(t_bias_info != nullptr, "-DBETA=" + float_to_string_with_full_precision(gemm_native_desc.beta));
				117	ref_cl_build_options.add_option_if(helpers::float_ops::is_one(gemm_native_desc.beta), "-DUNIT_BETA");
				118	ref_cl_build_options.add_option_if(gemm_native_desc.broadcast_bias, "-DBROADCAST_BIAS");
				119	ref_cl_build_options.add_option_if(reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
				120	ref_cl_build_options.add_option_if(reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
				121	ref_cl_build_options.add_option_if(reinterpret_input_as_3d \|\| reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
				122	ref_cl_build_options.add_option_if(reinterpret_input_as_3d \|\| reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
				123	ref_cl_build_options.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(t_rhs_info.dimension(2)));
				124	ref_cl_build_options.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
				125	ref_cl_build_options.add_option("-DM=" + support::cpp11::to_string(internal_m));
				126	ref_cl_build_options.add_option("-DN=" + support::cpp11::to_string(gemm_native_desc.n));
				127	ref_cl_build_options.add_option("-DK=" + support::cpp11::to_string(gemm_native_desc.k));
				128	ref_cl_build_options.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
				129	ref_cl_build_options.add_option("-DN0=" + support::cpp11::to_string(gemm_native_desc.rhs_info.n0));
				130	ref_cl_build_options.add_option("-DK0=" + support::cpp11::to_string(gemm_native_desc.rhs_info.k0));
				131	ref_cl_build_options.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
				132	ref_cl_build_options.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
				133	// Manually add PostOps
				134	{
				135	ref_cl_build_options.add_option("-DOP=ADD_X_POS_1");
				136	ref_cl_build_options.add_option("-DP2_ELTWISE_ARG1_HEIGHT=" + support::cpp11::to_string(t_dst_info.dimension(1)));
				137	ref_cl_build_options.add_option("-DP2_ELTWISE_ARG1_WIDTH=" + support::cpp11::to_string(t_dst_info.dimension(0)));
				138	}
				139	}
				140	cl_code.build_options = ref_cl_build_options;
				141	}
				142	} // namespace
				143
				144	TEST_SUITE(CL)
				145	TEST_SUITE(UNIT)
				146	TEST_SUITE(DYNAMIC_FUSION)
				147	TEST_SUITE(ClCompositeKernel)
				148	TEST_SUITE(Validate)
				149
				150	TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
				151	{
				152	/* Computation:
				153	* out = add(addend, gemm_native(lhs, rhs, bias)) (non-broadcast)
				154	*/
				155	const auto data_type = DataType::F32;
				156	const auto m = 5U;
				157	const auto n = 4U;
				158	const auto k = 3U;
				159	const auto t_lhs_shape = TensorShape(k, m);
				160	const auto t_rhs_shape = TensorShape(n, k);
				161	const auto t_dst_shape = TensorShape(n, m);
				162	auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type);
				163	auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type);
Gunes Bayir	8a87983	2022-03-10 21:21:01 +0000	[diff] [blame]	164	auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32);
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	165	auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type);
				166
				167	const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
				168	const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 };
				169	const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 };
				170	const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 };
				171	const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 };
				172
				173	ClKernelBlueprint bp;
				174	ArgumentID tid_lhs;
				175	ArgumentID tid_rhs;
				176	ArgumentID tid_l0_bias = g_arg_placeholder;
				177	ArgumentID tid_l1_addend;
				178	ArgumentID tid_dst;
				179	auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs);
				180	st = add_tensor_argument(bp, t_rhs_desc, tid_rhs);
				181	st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend);
				182	st = add_tensor_argument(bp, t_dst_desc, tid_dst);
				183
				184	const auto common_kernel_desc = ClKernelComponentDescriptor{};
				185	const GemmNativeDescriptor gemm_native_desc{ 1.0, 1.0, m, n, k };
				186	const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
				187	const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
				188	const TileDescriptor store_tile_info{};
				189
				190	ArgumentID tid_acc;
				191	st = add_tensor_intermed(bp, tid_acc);
				192	st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc);
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	193	st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc);
				194	st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware);
				195
				196	ClKernelCode cl_code;
				197
				198	st = set_tile_info(bp, store_tile_info);
				199	st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	200	set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info);
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	201
				202	ClExecutionDescriptor exec_desc;
				203	st = tune_static(exec_desc, cl_code);
				204
				205	CLScheduler::get().default_init();
				206	ClCompositeKernel kernel;
				207	kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
				208
				209	// Construct tensors
				210	CLTensor t_lhs{};
				211	CLTensor t_rhs{};
				212	CLTensor t_l1_addend{};
				213	CLTensor t_dst{};
				214	// Init tensors
				215	{
				216	t_lhs.allocator()->init(t_lhs_info);
				217	t_rhs.allocator()->init(t_rhs_info);
				218	t_l1_addend.allocator()->init(t_dst_info);
				219	t_dst.allocator()->init(t_dst_info);
				220	}
				221	// "Pack" tensors
				222	TensorBinding tensors({ { tid_lhs, &t_lhs },
				223	{ tid_rhs, &t_rhs },
				224	{ tid_l1_addend, &t_l1_addend },
				225	{ tid_dst, &t_dst }
				226	});
				227	// Allocate and fill tensors
				228	{
				229	t_lhs.allocator()->allocate();
				230	t_rhs.allocator()->allocate();
				231	t_l1_addend.allocator()->allocate();
				232	t_dst.allocator()->allocate();
				233	fill<float>(CLAccessor(t_lhs), 0);
				234	fill<float>(CLAccessor(t_rhs), 1);
				235	fill<float>(CLAccessor(t_l1_addend), 2);
				236	}
				237
				238	CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
				239
				240	// Create reference
				241	SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 };
				242	SimpleTensor<float> ref_t_rhs{ t_rhs_shape, data_type, 1 };
				243	SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1 };
				244	SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1 };
				245
				246	// Fill reference
				247	fill<float>(ref_t_lhs, 0);
				248	fill<float>(ref_t_rhs, 1);
				249	fill<float>(ref_t_l1_addend, 2);
				250	const auto ref_t_dst = reference::arithmetic_operation(
				251	ArithmeticOperation::ADD,
				252	ref_t_l1_addend,
				253	reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */),
				254	data_type,
				255	eltwise_add_desc.convert_policy);
				256
				257	RelativeTolerance<float> tolerance_f32(0.001f); /*< Tolerance value for comparing reference's output against implementation's output for floating point data types /
				258	validate(CLAccessor(t_dst), ref_t_dst, tolerance_f32);
				259	}
				260
				261	TEST_SUITE_END() // Validate
				262
				263	TEST_SUITE(Benchmark)
				264	TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
				265	{
				266	using std::chrono::duration_cast;
				267	using std::chrono::microseconds;
				268	const int num_iterations = 200;
				269	std::map<std::string, std::chrono::microseconds> measurements;
				270	/* Computation:
				271	* out = add(addend, gemm_native(lhs, rhs, bias))
				272	*/
				273	const auto data_type = DataType::F32;
				274	const unsigned int m = 12 * 12;
				275	const unsigned int n = 64;
				276	const unsigned int k = 384;
				277	const auto t_lhs_shape = TensorShape(k, m);
				278	const auto t_rhs_shape = TensorShape(n, k);
				279	const auto t_dst_shape = TensorShape(n, m);
				280	auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type);
				281	auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type);
				282	auto t_bias_info = TensorInfo(TensorShape(), 1, data_type);
				283	auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3
				284	auto t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type);
				285	auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type);
				286
				287	const auto common_kernel_desc = ClKernelComponentDescriptor{};
				288	const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k };
				289	const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
				290	const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
				291	const TileDescriptor store_tile_info{};
				292
				293	// Create reference
				294	SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 };
				295	SimpleTensor<float> ref_t_rhs{ t_rhs_shape, data_type, 1 };
				296	SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1 };
				297	SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1 };
				298
				299	// Fill reference
				300	fill<float>(ref_t_lhs, 0);
				301	fill<float>(ref_t_rhs, 1);
				302	fill<float>(ref_t_l1_addend, 2);
				303	const auto ref_t_dst = reference::arithmetic_operation(
				304	ArithmeticOperation::ADD,
				305	ref_t_l1_addend,
				306	reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */),
				307	data_type,
				308	eltwise_add_desc.convert_policy);
				309
				310	CLScheduler::get().default_init();
				311
				312	/* Condition 0: Dynamic Fused Kernel */
				313	CLTensor cond0_t_dst{};
				314	{
				315	TICK(cond0_0_startup_time);
				316
				317	ClKernelBlueprint bp;
				318	ArgumentID tid_lhs;
				319	ArgumentID tid_rhs;
				320	ArgumentID tid_l0_bias = g_arg_placeholder;
				321	ArgumentID tid_l1_addend;
				322	ArgumentID tid_dst;
				323
				324	const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
				325	const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 };
				326	const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 };
				327	const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 };
				328	const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 };
				329
				330	ClKernelCode cl_code;
				331	TICK(cond0_build_time)
				332	auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs);
				333	st = add_tensor_argument(bp, t_rhs_desc, tid_rhs);
				334	st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend);
				335	st = add_tensor_argument(bp, t_dst_desc, tid_dst);
				336
				337	ArgumentID tid_acc;
				338	st = add_tensor_intermed(bp, tid_acc);
				339	st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc);
				340
				341	st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc);
				342
				343	st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware);
				344
				345	st = set_tile_info(bp, store_tile_info);
				346	st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
				347	set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info);
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	348	TOCK(cond0_build_time, measurements)
				349
				350	TICK(cond0_tune_time)
				351	ClExecutionDescriptor exec_desc;
				352	st = tune_static(exec_desc, cl_code);
				353	TOCK(cond0_tune_time, measurements)
				354
				355	TICK(cond0_configure_time)
				356	ClCompositeKernel kernel;
				357	kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
				358	TOCK(cond0_configure_time, measurements)
				359
				360	// Construct tensors
				361	CLTensor t_lhs{};
				362	CLTensor t_rhs{};
				363	CLTensor t_l1_addend{};
				364
				365	// Init tensors
				366	{
				367	t_lhs.allocator()->init(t_lhs_info);
				368	t_rhs.allocator()->init(t_rhs_info);
				369	t_l1_addend.allocator()->init(t_dst_info);
				370	cond0_t_dst.allocator()->init(t_dst_info);
				371	}
				372	// Allocate tensors
				373	{
				374	t_lhs.allocator()->allocate();
				375	t_rhs.allocator()->allocate();
				376	t_l1_addend.allocator()->allocate();
				377	cond0_t_dst.allocator()->allocate();
				378	fill<float>(CLAccessor(t_lhs), 0);
				379	fill<float>(CLAccessor(t_rhs), 1);
				380	fill<float>(CLAccessor(t_l1_addend), 2);
				381	}
				382
				383	// "Pack" tensors
				384	TensorBinding tensors({ { tid_lhs, &t_lhs }, { tid_rhs, &t_rhs }, { tid_l1_addend, &t_l1_addend }, { tid_dst, &cond0_t_dst } });
				385
				386	CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
				387	CLScheduler::get().sync();
				388	TOCK(cond0_0_startup_time, measurements)
				389
				390	TICK(cond0_1_latency)
				391	for(int i = 0; i < num_iterations; ++i)
				392	{
				393	CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
				394	}
				395	CLScheduler::get().sync();
				396	TOCK_AVG(cond0_1_latency, measurements, num_iterations)
				397	}
				398	/* Condition 1: Dynamic Unfused Kernel */
				399	/* Condition 2: Static Fused Kernel (current) */
				400	CLTensor cond2_t_dst{};
				401	{
				402	TICK(cond2_0_startup_time);
				403	arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm;
				404
				405	TICK(cond2_configure_time);
				406	experimental::PostOpList<ITensorInfo *> post_ops;
				407	post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(&t_dst_info, 1, eltwise_add_desc.convert_policy);
				408	GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0, post_ops };
				409	l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info,
				410	gemm_native_desc.rhs_info, gemm_info);
				411	TOCK(cond2_configure_time, measurements);
				412
				413	// Construct tensors
				414	CLTensor t_lhs{};
				415	CLTensor t_rhs{};
				416	CLTensor t_l1_addend{};
				417
				418	// Init tensors
				419	{
				420	t_lhs.allocator()->init(t_lhs_info);
				421	t_rhs.allocator()->init(t_rhs_info);
				422	t_l1_addend.allocator()->init(t_dst_info);
				423	cond2_t_dst.allocator()->init(t_dst_info);
				424	}
				425	// Allocate tensors
				426	{
				427	t_lhs.allocator()->allocate();
				428	t_rhs.allocator()->allocate();
				429	t_l1_addend.allocator()->allocate();
				430	cond2_t_dst.allocator()->allocate();
				431	fill<float>(CLAccessor(t_lhs), 0);
				432	fill<float>(CLAccessor(t_rhs), 1);
				433	fill<float>(CLAccessor(t_l1_addend), 2);
				434	}
				435
				436	// "Pack" tensors
				437	ITensorPack tensors
				438	{
				439	{ ACL_SRC_0, &t_lhs },
				440	{ ACL_SRC_1, &t_rhs },
				441	{ EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, &t_l1_addend },
				442	{ ACL_DST, &cond2_t_dst },
				443	};
				444	CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true);
				445	CLScheduler::get().sync();
				446	TOCK(cond2_0_startup_time, measurements);
				447
				448	TICK(cond2_1_latency);
				449	for(int i = 0; i < num_iterations; ++i)
				450	{
				451	CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true);
				452	}
				453	CLScheduler::get().sync();
				454	TOCK_AVG(cond2_1_latency, measurements, num_iterations);
				455	}
				456	/* Condition 3: Static Unfused Kernel (current) */
				457	CLTensor cond3_t_dst{};
				458	{
				459	TICK(cond3_0_startup_time);
				460	arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm;
				461	arm_compute::opencl::kernels::ClSaturatedArithmeticKernel l1_add;
				462
				463	TICK(cond3_configure_time);
				464	GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
				465	l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_l0_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info,
				466	gemm_native_desc.rhs_info, gemm_info);
				467	l1_add.configure(CLKernelLibrary::get().get_compile_context(), ArithmeticOperation::ADD, &t_l0_dst_info, &t_l1_rhs_info, &t_dst_info, eltwise_add_desc.convert_policy);
				468	TOCK(cond3_configure_time, measurements);
				469
				470	// Construct tensors
				471	CLTensor t_lhs{};
				472	CLTensor t_rhs{};
				473	CLTensor t_l0_dst{};
				474	CLTensor t_l1_addend{};
				475
				476	// Init tensors
				477	{
				478	t_lhs.allocator()->init(t_lhs_info);
				479	t_rhs.allocator()->init(t_rhs_info);
				480	t_l0_dst.allocator()->init(t_l0_dst_info);
				481	t_l1_addend.allocator()->init(t_dst_info);
				482	cond3_t_dst.allocator()->init(t_dst_info);
				483	}
				484	// Allocate tensors
				485	{
				486	t_lhs.allocator()->allocate();
				487	t_rhs.allocator()->allocate();
				488	t_l0_dst.allocator()->allocate();
				489	t_l1_addend.allocator()->allocate();
				490	cond3_t_dst.allocator()->allocate();
				491	fill<float>(CLAccessor(t_lhs), 0);
				492	fill<float>(CLAccessor(t_rhs), 1);
				493	fill<float>(CLAccessor(t_l1_addend), 2);
				494	}
				495
				496	// "Pack" tensors
				497	ITensorPack tensors_l0
				498	{
				499	{ ACL_SRC_0, &t_lhs },
				500	{ ACL_SRC_1, &t_rhs },
				501	{ ACL_DST, &t_l0_dst },
				502	};
				503	ITensorPack tensors_l1
				504	{
				505	{ ACL_SRC_0, &t_l0_dst },
				506	{ ACL_SRC_1, &t_l1_addend },
				507	{ ACL_DST, &cond3_t_dst },
				508	};
				509	CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true);
				510	CLScheduler::get().enqueue_op(l1_add, tensors_l1, true);
				511	CLScheduler::get().sync();
				512	TOCK(cond3_0_startup_time, measurements);
				513
				514	TICK(cond3_1_latency);
				515	for(int i = 0; i < num_iterations; ++i)
				516	{
				517	CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true);
				518	CLScheduler::get().enqueue_op(l1_add, tensors_l1, true);
				519	}
				520	CLScheduler::get().sync();
				521	TOCK_AVG(cond3_1_latency, measurements, num_iterations);
				522	}
				523
				524	RelativeTolerance<float> tolerance_f32(0.001f); /*< Tolerance value for comparing reference's output against implementation's output for floating point data types /
				525	std::cout << "cond0 validation: " << std::endl;
				526	validate(CLAccessor(cond0_t_dst), ref_t_dst, tolerance_f32);
				527	std::cout << "cond2 validation: " << std::endl;
				528	validate(CLAccessor(cond2_t_dst), ref_t_dst, tolerance_f32);
				529	std::cout << "cond3 validation: " << std::endl;
				530	validate(CLAccessor(cond3_t_dst), ref_t_dst, tolerance_f32);
				531
				532	/* Report */
				533	std::cout << "Performance comparison (gemm native + add)" << std::endl;
				534	std::cout << "cond0: dynamic fusion module" << std::endl;
				535	std::cout << "cond2: static fused with post ops" << std::endl;
				536	std::cout << "cond3: static unfused" << std::endl;
				537	for(auto m : measurements)
				538	{
				539	std::cout << m.first << ": " << m.second.count() << "us" << std::endl;
				540	}
				541	}
				542	TEST_SUITE_END() // Benchmark
				543	TEST_SUITE_END() // ClCompositeKernel
				544	TEST_SUITE_END() // DYNAMIC_FUSION
				545	TEST_SUITE_END() // UNIT
				546	TEST_SUITE_END() // CL
				547	} // namespace validation
				548	} // namespace test
				549	} // namespace arm_compute
				550
				551	#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)