Blame - tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp - ml/ComputeLibrary

blob: cb365901da30a52608a8a809270dcd39bcae9646 [file] [log] [blame]

Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2022 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	24	#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
				25
				26	#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
				27
				28	#include "src/core/utils/helpers/float_ops.h"
				29	#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
				30	#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
				31	#include "tests/CL/CLAccessor.h"
				32	#include "tests/framework/Macros.h"
				33	#include "tests/framework/datasets/Datasets.h"
				34	#include "tests/validation/Validation.h"
				35	#include "tests/validation/reference/ElementwiseOperations.h"
				36	#include "tests/validation/reference/GEMM.h"
				37
				38	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
				39	#include "src/core/AccessWindowStatic.h"
				40	#include "src/core/helpers/AutoConfiguration.h"
				41	#include "src/core/helpers/WindowHelpers.h"
				42
				43	#include <chrono>
				44
				45	using namespace arm_compute::experimental::dynamic_fusion;
				46
				47	namespace arm_compute
				48	{
				49	namespace test
				50	{
				51	namespace validation
				52	{
				53	namespace
				54	{
				55	/** Macros which measures the wall clock time, and records it into a map measurement_map with name clock_name */
				56	#define TICK(clock_name) \
				57	auto clock_name##_tick = std::chrono::high_resolution_clock::now();
				58	#define TOCK(clock_name, measurement_map) \
				59	auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
				60	measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
				61	#define TOCK_AVG(clock_name, measurement_map, num_iterations) \
				62	auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
				63	measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
				64
				65	template <typename T, typename U>
				66	void fill(U &&tensor, int seed)
				67	{
				68	static_assert(std::is_floating_point<T>::value \|\| std::is_same<T, half>::value, "Only floating point data types supported.");
				69	using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
				70
				71	DistributionType distribution{ T(-1.0f), T(1.0f) };
				72	library->fill(tensor, distribution, seed);
				73
				74	// Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
				75	DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
				76	library->fill_borders_with_garbage(tensor, distribution_inf, seed);
				77	}
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	78	} // namespace
				79
				80	TEST_SUITE(CL)
				81	TEST_SUITE(UNIT)
				82	TEST_SUITE(DYNAMIC_FUSION)
				83	TEST_SUITE(ClCompositeKernel)
				84	TEST_SUITE(Validate)
				85
				86	TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
				87	{
				88	/* Computation:
				89	* out = add(addend, gemm_native(lhs, rhs, bias)) (non-broadcast)
				90	*/
				91	const auto data_type = DataType::F32;
				92	const auto m = 5U;
				93	const auto n = 4U;
				94	const auto k = 3U;
				95	const auto t_lhs_shape = TensorShape(k, m);
				96	const auto t_rhs_shape = TensorShape(n, k);
				97	const auto t_dst_shape = TensorShape(n, m);
				98	auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type);
				99	auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type);
Gunes Bayir	8a87983	2022-03-10 21:21:01 +0000	[diff] [blame]	100	auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32);
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	101	auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type);
				102
				103	const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
				104	const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 };
				105	const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 };
				106	const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 };
				107	const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 };
				108
				109	ClKernelBlueprint bp;
				110	ArgumentID tid_lhs;
				111	ArgumentID tid_rhs;
				112	ArgumentID tid_l0_bias = g_arg_placeholder;
				113	ArgumentID tid_l1_addend;
				114	ArgumentID tid_dst;
				115	auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs);
				116	st = add_tensor_argument(bp, t_rhs_desc, tid_rhs);
				117	st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend);
				118	st = add_tensor_argument(bp, t_dst_desc, tid_dst);
				119
				120	const auto common_kernel_desc = ClKernelComponentDescriptor{};
				121	const GemmNativeDescriptor gemm_native_desc{ 1.0, 1.0, m, n, k };
				122	const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
				123	const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
Giorgio Arena	bd44caa	2022-03-15 13:45:15 +0000	[diff] [blame]	124	const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT };
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	125
				126	ArgumentID tid_acc;
				127	st = add_tensor_intermed(bp, tid_acc);
				128	st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc);
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	129	st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc);
				130	st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware);
				131
				132	ClKernelCode cl_code;
				133
				134	st = set_tile_info(bp, store_tile_info);
				135	st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	136
				137	ClExecutionDescriptor exec_desc;
				138	st = tune_static(exec_desc, cl_code);
				139
				140	CLScheduler::get().default_init();
				141	ClCompositeKernel kernel;
				142	kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
				143
				144	// Construct tensors
				145	CLTensor t_lhs{};
				146	CLTensor t_rhs{};
				147	CLTensor t_l1_addend{};
				148	CLTensor t_dst{};
				149	// Init tensors
				150	{
				151	t_lhs.allocator()->init(t_lhs_info);
				152	t_rhs.allocator()->init(t_rhs_info);
				153	t_l1_addend.allocator()->init(t_dst_info);
				154	t_dst.allocator()->init(t_dst_info);
				155	}
				156	// "Pack" tensors
				157	TensorBinding tensors({ { tid_lhs, &t_lhs },
				158	{ tid_rhs, &t_rhs },
				159	{ tid_l1_addend, &t_l1_addend },
				160	{ tid_dst, &t_dst }
				161	});
				162	// Allocate and fill tensors
				163	{
				164	t_lhs.allocator()->allocate();
				165	t_rhs.allocator()->allocate();
				166	t_l1_addend.allocator()->allocate();
				167	t_dst.allocator()->allocate();
				168	fill<float>(CLAccessor(t_lhs), 0);
				169	fill<float>(CLAccessor(t_rhs), 1);
				170	fill<float>(CLAccessor(t_l1_addend), 2);
				171	}
				172
				173	CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
				174
				175	// Create reference
				176	SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 };
				177	SimpleTensor<float> ref_t_rhs{ t_rhs_shape, data_type, 1 };
				178	SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1 };
				179	SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1 };
				180
				181	// Fill reference
				182	fill<float>(ref_t_lhs, 0);
				183	fill<float>(ref_t_rhs, 1);
				184	fill<float>(ref_t_l1_addend, 2);
				185	const auto ref_t_dst = reference::arithmetic_operation(
				186	ArithmeticOperation::ADD,
				187	ref_t_l1_addend,
				188	reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */),
				189	data_type,
				190	eltwise_add_desc.convert_policy);
				191
				192	RelativeTolerance<float> tolerance_f32(0.001f); /*< Tolerance value for comparing reference's output against implementation's output for floating point data types /
				193	validate(CLAccessor(t_dst), ref_t_dst, tolerance_f32);
				194	}
				195
				196	TEST_SUITE_END() // Validate
				197
				198	TEST_SUITE(Benchmark)
				199	TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
				200	{
				201	using std::chrono::duration_cast;
				202	using std::chrono::microseconds;
				203	const int num_iterations = 200;
				204	std::map<std::string, std::chrono::microseconds> measurements;
				205	/* Computation:
				206	* out = add(addend, gemm_native(lhs, rhs, bias))
				207	*/
				208	const auto data_type = DataType::F32;
				209	const unsigned int m = 12 * 12;
				210	const unsigned int n = 64;
				211	const unsigned int k = 384;
				212	const auto t_lhs_shape = TensorShape(k, m);
				213	const auto t_rhs_shape = TensorShape(n, k);
				214	const auto t_dst_shape = TensorShape(n, m);
				215	auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type);
				216	auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type);
				217	auto t_bias_info = TensorInfo(TensorShape(), 1, data_type);
				218	auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3
				219	auto t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type);
				220	auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type);
				221
				222	const auto common_kernel_desc = ClKernelComponentDescriptor{};
				223	const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k };
				224	const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
				225	const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
Giorgio Arena	bd44caa	2022-03-15 13:45:15 +0000	[diff] [blame]	226	const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT };
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	227
				228	// Create reference
				229	SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 };
				230	SimpleTensor<float> ref_t_rhs{ t_rhs_shape, data_type, 1 };
				231	SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1 };
				232	SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1 };
				233
				234	// Fill reference
				235	fill<float>(ref_t_lhs, 0);
				236	fill<float>(ref_t_rhs, 1);
				237	fill<float>(ref_t_l1_addend, 2);
				238	const auto ref_t_dst = reference::arithmetic_operation(
				239	ArithmeticOperation::ADD,
				240	ref_t_l1_addend,
				241	reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */),
				242	data_type,
				243	eltwise_add_desc.convert_policy);
				244
				245	CLScheduler::get().default_init();
				246
				247	/* Condition 0: Dynamic Fused Kernel */
				248	CLTensor cond0_t_dst{};
				249	{
				250	TICK(cond0_0_startup_time);
				251
				252	ClKernelBlueprint bp;
				253	ArgumentID tid_lhs;
				254	ArgumentID tid_rhs;
				255	ArgumentID tid_l0_bias = g_arg_placeholder;
				256	ArgumentID tid_l1_addend;
				257	ArgumentID tid_dst;
				258
				259	const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
				260	const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 };
				261	const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 };
				262	const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 };
				263	const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 };
				264
				265	ClKernelCode cl_code;
				266	TICK(cond0_build_time)
				267	auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs);
				268	st = add_tensor_argument(bp, t_rhs_desc, tid_rhs);
				269	st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend);
				270	st = add_tensor_argument(bp, t_dst_desc, tid_dst);
				271
				272	ArgumentID tid_acc;
				273	st = add_tensor_intermed(bp, tid_acc);
				274	st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc);
				275
				276	st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc);
				277
				278	st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware);
				279
				280	st = set_tile_info(bp, store_tile_info);
				281	st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
Giorgio Arena	232c452	2022-03-03 10:09:01 +0000	[diff] [blame]	282	TOCK(cond0_build_time, measurements)
				283
				284	TICK(cond0_tune_time)
				285	ClExecutionDescriptor exec_desc;
				286	st = tune_static(exec_desc, cl_code);
				287	TOCK(cond0_tune_time, measurements)
				288
				289	TICK(cond0_configure_time)
				290	ClCompositeKernel kernel;
				291	kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
				292	TOCK(cond0_configure_time, measurements)
				293
				294	// Construct tensors
				295	CLTensor t_lhs{};
				296	CLTensor t_rhs{};
				297	CLTensor t_l1_addend{};
				298
				299	// Init tensors
				300	{
				301	t_lhs.allocator()->init(t_lhs_info);
				302	t_rhs.allocator()->init(t_rhs_info);
				303	t_l1_addend.allocator()->init(t_dst_info);
				304	cond0_t_dst.allocator()->init(t_dst_info);
				305	}
				306	// Allocate tensors
				307	{
				308	t_lhs.allocator()->allocate();
				309	t_rhs.allocator()->allocate();
				310	t_l1_addend.allocator()->allocate();
				311	cond0_t_dst.allocator()->allocate();
				312	fill<float>(CLAccessor(t_lhs), 0);
				313	fill<float>(CLAccessor(t_rhs), 1);
				314	fill<float>(CLAccessor(t_l1_addend), 2);
				315	}
				316
				317	// "Pack" tensors
				318	TensorBinding tensors({ { tid_lhs, &t_lhs }, { tid_rhs, &t_rhs }, { tid_l1_addend, &t_l1_addend }, { tid_dst, &cond0_t_dst } });
				319
				320	CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
				321	CLScheduler::get().sync();
				322	TOCK(cond0_0_startup_time, measurements)
				323
				324	TICK(cond0_1_latency)
				325	for(int i = 0; i < num_iterations; ++i)
				326	{
				327	CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
				328	}
				329	CLScheduler::get().sync();
				330	TOCK_AVG(cond0_1_latency, measurements, num_iterations)
				331	}
				332	/* Condition 1: Dynamic Unfused Kernel */
				333	/* Condition 2: Static Fused Kernel (current) */
				334	CLTensor cond2_t_dst{};
				335	{
				336	TICK(cond2_0_startup_time);
				337	arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm;
				338
				339	TICK(cond2_configure_time);
				340	experimental::PostOpList<ITensorInfo *> post_ops;
				341	post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(&t_dst_info, 1, eltwise_add_desc.convert_policy);
				342	GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0, post_ops };
				343	l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info,
				344	gemm_native_desc.rhs_info, gemm_info);
				345	TOCK(cond2_configure_time, measurements);
				346
				347	// Construct tensors
				348	CLTensor t_lhs{};
				349	CLTensor t_rhs{};
				350	CLTensor t_l1_addend{};
				351
				352	// Init tensors
				353	{
				354	t_lhs.allocator()->init(t_lhs_info);
				355	t_rhs.allocator()->init(t_rhs_info);
				356	t_l1_addend.allocator()->init(t_dst_info);
				357	cond2_t_dst.allocator()->init(t_dst_info);
				358	}
				359	// Allocate tensors
				360	{
				361	t_lhs.allocator()->allocate();
				362	t_rhs.allocator()->allocate();
				363	t_l1_addend.allocator()->allocate();
				364	cond2_t_dst.allocator()->allocate();
				365	fill<float>(CLAccessor(t_lhs), 0);
				366	fill<float>(CLAccessor(t_rhs), 1);
				367	fill<float>(CLAccessor(t_l1_addend), 2);
				368	}
				369
				370	// "Pack" tensors
				371	ITensorPack tensors
				372	{
				373	{ ACL_SRC_0, &t_lhs },
				374	{ ACL_SRC_1, &t_rhs },
				375	{ EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, &t_l1_addend },
				376	{ ACL_DST, &cond2_t_dst },
				377	};
				378	CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true);
				379	CLScheduler::get().sync();
				380	TOCK(cond2_0_startup_time, measurements);
				381
				382	TICK(cond2_1_latency);
				383	for(int i = 0; i < num_iterations; ++i)
				384	{
				385	CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true);
				386	}
				387	CLScheduler::get().sync();
				388	TOCK_AVG(cond2_1_latency, measurements, num_iterations);
				389	}
				390	/* Condition 3: Static Unfused Kernel (current) */
				391	CLTensor cond3_t_dst{};
				392	{
				393	TICK(cond3_0_startup_time);
				394	arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm;
				395	arm_compute::opencl::kernels::ClSaturatedArithmeticKernel l1_add;
				396
				397	TICK(cond3_configure_time);
				398	GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
				399	l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_l0_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info,
				400	gemm_native_desc.rhs_info, gemm_info);
				401	l1_add.configure(CLKernelLibrary::get().get_compile_context(), ArithmeticOperation::ADD, &t_l0_dst_info, &t_l1_rhs_info, &t_dst_info, eltwise_add_desc.convert_policy);
				402	TOCK(cond3_configure_time, measurements);
				403
				404	// Construct tensors
				405	CLTensor t_lhs{};
				406	CLTensor t_rhs{};
				407	CLTensor t_l0_dst{};
				408	CLTensor t_l1_addend{};
				409
				410	// Init tensors
				411	{
				412	t_lhs.allocator()->init(t_lhs_info);
				413	t_rhs.allocator()->init(t_rhs_info);
				414	t_l0_dst.allocator()->init(t_l0_dst_info);
				415	t_l1_addend.allocator()->init(t_dst_info);
				416	cond3_t_dst.allocator()->init(t_dst_info);
				417	}
				418	// Allocate tensors
				419	{
				420	t_lhs.allocator()->allocate();
				421	t_rhs.allocator()->allocate();
				422	t_l0_dst.allocator()->allocate();
				423	t_l1_addend.allocator()->allocate();
				424	cond3_t_dst.allocator()->allocate();
				425	fill<float>(CLAccessor(t_lhs), 0);
				426	fill<float>(CLAccessor(t_rhs), 1);
				427	fill<float>(CLAccessor(t_l1_addend), 2);
				428	}
				429
				430	// "Pack" tensors
				431	ITensorPack tensors_l0
				432	{
				433	{ ACL_SRC_0, &t_lhs },
				434	{ ACL_SRC_1, &t_rhs },
				435	{ ACL_DST, &t_l0_dst },
				436	};
				437	ITensorPack tensors_l1
				438	{
				439	{ ACL_SRC_0, &t_l0_dst },
				440	{ ACL_SRC_1, &t_l1_addend },
				441	{ ACL_DST, &cond3_t_dst },
				442	};
				443	CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true);
				444	CLScheduler::get().enqueue_op(l1_add, tensors_l1, true);
				445	CLScheduler::get().sync();
				446	TOCK(cond3_0_startup_time, measurements);
				447
				448	TICK(cond3_1_latency);
				449	for(int i = 0; i < num_iterations; ++i)
				450	{
				451	CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true);
				452	CLScheduler::get().enqueue_op(l1_add, tensors_l1, true);
				453	}
				454	CLScheduler::get().sync();
				455	TOCK_AVG(cond3_1_latency, measurements, num_iterations);
				456	}
				457
				458	RelativeTolerance<float> tolerance_f32(0.001f); /*< Tolerance value for comparing reference's output against implementation's output for floating point data types /
				459	std::cout << "cond0 validation: " << std::endl;
				460	validate(CLAccessor(cond0_t_dst), ref_t_dst, tolerance_f32);
				461	std::cout << "cond2 validation: " << std::endl;
				462	validate(CLAccessor(cond2_t_dst), ref_t_dst, tolerance_f32);
				463	std::cout << "cond3 validation: " << std::endl;
				464	validate(CLAccessor(cond3_t_dst), ref_t_dst, tolerance_f32);
				465
				466	/* Report */
				467	std::cout << "Performance comparison (gemm native + add)" << std::endl;
				468	std::cout << "cond0: dynamic fusion module" << std::endl;
				469	std::cout << "cond2: static fused with post ops" << std::endl;
				470	std::cout << "cond3: static unfused" << std::endl;
				471	for(auto m : measurements)
				472	{
				473	std::cout << m.first << ": " << m.second.count() << "us" << std::endl;
				474	}
				475	}
				476	TEST_SUITE_END() // Benchmark
				477	TEST_SUITE_END() // ClCompositeKernel
				478	TEST_SUITE_END() // DYNAMIC_FUSION
				479	TEST_SUITE_END() // UNIT
				480	TEST_SUITE_END() // CL
				481	} // namespace validation
				482	} // namespace test
				483	} // namespace arm_compute
				484
				485	#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)