blob: cb365901da30a52608a8a809270dcd39bcae9646 [file] [log] [blame]
Giorgio Arena232c4522022-03-03 10:09:01 +00001/*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena232c4522022-03-03 10:09:01 +000024#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
25
26#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
27
28#include "src/core/utils/helpers/float_ops.h"
29#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
30#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
31#include "tests/CL/CLAccessor.h"
32#include "tests/framework/Macros.h"
33#include "tests/framework/datasets/Datasets.h"
34#include "tests/validation/Validation.h"
35#include "tests/validation/reference/ElementwiseOperations.h"
36#include "tests/validation/reference/GEMM.h"
37
38#include "arm_compute/core/utils/misc/ShapeCalculator.h"
39#include "src/core/AccessWindowStatic.h"
40#include "src/core/helpers/AutoConfiguration.h"
41#include "src/core/helpers/WindowHelpers.h"
42
43#include <chrono>
44
45using namespace arm_compute::experimental::dynamic_fusion;
46
47namespace arm_compute
48{
49namespace test
50{
51namespace validation
52{
53namespace
54{
55/** Macros which measures the wall clock time, and records it into a map measurement_map with name clock_name */
56#define TICK(clock_name) \
57 auto clock_name##_tick = std::chrono::high_resolution_clock::now();
58#define TOCK(clock_name, measurement_map) \
59 auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
60 measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
61#define TOCK_AVG(clock_name, measurement_map, num_iterations) \
62 auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
63 measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
64
65template <typename T, typename U>
66void fill(U &&tensor, int seed)
67{
68 static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
69 using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
70
71 DistributionType distribution{ T(-1.0f), T(1.0f) };
72 library->fill(tensor, distribution, seed);
73
74 // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
75 DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
76 library->fill_borders_with_garbage(tensor, distribution_inf, seed);
77}
Giorgio Arena232c4522022-03-03 10:09:01 +000078} // namespace
79
80TEST_SUITE(CL)
81TEST_SUITE(UNIT)
82TEST_SUITE(DYNAMIC_FUSION)
83TEST_SUITE(ClCompositeKernel)
84TEST_SUITE(Validate)
85
86TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
87{
88 /* Computation:
89 * out = add(addend, gemm_native(lhs, rhs, bias)) (non-broadcast)
90 */
91 const auto data_type = DataType::F32;
92 const auto m = 5U;
93 const auto n = 4U;
94 const auto k = 3U;
95 const auto t_lhs_shape = TensorShape(k, m);
96 const auto t_rhs_shape = TensorShape(n, k);
97 const auto t_dst_shape = TensorShape(n, m);
98 auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type);
99 auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type);
Gunes Bayir8a879832022-03-10 21:21:01 +0000100 auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32);
Giorgio Arena232c4522022-03-03 10:09:01 +0000101 auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type);
102
103 const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
104 const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 };
105 const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 };
106 const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 };
107 const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 };
108
109 ClKernelBlueprint bp;
110 ArgumentID tid_lhs;
111 ArgumentID tid_rhs;
112 ArgumentID tid_l0_bias = g_arg_placeholder;
113 ArgumentID tid_l1_addend;
114 ArgumentID tid_dst;
115 auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs);
116 st = add_tensor_argument(bp, t_rhs_desc, tid_rhs);
117 st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend);
118 st = add_tensor_argument(bp, t_dst_desc, tid_dst);
119
120 const auto common_kernel_desc = ClKernelComponentDescriptor{};
121 const GemmNativeDescriptor gemm_native_desc{ 1.0, 1.0, m, n, k };
122 const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
123 const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
Giorgio Arenabd44caa2022-03-15 13:45:15 +0000124 const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT };
Giorgio Arena232c4522022-03-03 10:09:01 +0000125
126 ArgumentID tid_acc;
127 st = add_tensor_intermed(bp, tid_acc);
128 st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc);
Giorgio Arena232c4522022-03-03 10:09:01 +0000129 st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc);
130 st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware);
131
132 ClKernelCode cl_code;
133
134 st = set_tile_info(bp, store_tile_info);
135 st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
Giorgio Arena232c4522022-03-03 10:09:01 +0000136
137 ClExecutionDescriptor exec_desc;
138 st = tune_static(exec_desc, cl_code);
139
140 CLScheduler::get().default_init();
141 ClCompositeKernel kernel;
142 kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
143
144 // Construct tensors
145 CLTensor t_lhs{};
146 CLTensor t_rhs{};
147 CLTensor t_l1_addend{};
148 CLTensor t_dst{};
149 // Init tensors
150 {
151 t_lhs.allocator()->init(t_lhs_info);
152 t_rhs.allocator()->init(t_rhs_info);
153 t_l1_addend.allocator()->init(t_dst_info);
154 t_dst.allocator()->init(t_dst_info);
155 }
156 // "Pack" tensors
157 TensorBinding tensors({ { tid_lhs, &t_lhs },
158 { tid_rhs, &t_rhs },
159 { tid_l1_addend, &t_l1_addend },
160 { tid_dst, &t_dst }
161 });
162 // Allocate and fill tensors
163 {
164 t_lhs.allocator()->allocate();
165 t_rhs.allocator()->allocate();
166 t_l1_addend.allocator()->allocate();
167 t_dst.allocator()->allocate();
168 fill<float>(CLAccessor(t_lhs), 0);
169 fill<float>(CLAccessor(t_rhs), 1);
170 fill<float>(CLAccessor(t_l1_addend), 2);
171 }
172
173 CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
174
175 // Create reference
176 SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 };
177 SimpleTensor<float> ref_t_rhs{ t_rhs_shape, data_type, 1 };
178 SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1 };
179 SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1 };
180
181 // Fill reference
182 fill<float>(ref_t_lhs, 0);
183 fill<float>(ref_t_rhs, 1);
184 fill<float>(ref_t_l1_addend, 2);
185 const auto ref_t_dst = reference::arithmetic_operation(
186 ArithmeticOperation::ADD,
187 ref_t_l1_addend,
188 reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */),
189 data_type,
190 eltwise_add_desc.convert_policy);
191
192 RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
193 validate(CLAccessor(t_dst), ref_t_dst, tolerance_f32);
194}
195
196TEST_SUITE_END() // Validate
197
198TEST_SUITE(Benchmark)
199TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
200{
201 using std::chrono::duration_cast;
202 using std::chrono::microseconds;
203 const int num_iterations = 200;
204 std::map<std::string, std::chrono::microseconds> measurements;
205 /* Computation:
206 * out = add(addend, gemm_native(lhs, rhs, bias))
207 */
208 const auto data_type = DataType::F32;
209 const unsigned int m = 12 * 12;
210 const unsigned int n = 64;
211 const unsigned int k = 384;
212 const auto t_lhs_shape = TensorShape(k, m);
213 const auto t_rhs_shape = TensorShape(n, k);
214 const auto t_dst_shape = TensorShape(n, m);
215 auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type);
216 auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type);
217 auto t_bias_info = TensorInfo(TensorShape(), 1, data_type);
218 auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3
219 auto t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type);
220 auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type);
221
222 const auto common_kernel_desc = ClKernelComponentDescriptor{};
223 const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k };
224 const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
225 const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
Giorgio Arenabd44caa2022-03-15 13:45:15 +0000226 const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT };
Giorgio Arena232c4522022-03-03 10:09:01 +0000227
228 // Create reference
229 SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 };
230 SimpleTensor<float> ref_t_rhs{ t_rhs_shape, data_type, 1 };
231 SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1 };
232 SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1 };
233
234 // Fill reference
235 fill<float>(ref_t_lhs, 0);
236 fill<float>(ref_t_rhs, 1);
237 fill<float>(ref_t_l1_addend, 2);
238 const auto ref_t_dst = reference::arithmetic_operation(
239 ArithmeticOperation::ADD,
240 ref_t_l1_addend,
241 reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */),
242 data_type,
243 eltwise_add_desc.convert_policy);
244
245 CLScheduler::get().default_init();
246
247 /* Condition 0: Dynamic Fused Kernel */
248 CLTensor cond0_t_dst{};
249 {
250 TICK(cond0_0_startup_time);
251
252 ClKernelBlueprint bp;
253 ArgumentID tid_lhs;
254 ArgumentID tid_rhs;
255 ArgumentID tid_l0_bias = g_arg_placeholder;
256 ArgumentID tid_l1_addend;
257 ArgumentID tid_dst;
258
259 const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
260 const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 };
261 const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 };
262 const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 };
263 const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 };
264
265 ClKernelCode cl_code;
266 TICK(cond0_build_time)
267 auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs);
268 st = add_tensor_argument(bp, t_rhs_desc, tid_rhs);
269 st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend);
270 st = add_tensor_argument(bp, t_dst_desc, tid_dst);
271
272 ArgumentID tid_acc;
273 st = add_tensor_intermed(bp, tid_acc);
274 st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc);
275
276 st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc);
277
278 st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware);
279
280 st = set_tile_info(bp, store_tile_info);
281 st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
Giorgio Arena232c4522022-03-03 10:09:01 +0000282 TOCK(cond0_build_time, measurements)
283
284 TICK(cond0_tune_time)
285 ClExecutionDescriptor exec_desc;
286 st = tune_static(exec_desc, cl_code);
287 TOCK(cond0_tune_time, measurements)
288
289 TICK(cond0_configure_time)
290 ClCompositeKernel kernel;
291 kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
292 TOCK(cond0_configure_time, measurements)
293
294 // Construct tensors
295 CLTensor t_lhs{};
296 CLTensor t_rhs{};
297 CLTensor t_l1_addend{};
298
299 // Init tensors
300 {
301 t_lhs.allocator()->init(t_lhs_info);
302 t_rhs.allocator()->init(t_rhs_info);
303 t_l1_addend.allocator()->init(t_dst_info);
304 cond0_t_dst.allocator()->init(t_dst_info);
305 }
306 // Allocate tensors
307 {
308 t_lhs.allocator()->allocate();
309 t_rhs.allocator()->allocate();
310 t_l1_addend.allocator()->allocate();
311 cond0_t_dst.allocator()->allocate();
312 fill<float>(CLAccessor(t_lhs), 0);
313 fill<float>(CLAccessor(t_rhs), 1);
314 fill<float>(CLAccessor(t_l1_addend), 2);
315 }
316
317 // "Pack" tensors
318 TensorBinding tensors({ { tid_lhs, &t_lhs }, { tid_rhs, &t_rhs }, { tid_l1_addend, &t_l1_addend }, { tid_dst, &cond0_t_dst } });
319
320 CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
321 CLScheduler::get().sync();
322 TOCK(cond0_0_startup_time, measurements)
323
324 TICK(cond0_1_latency)
325 for(int i = 0; i < num_iterations; ++i)
326 {
327 CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
328 }
329 CLScheduler::get().sync();
330 TOCK_AVG(cond0_1_latency, measurements, num_iterations)
331 }
332 /* Condition 1: Dynamic Unfused Kernel */
333 /* Condition 2: Static Fused Kernel (current) */
334 CLTensor cond2_t_dst{};
335 {
336 TICK(cond2_0_startup_time);
337 arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm;
338
339 TICK(cond2_configure_time);
340 experimental::PostOpList<ITensorInfo *> post_ops;
341 post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(&t_dst_info, 1, eltwise_add_desc.convert_policy);
342 GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0, post_ops };
343 l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info,
344 gemm_native_desc.rhs_info, gemm_info);
345 TOCK(cond2_configure_time, measurements);
346
347 // Construct tensors
348 CLTensor t_lhs{};
349 CLTensor t_rhs{};
350 CLTensor t_l1_addend{};
351
352 // Init tensors
353 {
354 t_lhs.allocator()->init(t_lhs_info);
355 t_rhs.allocator()->init(t_rhs_info);
356 t_l1_addend.allocator()->init(t_dst_info);
357 cond2_t_dst.allocator()->init(t_dst_info);
358 }
359 // Allocate tensors
360 {
361 t_lhs.allocator()->allocate();
362 t_rhs.allocator()->allocate();
363 t_l1_addend.allocator()->allocate();
364 cond2_t_dst.allocator()->allocate();
365 fill<float>(CLAccessor(t_lhs), 0);
366 fill<float>(CLAccessor(t_rhs), 1);
367 fill<float>(CLAccessor(t_l1_addend), 2);
368 }
369
370 // "Pack" tensors
371 ITensorPack tensors
372 {
373 { ACL_SRC_0, &t_lhs },
374 { ACL_SRC_1, &t_rhs },
375 { EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, &t_l1_addend },
376 { ACL_DST, &cond2_t_dst },
377 };
378 CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true);
379 CLScheduler::get().sync();
380 TOCK(cond2_0_startup_time, measurements);
381
382 TICK(cond2_1_latency);
383 for(int i = 0; i < num_iterations; ++i)
384 {
385 CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true);
386 }
387 CLScheduler::get().sync();
388 TOCK_AVG(cond2_1_latency, measurements, num_iterations);
389 }
390 /* Condition 3: Static Unfused Kernel (current) */
391 CLTensor cond3_t_dst{};
392 {
393 TICK(cond3_0_startup_time);
394 arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm;
395 arm_compute::opencl::kernels::ClSaturatedArithmeticKernel l1_add;
396
397 TICK(cond3_configure_time);
398 GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
399 l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_l0_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info,
400 gemm_native_desc.rhs_info, gemm_info);
401 l1_add.configure(CLKernelLibrary::get().get_compile_context(), ArithmeticOperation::ADD, &t_l0_dst_info, &t_l1_rhs_info, &t_dst_info, eltwise_add_desc.convert_policy);
402 TOCK(cond3_configure_time, measurements);
403
404 // Construct tensors
405 CLTensor t_lhs{};
406 CLTensor t_rhs{};
407 CLTensor t_l0_dst{};
408 CLTensor t_l1_addend{};
409
410 // Init tensors
411 {
412 t_lhs.allocator()->init(t_lhs_info);
413 t_rhs.allocator()->init(t_rhs_info);
414 t_l0_dst.allocator()->init(t_l0_dst_info);
415 t_l1_addend.allocator()->init(t_dst_info);
416 cond3_t_dst.allocator()->init(t_dst_info);
417 }
418 // Allocate tensors
419 {
420 t_lhs.allocator()->allocate();
421 t_rhs.allocator()->allocate();
422 t_l0_dst.allocator()->allocate();
423 t_l1_addend.allocator()->allocate();
424 cond3_t_dst.allocator()->allocate();
425 fill<float>(CLAccessor(t_lhs), 0);
426 fill<float>(CLAccessor(t_rhs), 1);
427 fill<float>(CLAccessor(t_l1_addend), 2);
428 }
429
430 // "Pack" tensors
431 ITensorPack tensors_l0
432 {
433 { ACL_SRC_0, &t_lhs },
434 { ACL_SRC_1, &t_rhs },
435 { ACL_DST, &t_l0_dst },
436 };
437 ITensorPack tensors_l1
438 {
439 { ACL_SRC_0, &t_l0_dst },
440 { ACL_SRC_1, &t_l1_addend },
441 { ACL_DST, &cond3_t_dst },
442 };
443 CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true);
444 CLScheduler::get().enqueue_op(l1_add, tensors_l1, true);
445 CLScheduler::get().sync();
446 TOCK(cond3_0_startup_time, measurements);
447
448 TICK(cond3_1_latency);
449 for(int i = 0; i < num_iterations; ++i)
450 {
451 CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true);
452 CLScheduler::get().enqueue_op(l1_add, tensors_l1, true);
453 }
454 CLScheduler::get().sync();
455 TOCK_AVG(cond3_1_latency, measurements, num_iterations);
456 }
457
458 RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
459 std::cout << "cond0 validation: " << std::endl;
460 validate(CLAccessor(cond0_t_dst), ref_t_dst, tolerance_f32);
461 std::cout << "cond2 validation: " << std::endl;
462 validate(CLAccessor(cond2_t_dst), ref_t_dst, tolerance_f32);
463 std::cout << "cond3 validation: " << std::endl;
464 validate(CLAccessor(cond3_t_dst), ref_t_dst, tolerance_f32);
465
466 /* Report */
467 std::cout << "Performance comparison (gemm native + add)" << std::endl;
468 std::cout << "cond0: dynamic fusion module" << std::endl;
469 std::cout << "cond2: static fused with post ops" << std::endl;
470 std::cout << "cond3: static unfused" << std::endl;
471 for(auto m : measurements)
472 {
473 std::cout << m.first << ": " << m.second.count() << "us" << std::endl;
474 }
475}
476TEST_SUITE_END() // Benchmark
477TEST_SUITE_END() // ClCompositeKernel
478TEST_SUITE_END() // DYNAMIC_FUSION
479TEST_SUITE_END() // UNIT
480TEST_SUITE_END() // CL
481} // namespace validation
482} // namespace test
483} // namespace arm_compute
484
485#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)