COMPMID-1706: Fuse the bias addition within CLGEMM Change-Id: I378f2023f4fa010f195f76716ac07aa86279bfae Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/280 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>

commit: ebc3a90721fe4a41b8e141466894d4d7185c01b7 [log] [tgz]
author: Michele Di Giorgio <michele.digiorgio@arm.com> Fri Nov 16 16:04:25 2018 +0000
committer: Michele Di Giorgio <michele.digiorgio@arm.com> Thu Feb 07 09:44:08 2019 +0000
tree: 9149764caa37edbdc6bb6c69d503d37dbb28449f
parent: 4632e5e44e9a78b15884d0947007bb030fde0aea [diff] [blame]
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index cd40fc6..e91038f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -160,6 +160,10 @@
     const auto workload   = static_cast<float>((m * n) / 20.0f);
     _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
 
+    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
+    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
+    const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
+
     // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(_is_interleaved_transposed)
     {
@@ -202,9 +206,8 @@
     if(!_is_new_gemm_reshaped)
     {
         // Configure and tune matrix multiply kernel
-        _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
-                                                                                                            mult_transpose1xW_width, mult_interleave4x4_height,
-                                                                                                            depth_output_gemm3d, reinterpret_input_as_3d),
+        _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
+                             GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
                              gemm_info.fp_mixed_precision());
         CLScheduler::get().tune_kernel_static(_mm_kernel);
     }
@@ -220,7 +223,7 @@
     }
 
     // Configure matrix addition kernel
-    if(beta != 0 && c != nullptr)
+    if(add_matrix_c && !use_fused_add)
     {
         _ma_kernel.configure(c, output, beta);
         _run_addition = true;
@@ -284,6 +287,10 @@
     const auto workload             = static_cast<float>((m * n) / 20.0f);
     const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
 
+    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
+    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
+    const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+
     // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(run_interleave_transpose)
     {
@@ -328,10 +335,11 @@
     if(!is_new_gemm_reshaped)
     {
         // Validate matrix multiply
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
+                                                                         run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
     }
 
-    if(beta != 0 && c != nullptr)
+    if(add_matrix_c && !use_fused_add)
     {
         // Validate matrix addition kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
commit	ebc3a90721fe4a41b8e141466894d4d7185c01b7	[log] [tgz]
author	Michele Di Giorgio <michele.digiorgio@arm.com>	Fri Nov 16 16:04:25 2018 +0000
committer	Michele Di Giorgio <michele.digiorgio@arm.com>	Thu Feb 07 09:44:08 2019 +0000
tree	9149764caa37edbdc6bb6c69d503d37dbb28449f
parent	4632e5e44e9a78b15884d0947007bb030fde0aea [diff] [blame]