COMPMID-3560: Fix F16 performance regression (OpenCL) The performance regression was caused by a change in the interface of the OpenCL kernels gemm_mm_reshaped_lhs_* Change-Id: I030df4975dc040886c17e71710a27137b50edd9b Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3465 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>

commit: e5563d9b0102846973f144cba42fb9002bebd09b [log] [tgz]
author: Gian Marco Iodice <gianmarco.iodice@arm.com> Thu Jun 25 17:18:36 2020 +0100
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> Fri Jun 26 10:15:10 2020 +0000
tree: 3ede792d30aad726a81b371e34bae16f30f5d81c
parent: 6cb26ce7ff35e0c9b634160603560feeb23b0cee [diff] [blame]
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
index ba1c8a9..22bde63 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp

@@ -225,7 +225,7 @@
 
 CLGEMMMatrixMultiplyReshapedKernel::CLGEMMMatrixMultiplyReshapedKernel()
     : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _use_dummy_work_items(false), _add_bias(false),
-      _broadcast_bias(false), _export_to_cl_image(false)
+      _broadcast_bias(false), _export_to_cl_image(false), _k(1)
 {
 }
 
@@ -254,6 +254,7 @@
     _add_bias                 = _input2 != nullptr;
     _broadcast_bias           = gemm_info.broadcast_bias;
     _export_to_cl_image       = rhs_info.export_to_cl_image;
+    _k                        = gemm_info.k;
 
     // Check if we need to slide the matrix B
     const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
@@ -435,6 +436,9 @@
         // Output buffer
         add_2D_tensor_argument(idx, _output, slice);
 
+        // K dimension (not used if _export_to_cl_image == true)
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
+
         // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
commit	e5563d9b0102846973f144cba42fb9002bebd09b	[log] [tgz]
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	Thu Jun 25 17:18:36 2020 +0100
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	Fri Jun 26 10:15:10 2020 +0000
tree	3ede792d30aad726a81b371e34bae16f30f5d81c
parent	6cb26ce7ff35e0c9b634160603560feeb23b0cee [diff] [blame]