Integrate new pretranspose_b_array with extra fused transpose of B This patch fuses the transposition taking place in Acl with the transformations done in arm_gemm (called pretranspose_b_array) if the underlying kernel and transform supports it. This should improve start-up time (as it's for constant Rhs matrices) and memory footprint. The transformations in arm_gemm are kernel specific. The Rhs matrix is transformed into certain layouts to improve the performance. Resolves: COMPMID-6595 Change-Id: Id2932dd966e59f903c279417bebcea83d9a42464 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11144 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>

commit: ef637398a8c2060e15de438020c53331da8bd6dd [log] [tgz]
author: Gunes Bayir <gunes.bayir@arm.com> Mon Feb 12 21:32:51 2024 +0000
committer: Gunes Bayir <gunes.bayir@arm.com> Wed Feb 21 10:36:22 2024 +0000
tree: b1a1738736c9b6b49e76767e44bf4b77bf732876
parent: 0a48c4c83b598991b4d4235f870c24d9e6634b20 [diff] [blame]
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index 436316c..a6c9677 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -221,7 +221,9 @@
         return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi);
     }
 
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override {
+        assert(!transposed);
+
         Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
         _B_transposed = buffer;
         strategy strat(_ci);
@@ -237,7 +239,7 @@
                     const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
 
                     strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
-                                               x0, xmax, k0, kmax);
+                                               x0, xmax, k0, kmax, false);
 
                     buffer += size;
                 }
commit	ef637398a8c2060e15de438020c53331da8bd6dd	[log] [tgz]
author	Gunes Bayir <gunes.bayir@arm.com>	Mon Feb 12 21:32:51 2024 +0000
committer	Gunes Bayir <gunes.bayir@arm.com>	Wed Feb 21 10:36:22 2024 +0000
tree	b1a1738736c9b6b49e76767e44bf4b77bf732876
parent	0a48c4c83b598991b4d4235f870c24d9e6634b20 [diff] [blame]