[ONCPUML-970] Fast math mode for fixed format kernels Minor tweaks and test for running fixed format kernels with BF16 operations when specified by the user. Change-Id: Ic8167f67b86b1298da65e46cfebed9f3b86940e4 Signed-off-by: Milos Puzovic <milos.puzovic@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8000 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>

commit: 93581a524a8e66ed29ace892bc5cb297287802af [log] [tgz]
author: Pablo Marquez Tello <pablo.tello@arm.com> Thu Jul 21 13:55:27 2022 +0100
committer: Gunes Bayir <gunes.bayir@arm.com> Thu Aug 04 12:56:07 2022 +0000
tree: c585e366b4504f29f10326b0b34768574741eb82
parent: f67903b8ab8205b47f0ee2c27aeca8bed405c58e [diff] [blame]
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index df02d64..77da830 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/utils/AssemblyUtils.h"
 #include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
@@ -507,14 +508,20 @@
             const TensorShape tensor_shape    = tensor_info->tensor_shape();
             const int         tensor_height   = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)];
             const int         tensor_width    = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)];
-            const int         tensor_channels = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)];
+            int               tensor_channels = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)];
             const int         interleave_by   = arm_compute::interleave_by(wf);
+            const int         blocked_by      = arm_compute::block_by(wf);
             // We need to find a new stride that is distance from the data for one
             // set of output channels to the next
             if(ldb == tensor_channels && multi_stride_b == tensor_channels * tensor_width)
             {
                 // In this case dimensions that are packed are height, width and channel
                 // so we need to stride it by interleave_by
+                if(tensor_channels % blocked_by != 0)
+                {
+                    // We need to pad
+                    tensor_channels = arm_gemm::iceildiv(tensor_channels, blocked_by) * blocked_by;
+                }
                 ldb = interleave_by * tensor_height * tensor_width * tensor_channels;
             }
             else if(multi_stride_b == 0 || (ldb == tensor_width && multi_stride_b == tensor_height * tensor_width))
commit	93581a524a8e66ed29ace892bc5cb297287802af	[log] [tgz]
author	Pablo Marquez Tello <pablo.tello@arm.com>	Thu Jul 21 13:55:27 2022 +0100
committer	Gunes Bayir <gunes.bayir@arm.com>	Thu Aug 04 12:56:07 2022 +0000
tree	c585e366b4504f29f10326b0b34768574741eb82
parent	f67903b8ab8205b47f0ee2c27aeca8bed405c58e [diff] [blame]