COMPMID-1117: TransposeAccessWindow leads to high padding

Switches CLGEMMMatrixMultiplyKernel and CLGEMMTranspose1xWKernel to use
AccessWindowStatic

Change-Id: I21533d4218215d5b8f84b23c603062678eccb1ed
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/130244
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 805a594..2761247 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -126,7 +126,9 @@
         win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
         AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
-        AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowStatic    input1_access(input1, 0, 0,
+                                            ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                            ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
         AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
         window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index f69a39e..54abb1c 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -80,7 +81,7 @@
     // Configure window in case of configured output
     if(output->total_size() != 0)
     {
-        AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+        AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), scale_x), output->dimension(1));
         window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
         output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
     }