COMPMID-708 Fix AccessWindowTranspose

Change-Id: I68f65b6dea7889d71b4a10021f59e6f0ab82903b
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/145590
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index 3c45ab3..70235a2 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -53,7 +53,10 @@
     // the kernel to write back output values.
     // As the relation between input and output is transposed window.y() is
     // used for x anchor and window.x() for y anchor.
-    anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
+    if(_info->dimension(0) > 1)
+    {
+        anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
+    }
     anchor.set(1, std::max<int>(window.x().start() * _scale_y, anchor[0] + border_size.left) + _y);
 
     // End of the valid region is equal to the start of the last write of the
@@ -66,8 +69,11 @@
     // a size of the region.
     // As the relation between input and output is transposed window.y() is
     // used for x shape and window.x() for y shape.
-    shape.set(0, std::min<int>((old_anchor[1] + old_shape[1]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
-    shape.set(1, std::min<int>((old_anchor[0] + old_shape[0]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+    if(_info->dimension(0) > 1)
+    {
+        shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+    }
+    shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input
@@ -192,9 +198,9 @@
     ARM_COMPUTE_ERROR_ON(window.x().step() == 0);
 
     const int min_x = window.y().start() * _scale_x + _x;
-    const int max_x = window.y().end() * _scale_x + _x;
+    const int max_x = (window.y().end() - window.y().step()) * _scale_x + _x + _width;
     const int min_y = window.x().start() * _scale_y + _y;
-    const int max_y = window.x().end() * _scale_y + _y;
+    const int max_y = (window.x().end() - window.x().step()) * _scale_y + _y + _height;
 
     const TensorShape &shape = _info->tensor_shape();
 
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 8530ed2..a3cf18a 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 5b29905..aa1b92a 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index 695bdf7..ccf22ea 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
@@ -86,9 +87,7 @@
 
     if(output->total_size() != 0)
     {
-        // TODO (COMPMID-708): Replace AccessWindowStatic with AccessWindowTranspose
-        AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration), ceil_to_multiple(output->dimension(1),
-                                         num_elems_processed_per_iteration));
+        AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
 
         window_changed = window_changed || update_window_and_padding(win, output_access);
 
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 09e4acd..94b438c 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -25,7 +25,6 @@
 #include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index 92ee8d5..e8fb8cd 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
index 2d17c23..921582a 100644
--- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
index 22a2cf8..77ab5ad 100644
--- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index 0ca2474..f182fb2 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index 4d3ec46..46b7913 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 32a5acd..870d2c9 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -101,14 +102,12 @@
     // Configure kernel window
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowStatic input_access(input, 0, 0, input->dimension(0), input->dimension(1));
-
-    bool window_changed = update_window_and_padding(win, input_access);
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    bool                  window_changed = update_window_and_padding(win, input_access);
 
     if(output->total_size() != 0)
     {
-        // TODO (COMPMID-708): Replace AccessWindowStatic with AccessWindowTranspose
-        AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+        AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration_y, num_elems_processed_per_iteration_x);
 
         window_changed = window_changed || update_window_and_padding(win, output_access);
 
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 57bb9d0..4d75a16 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -45,7 +45,10 @@
     Small1DShapes()
         : ShapeDataset("Shape",
     {
-        TensorShape{ 256U }
+        TensorShape{ 128U },
+                     TensorShape{ 256U },
+                     TensorShape{ 512U },
+                     TensorShape{ 1024U }
     })
     {
     }
diff --git a/tests/validation/NEON/Transpose.cpp b/tests/validation/NEON/Transpose.cpp
index f2ef716..f7c5280 100644
--- a/tests/validation/NEON/Transpose.cpp
+++ b/tests/validation/NEON/Transpose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,19 +48,19 @@
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
     framework::dataset::make("InputInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::U8),  // Input not a multiple of 8
                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::U16), // Invalid shape
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::U32),
+                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::U32), // Window shrink
                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::U8),  // Wrong data type
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::U16),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::U32),
+                                            TensorInfo(TensorShape(20U, 16U), 1, DataType::U16),
+                                            TensorInfo(TensorShape(20U, 16U), 1, DataType::U32),
                                           }),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(13U, 21U), 1, DataType::U8),
                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::U16),
                                             TensorInfo(TensorShape(13U, 20U), 1, DataType::U32),
                                             TensorInfo(TensorShape(31U, 20U), 1, DataType::U16),
-                                            TensorInfo(TensorShape(13U, 20U), 1, DataType::U16),
-                                            TensorInfo(TensorShape(13U, 20U), 1, DataType::U32),
+                                            TensorInfo(TensorShape(16U, 20U), 1, DataType::U16),
+                                            TensorInfo(TensorShape(16U, 20U), 1, DataType::U32),
                                            })),
-    framework::dataset::make("Expected", { true, false, true, false, true, true })),
+    framework::dataset::make("Expected", { false, false, false, false, true, true })),
     a_info, output_info, expected)
 {
     // Lock tensors
@@ -90,9 +90,17 @@
     validate(dst.info()->valid_region(), valid_region);
 
     // Validate padding
-    const PaddingSize padding(0, 0);
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
+    const unsigned int num_elems_processed_per_iteration_x = 1;
+    const unsigned int num_elems_processed_per_iteration_y = std::max(4, static_cast<int>(8 / src.info()->element_size()));
+    const unsigned int max_in_x                            = ceil_to_multiple(shape[0], num_elems_processed_per_iteration_x);
+    const unsigned int max_in_y                            = ceil_to_multiple(shape[1], num_elems_processed_per_iteration_y);
+    const unsigned int max_out_x                           = ceil_to_multiple(output_shape[0], num_elems_processed_per_iteration_y);
+    const unsigned int max_out_y                           = ceil_to_multiple(output_shape[1], num_elems_processed_per_iteration_x);
+
+    const PaddingSize in_padding(0, max_in_x - shape[0], max_in_y - shape[1], 0);
+    const PaddingSize out_padding(0, max_out_x - output_shape[0], max_out_y - output_shape[1], 0);
+    validate(src.info()->padding(), in_padding);
+    validate(dst.info()->padding(), out_padding);
 }
 
 template <typename T>