Optimize CpuReshapeKernel

Resolves COMPMID-5279

Change-Id: Id9b007eed62c200702bbfcc83b94dab7b5de1714
Signed-off-by: Anitha Raj <anitha.raj@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9962
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index b8d4157..c8c8726 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -46,6 +46,8 @@
    - Add support for input data type U64/S64 in CLCast and NECast.
    - Add support for output data type S64 in NEArgMinMaxLayer and CLArgMinMaxLayer
  - Update OpenCL™ API headers to v2023.04.17.
+ - Performance optimizations:
+   - Optimize @ref CpuReshape
 
 v23.08 Public major release
  - Deprecate the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h
index 326dc96..641d536 100644
--- a/src/core/helpers/Utils.h
+++ b/src/core/helpers/Utils.h
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2020-2021 Arm Limited.
+* Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define SRC_CORE_HELPERS_UTILS_H
 
 #include "arm_compute/core/ITensorInfo.h"
-
 namespace arm_compute
 {
 /** Create a strides object based on the provided strides and the tensor dimensions.
@@ -38,7 +37,7 @@
  *         calculated based on the tensor shape and the strides of lower dimensions.
  */
 template <typename T, typename... Ts>
-inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
+inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&...fixed_strides)
 {
     const TensorShape &shape = info.tensor_shape();
 
@@ -92,6 +91,32 @@
 
     return x;
 }
+
+/** Check if the tensor has any holes.
+ *
+ * @param[in] info      Tensor info object defining the shape of the input tensor.
+ * @param[in] dimension Highest dimension to check.
+ *
+ * @note This function checks for holes in all the dimensions upto and including the highest dimension.
+ *
+ */
+inline bool has_holes(const ITensorInfo &info, size_t dimension)
+{
+    const auto &shape          = info.tensor_shape();
+    const auto &strides        = info.strides_in_bytes();
+    size_t      squashed_bytes = info.element_size();
+
+    for(size_t dim = 0; dim <= dimension; ++dim)
+    {
+        if(strides[dim] != squashed_bytes)
+        {
+            return true;
+        }
+        squashed_bytes *= shape[dim];
+    }
+    return false;
+}
+
 } // namespace arm_compute
 
 #endif /* SRC_CORE_HELPERS_UTILS_H */
diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp
index 068ff07..a9672a8 100644
--- a/src/cpu/kernels/CpuReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuReshapeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,11 +29,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include <cstdint>
 
 /** [NEReshapeLayerKernel Kernel] **/
@@ -61,21 +59,109 @@
     return Status{};
 }
 
+
 template <typename T>
-inline void reshape_tensor(const Window &window, const ITensor *src, ITensor *dst)
+void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst)
 {
     const TensorShape &src_shape = src->info()->tensor_shape();
     const TensorShape &dst_shape = dst->info()->tensor_shape();
+
+    Iterator    dst_it(dst, window);
+
+    execute_window_loop(window, [&](const Coordinates & dst_coord)
+    {
+        Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+        const auto output_ptr = dst->ptr_to_element(dst_coord);
+        const auto input_ptr  = src->ptr_to_element(src_coord);
+
+        *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr);
+    },
+    dst_it);
+}
+
+void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst )
+{
+    switch(src->info()->data_type())
+    {
+        case DataType::U8:
+        case DataType::S8:
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            reshape_tensor_per_element<uint8_t>(window, src, dst);
+            break;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::F16:
+            reshape_tensor_per_element<uint16_t>(window, src, dst);
+            break;
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            reshape_tensor_per_element<uint32_t>(window, src, dst);
+            break;
+        case DataType::U64:
+        case DataType::S64:
+        case DataType::F64:
+            reshape_tensor_per_element<uint64_t>(window, src, dst);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type!");
+    }
+}
+
+void reshape_tensor_per_row(const Window &window, const ITensor *src, ITensor *dst)
+{
+    const TensorShape &src_shape = src->info()->tensor_shape();
+    const TensorShape &dst_shape = dst->info()->tensor_shape();
+    Coordinates        src_coord{};
     Coordinates        dst_coord{};
 
-    Iterator src_it(src, window);
+    const auto element_size      = dst->info()->element_size();
+    const auto window_start_x    = static_cast<int>(window.x().start());
+    const auto window_end_x      = static_cast<int>(window.x().end());
+    const auto src_row_size      = static_cast<int>(src_shape[0]);
+    const auto row_size_in_bytes = src_row_size * element_size;
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    auto output_ptr = dst->ptr_to_element(dst_coord);
+    auto input_ptr  = src->ptr_to_element(src_coord);
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator dst_it(dst, win);
+    execute_window_loop(win, [&]( Coordinates & id)
     {
-        dst_coord                                              = index2coords(dst_shape, coords2index(src_shape, id));
-        *reinterpret_cast<T *>(dst->ptr_to_element(dst_coord)) = *reinterpret_cast<T *>(src_it.ptr());
+        dst_coord = id;
+
+        for(int x = window_start_x; x < window_end_x; x += src_row_size)
+        {
+            src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+            output_ptr = dst->ptr_to_element(dst_coord);
+            input_ptr  = src->ptr_to_element(src_coord);
+
+            std::memcpy(output_ptr, input_ptr, row_size_in_bytes);
+
+            dst_coord.increment(Window::DimX, src_row_size);
+        }
     },
-    src_it);
+    dst_it);
+}
+
+void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst)
+{
+    Iterator src_it(src, window);
+    Iterator dst_it(dst, window);
+
+    const size_t element_size         = dst->info()->element_size();
+    const auto   window_size          = window.x().end() - window.x().start();
+    const auto   window_size_in_bytes = window_size * element_size;
+
+    const auto input_ptr  = src_it.ptr();
+    const auto output_ptr = dst_it.ptr();
+
+    std::memcpy(output_ptr, input_ptr, window_size_in_bytes);
 }
 } // namespace
 
@@ -83,10 +169,11 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-    ARM_COMPUTE_UNUSED(dst);
+    ARM_COMPUTE_UNUSED(src);
 
+    _reshape_tensor_fn = reshape_tensor_per_element_selector;
     // Configure kernel window
-    Window win = calculate_max_window(*src);
+    Window win = calculate_max_window(*dst);
 
     ICpuKernel::configure(win);
 }
@@ -94,7 +181,6 @@
 Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-
     return Status{};
 }
 
@@ -106,28 +192,7 @@
 
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(src->info()->data_type())
-    {
-        case DataType::U8:
-        case DataType::S8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-            reshape_tensor<uint8_t>(window, src, dst);
-            break;
-        case DataType::U16:
-        case DataType::S16:
-        case DataType::F16:
-            reshape_tensor<uint16_t>(window, src, dst);
-            break;
-        case DataType::U32:
-        case DataType::S32:
-        case DataType::F32:
-            reshape_tensor<uint32_t>(window, src, dst);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type!");
-    }
+    _reshape_tensor_fn(window, src, dst);
 }
 
 const char *CpuReshapeKernel::name() const
@@ -143,6 +208,58 @@
     return ICPPKernel::default_mws;
 }
 
+void CpuReshapeKernel::prepare(ITensorPack &tensors)
+{
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    const ITensorInfo* src_info = src->info();
+    const ITensorInfo* dst_info = dst->info();
+
+    // Calculate kernel window based on the padding info
+    Window win;
+
+    const bool src_has_holes      = has_holes(*src_info, src_info->num_dimensions() - 1);
+    const bool dst_has_holes      = has_holes(*dst_info, dst_info->num_dimensions() - 1);
+    const bool src_has_holes_in_x = has_holes(*src_info, Window::DimX);
+    const bool dst_has_holes_in_x = has_holes(*dst_info, Window::DimX);
+    const auto src_row_size       = static_cast<int>(src_info->tensor_shape()[0]);
+    const auto dst_row_size       = static_cast<int>(dst_info->tensor_shape()[0]);
+
+    if(!src_has_holes && !dst_has_holes)
+    {
+        std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info);
+        /*
+            Copy the tensor per window. If the src and dst tensors
+            are contiguous memory allocations without any holes or
+            padding, then the tensor is squashed to 1D window and
+            we can use use a single memcopy call to copy the whole
+            window in reshape_tensor_per_window fn
+        */
+        _reshape_tensor_fn = reshape_tensor_per_window;
+    }
+    else
+    {
+        win = calculate_max_window(*dst_info);
+        /*
+            Copy tensor row by row if src and dst have no holes in X
+            dim and they have the same number of elements in their rows
+        */
+        if (!src_has_holes_in_x && !dst_has_holes_in_x && (src_row_size == dst_row_size))
+        {
+            _reshape_tensor_fn = reshape_tensor_per_row;
+        }
+        else
+        {
+            /*
+                Fall back to the element wise copy
+            */
+            _reshape_tensor_fn = reshape_tensor_per_element_selector;
+        }
+    }
+
+    ICPPKernel::configure(win);
+}
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h
index 17302c6..eddbbf7 100644
--- a/src/cpu/kernels/CpuReshapeKernel.h
+++ b/src/cpu/kernels/CpuReshapeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,6 +58,13 @@
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
+    /** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes
+     *
+     * @param[in] tensors Pack of input and output tensors
+     *
+     */
+    void prepare(ITensorPack &tensors);
+
     /** Return minimum workload size of the relevant kernel
      *
      * @param[in] platform     The CPU platform used to create the context.
@@ -66,6 +73,21 @@
      * @return[out] small_network_mws          Minimum workload size for requsted configuration.
      */
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+    /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
+      *
+      * @return The split dimension.
+      */
+    size_t get_split_dimension() const
+    {
+        return _split_dimension;
+    }
+
+private:
+    size_t               _split_dimension{ Window::DimY };
+
+    std::function<void(const Window &window, const ITensor *src, ITensor *dst )>  _reshape_tensor_fn{};
+
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp
index 79e7b8f..e6892a2 100644
--- a/src/cpu/operators/CpuReshape.cpp
+++ b/src/cpu/operators/CpuReshape.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,8 @@
 
 #include "src/common/utils/Log.h"
 
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 namespace arm_compute
 {
 namespace cpu
@@ -43,5 +45,17 @@
 {
     return kernels::CpuReshapeKernel::validate(src, dst);
 }
+
+void CpuReshape::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    if(!_is_prepared)
+    {
+        static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors);
+        _is_prepared = true;
+    }
+    const auto split_dimension = static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->get_split_dimension();
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h
index 92dcb09..9bc43e7 100644
--- a/src/cpu/operators/CpuReshape.h
+++ b/src/cpu/operators/CpuReshape.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_RESHAPE_H
 
 #include "src/cpu/ICpuOperator.h"
+#include "arm_compute/core/Window.h"
 
 namespace arm_compute
 {
@@ -47,6 +48,12 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    bool    _is_prepared{ false } ;
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ReshapeLayer.cpp b/tests/validation/NEON/ReshapeLayer.cpp
index bf39c39..e9f114d 100644
--- a/tests/validation/NEON/ReshapeLayer.cpp
+++ b/tests/validation/NEON/ReshapeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,9 @@
 template <typename T>
 using NEReshapeLayerFixture = ReshapeLayerValidationFixture<Tensor, Accessor, NEReshapeLayer, T>;
 
+template <typename T>
+using NEReshapeLayerPaddedFixture = ReshapeLayerPaddedValidationFixture<Tensor, Accessor, NEReshapeLayer, T>;
+
 TEST_SUITE(Float)
 TEST_SUITE(F32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::F32)))
@@ -84,8 +87,8 @@
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //F32
+TEST_SUITE_END() //Float
 
 TEST_SUITE(Integer)
 TEST_SUITE(S8)
@@ -94,7 +97,7 @@
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() //S8
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S16)))
@@ -102,11 +105,41 @@
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //S16
+TEST_SUITE_END() //Integer
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE(Padded)
+TEST_SUITE(Float)
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S32
+TEST_SUITE_END() //Float
+
+TEST_SUITE(Integer)
+TEST_SUITE(S8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<int8_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S8)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S8
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S16
+TEST_SUITE_END() //Integer
+TEST_SUITE_END() //Padded
+
+TEST_SUITE_END() //ReshapeLayer
+TEST_SUITE_END() //NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/fixtures/ReshapeLayerFixture.h b/tests/validation/fixtures/ReshapeLayerFixture.h
index b4c3a9f..8182350 100644
--- a/tests/validation/fixtures/ReshapeLayerFixture.h
+++ b/tests/validation/fixtures/ReshapeLayerFixture.h
@@ -31,6 +31,7 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/reference/ReshapeLayer.h"
 
 namespace arm_compute
@@ -41,12 +42,12 @@
 {
 /** [ReshapeLayer fixture] **/
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ReshapeLayerValidationFixture : public framework::Fixture
+class ReshapeLayerGenericValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
+    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type, bool add_x_padding = false)
     {
-        _target    = compute_target(input_shape, output_shape, data_type);
+        _target    = compute_target(input_shape, output_shape, data_type, add_x_padding);
         _reference = compute_reference(input_shape, output_shape, data_type);
     }
 
@@ -57,7 +58,7 @@
         library->fill_tensor_uniform(tensor, i);
     }
 
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type, bool add_x_padding = false)
     {
         // Check if indeed the input shape can be reshape to the output one
         ARM_COMPUTE_ASSERT(input_shape.total_size() == output_shape.total_size());
@@ -74,6 +75,12 @@
         ARM_COMPUTE_ASSERT(src.info()->is_resizable());
         ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
+        if(add_x_padding)
+        {
+            // Add random padding in x dimension
+            add_padding_x({ &src, &dst });
+        }
+
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
@@ -104,6 +111,25 @@
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ReshapeLayerValidationFixture : public ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
+    {
+        ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, output_shape, data_type);
+    }
+};
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ReshapeLayerPaddedValidationFixture : public ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
+    {
+        ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, output_shape, data_type, true /* add_x_padding */);
+    }
+};
 /** [ReshapeLayer fixture] **/
 } // namespace validation
 } // namespace test