COMPMID-719: NEWinogradLayer reordering using NEPermute.

Input reordering from NCHW to NHWC
Output reordering from NHWC to NCHW
Weights reordering from [Ofm x Ifm x Height x Width] to [Height x Width x Ifm x Ofm]

Change-Id: I85aabedb1f9c13700bc4919eb3130f4d4bd0b465
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/113631
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
index 3251de4..800153e 100644
--- a/src/runtime/NEON/functions/NEWinogradLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp
@@ -43,7 +43,8 @@
 namespace arm_compute
 {
 NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _winograd_kernel(), _weights_workspace(), _workspace(), _kernel_storage(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv()
+    : _memory_group(std::move(memory_manager)), _winograd_kernel(), _permute_input(), _permute_weights(), _permute_output(), _workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(),
+      _weights_hwio(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv()
 {
 } /* arm_compute */
 
@@ -71,9 +72,8 @@
     ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
 
     // Get convolved dimensions
-    auto      padding     = PADDING_VALID;
-    const int in_channels = input->info()->dimension(2);
-
+    auto      padding        = PADDING_VALID;
+    const int in_channels    = input->info()->dimension(2);
     const int out_channels   = output->info()->dimension(2);
     const int weights_width  = weights->info()->dimension(0);
     const int weights_height = weights->info()->dimension(1);
@@ -88,25 +88,45 @@
     _memory_group.manage(&_kernel_storage);
 
     // Get workbench size and allocate memory
+
     constexpr size_t wspace_alignment = 64;
     const size_t     ws_size          = NEWinogradLayerKernel::get_working_space_size(in_shape, kernel_shape, padding);
     _workspace.allocator()->init(TensorInfo(TensorShape{ (ws_size + wspace_alignment - 1) }, 1, DataType::U8));
     _memory_group.manage(&_workspace);
-
-    // Workspace for weights transform
-    const size_t weights_transform_size = NEWinogradLayerKernel::get_kernel_transform_working_size(kernel_shape);
-    _weights_workspace.allocator()->init(TensorInfo(TensorShape{ (weights_transform_size + wspace_alignment - 1) }, 1, DataType::U8));
-    _memory_group.manage(&_weights_workspace);
-
+    _memory_group.manage(&_input_nhwc);
     _kernel_storage.allocator()->allocate();
     _workspace.allocator()->allocate();
-    _weights_workspace.allocator()->allocate();
 
     // Create Winograd operator object
     _conv = support::cpp14::make_unique<Winograd3x3F32>(kernel_shape, in_shape, padding, _kernel_storage.buffer());
 
     // Configure the kernel, padding not needed so it's safe to call configure after allocare
-    _winograd_kernel.configure(output, _conv.get());
+    _winograd_kernel.configure(_conv.get());
+
+    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+    switch(weights->info()->num_dimensions())
+    {
+        case 3:
+        {
+            _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+            break;
+        }
+        case 4:
+        {
+            _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+        }
+    }
+    // configure the kernel to transform the input tensor from NCHW -> NHWC
+    _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+
+    _weights_hwio.allocator()->allocate();
+    _input_nhwc.allocator()->allocate();
 }
 
 void NEWinogradLayer::run()
@@ -115,29 +135,42 @@
     _memory_group.acquire();
     if(!_reshaped_kernel)
     {
-        _conv->transform_weights(reinterpret_cast<const float *>(_weights->buffer()), reinterpret_cast<float *>(_weights_workspace.buffer()));
         _reshaped_kernel = true;
+        _permute_weights.run();
+        _conv->transform_weights(reinterpret_cast<const float *>(_weights_hwio.buffer()), nullptr);
     }
     const Tensor4DShape in_shape(internal_get_input_shape(_input));
     auto                padding = PADDING_VALID;
 
     //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
-    _conv->nchw2nhwc(in_shape, padding, _workspace.buffer(), reinterpret_cast<const float *>(_input->buffer()));
+    _permute_input.run();
 
     //Get ptrs into the workspace
     std::pair<void *, void *> nhwc_ptrs = _conv->get_nhwc_ptrs(in_shape, padding, _workspace.buffer());
 
     //Setup matrices ptrs and transfor the input tensor to the appropriate form before running GEMM.
-    _conv->reshape_input(in_shape, padding, nhwc_ptrs.second, _workspace.buffer());
+    _conv->reshape_input(in_shape, padding, reinterpret_cast<float *>(_input_nhwc.buffer()), _workspace.buffer());
 
     //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
-    NEScheduler::get().schedule(&_winograd_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_winograd_kernel, Window::DimX);
 
     //Transform the output to the appropriate form
     _conv->reshape_output(in_shape, padding, nhwc_ptrs.first);
 
-    //Transform back to NCHW
-    _conv->nhwc2nchw(in_shape, padding, _workspace.buffer(), reinterpret_cast<float *>(_output->buffer()));
+    const unsigned int out_width    = _output->info()->dimension(0);
+    const unsigned int out_height   = _output->info()->dimension(1);
+    const unsigned int out_channels = _output->info()->dimension(2);
+    const unsigned int out_batches  = _output->info()->dimension(3);
+
+    // We create a temporary tensor with the results in the workspace so that the we can run a function to reorder from NHWC -> NCHW
+    Tensor     output_nhwc;
+    TensorInfo info(TensorShape(out_channels, out_width, out_height, out_batches), 1, _output->info()->data_type());
+    output_nhwc.allocator()->init(info);
+    output_nhwc.allocator()->import_memory(Memory(static_cast<uint8_t *>(nhwc_ptrs.first)));
+
+    // Reorder the convoluted output to ACL's ordering NCHW
+    _permute_output.configure(&output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+    _permute_output.run();
 
     _memory_group.release();
 #else  /* __aarch64__ */