Avoid over-allocation of temporary buffers within CpuWinogradConv2d

Resolves: COMPMID-4716

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: Ie036d2bb7a243301a62f089b3920ebee0f409190
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6028
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
diff --git a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp
index 74b031b..5620d36 100644
--- a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp
+++ b/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp
@@ -195,8 +195,7 @@
 {
     const KernelShape shape(num_output_channels, KernelRows, KernelCols, num_input_channels);
     return static_cast<unsigned int>(
-               // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T
-               WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels) / sizeof(T));
+               WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels));
 }
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -298,14 +297,13 @@
     // Construct shapes for the input and kernel tensors.
     const Tensor4DShape input_shape(num_batches, num_rows, num_cols, num_channels);
     const KernelShape   kern_shape(1, KernelRows, KernelCols, num_channels);
-    // Return the size, converted into units of TIn
-    return static_cast<unsigned int>(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding) / sizeof(T));
+    return static_cast<unsigned int>(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding));
 }
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 unsigned int CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
 {
-    return _transform->get_working_space_size(num_threads) / sizeof(T);
+    return _transform->get_working_space_size(num_threads);
 }
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -434,9 +432,8 @@
     // Construct shapes for the input and kernel tensors.
     const Tensor4DShape input_shape(num_batches, num_rows, num_cols, 1);
     const KernelShape   kern_shape(num_output_channels, KernelRows, KernelCols, 1);
-    // Return the size, converted into units of TOut
     return static_cast<unsigned int>(
-               WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels) / sizeof(T));
+               WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels));
 }
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -448,7 +445,7 @@
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 unsigned int CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
 {
-    return _transform->get_working_space_size(num_threads) / sizeof(T);
+    return _transform->get_working_space_size(num_threads);
 }
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
diff --git a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp
index a734e17..ca7b004 100644
--- a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp
+++ b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp
@@ -549,12 +549,6 @@
     _kernel_storage     = b_info;
     _output_transformed = d_info;
 
-    // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
-    TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
-                                dst->dimension(1), dst->dimension(3)),
-                    1, dst->data_type());
-    _output_nhwc = info;
-
     const ITensorInfo *input_to_use  = src;
     ITensorInfo       *output_to_use = dst;
     PermutationVector  weights_permutation_vector(3U, 0U, 1U, 2U);
@@ -573,7 +567,7 @@
     transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
                                       &_input_transformed, input_matrix_stride, &_input_workspace);
     const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
-    TensorInfo   input_workspace_info(TensorShape(input_workspace_size), 1, src->data_type());
+    TensorInfo   input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8);
     _input_workspace = input_workspace_info;
 
     // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
@@ -587,6 +581,11 @@
     // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
     if(_data_layout == DataLayout::NCHW)
     {
+        // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
+        TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
+                                    dst->dimension(1), dst->dimension(3)),
+                        1, dst->data_type());
+        _output_nhwc  = info;
         output_to_use = &_output_nhwc;
     }
     const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
@@ -603,7 +602,7 @@
                                        activation);
 
     const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
-    TensorInfo   output_workspace_info(TensorShape(output_workspace_size), 1, dst->data_type());
+    TensorInfo   output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8);
     _output_workspace = output_workspace_info;
 
     // Reorder the convoluted output to ACL's ordering NCHW
@@ -631,20 +630,12 @@
     _aux_mem[TransposedRHS]  = asm_mem_req[TransposedRHS];
     _aux_mem[TempResult]     = asm_mem_req[TempResult];
 
-    _aux_mem[InputTransformed] = MemoryInfo(offset_int_vec(InputTransformed), MemoryLifetime::Persistent, input_storage_size, storage_alignment);
-    _aux_mem[InputWorkspace]   = MemoryInfo(offset_int_vec(InputWorkspace), MemoryLifetime::Persistent, input_workspace_size);
-    if(_aux_mem[Pretranspose].size > 0)
-    {
-        // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-        _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
-    }
-    else
-    {
-        _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, _weights_hwio.total_size());
-    }
+    _aux_mem[InputTransformed]   = MemoryInfo(offset_int_vec(InputTransformed), MemoryLifetime::Temporary, input_storage_size, storage_alignment);
+    _aux_mem[InputWorkspace]     = MemoryInfo(offset_int_vec(InputWorkspace), MemoryLifetime::Temporary, input_workspace_size);
+    _aux_mem[PermutedWeights]    = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
     _aux_mem[WeightsTransformed] = MemoryInfo(offset_int_vec(WeightsTransformed), MemoryLifetime::Persistent, kernel_storage_size, storage_alignment);
-    _aux_mem[OutputTransformed]  = MemoryInfo(offset_int_vec(OutputTransformed), MemoryLifetime::Persistent, output_storage_size, storage_alignment);
-    _aux_mem[OutputWorkspace]    = MemoryInfo(offset_int_vec(OutputWorkspace), MemoryLifetime::Persistent, output_workspace_size);
+    _aux_mem[OutputTransformed]  = MemoryInfo(offset_int_vec(OutputTransformed), MemoryLifetime::Temporary, output_storage_size, storage_alignment);
+    _aux_mem[OutputWorkspace]    = MemoryInfo(offset_int_vec(OutputWorkspace), MemoryLifetime::Temporary, output_workspace_size);
 }
 
 Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
@@ -829,10 +820,7 @@
         ITensorPack         transform_tensors{ { ACL_SRC, permuted_weights.get() }, { ACL_DST, transformed_weights.get() } };
         NEScheduler::get().schedule_op(_transform_weights_kernel.get(), Window::DimX, _transform_weights_kernel->window(), transform_tensors);
 
-        CpuAuxTensorHandler input_transformed(offset_int_vec(InputTransformed), _input_transformed, tensors, true);
-        CpuAuxTensorHandler output_transformed(offset_int_vec(OutputTransformed), _output_transformed, tensors, true);
-        ITensorPack         gemm_pack = tensors;
-        gemm_pack.add_const_tensor(ACL_SRC_0, input_transformed.get());
+        ITensorPack gemm_pack = tensors;
         gemm_pack.add_const_tensor(ACL_SRC_1, transformed_weights.get());
         _gemm_function->prepare(gemm_pack);
 
diff --git a/src/runtime/cpu/utils/CpuAuxTensorHandler.h b/src/runtime/cpu/utils/CpuAuxTensorHandler.h
index 0d1c927..ae1cffb 100644
--- a/src/runtime/cpu/utils/CpuAuxTensorHandler.h
+++ b/src/runtime/cpu/utils/CpuAuxTensorHandler.h
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -38,7 +39,7 @@
 class CpuAuxTensorHandler
 {
 public:
-    CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false)
+    CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
         : _tensor()
     {
         if(info.total_size() == 0)
@@ -50,7 +51,12 @@
         ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id));
         if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
         {
-            _tensor.allocator()->allocate();
+            if(!bypass_alloc)
+            {
+                _tensor.allocator()->allocate();
+                ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
+            }
+
             if(pack_inject)
             {
                 pack.add_tensor(slot_id, &_tensor);