COMPMID-1043: Rework GCGEMMMatrixMultiplyKernel interface and allow auto initialization of the tensors

This patch also:
- removes support for already reshaped weights in GCConvolutionLayer
- makes GCConvolutionLayer similar to CLGEMMConvolutionLayer
- enables usage of the GCGEMM function in GCConvolution instead of calling the
  GEMM kernels directly

Change-Id: I3e4a64335555e86e18585d38d8fda4bfdb44e265
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/127696
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index a5f09e8..b4bb547 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,37 +31,180 @@
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
 #include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <set>
 #include <string>
 
 using namespace arm_compute;
 using namespace arm_compute::gles_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
+{
+    ARM_COMPUTE_UNUSED(reshape_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+
+    if(!is_interleaved_transposed)
+    {
+        ARM_COMPUTE_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
+            ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+        }
+    }
+    else
+    {
+        const int m                         = reshape_info.m();
+        const int n                         = reshape_info.n();
+        const int k                         = reshape_info.k();
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+        TensorShape tensor_shape0{ input0->tensor_shape() };
+        tensor_shape0.set(0, k);
+        tensor_shape0.set(1, m);
+
+        TensorShape tensor_shape1{ input1->tensor_shape() };
+        tensor_shape1.set(0, n);
+        tensor_shape1.set(1, k);
+
+        const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
+        const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+        const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
+        }
+    }
+
+    return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
+                                                               bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
+                                                               GPUTarget gpu_target, ElementsProcessed &num_elements_processed)
+{
+    ARM_COMPUTE_UNUSED(gpu_target);
+
+    // Output tensor auto inizialitation if not yet initialized
+    TensorShape tensor_shape{ input0->tensor_shape() };
+    tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->dimension(0));
+    tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->dimension(1));
+
+    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(tensor_shape));
+
+    bool   window_changed = false;
+    Window win{};
+
+    const DataType data_type                           = input0->data_type();
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+
+    if(is_interleaved_transposed)
+    {
+        // Configure window kernel
+        num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);
+        num_elems_processed_per_iteration_y = 4;
+
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+        AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+    else // The input tensors have not been reshaped
+    {
+        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
+
+        switch(data_type)
+        {
+            case DataType::F16:
+                num_elems_processed_per_iteration_x = 4;
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
+                break;
+
+            case DataType::F32:
+                num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowStatic    input0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
+        AccessWindowStatic    input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 GCGEMMMatrixMultiplyKernel::GCGEMMMatrixMultiplyKernel()
     : _input0(nullptr), _input1(nullptr), _output(nullptr)
 {
 }
 
-void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed)
+void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
 
-    if(!is_interleaved_transposed)
-    {
-        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
-    }
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
 
     _input0 = input0;
     _input1 = input1;
     _output = output;
 
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, GPUTarget::UNKNOWN, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IGCKernel::configure(win_config.second);
+
+    // Create build options
     std::set<std::string> build_opts;
+    std::string           kernel_name;
     Window                win;
 
     build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
@@ -74,6 +217,12 @@
     // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
     if(is_interleaved_transposed)
     {
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+        build_opts.emplace("#define MULT_TRANSPOSE1XW_WIDTH " + support::cpp11::to_string(mult_transpose1xW_width));
+        build_opts.emplace("#define MULT_INTERLEAVE4X4_HEIGHT " + support::cpp11::to_string(mult_interleave4x4_height));
+
         switch(input0->info()->data_type())
         {
             case DataType::F16:
@@ -91,56 +240,20 @@
 
         build_opts.emplace("#define GEMM_MM_INTERLEAVED_TRANSPOSED");
 
-        // Create kernel
-        _kernel = GCKernelLibrary::get().create_kernel(("gemm_mm_interleaved_transposed"), build_opts);
-
-        // Configure window kernel
-        const unsigned int     num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type());
-        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
-        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
-        AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        update_window_and_padding(win, input0_access, input1_access, output_access);
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+        kernel_name = "gemm_mm_interleaved_transposed";
     }
     else
     {
-        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
-
         // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
-        unsigned int num_elems_processed_per_iteration_x;
-        unsigned int num_elems_processed_per_iteration_y;
 
         switch(input0->info()->data_type())
         {
             case DataType::F16:
                 build_opts.emplace("#define DATA_TYPE_FP16");
-
-#define MM_PROCESS_4X_OPTIMIZED
-
-#if defined(MM_PROCESS_4X)
-                num_elems_processed_per_iteration_x = 4;
-                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
-                build_opts.emplace("#define MM_PROCESS_4X");
-#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
-                num_elems_processed_per_iteration_x = 4;
-                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
                 build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED");
-#elif defined(MM_PROCESS_8X)           /* MM_PROCESS_4X */
-                num_elems_processed_per_iteration_x = 8;
-                num_elems_processed_per_iteration_y = 1;
-                build_opts.emplace("#define MM_PROCESS_8X");
-#endif                                 /* MM_PROCESS_4X */
                 break;
 
             case DataType::F32:
-                num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type());
-                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
                 build_opts.emplace("#define DATA_TYPE_FP32");
                 break;
 
@@ -150,31 +263,31 @@
         }
 
         build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
-        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elems_processed_per_iteration_x));
-        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elems_processed_per_iteration_y));
+        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elements_processed.x()));
+        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elements_processed.y()));
 
-        // Create kernel
-        _kernel = GCKernelLibrary::get().create_kernel("gemm_mm_floating_point", build_opts);
-
-        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-#if defined(MM_PROCESS_4X_OPTIMIZED)
-        AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
-#else  /* MM_PROCESS_4X_OPTIMIZED */
-        AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), num_elems_processed_per_iteration_x), ceil_to_multiple(input0->info()->dimension(1),
-                                         num_elems_processed_per_iteration_y));
-#endif /* MM_PROCESS_4X_OPTIMIZED */
-        AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        update_window_and_padding(win, input0_access, input1_access, output_access);
-
-        Coordinates coord;
-        coord.set_num_dimensions(output->info()->num_dimensions());
-        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+        kernel_name = "gemm_mm_floating_point";
     }
 
-    IGCKernel::configure(win);
+    // Create kernel
+    _kernel = GCKernelLibrary::get().create_kernel(kernel_name, build_opts);
+}
+
+Status GCGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
+                                            const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+                                                              input1->clone().get(),
+                                                              output->clone().get(),
+                                                              is_interleaved_transposed,
+                                                              reshape_info,
+                                                              gpu_target,
+                                                              num_elements_processed)
+                                .first);
+    return Status{};
 }
 
 void GCGEMMMatrixMultiplyKernel::run(const Window &window)