COMPMID-3322: Add cl_image support for GEMMReshapedOnlyRHS NT
COMPMID-3323: Add cl_image support for GEMMReshapedOnlyRHS T

- Added support for cl_image in CLGEMMMatrixMultiplyReshapedInlyRHSKernel (both NT and T kernels)
- Extended the tests for the validating rhs_info.export_to_cl_image = true
- Updated doxygen documentation in CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h

Change-Id: If253794323aac072d84a4d8680b9a2339ab7ad92
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3437
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
index 22bde63..d6ee0b0 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
@@ -38,6 +38,7 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLUtils.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
@@ -380,29 +381,14 @@
 
     const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
 
-    cl_mem      cl_image;
-    cl_int      err = CL_SUCCESS;
     cl::Image2D input1_image2d;
 
     if(_export_to_cl_image)
     {
-        // Create OpenCL image object from OpenCL buffer
-        const cl_image_format format = { CL_RGBA, CL_FLOAT };
+        const TensorShape shape2d(_input1->info()->dimension(0) / 4, _input1->info()->dimension(1) * _input1->info()->dimension(2));
+        const size_t      image_row_pitch = _input1->info()->strides_in_bytes()[1];
 
-        cl_image_desc desc;
-        memset(&desc, 0, sizeof(desc));
-        desc.image_type      = CL_MEM_OBJECT_IMAGE2D;
-        desc.mem_object      = _input1->cl_buffer()();
-        desc.image_row_pitch = _input1->info()->strides_in_bytes()[1];
-        desc.image_width     = _input1->info()->dimension(0) / 4;
-        desc.image_height    = _input1->info()->dimension(1) * _input1->info()->dimension(2);
-
-        cl_image = clCreateImage(CLKernelLibrary::get().context()(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
-
-        ARM_COMPUTE_UNUSED(err);
-        ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Error during the creation of CL image from buffer");
-
-        input1_image2d = cl::Image2D(cl_image);
+        input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, CL_FLOAT, image_row_pitch);
     }
 
     do
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
index 8e194d5..deeb491 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLUtils.h"
 #include "support/StringSupport.h"
 
 #include <tuple>
@@ -65,6 +66,23 @@
                                     "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
 
+    if(rhs_info.export_to_cl_image)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.n0 == 2) || (rhs_info.n0 == 3), "Export to cl_image only supported with n0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 == 2) || (rhs_info.k0 == 3), "Export to cl_image only supported with k0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() != DataType::F32, "Export to cl_image only supported with F32 data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment");
+
+        // Check the width and height of the output tensor.
+        // Since we cannot create a 3d image from a buffer, the third dimension is collapsed with the second dimension
+        size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+        size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->tensor_shape()[1] * input1->tensor_shape()[2] > max_image_h, "Not supported height for cl_image");
+    }
+
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
     const unsigned int k = gemm_info.k;
@@ -204,7 +222,7 @@
 
 CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel()
     : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false),
-      _add_bias(false), _broadcast_bias(false)
+      _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false)
 {
 }
 
@@ -234,6 +252,7 @@
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = _input2 != nullptr;
     _broadcast_bias           = gemm_info.broadcast_bias;
+    _export_to_cl_image       = rhs_info.export_to_cl_image;
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -276,6 +295,8 @@
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
+    build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(input1->info()->dimension(1)));
     build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
     build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
     build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
@@ -289,6 +310,7 @@
 
     std::string kernel_name("gemm_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
+    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
 
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -358,36 +380,17 @@
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        unsigned int idx0;
-        if(_add_bias)
-        {
-            idx0 = 4 * num_arguments_per_2D_tensor() + 4;
-        }
-        else
-        {
-            idx0 = 3 * num_arguments_per_2D_tensor() + 3;
-        }
-        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
+    const unsigned int total_cross_plane_pad_lhs = _input0->info()->padding().top + _input0->info()->padding().bottom;
+    const unsigned int total_cross_plane_pad_out = _output->info()->padding().top + _output->info()->padding().bottom;
 
-    if(_reinterpret_output_as_3d)
+    cl::Image2D input1_image2d;
+
+    if(_export_to_cl_image)
     {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        unsigned int idx0;
-        if(_add_bias)
-        {
-            idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0);
-        }
-        else
-        {
-            idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        }
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+        const TensorShape shape2d(_input1->info()->dimension(0) / 4, _input1->info()->dimension(1) * _input1->info()->dimension(2));
+        const size_t      image_row_pitch = _input1->info()->strides_in_bytes()[1];
+
+        input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, CL_FLOAT, image_row_pitch);
     }
 
     do
@@ -401,17 +404,53 @@
         }
 
         unsigned int idx = 0;
+
+        // LHS buffer
         add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument_if((_add_bias), idx, _input2, slice);
+
+        // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
+        if(_export_to_cl_image)
+        {
+            _kernel.setArg(idx++, input1_image2d);
+        }
+        else
+        {
+            add_2D_tensor_argument(idx, _input1, slice_b);
+        }
+
+        // Bias buffer (_add_bias == true)
+        add_2D_tensor_argument_if(_add_bias, idx, _input2, slice);
+
+        // Output buffer
         add_2D_tensor_argument(idx, _output, slice);
+
+        // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+
+        // RHS stride_z (not used if _export_to_cl_image == true)
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+
+        // Bias stride_z (if _add_bias == true)
         if(_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
         }
+
+        // Output stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+
+        // Cross-plan padding (if _reinterpret_input_as_3d = true)
+        if(_reinterpret_input_as_3d)
+        {
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
+        }
+
+        // Cross-plan padding (if _reinterpret_output_as_3d = true)
+        if(_reinterpret_output_as_3d)
+        {
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
+        }
+
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
     }
     while(window.slide_window_slice_3D(slice));