COMPMID-1687: Optimize CLGEMMMatrixMultiplyKernel for Mali-G76 - Part1

The current implementation is limited just to FP32

Change-Id: I185ab57e483e879d7c301e9cc3033efc8b41e244
Reviewed-on: https://review.mlplatform.org/389
Reviewed-by: Anthony Barbier <Anthony.barbier@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 54ef23f..e3d8df5 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -192,11 +192,15 @@
         num_elems_processed_per_iteration = 2;
         is_padding_required_nchw          = false;
 
-        // Only the 3x3 case is optimized for NHWC
+        // Only the 3x3 and 9x9 cases are optimized for NHWC
         if(kernel_dims == Size2D(3U, 3U))
         {
             kernel_name = "im2col3x3_";
         }
+        else if(kernel_dims == Size2D(9U, 9U))
+        {
+            kernel_name = "im2col9x9_";
+        }
 
         build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
         build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(input_channel - num_elems_processed_per_iteration), 0)));