Force CL kernel compilation with 64 registers

* For DDK version 30 and higher, force the CL compiler to use
  64 registers for NHWC direct convolution.

Resolves: COMPMID-5508
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I7d9ecc3b5a4eceaff44542cd26f6f05e30ab2c1f
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8351
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index c4b70ca..722c802 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -292,6 +292,11 @@
             build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
             build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
         }
+
+        if(compile_context.get_ddk_version() >= 30)
+        {
+            build_options.add_option("-fregister-allocation=64");
+        }
     }
     else
     {