Force CL kernel compilation with 64 registers

* For DDK version 30 and higher, force the CL compiler to use
  64 registers for NHWC direct convolution.

Resolves: COMPMID-5508
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I7d9ecc3b5a4eceaff44542cd26f6f05e30ab2c1f
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8351
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index 81eb748..fce8798 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -270,16 +270,9 @@
         ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
     }
 
-    if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD)
+    if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11)
     {
-        const std::string device_vers = _device.device_version();
-        const std::regex  ddk_regex("r([0-9]*)p[0-9]");
-        std::smatch       ddk_match;
-
-        if(std::regex_search(device_vers, ddk_match, ddk_regex) && std::stoi(ddk_match[1]) >= 11)
-        {
-            concat_str += " -DUNROLL_WITH_PRAGMA ";
-        }
+        concat_str += " -DUNROLL_WITH_PRAGMA ";
     }
 
     std::string build_options = stringify_set(build_options_set, kernel_path) + concat_str;
@@ -392,4 +385,18 @@
 {
     return _device.compute_units();
 }
+
+int32_t CLCompileContext::get_ddk_version() const
+{
+    const std::string device_version = _device.device_version();
+    const std::regex  ddk_regex("r([0-9]*)p[0-9]");
+    std::smatch       ddk_match;
+
+    if(std::regex_search(device_version, ddk_match, ddk_regex))
+    {
+        return std::stoi(ddk_match[1]);
+    }
+
+    return -1;
+}
 } // namespace arm_compute