Force CL kernel compilation with 64 registers
* For DDK version 30 and higher, force the CL compiler to use
64 registers for NHWC direct convolution.
Resolves: COMPMID-5508
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I7d9ecc3b5a4eceaff44542cd26f6f05e30ab2c1f
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8351
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index 81eb748..fce8798 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -270,16 +270,9 @@
ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
}
- if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD)
+ if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11)
{
- const std::string device_vers = _device.device_version();
- const std::regex ddk_regex("r([0-9]*)p[0-9]");
- std::smatch ddk_match;
-
- if(std::regex_search(device_vers, ddk_match, ddk_regex) && std::stoi(ddk_match[1]) >= 11)
- {
- concat_str += " -DUNROLL_WITH_PRAGMA ";
- }
+ concat_str += " -DUNROLL_WITH_PRAGMA ";
}
std::string build_options = stringify_set(build_options_set, kernel_path) + concat_str;
@@ -392,4 +385,18 @@
{
return _device.compute_units();
}
+
+int32_t CLCompileContext::get_ddk_version() const
+{
+ const std::string device_version = _device.device_version();
+ const std::regex ddk_regex("r([0-9]*)p[0-9]");
+ std::smatch ddk_match;
+
+ if(std::regex_search(device_version, ddk_match, ddk_regex))
+ {
+ return std::stoi(ddk_match[1]);
+ }
+
+ return -1;
+}
} // namespace arm_compute