COMPMID-1182: printf doesn't work

Change-Id: I013d57f6e2becbd6d2d7700ce5fbbeca670443c4
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/133735
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 21a0e68..7f1667a 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -741,6 +741,11 @@
     }
     std::string concat_str;
 
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+    // Enable debug properties in CL kernels
+    concat_str += " -DARM_COMPUTE_DEBUG_ENABLED";
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+
     if(fp16_supported())
     {
         concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index a67e5b2..0ea3254 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp
@@ -38,7 +38,6 @@
 GraphManager::GraphManager()
     : _workloads()
 {
-    detail::default_initialize_backends();
 }
 
 void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &pm, Target target)
diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
index 7f2be67..b235c3a 100644
--- a/src/graph/backends/CL/CLDeviceBackend.cpp
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp
@@ -66,7 +66,7 @@
 static const std::string tuner_data_filename = "acl_tuner.csv";
 
 CLDeviceBackend::CLDeviceBackend()
-    : _tuner(), _allocator(cl::Context::getDefault())
+    : _initialized(false), _tuner(), _allocator(nullptr)
 {
 }
 
@@ -96,11 +96,18 @@
     CLScheduler::get().default_init(&_tuner);
 
     // Create allocator with new context
-    _allocator = CLBufferAllocator();
+    _allocator = support::cpp14::make_unique<CLBufferAllocator>();
 }
 
 void CLDeviceBackend::setup_backend_context(GraphContext &ctx)
 {
+    // Force backend initialization
+    if(!_initialized)
+    {
+        initialize_backend();
+        _initialized = true;
+    }
+
     // Setup tuner
     set_kernel_tuning(ctx.config().use_tuner);
 
@@ -124,7 +131,7 @@
 
 IAllocator *CLDeviceBackend::backend_allocator()
 {
-    return &_allocator;
+    return _allocator.get();
 }
 
 std::unique_ptr<ITensorHandle> CLDeviceBackend::create_tensor(const Tensor &tensor)
@@ -180,7 +187,7 @@
     auto pool_mgr     = std::make_shared<PoolManager>();
     auto mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
 
-    mm->set_allocator(&_allocator);
+    mm->set_allocator(_allocator.get());
 
     return mm;
 }
diff --git a/src/graph/backends/GLES/GCDeviceBackend.cpp b/src/graph/backends/GLES/GCDeviceBackend.cpp
index 770cca5..bfac31a 100644
--- a/src/graph/backends/GLES/GCDeviceBackend.cpp
+++ b/src/graph/backends/GLES/GCDeviceBackend.cpp
@@ -53,7 +53,7 @@
 static detail::BackendRegistrar<GCDeviceBackend> GCDeviceBackend_registrar(Target::GC);
 
 GCDeviceBackend::GCDeviceBackend()
-    : _allocator()
+    : _initialized(false), _allocator()
 {
 }
 
@@ -65,6 +65,13 @@
 
 void GCDeviceBackend::setup_backend_context(GraphContext &ctx)
 {
+    // Force backend initialization
+    if(!_initialized)
+    {
+        initialize_backend();
+        _initialized = true;
+    }
+
     // Setup a management backend
     if(ctx.memory_management_ctx(Target::GC) == nullptr)
     {
diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index c370fdf..d68092a 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp
@@ -35,14 +35,6 @@
 {
 namespace detail
 {
-void default_initialize_backends()
-{
-    for(const auto &backend : backends::BackendRegistry::get().backends())
-    {
-        backend.second->initialize_backend();
-    }
-}
-
 void validate_all_nodes(Graph &g)
 {
     auto &nodes = g.nodes();
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index 534c4f9..bbc513d 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -61,7 +61,7 @@
 
 void CLMemory::create_empty_region()
 {
-    _region_owned = std::make_shared<CLBufferMemoryRegion>(cl::Context::getDefault(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0);
+    _region_owned = std::make_shared<CLBufferMemoryRegion>(cl::Context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0);
     _region       = _region_owned.get();
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index fdae615..c348dfa 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -28,6 +28,16 @@
 
 using namespace arm_compute;
 
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+namespace
+{
+void printf_callback(const char *buffer, unsigned int len, size_t complete, void *user_data)
+{
+    printf("%.*s", len, buffer);
+}
+} // namespace
+#endif /* defined(ARM_COMPUTE_DEBUG_ENABLED) */
+
 std::once_flag CLScheduler::_initialize_symbols;
 
 CLScheduler::CLScheduler()
@@ -42,6 +52,44 @@
     return scheduler;
 }
 
+void CLScheduler::default_init(ICLTuner *cl_tuner)
+{
+    if(!_is_initialised)
+    {
+        cl::Context ctx              = cl::Context::getDefault();
+        auto        queue_properties = cl::CommandQueue::getDefault().getInfo<CL_QUEUE_PROPERTIES>(nullptr);
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+        // Query devices in the context for cl_arm_printf support
+        std::vector<cl::Device> def_platform_devices;
+        cl::Platform::getDefault().getDevices(CL_DEVICE_TYPE_DEFAULT, &def_platform_devices);
+
+        if(device_supports_extension(def_platform_devices[0], "cl_arm_printf"))
+        {
+            // Create a cl_context with a printf_callback and user specified buffer size.
+            cl_context_properties properties[] =
+            {
+                CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(cl::Platform::get()()),
+                // Enable a printf callback function for this context.
+                CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
+                // Request a minimum printf buffer size of 4MB for devices in the
+                // context that support this extension.
+                CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
+                0
+            };
+            ctx = cl::Context(CL_DEVICE_TYPE_DEFAULT, properties);
+        }
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+
+        cl::CommandQueue queue = cl::CommandQueue(ctx, cl::Device::getDefault(), queue_properties);
+        CLKernelLibrary::get().init("./cl_kernels/", ctx, cl::Device::getDefault());
+        init(ctx, queue, cl::Device::getDefault(), cl_tuner);
+    }
+    else
+    {
+        _cl_tuner = cl_tuner;
+    }
+}
+
 void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,