COMPMID-2443: CL example use program cache by default.

Change-Id: I9db5cf4ce98e86f7488f4041f0d0247d3d0cd663
Signed-off-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1528
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: VidhyaSudhan Loganathan <vidhyasudhan.loganathan@arm.com>
diff --git a/examples/cl_cache.cpp b/examples/cl_cache.cpp
index 87a3058..998c468 100644
--- a/examples/cl_cache.cpp
+++ b/examples/cl_cache.cpp
@@ -35,69 +35,6 @@
 
 namespace
 {
-/** This function loads prebuilt opencl kernels from a file
- *
- * @param[in] filename Name of the file to be used to load the kernels
- */
-void restore_program_cache_from_file(const std::string &filename = "cache.bin")
-{
-    std::cout << "Loading kernels from file " << filename << std::endl;
-    std::ifstream cache_file(filename, std::ios::binary);
-    if(cache_file.is_open())
-    {
-        while(!cache_file.eof())
-        {
-            size_t name_len   = 0;
-            size_t binary_len = 0;
-            cache_file.read(reinterpret_cast<char *>(&name_len), sizeof(size_t));
-            cache_file.read(reinterpret_cast<char *>(&binary_len), sizeof(size_t));
-            if(name_len == 0 || binary_len == 0)
-            {
-                break;
-            }
-            std::vector<char>          tmp(name_len);
-            std::vector<unsigned char> binary(binary_len);
-            std::string                name;
-            cache_file.read(tmp.data(), name_len);
-            name.assign(tmp.data(), name_len);
-            tmp.resize(binary_len);
-            cache_file.read(reinterpret_cast<char *>(binary.data()), binary_len);
-            cl::Context             context = arm_compute::CLScheduler::get().context();
-            cl::Program::Binaries   binaries{ binary };
-            std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
-            cl::Program             program(context, devices, binaries);
-            program.build();
-            CLKernelLibrary::get().add_built_program(name, program);
-        }
-        cache_file.close();
-    }
-}
-
-/** This function saves opencl kernels library to a file
- *
- * @param[in] filename Name of the file to be used to save the library
- */
-void save_program_cache_to_file(const std::string &filename = "cache.bin")
-{
-    std::cout << "Saving opencl kernels to " << filename << std::endl;
-    std::ofstream cache_file(filename, std::ios::binary);
-    if(cache_file.is_open())
-    {
-        for(const auto &it : CLKernelLibrary::get().get_built_programs())
-        {
-            std::vector<std::vector<unsigned char>> binaries = it.second.getInfo<CL_PROGRAM_BINARIES>();
-            ARM_COMPUTE_ERROR_ON(binaries.size() != 1);
-            const std::string kernel_name      = it.first;
-            size_t            kernel_name_size = kernel_name.length();
-            size_t            binary_size      = binaries[0].size();
-            cache_file.write(reinterpret_cast<char *>(&kernel_name_size), sizeof(size_t));
-            cache_file.write(reinterpret_cast<char *>(&binary_size), sizeof(size_t));
-            cache_file.write(kernel_name.c_str(), kernel_name_size);
-            cache_file.write(reinterpret_cast<const char *>(binaries[0].data()), binaries[0].size());
-        }
-        cache_file.close();
-    }
-}
 } // namespace
 
 class CLCacheExample : public Example
diff --git a/examples/graph_alexnet.cpp b/examples/graph_alexnet.cpp
index a785dea..f8b25a1 100644
--- a/examples/graph_alexnet.cpp
+++ b/examples/graph_alexnet.cpp
@@ -27,6 +27,8 @@
 #include "utils/GraphUtils.h"
 #include "utils/Utils.h"
 
+#include <chrono>
+
 using namespace arm_compute::utils;
 using namespace arm_compute::graph::frontend;
 using namespace arm_compute::graph_utils;
@@ -148,13 +150,34 @@
 
         // Finalize graph
         GraphConfig config;
+
         config.num_threads = common_params.threads;
         config.use_tuner   = common_params.enable_tuner;
         config.tuner_mode  = common_params.tuner_mode;
         config.tuner_file  = common_params.tuner_file;
 
+        const auto config_start_time = std::chrono::high_resolution_clock::now();
+
+        // Load the precompiled kernels from a file into the kernel library, in this way the next time they are needed
+        // compilation won't be required.
+        if(common_params.enable_cl_cache)
+        {
+            restore_program_cache_from_file();
+        }
+
         graph.finalize(common_params.target, config);
 
+        const auto config_end_time = std::chrono::high_resolution_clock::now();
+        const auto time_elapsed    = config_end_time - config_start_time;
+        const auto time_elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(time_elapsed).count();
+        std::cout << "Configuration time " << time_elapsed_ms << " ms " << std::endl;
+
+        // Save the opencl kernels to a file
+        if(common_opts.enable_cl_cache)
+        {
+            save_program_cache_to_file();
+        }
+
         return true;
     }
     void do_run() override
diff --git a/examples/graph_inception_v4.cpp b/examples/graph_inception_v4.cpp
index 3ea2b2f..15fd049 100644
--- a/examples/graph_inception_v4.cpp
+++ b/examples/graph_inception_v4.cpp
@@ -27,6 +27,8 @@
 #include "utils/GraphUtils.h"
 #include "utils/Utils.h"
 
+#include <chrono>
+
 using namespace arm_compute::utils;
 using namespace arm_compute::graph::frontend;
 using namespace arm_compute::graph_utils;
@@ -154,8 +156,28 @@
         config.tuner_mode  = common_params.tuner_mode;
         config.tuner_file  = common_params.tuner_file;
 
+        const auto config_start_time = std::chrono::high_resolution_clock::now();
+
+        // Load the precompiled kernels from a file into the kernel library, in this way the next time they are needed
+        // compilation won't be required.
+        if(common_params.enable_cl_cache)
+        {
+            restore_program_cache_from_file();
+        }
+
         graph.finalize(common_params.target, config);
 
+        const auto config_end_time = std::chrono::high_resolution_clock::now();
+        const auto time_elapsed    = config_end_time - config_start_time;
+        const auto time_elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(time_elapsed).count();
+        std::cout << "Configuration time " << time_elapsed_ms << " ms " << std::endl;
+
+        // Save the opencl kernels to a file
+        if(common_opts.enable_cl_cache)
+        {
+            save_program_cache_to_file();
+        }
+
         return true;
     }
 
diff --git a/utils/CommonGraphOptions.cpp b/utils/CommonGraphOptions.cpp
index e2ca98a..280ad4e 100644
--- a/utils/CommonGraphOptions.cpp
+++ b/utils/CommonGraphOptions.cpp
@@ -83,6 +83,7 @@
     os << "Data type : " << common_params.data_type << std::endl;
     os << "Data layout : " << common_params.data_layout << std::endl;
     os << "Tuner enabled? : " << (common_params.enable_tuner ? true_str : false_str) << std::endl;
+    os << "Cache enabled? : " << (common_params.enable_cl_cache ? true_str : false_str) << std::endl;
     os << "Tuner mode : " << common_params.tuner_mode << std::endl;
     os << "Tuner file : " << common_params.tuner_file << std::endl;
     os << "Fast math enabled? : " << (common_params.fast_math_hint == FastMathHint::Enabled ? true_str : false_str) << std::endl;
@@ -118,6 +119,7 @@
       data_type(),
       data_layout(),
       enable_tuner(parser.add_option<ToggleOption>("enable-tuner")),
+      enable_cl_cache(parser.add_option<ToggleOption>("enable-cl-cache")),
       tuner_mode(),
       fast_math_hint(parser.add_option<ToggleOption>("fast-math")),
       data_path(parser.add_option<SimpleOption<std::string>>("data")),
@@ -166,6 +168,7 @@
     data_type->set_help("Data type to use");
     data_layout->set_help("Data layout to use");
     enable_tuner->set_help("Enable OpenCL dynamic tuner");
+    enable_cl_cache->set_help("Enable OpenCL program caches");
     tuner_mode->set_help("Configures the time taken by the tuner to tune. Slow tuner produces the most performant LWS configuration");
     fast_math_hint->set_help("Enable fast math");
     data_path->set_help("Path where graph parameters reside");
@@ -192,6 +195,7 @@
         common_params.data_layout = options.data_layout->value();
     }
     common_params.enable_tuner           = options.enable_tuner->is_set() ? options.enable_tuner->value() : false;
+    common_params.enable_cl_cache        = common_params.target == arm_compute::graph::Target::CL ? (options.enable_cl_cache->is_set() ? options.enable_cl_cache->value() : true) : false;
     common_params.tuner_mode             = options.tuner_mode->value();
     common_params.fast_math_hint         = options.fast_math_hint->is_set() ? fast_math_hint_value : FastMathHint::Disabled;
     common_params.data_path              = options.data_path->value();
diff --git a/utils/CommonGraphOptions.h b/utils/CommonGraphOptions.h
index 826cca1..3666462 100644
--- a/utils/CommonGraphOptions.h
+++ b/utils/CommonGraphOptions.h
@@ -44,6 +44,7 @@
  * --type             : Data type to be used by the examples. Supported data type options: QASYMM8, F16, F32.
  * --layout           : Data layout to be used by the examples. Supported data layout options : NCHW, NHWC.
  * --enable-tuner     : Toggle option to enable the OpenCL dynamic tuner.
+ * --enable-cl-cache  : Toggle option to load the prebuilt opencl kernels from a cache file.
  * --fast-math        : Toggle option to enable the fast math option.
  * --data             : Path that contains the trainable parameter files of graph layers.
  * --image            : Image to load and operate on. Image types supported: PPM, JPEG, NPY.
@@ -94,6 +95,7 @@
     arm_compute::DataType            data_type{ DataType::F32 };
     arm_compute::DataLayout          data_layout{ DataLayout::NHWC };
     bool                             enable_tuner{ false };
+    bool                             enable_cl_cache{ false };
     arm_compute::CLTunerMode         tuner_mode{ CLTunerMode::NORMAL };
     arm_compute::graph::FastMathHint fast_math_hint{ arm_compute::graph::FastMathHint::Disabled };
     std::string                      data_path{};
@@ -149,6 +151,7 @@
     EnumOption<arm_compute::DataType>      *data_type;        /**< Graph data type */
     EnumOption<arm_compute::DataLayout>    *data_layout;      /**< Graph data layout */
     ToggleOption                           *enable_tuner;     /**< Enable tuner */
+    ToggleOption                           *enable_cl_cache;  /**< Enable opencl kernels cache */
     SimpleOption<arm_compute::CLTunerMode> *tuner_mode;       /**< Tuner mode */
     ToggleOption                           *fast_math_hint;   /**< Fast math hint */
     SimpleOption<std::string>              *data_path;        /**< Trainable parameters path */
diff --git a/utils/Utils.cpp b/utils/Utils.cpp
index 1d08676..47ec259 100644
--- a/utils/Utils.cpp
+++ b/utils/Utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,8 @@
  */
 #include "Utils.h"
 
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include <cctype>
 #include <cerrno>
 #include <iomanip>
@@ -267,5 +269,79 @@
     // Nothing found or an error during opening the file
     return 0;
 }
+
+/** This function loads prebuilt opencl kernels from a file
+ *
+ * @param[in] filename Name of the file to be used to load the kernels
+ */
+void restore_program_cache_from_file(const std::string &filename)
+{
+    std::ifstream cache_file(filename, std::ios::binary);
+    if(cache_file.is_open())
+    {
+        if(!CLScheduler::get().is_initialised())
+        {
+            arm_compute::CLScheduler::get().default_init();
+        }
+
+        while(!cache_file.eof())
+        {
+            size_t name_len   = 0;
+            size_t binary_len = 0;
+            cache_file.read(reinterpret_cast<char *>(&name_len), sizeof(size_t));
+            cache_file.read(reinterpret_cast<char *>(&binary_len), sizeof(size_t));
+            if(name_len == 0 || binary_len == 0)
+            {
+                break;
+            }
+            std::vector<char>          tmp(name_len);
+            std::vector<unsigned char> binary(binary_len);
+            std::string                name;
+            cache_file.read(tmp.data(), name_len);
+            name.assign(tmp.data(), name_len);
+            tmp.resize(binary_len);
+            cache_file.read(reinterpret_cast<char *>(binary.data()), binary_len);
+            cl::Context             context = arm_compute::CLScheduler::get().context();
+            cl::Program::Binaries   binaries{ binary };
+            std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+            cl::Program             program(context, devices, binaries);
+            program.build();
+            CLKernelLibrary::get().add_built_program(name, program);
+        }
+        cache_file.close();
+    }
+}
+
+/** This function saves opencl kernels library to a file
+ *
+ * @param[in] filename Name of the file to be used to save the library
+ */
+void save_program_cache_to_file(const std::string &filename)
+{
+    if(CLScheduler::get().is_initialised())
+    {
+        std::ofstream cache_file(filename, std::ios::binary);
+        if(cache_file.is_open())
+        {
+            for(const auto &it : CLKernelLibrary::get().get_built_programs())
+            {
+                std::vector<std::vector<unsigned char>> binaries = it.second.getInfo<CL_PROGRAM_BINARIES>();
+                ARM_COMPUTE_ERROR_ON(binaries.size() != 1);
+                const std::string kernel_name      = it.first;
+                size_t            kernel_name_size = kernel_name.length();
+                size_t            binary_size      = binaries[0].size();
+                cache_file.write(reinterpret_cast<char *>(&kernel_name_size), sizeof(size_t));
+                cache_file.write(reinterpret_cast<char *>(&binary_size), sizeof(size_t));
+                cache_file.write(kernel_name.c_str(), kernel_name_size);
+                cache_file.write(reinterpret_cast<const char *>(binaries[0].data()), binaries[0].size());
+            }
+            cache_file.close();
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Cannot open cache file");
+        }
+    }
+}
 } // namespace utils
 } // namespace arm_compute
diff --git a/utils/Utils.h b/utils/Utils.h
index eec6972..ba10d7c 100644
--- a/utils/Utils.h
+++ b/utils/Utils.h
@@ -823,6 +823,18 @@
 
     return num_mismatches;
 }
+
+/** This function saves opencl kernels library to a file
+ *
+ * @param[in] filename Name of the file to be used to save the library
+ */
+void save_program_cache_to_file(const std::string &filename = "cache.bin");
+
+/** This function loads prebuilt opencl kernels from a file
+ *
+ * @param[in] filename Name of the file to be used to load the kernels
+ */
+void restore_program_cache_from_file(const std::string &filename = "cache.bin");
 } // namespace utils
 } // namespace arm_compute
 #endif /* __UTILS_UTILS_H__*/