Revert "Revert "IVGCVSW-6700 Enable import aligned host memory in android-nn-driver""

This reverts commit 8069603dc44b7673b356f66517cd8b25af8080f0.

 * Reason for revert: Try reenable import aligned host memory in android-nn-driver
 * Added a check to ArmNNDriverImpl.cpp to not call ExecuteWithDummyInputs with GpuAcc
 * Added new android-nn-driver driver options to enable / disable Import and Export
 * Import is disabled by default for now due to conv2d issues
 * Export is enabled by default

!armnn:7147

Change-Id: I91110c58ebb3931d1c458e3774944e55c1250dd8
Signed-off-by: David Monahan <David.Monahan@arm.com>
diff --git a/1.2/ArmnnDriverImpl.cpp b/1.2/ArmnnDriverImpl.cpp
index 3274a8a..1c31384 100644
--- a/1.2/ArmnnDriverImpl.cpp
+++ b/1.2/ArmnnDriverImpl.cpp
@@ -267,7 +267,9 @@
                     options.GetRequestInputsAndOutputsDumpDir(),
                     options.IsGpuProfilingEnabled(),
                     options.isAsyncModelExecutionEnabled(),
-                    options.getNoOfArmnnThreads()));
+                    options.getNoOfArmnnThreads(),
+                    options.isImportEnabled(),
+                    options.isExportEnabled()));
 
     // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if
     // this is enabled) before the first 'real' inference which removes the overhead of the first inference.
@@ -630,6 +632,8 @@
                     options.IsGpuProfilingEnabled(),
                     options.isAsyncModelExecutionEnabled(),
                     options.getNoOfArmnnThreads(),
+                    options.isImportEnabled(),
+                    options.isExportEnabled(),
                     true));
 
     NotifyCallbackAndCheck(cb, V1_0::ErrorStatus::NONE, preparedModel.release());
diff --git a/1.3/ArmnnDriverImpl.cpp b/1.3/ArmnnDriverImpl.cpp
index c8b1d96..474e1c1 100644
--- a/1.3/ArmnnDriverImpl.cpp
+++ b/1.3/ArmnnDriverImpl.cpp
@@ -281,7 +281,9 @@
                     options.IsGpuProfilingEnabled(),
                     priority,
                     options.isAsyncModelExecutionEnabled(),
-                    options.getNoOfArmnnThreads()));
+                    options.getNoOfArmnnThreads(),
+                    options.isImportEnabled(),
+                    options.isExportEnabled()));
 
     // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if
     // this is enabled) before the first 'real' inference which removes the overhead of the first inference.
@@ -645,6 +647,8 @@
                                                            V1_3::Priority::MEDIUM,
                                                            options.isAsyncModelExecutionEnabled(),
                                                            options.getNoOfArmnnThreads(),
+                                                           options.isImportEnabled(),
+                                                           options.isExportEnabled(),
                                                            true));
 
     NotifyCallbackAndCheck(cb, V1_3::ErrorStatus::NONE, preparedModel.release());
diff --git a/ArmnnDriverImpl.cpp b/ArmnnDriverImpl.cpp
index 0b3b919..89fa54f 100644
--- a/ArmnnDriverImpl.cpp
+++ b/ArmnnDriverImpl.cpp
@@ -202,30 +202,36 @@
                     options.GetRequestInputsAndOutputsDumpDir(),
                     options.IsGpuProfilingEnabled(),
                     options.isAsyncModelExecutionEnabled(),
-                    options.getNoOfArmnnThreads()));
+                    options.getNoOfArmnnThreads(),
+                    options.isImportEnabled(),
+                    options.isExportEnabled()));
 
-    // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if
-    // this is enabled) before the first 'real' inference which removes the overhead of the first inference.
-    if (!preparedModel->ExecuteWithDummyInputs())
+    if (std::find(options.GetBackends().begin(),
+                  options.GetBackends().end(),
+                  armnn::Compute::GpuAcc) != options.GetBackends().end())
     {
-        return FailPrepareModel(V1_0::ErrorStatus::GENERAL_FAILURE, "Network could not be executed", cb);
-    }
-
-    if (clTunedParameters &&
-        options.GetClTunedParametersMode() == armnn::IGpuAccTunedParameters::Mode::UpdateTunedParameters)
-    {
-        // Now that we've done one inference the CL kernel parameters will have been tuned, so save the updated file.
-        try
+        // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if
+        // this is enabled) before the first 'real' inference which removes the overhead of the first inference.
+        if (!preparedModel->ExecuteWithDummyInputs())
         {
-            clTunedParameters->Save(options.GetClTunedParametersFile().c_str());
+            return FailPrepareModel(V1_0::ErrorStatus::GENERAL_FAILURE, "Network could not be executed", cb);
         }
-        catch (std::exception& error)
+
+        if (clTunedParameters &&
+            options.GetClTunedParametersMode() == armnn::IGpuAccTunedParameters::Mode::UpdateTunedParameters)
         {
-            ALOGE("ArmnnDriverImpl::prepareModel: Failed to save CL tuned parameters file '%s': %s",
-                  options.GetClTunedParametersFile().c_str(), error.what());
+            // Now that we've done one inference the CL kernel parameters will have been tuned, so save the updated file
+            try
+            {
+                clTunedParameters->Save(options.GetClTunedParametersFile().c_str());
+            }
+            catch (std::exception& error)
+            {
+                ALOGE("ArmnnDriverImpl::prepareModel: Failed to save CL tuned parameters file '%s': %s",
+                      options.GetClTunedParametersFile().c_str(), error.what());
+            }
         }
     }
-
     NotifyCallbackAndCheck(cb, V1_0::ErrorStatus::NONE, preparedModel);
 
     return V1_0::ErrorStatus::NONE;
diff --git a/ArmnnPreparedModel.cpp b/ArmnnPreparedModel.cpp
index 38f1bc2..326351c 100644
--- a/ArmnnPreparedModel.cpp
+++ b/ArmnnPreparedModel.cpp
@@ -8,6 +8,8 @@
 #include "ArmnnPreparedModel.hpp"
 #include "Utils.hpp"
 
+#include <armnn/Types.hpp>
+
 #include <log/log.h>
 #include <OperationsUtils.h>
 #include <ValidateHal.h>
@@ -116,7 +118,9 @@
                                                    const std::string& requestInputsAndOutputsDumpDir,
                                                    const bool gpuProfilingEnabled,
                                                    const bool asyncModelExecutionEnabled,
-                                                   const unsigned int numberOfThreads)
+                                                   const unsigned int numberOfThreads,
+                                                   const bool importEnabled,
+                                                   const bool exportEnabled)
     : m_NetworkId(networkId)
     , m_Runtime(runtime)
     , m_Model(model)
@@ -124,6 +128,8 @@
     , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
     , m_GpuProfilingEnabled(gpuProfilingEnabled)
     , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
+    , m_EnableImport(importEnabled)
+    , m_EnableExport(exportEnabled)
 {
     // Enable profiling if required.
     m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
@@ -308,7 +314,19 @@
         else
         {
             ALOGW("ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled false");
-            status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors);
+            // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied.
+            std::vector<armnn::ImportedInputId> importedInputIds;
+            if (m_EnableImport)
+            {
+                importedInputIds =  m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc);
+            }
+            std::vector<armnn::ImportedOutputId> importedOutputIds;
+            if (m_EnableExport)
+            {
+                importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc);
+            }
+            status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors,
+                                                importedInputIds, importedOutputIds);
         }
 
         if (status != armnn::Status::Success)
@@ -389,7 +407,19 @@
         else
         {
             ALOGW("ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled false");
-            status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors);
+            // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied.
+            std::vector<armnn::ImportedInputId> importedInputIds;
+            if (m_EnableImport)
+            {
+                importedInputIds =  m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc);
+            }
+            std::vector<armnn::ImportedOutputId> importedOutputIds;
+            if (m_EnableExport)
+            {
+                importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc);
+            }
+            status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors,
+                                                importedInputIds, importedOutputIds);
         }
         if (status != armnn::Status::Success)
         {
diff --git a/ArmnnPreparedModel.hpp b/ArmnnPreparedModel.hpp
index 685d950..0d19c07 100644
--- a/ArmnnPreparedModel.hpp
+++ b/ArmnnPreparedModel.hpp
@@ -41,7 +41,9 @@
                        const std::string& requestInputsAndOutputsDumpDir,
                        const bool gpuProfilingEnabled,
                        const bool asyncModelExecutionEnabled = false,
-                       const unsigned int numberOfThreads = 1);
+                       const unsigned int numberOfThreads = 1,
+                       const bool importEnabled = false,
+                       const bool exportEnabled = true);
 
     virtual ~ArmnnPreparedModel();
 
@@ -111,6 +113,8 @@
     static std::unique_ptr<armnn::Threadpool> m_Threadpool;
     std::shared_ptr<armnn::IWorkingMemHandle> m_WorkingMemHandle;
     const bool m_AsyncModelExecutionEnabled;
+    const bool m_EnableImport;
+    const bool m_EnableExport;
 };
 
 }
diff --git a/ArmnnPreparedModel_1_2.cpp b/ArmnnPreparedModel_1_2.cpp
index 7f35e60..37bc3a4 100644
--- a/ArmnnPreparedModel_1_2.cpp
+++ b/ArmnnPreparedModel_1_2.cpp
@@ -9,6 +9,8 @@
 
 #include "Utils.hpp"
 
+#include <armnn/Types.hpp>
+
 #include <log/log.h>
 #include <OperationsUtils.h>
 #include <ExecutionBurstServer.h>
@@ -151,7 +153,9 @@
                                                            const std::string& requestInputsAndOutputsDumpDir,
                                                            const bool gpuProfilingEnabled,
                                                            const bool asyncModelExecutionEnabled,
-                                                           const unsigned int numberOfThreads)
+                                                           const unsigned int numberOfThreads,
+                                                           const bool importEnabled,
+                                                           const bool exportEnabled)
     : m_NetworkId(networkId)
     , m_Runtime(runtime)
     , m_Model(model)
@@ -159,6 +163,8 @@
     , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
     , m_GpuProfilingEnabled(gpuProfilingEnabled)
     , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
+    , m_EnableImport(importEnabled)
+    , m_EnableExport(exportEnabled)
     , m_PreparedFromCache(false)
 {
     // Enable profiling if required.
@@ -192,6 +198,8 @@
                                                            const bool gpuProfilingEnabled,
                                                            const bool asyncModelExecutionEnabled,
                                                            const unsigned int numberOfThreads,
+                                                           const bool importEnabled,
+                                                           const bool exportEnabled,
                                                            const bool preparedFromCache)
     : m_NetworkId(networkId)
     , m_Runtime(runtime)
@@ -199,6 +207,8 @@
     , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
     , m_GpuProfilingEnabled(gpuProfilingEnabled)
     , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
+    , m_EnableImport(importEnabled)
+    , m_EnableExport(exportEnabled)
     , m_PreparedFromCache(preparedFromCache)
 {
     // Enable profiling if required.
@@ -531,7 +541,20 @@
         else
         {
             ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled false");
-            status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors);
+
+            // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied.
+            std::vector<armnn::ImportedInputId> importedInputIds;
+            if (m_EnableImport)
+            {
+                importedInputIds =  m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc);
+            }
+            std::vector<armnn::ImportedOutputId> importedOutputIds;
+            if (m_EnableExport)
+            {
+                importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc);
+            }
+            status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors,
+                                                importedInputIds, importedOutputIds);
         }
 
         if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
diff --git a/ArmnnPreparedModel_1_2.hpp b/ArmnnPreparedModel_1_2.hpp
index 255fc18..6afcbbe 100644
--- a/ArmnnPreparedModel_1_2.hpp
+++ b/ArmnnPreparedModel_1_2.hpp
@@ -47,7 +47,9 @@
                            const std::string& requestInputsAndOutputsDumpDir,
                            const bool gpuProfilingEnabled,
                            const bool asyncModelExecutionEnabled = false,
-                           const unsigned int numberOfThreads = 1);
+                           const unsigned int numberOfThreads = 1,
+                           const bool importEnabled = false,
+                           const bool exportEnabled = true);
 
     ArmnnPreparedModel_1_2(armnn::NetworkId networkId,
                            armnn::IRuntime* runtime,
@@ -55,6 +57,8 @@
                            const bool gpuProfilingEnabled,
                            const bool asyncModelExecutionEnabled = false,
                            const unsigned int numberOfThreads = 1,
+                           const bool importEnabled = false,
+                           const bool exportEnabled = true,
                            const bool preparedFromCache = false);
 
     virtual ~ArmnnPreparedModel_1_2();
@@ -164,6 +168,8 @@
     static std::unique_ptr<armnn::Threadpool> m_Threadpool;
     std::shared_ptr<IWorkingMemHandle>        m_WorkingMemHandle;
     const bool                                m_AsyncModelExecutionEnabled;
+    const bool                                m_EnableImport;
+    const bool                                m_EnableExport;
     const bool                                m_PreparedFromCache;
 };
 
diff --git a/ArmnnPreparedModel_1_3.cpp b/ArmnnPreparedModel_1_3.cpp
index 36575b8..34c42ec 100644
--- a/ArmnnPreparedModel_1_3.cpp
+++ b/ArmnnPreparedModel_1_3.cpp
@@ -12,6 +12,8 @@
 #include "ArmnnPreparedModel_1_3.hpp"
 #include "Utils.hpp"
 
+#include <armnn/Types.hpp>
+
 #include <Utils.h>
 #include <android/sync.h>
 #include <log/log.h>
@@ -172,7 +174,9 @@
                                                            const bool gpuProfilingEnabled,
                                                            V1_3::Priority priority,
                                                            const bool asyncModelExecutionEnabled,
-                                                           const unsigned int numberOfThreads)
+                                                           const unsigned int numberOfThreads,
+                                                           const bool importEnabled,
+                                                           const bool exportEnabled)
     : m_NetworkId(networkId)
     , m_Runtime(runtime)
     , m_Model(model)
@@ -181,6 +185,8 @@
     , m_GpuProfilingEnabled(gpuProfilingEnabled)
     , m_ModelPriority(priority)
     , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
+    , m_EnableImport(importEnabled)
+    , m_EnableExport(exportEnabled)
     , m_PreparedFromCache(false)
 {
     // Enable profiling if required.
@@ -215,6 +221,8 @@
                                                            V1_3::Priority priority,
                                                            const bool asyncModelExecutionEnabled,
                                                            const unsigned int numberOfThreads,
+                                                           const bool importEnabled,
+                                                           const bool exportEnabled,
                                                            const bool preparedFromCache)
     : m_NetworkId(networkId)
     , m_Runtime(runtime)
@@ -223,6 +231,8 @@
     , m_GpuProfilingEnabled(gpuProfilingEnabled)
     , m_ModelPriority(priority)
     , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
+    , m_EnableImport(importEnabled)
+    , m_EnableExport(exportEnabled)
     , m_PreparedFromCache(preparedFromCache)
 {
     // Enable profiling if required.
@@ -824,7 +834,19 @@
         else
         {
             ALOGW("ArmnnPreparedModel_1_3::ExecuteGraph m_AsyncModelExecutionEnabled false");
-            status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors);
+            // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied.
+            std::vector<armnn::ImportedInputId> importedInputIds;
+            if (m_EnableImport)
+            {
+                importedInputIds =  m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc);
+            }
+            std::vector<armnn::ImportedOutputId> importedOutputIds;
+            if (m_EnableExport)
+            {
+                importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc);
+            }
+            status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors,
+                                                importedInputIds, importedOutputIds);
         }
 
         if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
diff --git a/ArmnnPreparedModel_1_3.hpp b/ArmnnPreparedModel_1_3.hpp
index cd5fc0e..cb3c768 100644
--- a/ArmnnPreparedModel_1_3.hpp
+++ b/ArmnnPreparedModel_1_3.hpp
@@ -55,7 +55,9 @@
                            const bool gpuProfilingEnabled,
                            V1_3::Priority priority = V1_3::Priority::MEDIUM,
                            const bool asyncModelExecutionEnabled = false,
-                           const unsigned int numberOfThreads = 1);
+                           const unsigned int numberOfThreads = 1,
+                           const bool importEnabled = false,
+                           const bool exportEnabled = true);
 
     ArmnnPreparedModel_1_3(armnn::NetworkId networkId,
                            armnn::IRuntime* runtime,
@@ -64,6 +66,8 @@
                            V1_3::Priority priority = V1_3::Priority::MEDIUM,
                            const bool asyncModelExecutionEnabled = false,
                            const unsigned int numberOfThreads = 1,
+                           const bool importEnabled = false,
+                           const bool exportEnabled = true,
                            const bool preparedFromCache = false);
 
     virtual ~ArmnnPreparedModel_1_3();
@@ -201,6 +205,8 @@
     static std::unique_ptr<armnn::Threadpool>      m_Threadpool;
     std::shared_ptr<IWorkingMemHandle>             m_WorkingMemHandle;
     const bool                                     m_AsyncModelExecutionEnabled;
+    const bool                                     m_EnableImport;
+    const bool                                     m_EnableExport;
     const bool                                     m_PreparedFromCache;
 };
 
diff --git a/DriverOptions.cpp b/DriverOptions.cpp
index 8fd5c47..0018d97 100644
--- a/DriverOptions.cpp
+++ b/DriverOptions.cpp
@@ -40,6 +40,8 @@
     , m_NumberOfThreads(0)
     , m_EnableAsyncModelExecution(false)
     , m_ArmnnNumberOfThreads(1)
+    , m_EnableImport(false)
+    , m_EnableExport(true)
 {
 }
 
@@ -56,6 +58,8 @@
     , m_NumberOfThreads(0)
     , m_EnableAsyncModelExecution(false)
     , m_ArmnnNumberOfThreads(1)
+    , m_EnableImport(false)
+    , m_EnableExport(true)
 {
 }
 
@@ -71,6 +75,8 @@
     , m_NumberOfThreads(0)
     , m_EnableAsyncModelExecution(false)
     , m_ArmnnNumberOfThreads(1)
+    , m_EnableImport(false)
+    , m_EnableExport(true)
 {
     std::string unsupportedOperationsAsString;
     std::string clTunedParametersModeAsString;
@@ -168,7 +174,13 @@
          "Assign the number of threads used by ArmNN. "
          "Input value must be at least 1. "
          "Default is set to 1.",
-         cxxopts::value<unsigned int>(m_ArmnnNumberOfThreads)->default_value("1"));
+         cxxopts::value<unsigned int>(m_ArmnnNumberOfThreads)->default_value("1"))
+
+        ("I,enableImport", "Enable Importing of input buffers",
+         cxxopts::value<bool>(m_EnableImport)->default_value("false"))
+
+        ("E,enableExport", "Enable Exporting of output buffers",
+         cxxopts::value<bool>(m_EnableExport)->default_value("true"));
     }
     catch (const std::exception& e)
     {
diff --git a/DriverOptions.hpp b/DriverOptions.hpp
index e1d25c4..ee68a94 100644
--- a/DriverOptions.hpp
+++ b/DriverOptions.hpp
@@ -42,6 +42,8 @@
     unsigned int GetNumberOfThreads() const { return m_NumberOfThreads; }
     bool isAsyncModelExecutionEnabled() const { return m_EnableAsyncModelExecution; };
     unsigned int getNoOfArmnnThreads() const { return m_ArmnnNumberOfThreads; };
+    bool isImportEnabled() const { return m_EnableImport; };
+    bool isExportEnabled() const { return m_EnableExport; };
 
 private:
     std::vector<armnn::BackendId> m_Backends;
@@ -63,6 +65,8 @@
     unsigned int m_NumberOfThreads;
     bool m_EnableAsyncModelExecution;
     unsigned int m_ArmnnNumberOfThreads;
+    bool m_EnableImport;
+    bool m_EnableExport;
 };
 
 } // namespace armnn_driver
diff --git a/test/Concurrent.cpp b/test/Concurrent.cpp
index 4113a8d..71119cd 100644
--- a/test/Concurrent.cpp
+++ b/test/Concurrent.cpp
@@ -82,15 +82,16 @@
 
     // build the requests
     V1_0::Request requests[maxRequests];
+    android::sp<IMemory> inMemory[maxRequests];
     android::sp<IMemory> outMemory[maxRequests];
+    float indata[] = {2, 32, 16};
     float* outdata[maxRequests];
     for (size_t i = 0; i < maxRequests; ++i)
     {
         requests[i].inputs  = hidl_vec<RequestArgument>{input};
         requests[i].outputs = hidl_vec<RequestArgument>{output};
         // set the input data (matching source test)
-        float indata[] = {2, 32, 16};
-        AddPoolAndSetData<float>(3, requests[i], indata);
+        inMemory[i] = AddPoolAndSetData<float>(3, requests[i], indata);
         // add memory for the output
         outMemory[i] = AddPoolAndGetData<float>(1, requests[i]);
         outdata[i] = static_cast<float*>(static_cast<void*>(outMemory[i]->getPointer()));
diff --git a/test/DriverTestHelpers.hpp b/test/DriverTestHelpers.hpp
index 383c8fc..98be090 100644
--- a/test/DriverTestHelpers.hpp
+++ b/test/DriverTestHelpers.hpp
@@ -186,13 +186,15 @@
 }
 
 template<typename T>
-void AddPoolAndSetData(uint32_t size, V1_0::Request& request, const T* data)
+android::sp<IMemory> AddPoolAndSetData(uint32_t size, V1_0::Request& request, const T* data)
 {
     android::sp<IMemory> memory = AddPoolAndGetData<T>(size, request);
 
     T* dst = static_cast<T*>(static_cast<void*>(memory->getPointer()));
 
     memcpy(dst, data, size * sizeof(T));
+
+    return memory;
 }
 
 template<typename HalPolicy,