COMPMID-2853: VGG16 regression for fp32

* Caps dynamic scheduling granule to a max number

Change-Id: I35a9239bc9984dbc1b416c40c4c1b4ac7f5808bd
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2223
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index 14acf04..1c3e3e7 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,9 +53,10 @@
          *
          * @param[in] split_dimension Dimension along which to split the kernel's execution window.
          * @param[in] strategy        (Optional) Split strategy.
+         * @param[in] threshold       (Optional) Dynamic scheduling capping threshold.
          */
-        Hints(unsigned int split_dimension, StrategyHint strategy = StrategyHint::STATIC)
-            : _split_dimension(split_dimension), _strategy(strategy)
+        Hints(unsigned int split_dimension, StrategyHint strategy = StrategyHint::STATIC, int threshold = 0)
+            : _split_dimension(split_dimension), _strategy(strategy), _threshold(threshold)
         {
         }
         /** Set the split_dimension hint
@@ -97,10 +98,19 @@
         {
             return _strategy;
         }
+        /** Return the granule capping threshold to be used by dynamic scheduling.
+         *
+         * @return The capping threshold
+         */
+        int threshold() const
+        {
+            return _threshold;
+        }
 
     private:
         unsigned int _split_dimension;
         StrategyHint _strategy;
+        int          _threshold;
     };
     /** Signature for the workloads to execute */
     using Workload = std::function<void(const ThreadInfo &)>;
@@ -165,5 +175,5 @@
 private:
     unsigned int _num_threads_hint = {};
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_ISCHEDULER_H__ */
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 5849218..e684eee 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -338,9 +338,9 @@
                 break;
             case StrategyHint::DYNAMIC:
             {
+                const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
                 // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
-                const unsigned int max_iterations = static_cast<unsigned int>(_impl->_num_threads) * 3;
-                num_windows                       = num_iterations > max_iterations ? max_iterations : num_iterations;
+                num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
                 break;
             }
             default:
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 43e5315..88e0601 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -201,6 +201,8 @@
     IWeightsManager *_weights_manager{ nullptr };
     /** Weights transform object */
     FallbackTransform<TypeInput, TypeOutput> _weights_transform{};
+    /** GEMM kernel description */
+    arm_gemm::KernelDescription _kernel_info{};
 };
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
@@ -208,12 +210,12 @@
                                                              arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
                                                              MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os)
 {
-    arm_gemm::GemmConfig              gemm_cfg;
-    const arm_gemm::KernelDescription gemm_kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
-    _weights_manager                                   = weights_manager;
-    if(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
+    arm_gemm::GemmConfig gemm_cfg;
+    _kernel_info     = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
+    _weights_manager = weights_manager;
+    if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
     {
-        gemm_cfg.filter = gemm_kernel_info.name;
+        gemm_cfg.filter = _kernel_info.name;
         args._cfg       = &gemm_cfg;
     }
     _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
@@ -387,7 +389,13 @@
                                  bias, 0);
 
     // Schedule assembly kernel
-    NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
+    IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
+    if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED)
+    {
+        constexpr int granule_threshold = 200;
+        scheduling_hint                 = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
+    }
+    NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
 }
 
 template <typename TypeInput, typename TypeOutput>
diff --git a/tests/framework/instruments/SchedulerTimer.cpp b/tests/framework/instruments/SchedulerTimer.cpp
index 98c9b87..9e8bba2 100644
--- a/tests/framework/instruments/SchedulerTimer.cpp
+++ b/tests/framework/instruments/SchedulerTimer.cpp
@@ -76,7 +76,7 @@
     void schedule(ICPPKernel *kernel, const Hints &hints) override
     {
         _timer.start();
-        _real_scheduler.schedule(kernel, hints.split_dimension());
+        _real_scheduler.schedule(kernel, hints);
         _timer.stop();
 
         typename SchedulerClock<output_timestamps>::kernel_info info;