COMPMID-882 - Optimizing GEMMLowp on OpenCL reshaping matrices

This new optimization allows to achieve 36.3 % of MAC utilisation on Mate 9 @ 1GHz.
The performance have been reported here
https://confluence.arm.com/display/MLENG/GEMMLowp+performance%3A+ACL+18.02

Change-Id: I71b6a217068763dfdc11bbf3574ee0eb94f93679
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118531
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index a09849a..f02eb16 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -50,7 +50,7 @@
         if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run)
         {
             const float scale = k < 1024 ? 2.0f : 2.5f;
-            flag              = scale * n > 1.66f * n + 38.4f;
+            flag              = (scale * n) > ((1.66f * n) + 38.4f);
         }
         else
         {
@@ -122,6 +122,10 @@
         matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp_a);
+        _memory_group.manage(&_tmp_b);
+
         // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
         // Configure interleave kernel
@@ -129,10 +133,6 @@
 
         // Configure transpose kernel
         _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
     }
 
     _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));