COMPMID-751 Processing 8 elements makes computation up to 80us faster on MobileNet QASYMM8 dwc layers Change-Id: I30eaea3f3625086e311ad201ef73a8f06a01e382 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/116521 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>

commit: 944d3f79baef6878916c1ec082a71768f0bf3409 [log] [tgz]
author: Giorgio Arena <giorgio.arena@arm.com> Tue Jan 16 15:38:35 2018 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> Fri Nov 02 16:43:42 2018 +0000
tree: 1dc18a46876aedfbe23ec18f9c43dc28ece97d47
parent: 200b6e38b2f66a87dd0e73b6833554d1cab20b26 [diff] [blame]
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
index f9229ba..1c0fe99 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -158,9 +158,9 @@
     }
 
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 2;
-    const unsigned int num_elems_written_per_iteration   = 2;
-    const unsigned int num_elems_read_per_iteration      = 3 + _conv_stride_x;
+    const unsigned int num_elems_processed_per_iteration = 8 / data_size_from_type(input->info()->data_type());
+    const unsigned int num_elems_written_per_iteration   = num_elems_processed_per_iteration;
+    const unsigned int num_elems_read_per_iteration      = 3 + (num_elems_processed_per_iteration - 1) * _conv_stride_x;
     const unsigned int num_rows_read_per_iteration       = 3;
 
     Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
commit	944d3f79baef6878916c1ec082a71768f0bf3409	[log] [tgz]
author	Giorgio Arena <giorgio.arena@arm.com>	Tue Jan 16 15:38:35 2018 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	Fri Nov 02 16:43:42 2018 +0000
tree	1dc18a46876aedfbe23ec18f9c43dc28ece97d47
parent	200b6e38b2f66a87dd0e73b6833554d1cab20b26 [diff] [blame]