APPBROWSER-371: Rewrite the direct_convolution3x3.cs with the new common code
Change-Id: I82a3ec133193433ba9ed3efcb49c51a2b95b16c0
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114962
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Zhenglin Li <zhenglin.li@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index ab78fb9..06f9bce 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -119,44 +119,44 @@
{
switch(input->info()->data_type())
{
- // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
-#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
-
case DataType::F16:
-#if defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16)
- options.emplace("#define PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16");
+ // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
+#define PROCESS_4X_3Y_1Z
+
+#if defined(PROCESS_8X_3Y_1Z)
+ options.emplace("#define PROCESS_8X_3Y_1Z");
num_elems_read_per_iteration_x = 16;
num_elems_read_per_iteration_y = 5;
num_elems_written_per_iteration_x = 8;
num_elems_written_per_iteration_y = 3;
-#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16)
- options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16");
+#elif defined(PROCESS_4X_3Y_1Z)
+ options.emplace("#define PROCESS_4X_3Y_1Z");
num_elems_read_per_iteration_x = 8;
num_elems_read_per_iteration_y = 5;
num_elems_written_per_iteration_x = 4;
num_elems_written_per_iteration_y = 3;
-#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16)
- options.emplace("#define PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16");
+#elif defined(PROCESS_4X_4Y_1Z)
+ options.emplace("#define PROCESS_4X_4Y_1Z");
num_elems_read_per_iteration_x = 8;
num_elems_read_per_iteration_y = 6;
num_elems_written_per_iteration_x = 4;
num_elems_written_per_iteration_y = 4;
-#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16)
- options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16");
+#elif defined(PROCESS_4X_3Y_2Z)
+ options.emplace("#define PROCESS_4X_3Y_2Z");
num_elems_read_per_iteration_x = 8;
num_elems_read_per_iteration_y = 5;
num_elems_written_per_iteration_x = 4;
num_elems_written_per_iteration_y = 3;
num_elems_written_per_iteration_z = 2;
-#endif /* PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16 */
-#undef PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16
+#endif /* PROCESS_nX_nY_nZ */
+#undef PROCESS_8X_3Y_1Z
+#undef PROCESS_4X_3Y_1Z
+#undef PROCESS_4X_4Y_1Z
+#undef PROCESS_4X_3Y_2Z
break;
case DataType::F32:
- options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS");
+ options.emplace("#define PROCESS_4X_3Y_1Z");
num_elems_read_per_iteration_x = 8;
num_elems_read_per_iteration_y = 5;
num_elems_written_per_iteration_x = 4;
@@ -174,33 +174,33 @@
switch(input->info()->data_type())
{
case DataType::F16:
- options.emplace("#define PROCESS_X_4ELEMENTS_FP16");
+ options.emplace("#define PROCESS_4X_1Y_1Z");
num_elems_read_per_iteration_x = 8;
num_elems_written_per_iteration_x = 4;
break;
case DataType::F32:
// TODO(APPBROWSER-299): Choose the most optimal path and remove others.
-#define PROCESS_4_ELEMENT
+#define PROCESS_4X_1Y_1Z
-#if defined(PROCESS_1_ELEMENT)
- options.emplace("#define PROCESS_1_ELEMENT");
+#if defined(PROCESS_1X_1Y_1Z)
+ options.emplace("#define PROCESS_1X_1Y_1Z");
num_elems_read_per_iteration_x = 3;
num_elems_written_per_iteration_x = 1;
-#elif defined(PROCESS_4_ELEMENT)
- options.emplace("#define PROCESS_4_ELEMENT");
+#elif defined(PROCESS_4X_1Y_1Z)
+ options.emplace("#define PROCESS_4X_1Y_1Z");
num_elems_read_per_iteration_x = 8;
num_elems_written_per_iteration_x = 4;
-#elif defined(PROCESS_8_ELEMENT)
- options.emplace("#define PROCESS_8_ELEMENT");
+#elif defined(PROCESS_8X_1Y_1Z)
+ options.emplace("#define PROCESS_8X_1Y_1Z");
num_elems_read_per_iteration_x = 12;
num_elems_written_per_iteration_x = 8;
-#else /* PROCESS_1_ELEMENT */
+#else /* PROCESS_nX_nY_nZ */
#error Have to declare how many elements to process in one thread.
-#endif /* PROCESS_1_ELEMENT */
-#undef PROCESS_1_ELEMENT
-#undef PROCESS_4_ELEMENT
-#undef PROCESS_8_ELEMENT
+#endif /* PROCESS_nX_nY_nZ */
+#undef PROCESS_1X_1Y_1Z
+#undef PROCESS_4X_1Y_1Z
+#undef PROCESS_8X_1Y_1Z
break;
default: