APPBROWSER-371: Rewrite the direct_convolution3x3.cs with the new common code

Change-Id: I82a3ec133193433ba9ed3efcb49c51a2b95b16c0
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114962
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Zhenglin Li <zhenglin.li@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index ab78fb9..06f9bce 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -119,44 +119,44 @@
         {
             switch(input->info()->data_type())
             {
-                    // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
-#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
-
                 case DataType::F16:
-#if defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16)
-                    options.emplace("#define PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16");
+                    // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
+#define PROCESS_4X_3Y_1Z
+
+#if defined(PROCESS_8X_3Y_1Z)
+                    options.emplace("#define PROCESS_8X_3Y_1Z");
                     num_elems_read_per_iteration_x    = 16;
                     num_elems_read_per_iteration_y    = 5;
                     num_elems_written_per_iteration_x = 8;
                     num_elems_written_per_iteration_y = 3;
-#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16)
-                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16");
+#elif defined(PROCESS_4X_3Y_1Z)
+                    options.emplace("#define PROCESS_4X_3Y_1Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_read_per_iteration_y    = 5;
                     num_elems_written_per_iteration_x = 4;
                     num_elems_written_per_iteration_y = 3;
-#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16)
-                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16");
+#elif defined(PROCESS_4X_4Y_1Z)
+                    options.emplace("#define PROCESS_4X_4Y_1Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_read_per_iteration_y    = 6;
                     num_elems_written_per_iteration_x = 4;
                     num_elems_written_per_iteration_y = 4;
-#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16)
-                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16");
+#elif defined(PROCESS_4X_3Y_2Z)
+                    options.emplace("#define PROCESS_4X_3Y_2Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_read_per_iteration_y    = 5;
                     num_elems_written_per_iteration_x = 4;
                     num_elems_written_per_iteration_y = 3;
                     num_elems_written_per_iteration_z = 2;
-#endif /* PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16 */
-#undef PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16
+#endif /* PROCESS_nX_nY_nZ */
+#undef PROCESS_8X_3Y_1Z
+#undef PROCESS_4X_3Y_1Z
+#undef PROCESS_4X_4Y_1Z
+#undef PROCESS_4X_3Y_2Z
                     break;
 
                 case DataType::F32:
-                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS");
+                    options.emplace("#define PROCESS_4X_3Y_1Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_read_per_iteration_y    = 5;
                     num_elems_written_per_iteration_x = 4;
@@ -174,33 +174,33 @@
             switch(input->info()->data_type())
             {
                 case DataType::F16:
-                    options.emplace("#define PROCESS_X_4ELEMENTS_FP16");
+                    options.emplace("#define PROCESS_4X_1Y_1Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_written_per_iteration_x = 4;
                     break;
 
                 case DataType::F32:
                     // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
-#define PROCESS_4_ELEMENT
+#define PROCESS_4X_1Y_1Z
 
-#if defined(PROCESS_1_ELEMENT)
-                    options.emplace("#define PROCESS_1_ELEMENT");
+#if defined(PROCESS_1X_1Y_1Z)
+                    options.emplace("#define PROCESS_1X_1Y_1Z");
                     num_elems_read_per_iteration_x    = 3;
                     num_elems_written_per_iteration_x = 1;
-#elif defined(PROCESS_4_ELEMENT)
-                    options.emplace("#define PROCESS_4_ELEMENT");
+#elif defined(PROCESS_4X_1Y_1Z)
+                    options.emplace("#define PROCESS_4X_1Y_1Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_written_per_iteration_x = 4;
-#elif defined(PROCESS_8_ELEMENT)
-                    options.emplace("#define PROCESS_8_ELEMENT");
+#elif defined(PROCESS_8X_1Y_1Z)
+                    options.emplace("#define PROCESS_8X_1Y_1Z");
                     num_elems_read_per_iteration_x    = 12;
                     num_elems_written_per_iteration_x = 8;
-#else /* PROCESS_1_ELEMENT */
+#else /* PROCESS_nX_nY_nZ */
 #error Have to declare how many elements to process in one thread.
-#endif /* PROCESS_1_ELEMENT */
-#undef PROCESS_1_ELEMENT
-#undef PROCESS_4_ELEMENT
-#undef PROCESS_8_ELEMENT
+#endif /* PROCESS_nX_nY_nZ */
+#undef PROCESS_1X_1Y_1Z
+#undef PROCESS_4X_1Y_1Z
+#undef PROCESS_8X_1Y_1Z
                     break;
 
                 default: