APPBROWSER-298, APPBROWSER-306: Reimplement the common code of compute shader

The new common code of compute shader is in file helpers_cs.h
Rewrite the direct_convolution1x1.cs and softmax_layer.cs to use the new common code.

It will also remove the dependence of the token pasting operator (##).
We'll remove the "##" support after we rewrite all of the compute shader code.

Change-Id: Icd8553ef6b61ad484a8507590ac8ed499bd47061
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95455
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Frank Lei <frank.lei@arm.com>
(cherry picked from commit 0a4f83570d261f839d9866b68979efe8d7a95883)
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95601
Reviewed-by: Jim He <jim.he@arm.com>
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
index 3a31cb8..071c185 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
@@ -24,107 +24,88 @@
 
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 
-#include "helpers.h"
+#include "helpers_cs.h"
 
-layout(std140) uniform shader_params
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr          Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs        The attributes of the source tensor
+ * @param[out] dst_ptr          Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs        The attributes of the destination tensor
+ * @param[in]  weights_ptr      Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_attrs    The attributes of the weights tensor
+ * @param[in]  biases_ptr       Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_attrs     The attributes of the weights tensor
+ * @param[in]  weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth    The third dimensions of the weights tensors
+ */
+SHADER_PARAMS_DECLARATION
 {
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    Tensor3DAttributes weights_attrs;
 #ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
+    VectorAttributes biases_attrs;
 #endif /* BIAS */
     uint weights_stride_w;
     uint weights_depth;
 };
 
 #if defined(DATA_TYPE_FP32)
-precision highp float;
-
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-BUFFER_DECLARATION(weights, 3, float, readonly);
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, float, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
-    float pixels  = CONVERT(0, float);
+    float pixels  = 0.f;
     uint  z_index = gl_GlobalInvocationID.z;
-    weights.current_offset += z_index * weights_stride_w >> 2;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
     float temp;
     float temp_weight;
-
     for(int d = 0; d < int(weights_depth); ++d)
     {
-        temp        = LOAD4(src, CURRENT_OFFSET(src));
-        temp_weight = LOAD4(weights, CURRENT_OFFSET(weights));
+        temp        = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+        temp_weight = LOAD_CURRENT_ITEM(weights_ptr, weights_iter);
         pixels += temp * temp_weight;
 
-        src.current_offset += (src_stride_z >> 2);
-        weights.current_offset += (weights_stride_z >> 2);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+    pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 #endif /* BIAS */
 
-    STORE4(dst, CURRENT_OFFSET(dst), pixels);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 #elif defined(DATA_TYPE_FP16)
-precision mediump float;
 
-BUFFER_DECLARATION(src, 1, uvec4, readonly);
-BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
 #if STRIDE_X == 2
@@ -135,15 +116,10 @@
 #error STRIDE_X larger than 2 is not supported
 #endif /* STRIDE_X == 2 */
 
-vec4[2] convolve_stride1(Image src, float w)
+vec4[2] convolve_stride1(ImageIterator src_iter, float w)
 {
-    uvec4 packed_s;
-    vec4  s[2];
-
-    GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
-
-    s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
-    s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+    vec4 s[2];
+    s = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
 
     s[0] *= w;
     s[1] *= w;
@@ -151,22 +127,14 @@
     return s;
 }
 
-vec4[2] convolve_stride2(Image src, float w)
+vec4[2] convolve_stride2(ImageIterator src_iter, float w)
 {
-    uvec4 packed_s;
-    vec4  s[2];
-    vec4  r[2];
+    vec4 s[2];
+    vec4 r[2];
 
-    GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
-    s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
-    s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
-
+    s    = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
     r[0] = vec4(s[0].xz, s[1].xz);
-
-    GC_LOAD1_2D_OFFSET(packed_s, src, 8, 0);
-    s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
-    s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
-
+    s    = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 8, 0));
     r[1] = vec4(s[0].xz, s[1].xz);
 
     r[0] *= w;
@@ -175,51 +143,14 @@
     return r;
 }
 
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector   biases  = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
     vec4 pixels[2];
@@ -227,48 +158,41 @@
     pixels[1] = vec4(0.f);
 
     uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
-    weights.current_offset += z_index * weights_stride_w;
-
-    uint  packed_w;
     float w;
-
     for(int d = 0; d < int(weights_depth); ++d)
     {
-        GC_LOAD1_3D_OFFSET(packed_w, weights, 0, 0, 0);
-        w = unpackHalf2x16(packed_w).x;
+        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
 
-        vec4 r[2] = CONVOLVE(src, w);
+        vec4 r[2] = CONVOLVE(src_iter, w);
         pixels[0] += r[0];
         pixels[1] += r[1];
 
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    uint  packed_b;
+    vec2  vec2_b;
     float b;
 
-    GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 
     if(z_index % uint(2) == uint(0))
     {
-        b = unpackHalf2x16(packed_b).x;
+        b = vec2_b.x;
     }
     else
     {
-        b = unpackHalf2x16(packed_b).y;
+        b = vec2_b.y;
     }
 
-    pixels[0] += vec4(b);
-    pixels[1] += vec4(b);
+    pixels[0] += b;
+    pixels[1] += b;
 #endif /* BIAS */
 
-    uvec4 packed_d;
-    packed_d = uvec4(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw),
-                     packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
-    GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
 }
 #else  /* DATA_TYPE_FP32 */
 #error Data type not supported