Reorganize the kernels into nhwc, nchw and common folders

The Following kernels have been split into nchw/nhwc kernels files:

- batchnormalization_layer
- batch_to_space
- channel_shuffle
- depth_to_space
- dequantization_layer
- im2col
- normalization_layer
- normalize_planar_yuv_layer
- normalize_planar_yuv_layer_quantized
- pooling_layer
- pooling_layer_quantized
- remap
- reorg_layer
- scale
- scale_quantized
- space_to_batch
- space_to_depth
- upsample_layer
- winograd_filter_transform
- winograd_input_transform
- winograd_output_transform

The following kernels have been moved to nchw folder:
- direct_convolution1x1
- direct_convolution3x3
- direct_convolution5x5
- direct_convolution_quantized
- prior_box_layer

The following kernels have been moved to nhwc folder:
- direct_convolution
- dwc_native_fp_nhwc
- dwc_native_quantized_nhwc

The following kernels have been removed:
- sobel_filter

While the rest kerenls have been moved to the common folder.

Partially resolves COMPMID-4453

Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com>
Change-Id: Ic327ac935687ec351c610c65a3c6357f364a5a58
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5919
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/SConscript b/SConscript
index 886ad08..682f553 100644
--- a/SConscript
+++ b/SConscript
@@ -228,10 +228,143 @@
 # Generate embed files
 generate_embed = [ version_file ]
 if env['opencl'] and env['embed_kernels']:
-    cl_files = Glob('src/core/CL/cl_kernels/*.cl')
-    cl_files += Glob('src/core/CL/cl_kernels/*.h')
+    
+    # Header files
+    cl_helper_files = [ 'src/core/CL/cl_kernels/activation_float_helpers.h',
+                        'src/core/CL/cl_kernels/activation_quant_helpers.h',
+                        'src/core/CL/cl_kernels/gemm_helpers.h',
+                        'src/core/CL/cl_kernels/helpers_asymm.h',
+                        'src/core/CL/cl_kernels/helpers.h',
+                        'src/core/CL/cl_kernels/load_store_utility.h',
+                        'src/core/CL/cl_kernels/repeat.h',
+                        'src/core/CL/cl_kernels/tile_helpers.h',
+                        'src/core/CL/cl_kernels/types.h',
+                        'src/core/CL/cl_kernels/warp_helpers_quantized.h',
+                        'src/core/CL/cl_kernels/warp_helpers.h'
+                    ]
 
-    embed_files = [ f.get_path()+"embed" for f in cl_files ]
+    # Common kernels
+    cl_files_common = ['src/core/CL/cl_kernels/common/activation_layer.cl',
+                       'src/core/CL/cl_kernels/common/activation_layer_quant.cl',
+                       'src/core/CL/cl_kernels/common/arg_min_max.cl',
+                       'src/core/CL/cl_kernels/common/batchnormalization_layer.cl',
+                       'src/core/CL/cl_kernels/common/bounding_box_transform.cl',
+                       'src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl',
+                       'src/core/CL/cl_kernels/common/bitwise_op.cl',
+                       'src/core/CL/cl_kernels/common/cast.cl',
+                       'src/core/CL/cl_kernels/common/comparisons.cl',
+                       'src/core/CL/cl_kernels/common/concatenate.cl',
+                       'src/core/CL/cl_kernels/common/col2im.cl',
+                       'src/core/CL/cl_kernels/common/convert_fc_weights.cl',
+                       'src/core/CL/cl_kernels/common/copy_tensor.cl',
+                       'src/core/CL/cl_kernels/common/crop_tensor.cl',
+                       'src/core/CL/cl_kernels/common/deconvolution_layer.cl',
+                       'src/core/CL/cl_kernels/common/dequantization_layer.cl',
+                       'src/core/CL/cl_kernels/common/elementwise_operation.cl',
+                       'src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl',
+                       'src/core/CL/cl_kernels/common/elementwise_unary.cl',
+                       'src/core/CL/cl_kernels/common/fft_digit_reverse.cl',
+                       'src/core/CL/cl_kernels/common/fft.cl',
+                       'src/core/CL/cl_kernels/common/fft_scale.cl',
+                       'src/core/CL/cl_kernels/common/fill_border.cl',
+                       'src/core/CL/cl_kernels/common/floor.cl',
+                       'src/core/CL/cl_kernels/common/gather.cl',
+                       'src/core/CL/cl_kernels/common/gemm.cl',
+                       'src/core/CL/cl_kernels/common/gemv.cl',
+                       'src/core/CL/cl_kernels/common/gemm_v1.cl',
+                       'src/core/CL/cl_kernels/common/gemmlowp.cl',
+                       'src/core/CL/cl_kernels/common/generate_proposals.cl',
+                       'src/core/CL/cl_kernels/common/generate_proposals_quantized.cl',
+                       'src/core/CL/cl_kernels/common/instance_normalization.cl',
+                       'src/core/CL/cl_kernels/common/l2_normalize.cl',
+                       'src/core/CL/cl_kernels/common/mean_stddev_normalization.cl',
+                       'src/core/CL/cl_kernels/common/unpooling_layer.cl',
+                       'src/core/CL/cl_kernels/common/memset.cl',
+                       'src/core/CL/cl_kernels/common/nonmax.cl',
+                       'src/core/CL/cl_kernels/common/minmax_layer.cl',
+                       'src/core/CL/cl_kernels/common/pad_layer.cl',
+                       'src/core/CL/cl_kernels/common/permute.cl',
+                       'src/core/CL/cl_kernels/common/pixelwise_mul_float.cl',
+                       'src/core/CL/cl_kernels/common/pixelwise_mul_int.cl',
+                       'src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl',
+                       'src/core/CL/cl_kernels/common/quantization_layer.cl',
+                       'src/core/CL/cl_kernels/common/range.cl',
+                       'src/core/CL/cl_kernels/common/reduction_operation.cl',
+                       'src/core/CL/cl_kernels/common/pooling_layer.cl',
+                       'src/core/CL/cl_kernels/common/reshape_layer.cl',
+                       'src/core/CL/cl_kernels/common/convolution_layer.cl',
+                       'src/core/CL/cl_kernels/common/reverse.cl',
+                       'src/core/CL/cl_kernels/common/roi_align_layer.cl',
+                       'src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl',
+                       'src/core/CL/cl_kernels/common/roi_pooling_layer.cl',
+                       'src/core/CL/cl_kernels/common/select.cl',
+                       'src/core/CL/cl_kernels/common/softmax_layer.cl',
+                       'src/core/CL/cl_kernels/common/softmax_layer_quantized.cl',
+                       'src/core/CL/cl_kernels/common/stack_layer.cl',
+                       'src/core/CL/cl_kernels/common/slice_ops.cl',
+                       'src/core/CL/cl_kernels/common/tile.cl',
+                       'src/core/CL/cl_kernels/common/transpose.cl'
+                    ]
+
+    # NCHW kernels
+    cl_files_nchw = ['src/core/CL/cl_kernels/nchw/batch_to_space.cl',
+                    'src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/channel_shuffle.cl',
+                    'src/core/CL/cl_kernels/nchw/depth_to_space.cl',
+                    'src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl',
+                    'src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl',
+                    'src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl',
+                    'src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl',
+                    'src/core/CL/cl_kernels/nchw/dequantization_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/im2col.cl',
+                    'src/core/CL/cl_kernels/nchw/normalization_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl',
+                    'src/core/CL/cl_kernels/nchw/pooling_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl',
+                    'src/core/CL/cl_kernels/nchw/prior_box_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/remap.cl',
+                    'src/core/CL/cl_kernels/nchw/reorg_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/scale.cl',
+                    'src/core/CL/cl_kernels/nchw/scale_quantized.cl',
+                    'src/core/CL/cl_kernels/nchw/space_to_batch.cl',
+                    'src/core/CL/cl_kernels/nchw/space_to_depth.cl',
+                    'src/core/CL/cl_kernels/nchw/upsample_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl',
+                    'src/core/CL/cl_kernels/nchw/winograd_input_transform.cl',
+                    'src/core/CL/cl_kernels/nchw/winograd_output_transform.cl'
+                ]
+
+    # NHWC kernels
+    cl_files_nhwc = ['src/core/CL/cl_kernels/nhwc/batch_to_space.cl',
+                    'src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/channel_shuffle.cl',
+                    'src/core/CL/cl_kernels/nhwc/direct_convolution.cl',
+                    'src/core/CL/cl_kernels/nhwc/depth_to_space.cl',
+                    'src/core/CL/cl_kernels/nhwc/dequantization_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl',
+                    'src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl',
+                    'src/core/CL/cl_kernels/nhwc/im2col.cl',
+                    'src/core/CL/cl_kernels/nhwc/normalization_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl',
+                    'src/core/CL/cl_kernels/nhwc/pooling_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl',
+                    'src/core/CL/cl_kernels/nhwc/remap.cl',
+                    'src/core/CL/cl_kernels/nhwc/reorg_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/scale.cl',
+                    'src/core/CL/cl_kernels/nhwc/scale_quantized.cl',
+                    'src/core/CL/cl_kernels/nhwc/space_to_batch.cl',
+                    'src/core/CL/cl_kernels/nhwc/space_to_depth.cl',
+                    'src/core/CL/cl_kernels/nhwc/upsample_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl',
+                    'src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl',
+                    'src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl'
+                ]
+
+    cl_files = cl_helper_files + cl_files_common + cl_files_nchw + cl_files_nhwc
+
+    embed_files = [ f+"embed" for f in cl_files ]
     arm_compute_env.Append(CPPPATH =[Dir("./src/core/CL/").path] )
 
     generate_embed.append(arm_compute_env.Command(embed_files, cl_files, action=resolve_includes))