IVGBENCH-1661: Segfault on FP16 for NEON

Failures were caused due to integer overflows as mixed calculation
between int32_t and uint32_t were taking place.

Change-Id: I72efb331c7b3093a71cf83639eb7e89f1c2c29fc
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2356
Reviewed-by: SiCong Li <sicong.li@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 58fa2d6..aaeb33f 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -977,8 +977,8 @@
                 int x = 0;
                 for(; x <= (pool_size_x - 8); x += 8)
                 {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
-                                                                                           (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
+                                                                                           (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
 
                     // Get power of 2 in case of l2 pooling and accumulate
                     if(pooling_type == PoolingType::L2)
@@ -994,7 +994,8 @@
                 // Leftover for loop
                 for(; x < pool_size_x; ++x)
                 {
-                    float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+                    float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
+                                                                           + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
 
                     // Get power of 2 in case of l2 pooling
                     if(pooling_type == PoolingType::L2)
@@ -1026,16 +1027,17 @@
                 int x = 0;
                 for(; x <= (pool_size_x - 8); x += 8)
                 {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
-                                                                                           (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
+                                                                                           (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
                     vres                   = vmaxq_f16(vres, data);
                 }
 
                 // Leftover for loop
                 for(; x < pool_size_x; ++x)
                 {
-                    const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
-                    res                  = std::max(res, data);
+                    const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
+                                                                                 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
+                    res = std::max(res, data);
                 }
             }
 
@@ -1111,8 +1113,8 @@
             {
                 for(int x = pool_start_x; x < pool_end_x; ++x)
                 {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
-                                                                                           (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
+                                                                                           (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
 
                     // Get power of 2 in case of l2 pooling and accumulate
                     if(pooling_type == PoolingType::L2)
@@ -1136,8 +1138,8 @@
             {
                 for(int x = pool_start_x; x < pool_end_x; ++x)
                 {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
-                                                                                           (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
+                                                                                           (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
                     vres                   = vmaxq_f16(vres, data);
                 }
             }