COMPMID-1293: Handle aligned allocations

Change-Id: I6e642c8cd968240f883c327464519e57e5d0c3e3
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/136088
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/runtime/ITensorAllocator.h b/arm_compute/runtime/ITensorAllocator.h
index 6103e43..bb708f0 100644
--- a/arm_compute/runtime/ITensorAllocator.h
+++ b/arm_compute/runtime/ITensorAllocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,9 +50,10 @@
 
     /** Initialize a tensor based on the passed @ref TensorInfo.
      *
-     * @param[in] input TensorInfo object containing the description of the tensor to initialize.
+     * @param[in] input     TensorInfo object containing the description of the tensor to initialize.
+     * @param[in] alignment Alignment in bytes that the underlying base pointer should comply with.
      */
-    void init(const TensorInfo &input);
+    void init(const TensorInfo &input, size_t alignment = 0);
     /** Return a reference to the tensor's metadata
      *
      * @return Reference to the tensor's metadata.
@@ -63,6 +64,11 @@
      * @return Constant reference to the tensor's metadata.
      */
     const TensorInfo &info() const;
+    /** Return underlying's tensor buffer alignment
+     *
+     * @return Tensor buffer alignment
+     */
+    size_t alignment() const;
 
     /** Interface to be implemented by the child class to allocate the tensor.
      *
@@ -87,7 +93,8 @@
     virtual void unlock() = 0;
 
 private:
-    TensorInfo _info; /**< Tensor's metadata. */
+    TensorInfo _info;      /**< Tensor's metadata. */
+    size_t     _alignment; /**< Tensor's alignment in bytes */
 };
 }
 #endif /*__ARM_COMPUTE_ITENSORALLOCATOR_H__ */
diff --git a/arm_compute/runtime/MemoryRegion.h b/arm_compute/runtime/MemoryRegion.h
index bf4e171..481b20d 100644
--- a/arm_compute/runtime/MemoryRegion.h
+++ b/arm_compute/runtime/MemoryRegion.h
@@ -27,6 +27,7 @@
 #include "arm_compute/runtime/IMemoryRegion.h"
 
 #include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
@@ -38,18 +39,28 @@
 public:
     /** Default constructor
      *
-     * @param[in] size Region size
+     * @param[in] size      Region size
+     * @param[in] alignment Alignment in bytes of the base pointer. Defaults to 0
      */
-    MemoryRegion(size_t size)
-        : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
+    MemoryRegion(size_t size, size_t alignment = 0)
+        : IMemoryRegion(size), _mem(nullptr), _alignment(alignment), _offset(0)
     {
         if(size != 0)
         {
-            _mem = std::shared_ptr<uint8_t>(new uint8_t[size](), [](uint8_t *ptr)
+            // Allocate backing memory
+            size_t space = size + alignment;
+            _mem         = std::shared_ptr<uint8_t>(new uint8_t[space](), [](uint8_t *ptr)
             {
                 delete[] ptr;
             });
-            _ptr = _mem.get();
+
+            // Calculate alignment offset
+            if(alignment != 0)
+            {
+                void *aligned_ptr = _mem.get();
+                support::cpp11::align(alignment, size, aligned_ptr, space);
+                _offset = reinterpret_cast<uintptr_t>(aligned_ptr) - reinterpret_cast<uintptr_t>(_mem.get());
+            }
         }
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -64,11 +75,12 @@
     // Inherited methods overridden :
     void *buffer() final
     {
-        return _mem.get();
+        return reinterpret_cast<void *>(_mem.get() + _offset);
     }
     void *buffer() const final
     {
-        return _mem.get();
+        // FIXME (COMPMID-1088) : Remove handle() and _offset when done
+        return reinterpret_cast<void *>(_mem.get() + _offset);
     }
     void **handle() final
     {
@@ -77,7 +89,8 @@
 
 protected:
     std::shared_ptr<uint8_t> _mem;
-    uint8_t                 *_ptr;
+    size_t                   _alignment;
+    size_t                   _offset;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_RUNTIME_MEMORY_REGION_H__ */
diff --git a/arm_compute/runtime/NEON/AssemblyHelper.h b/arm_compute/runtime/NEON/AssemblyHelper.h
index c4ba1a5..5801c42 100644
--- a/arm_compute/runtime/NEON/AssemblyHelper.h
+++ b/arm_compute/runtime/NEON/AssemblyHelper.h
@@ -89,13 +89,8 @@
             const auto in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer());
             const int  multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
 
-            // Forcing 128-byte alignment (required by 32-bit kernels)
-            const unsigned int alignment   = 128;
-            void              *raw_ptr     = reinterpret_cast<void *>(_pretranspose->buffer());
-            size_t             space       = _pretranspose->info()->total_size();
-            void              *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space);
             ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr);
-            _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b);
+            _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b);
             _b->mark_as_unused();
         }
 
@@ -169,7 +164,7 @@
 inline void allocate_workspace(size_t workspace_size, Tensor &workspace, MemoryGroup *memory_group, size_t alignment, unsigned int num_threads)
 {
     ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
-    workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment - 1) * num_threads }, 1, DataType::S8));
+    workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) * num_threads }, 1, DataType::S8), alignment);
     if(memory_group != nullptr)
     {
         memory_group->manage(&workspace);