COMPMID-2336: Add clang-tidy checks in assembly subfolder

Change-Id: I57fc21cfb8e2751e1ebb59f9106764775d09a00a
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1222
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 9b5fafb..a5d1cb5 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -33,9 +33,6 @@
     for i in range(0, len(lines)):
         line = lines[i]
 
-        if "/assembly/" in line:
-            continue
-
         if "/arm_gemm/" in line:
             continue
 
diff --git a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
index 7b1f3e7..97c20db 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
@@ -58,6 +58,7 @@
 template <typename To, typename Tr>
 Window NEGEMMNativeWrapperKernel<To, Tr>::configure_internal(float alpha, float beta)
 {
+    ARM_COMPUTE_UNUSED(alpha);
     using strategy = typename Kernel<To, Tr>::strategy;
 
     _beta = beta;
@@ -107,6 +108,7 @@
 
     auto on_new_row_size = [&](unsigned int start, unsigned int end)
     {
+        ARM_COMPUTE_UNUSED(start);
         m_end = std::min(end, _params.M);
     };
 
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index e207ab0..20aa149 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -45,7 +45,7 @@
     static constexpr unsigned int NUM_BUFFERS = 3;
 
     explicit BufferManagerMultipleThreads(unsigned int max_num_users)
-        : _max_num_users(max_num_users)
+        : _buffers(), _max_num_users(max_num_users)
     {
     }
     unsigned int num_buffers() const override
@@ -106,11 +106,15 @@
         ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
         // Check if it's already ready to use:
         if(buf.state == State::IN_USE)
+        {
             return;
+        }
         std::unique_lock<std::mutex> lock(buf.mutex);
         //Double check it didn't change while we were acquiring the lock:
         if(buf.state == State::IN_USE)
+        {
             return;
+        }
         buf.sem.wait(lock);
     }
     /* Mark the buffer at the given index as not used by this thread anymore.
@@ -143,7 +147,8 @@
         State                   state{ State::FREE };
         std::mutex              mutex{};
         std::condition_variable sem{};
-    } _buffers[NUM_BUFFERS];
+    };
+    std::array<struct Buffer, NUM_BUFFERS> _buffers;
     Buffer &get_buffer_from_index(unsigned int index)
     {
         return _buffers[index % NUM_BUFFERS];
@@ -161,6 +166,7 @@
     }
     bool lock_to_reshape_if_needed(unsigned int index) override
     {
+        ARM_COMPUTE_UNUSED(index);
         return true;
     }
     void mark_as_reshaped(unsigned int index) override
@@ -231,10 +237,10 @@
                 {
                     //For each block of rows in "M"
                     auto workload_mm = this->_mm_workloads.begin();
-                    for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+                    for(auto &workload_a : this->_a_workloads)
                     {
                         // Transform one k_block from A:
-                        this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+                        this->_transform_a->transform(workload_a, info, this->_batch_window, start_offset, end_offset);
                         // Then perform the matrix multiplication for each x block along N:
                         for(unsigned int i = 0; i < num_x_blocks; i++)
                         {
@@ -243,7 +249,7 @@
                         }
                     }
                 };
-                _workloads.push_back(workload);
+                _workloads.emplace_back(workload);
             }
             else
             {
@@ -255,10 +261,10 @@
                     //If there is only one thread then only reshape the B blocks as you need them:
                     unsigned int workload_b_next = num_threads == 1 ? this->_b_workloads.size() : 1;
 
-                    for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+                    for(auto &workload_a : this->_a_workloads)
                     {
                         // Transform one k_block from A:
-                        this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+                        this->_transform_a->transform(workload_a, info, this->_batch_window, start_offset, end_offset);
                         // Then perform the matrix multiplication for each x block along N:
                         for(unsigned int i = 0; i < num_x_blocks; i++)
                         {
@@ -287,7 +293,7 @@
                         }
                     }
                 };
-                _workloads.push_back(workload);
+                _workloads.emplace_back(workload);
             }
         }
         if(!_pretranspose_b && num_windows > 1 && num_windows % num_threads != 0)
@@ -325,7 +331,7 @@
                         workload_b++;
                     }
                 };
-                _workloads.push_back(workload);
+                _workloads.emplace_back(workload);
             }
         }