COMPMID-1040: Added support for nullptr bias tensor in NEWinogradLayer

Change-Id: Ie624ee17c63dede711d913a82819e128954a57c9
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/124861
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
index b2e44f8..fcd1594 100644
--- a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
@@ -299,12 +299,11 @@
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_biases->buffer());
     ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_output);
 
     OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride,
-                                     reinterpret_cast<T *>(_biases->buffer()), _output,
+                                     (_biases ? reinterpret_cast<T *>(_biases->buffer()) : nullptr), _output,
                                      _n_batches, _n_rows, _n_cols, _n_channels);
 
     // The code below cannot be moved to configure because biases hasn't been allocated at that point
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
index a95ce0e..3b3cda0 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
@@ -86,148 +86,288 @@
   const float *inptr = matrix_base;
   const float *bptr = biases;
 
-  // For each channel of the output
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
+  if (bptr)
   {
-    // Matrices used and computed during this transform
-    float32x4_t F[4][4], FZ[4][2], f[2][2], b;
-
-    // Read a 4x4 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 4; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 4; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[4][4], FZ[4][2], f[2][2], b;
+
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
       {
-        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
+
+        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+        FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+        f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+      }
+
+      // Load the bias vector
+      b = vld1q_f32(bptr);
+      bptr += 4;
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr += 4;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 4; i++)
-    {
-      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-      FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
-
-      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-      FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-      f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
-      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-      f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
-    }
-
-    // Load the bias vector
-    b = vld1q_f32(bptr);
-    bptr += 4;
-
-    // Write out the output tile
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
-      {
-        vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
-        outptrs[i][j] += 4;
-      }
-    }
-  }
 #endif  // __aarch64__
 #ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[4][4], FZ[4][2], f[2][2], b;
-
-    // Read a 4x4 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 4; i++)
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      for (int j = 0; j < 4; j++, m++)
+      // Matrices used and computed during this transform
+      float32x2_t F[4][4], FZ[4][2], f[2][2], b;
+
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
       {
-        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+
+        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+        FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+        f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+      }
+
+      // Load the bias vector
+      b = vld1_f32(bptr);
+      bptr += 2;
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+          outptrs[i][j] += 2;
+        }
       }
     }
-    inptr += 2;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 4; i++)
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
     {
-      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-      FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+      // Matrices used and computed during this transform
+      float F[4][4], FZ[4][2], f[2][2], b;
 
-      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-      FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-      f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
-      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-      f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
-    }
-
-    // Load the bias vector
-    b = vld1_f32(bptr);
-    bptr += 2;
-
-    // Write out the output tile
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
       {
-        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
-        outptrs[i][j] += 2;
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+      }
+
+      // Load the bias
+      b = *(bptr++);
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j] + b;
+        }
       }
     }
   }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
+  else
   {
-    // Matrices used and computed during this transform
-    float F[4][4], FZ[4][2], f[2][2], b;
-
-    // Read a 4x4 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 4; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 4; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[4][4], FZ[4][2], f[2][2];
+
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
       {
-        F[i][j] = *(inptr + m*matrix_stride);
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
+
+        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+        FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+        f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr++;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 4; i++)
+#endif  // __aarch64__
+#ifdef __arm_any__
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-      FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-    }
+      // Matrices used and computed during this transform
+      float32x2_t F[4][4], FZ[4][2], f[2][2];
 
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-      f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-    }
-
-    // Load the bias
-    b = *(bptr++);
-
-    // Write out the output tile
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
       {
-        *(outptrs[i][j]++) = f[i][j] + b;
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+
+        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+        FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+        f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 2;
+        }
+      }
+    }
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed during this transform
+      float F[4][4], FZ[4][2], f[2][2];
+
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
+      {
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j];
+        }
       }
     }
   }
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
index 262f711..cafce95 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
@@ -35,6 +35,7 @@
 template <>
 int Transform::ops_performed(const Tensor4DShape &shape)
 {
+  (void) shape;
   return 0;  // TODO
 }
 
@@ -83,142 +84,282 @@
   const float *inptr = matrix_base;
   const float *bptr = biases;
 
-  // For each channel of the output
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
+  if (bptr)
   {
-    // Matrices used and computed during this transform
-    float32x4_t F[6][6], FZ[6][2], f[2][2], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[6][6], FZ[6][2], f[2][2], b;
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+        FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+        f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      b = vld1q_f32(bptr);
+      bptr += 4;
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr += 4;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
-      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-      FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-      f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    b = vld1q_f32(bptr);
-    bptr += 4;
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
-      {
-        vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
-        outptrs[i][j] += 4;
-      }
-    }
-  }
 #endif  // __aarch64__
 #ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[6][6], FZ[6][2], f[2][2], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x2_t F[6][6], FZ[6][2], f[2][2], b;
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+        FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+        f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      b = vld1_f32(bptr);
+      bptr += 2;
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+          outptrs[i][j] += 2;
+        }
       }
     }
-    inptr += 2;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
     {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+      // Matrices used and computed during this transform
+      float F[6][6], FZ[6][2], f[2][2], b;
 
-      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-      FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-      f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    b = vld1_f32(bptr);
-    bptr += 2;
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
-        outptrs[i][j] += 2;
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+      }
+
+      // Write out the output tile
+      b = *(bptr++);
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j] + b;
+        }
       }
     }
   }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
+  else
   {
-    // Matrices used and computed during this transform
-    float F[6][6], FZ[6][2], f[2][2], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[6][6], FZ[6][2], f[2][2];
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = *(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+        FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+        f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr++;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
+#endif  // __aarch64__
+#ifdef __arm_any__
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-    }
+      // Matrices used and computed during this transform
+      float32x2_t F[6][6], FZ[6][2], f[2][2];
 
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-    }
-
-    // Write out the output tile
-    b = *(bptr++);
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        *(outptrs[i][j]++) = f[i][j] + b;
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+        FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+        f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 2;
+        }
+      }
+    }
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed during this transform
+      float F[6][6], FZ[6][2], f[2][2];
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j];
+        }
       }
     }
   }
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
index 609823b..cd3bdef 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
@@ -100,170 +100,338 @@
   const float *inptr = matrix_base;
   const float *bptr = biases;
 
-  // For each channel of the output
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
+  if (bptr)
   {
-    // Matrices used and computed during this transform
-    float32x4_t F[6][6], FZ[6][4], f[4][4], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[6][6], FZ[6][4], f[4][4], b;
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
+
+        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
+
+        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+        FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+        f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      b = vld1q_f32(bptr);
+      bptr += 4;
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr += 4;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
-      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-      FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
-
-      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-      FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
-
-      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-      FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 4; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-      f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
-      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-      f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
-      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-      f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    b = vld1q_f32(bptr);
-    bptr += 4;
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
-      {
-        vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
-        outptrs[i][j] += 4;
-      }
-    }
-  }
 #endif  // __aarch64__
 #ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[6][6], FZ[6][4], f[4][4], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x2_t F[6][6], FZ[6][4], f[4][4], b;
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
+
+        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+
+        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+        FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+        f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      b = vld1_f32(bptr);
+      bptr += 2;
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+          outptrs[i][j] += 2;
+        }
       }
     }
-    inptr += 2;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
+#endif
+    for (; channels_remaining; channels_remaining--)
     {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+      // Matrices used and computed during this transform
+      float F[6][6], FZ[6][4], f[4][4], b;
 
-      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-      FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
-
-      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-      FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
-
-      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-      FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 4; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-      f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
-      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-      f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
-      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-      f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    b = vld1_f32(bptr);
-    bptr += 2;
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
-        outptrs[i][j] += 2;
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+      }
+
+      // Write out the output tile
+      b = *(bptr++);
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j] + b;
+        }
       }
     }
   }
-#endif
-  for (; channels_remaining; channels_remaining--)
+  else
   {
-    // Matrices used and computed during this transform
-    float F[6][6], FZ[6][4], f[4][4], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[6][6], FZ[6][4], f[4][4];
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = *(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
+
+        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
+
+        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+        FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+        f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr++;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
+#endif  // __aarch64__
+#ifdef __arm_any__
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-      FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-      FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-    }
+      // Matrices used and computed during this transform
+      float32x2_t F[6][6], FZ[6][4], f[4][4];
 
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 4; j++)
-    {
-      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-      f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-      f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-    }
-
-    // Write out the output tile
-    b = *(bptr++);
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        *(outptrs[i][j]++) = f[i][j] + b;
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
+
+        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+
+        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+        FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+        f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 2;
+        }
+      }
+    }
+#endif
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed during this transform
+      float F[6][6], FZ[6][4], f[4][4];
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j];
+        }
       }
     }
   }