blob: 96defbc9c931d7d031122e66bd33dadefd152f2b [file] [log] [blame]
Michalis Spyroub7b31532017-11-23 12:10:21 +00001/*
Georgios Pinitasddb93bb2020-10-02 16:38:59 +01002 * Copyright (c) 2017-2020 Arm Limited.
Michalis Spyroub7b31532017-11-23 12:10:21 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
Michalis Spyrouf4643372019-11-29 16:17:13 +000025#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H
26#define ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H
Michalis Spyroub7b31532017-11-23 12:10:21 +000027
28#include <arm_neon.h>
29
30namespace arm_compute
31{
32namespace detail
33{
34inline float32x4x3_t load_matrix_row(const float *ptr)
35{
36 const float32x4x3_t r =
37 {
38 {
39 vld1q_dup_f32(ptr),
40 vld1q_dup_f32(1 + ptr),
41 vld1q_dup_f32(2 + ptr)
42 }
43 };
44 return r;
45}
46
47template <unsigned int stridex>
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010048float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
Michalis Spyroub7b31532017-11-23 12:10:21 +000049
50template <>
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010051inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
Michalis Spyroub7b31532017-11-23 12:10:21 +000052{
Michalis Spyroub7b31532017-11-23 12:10:21 +000053 const float32x4x3_t vtop =
54 {
55 {
56 vld1q_f32(in_top),
57 vld1q_f32(in_top + 4),
58 vld1q_f32(in_top + 8)
59 }
60 };
61 const float32x4x3_t vmid =
62 {
63 {
64 vld1q_f32(in_mid),
65 vld1q_f32(in_mid + 4),
66 vld1q_f32(in_mid + 8)
67 }
68 };
69 const float32x4x3_t vlow =
70 {
71 {
72 vld1q_f32(in_low),
73 vld1q_f32(in_low + 4),
74 vld1q_f32(in_low + 8)
75 }
76 };
77 float32x4x2_t out =
78 {
79 {
80 vmulq_f32(vtop.val[0], m0.val[0]),
81 vmulq_f32(vtop.val[1], m0.val[0])
82 }
83 };
84 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
85 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
86
87 out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
88 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
89 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
90
91 out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
92 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
93 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
94
95 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
96 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
97
98 out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
99 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
100 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
101
102 out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
103 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
104 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
105 return out;
106}
107
108template <>
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100109inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
Michalis Spyroub7b31532017-11-23 12:10:21 +0000110{
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100111 float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
Michalis Spyroub7b31532017-11-23 12:10:21 +0000112 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
113 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
114 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
115 return out;
116}
117
118template <>
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100119inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
Michalis Spyroub7b31532017-11-23 12:10:21 +0000120{
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100121 float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
Michalis Spyroub7b31532017-11-23 12:10:21 +0000122 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
123 return out;
124}
125
126template <unsigned int stridex>
127void store_results(float *buffer, const float32x4x2_t &values);
128
129template <>
130void store_results<1>(float *buffer, const float32x4x2_t &values)
131{
132 vst1q_f32(buffer, values.val[0]);
133 vst1q_f32(buffer + 4, values.val[1]);
134}
135
136template <>
137void store_results<2>(float *buffer, const float32x4x2_t &values)
138{
139 vst1q_f32(buffer, values.val[0]);
140}
141
142template <>
143void store_results<3>(float *buffer, const float32x4x2_t &values)
144{
145 vst1_f32(buffer, vget_low_f32(values.val[0]));
146}
147
148template <unsigned int stridex>
149int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);
150
151template <>
152int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)
153{
154 return num_elems_written_per_iteration;
155}
156
157template <>
158int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)
159{
160 return num_elems_written_per_iteration << 1;
161}
162
163template <>
164int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)
165{
166 return num_elems_written_per_iteration * 3;
167}
168}
169} // namespace arm_compute
Michalis Spyrouf4643372019-11-29 16:17:13 +0000170#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */