Blame - src/cpu/kernels/add/generic/neon/qasymm8.cpp - ml/ComputeLibrary

const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);

90

const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);

91

const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);

92

const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);

93

94

const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;

95

96

// Compute S elements per iteration

97

int x = window_start_x;

98

for(; x <= (window_end_x - window_step_x); x += window_step_x)

99

{

100

const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);

101

const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);

102

const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);

103

const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);

104

const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);

int32x4_t rf_0{};

int32x4_t rf_1{};

int32x4_t rf_2{};

int32x4_t rf_3{};

#ifdef __aarch64__

rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));

113

rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));

114

rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));

115

rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));

116

#else //__aarch64__

117

rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));

118

rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));

119

rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));

120

rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));

121

#endif //__aarch64__

122

123

const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));

124

const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));

125

vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));

126

}

127

128

// Compute left-over elements

129

for(; x < window_end_x; ++x)

130

{

131

const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;

132

*(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);

133

}

134

},

135

broadcast_input, non_broadcast_input, output);

}

else

{

// Clear X Dimension on execution window as we handle manually

140

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

141

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

142

Sheri Zhang

6124390

2021-01-12 18:25:16 +0000

[diff] [blame]

143

Iterator input1(src0, input1_win);

144

Iterator input2(src1, input2_win);

145

Iterator output(dst, win);

Michalis Spyrou

a3c9a3b

2020-12-08 21:02:16 +0000

[diff] [blame]

146

147

const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);

148

const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);

149

const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);

150

const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);

151

152

execute_window_loop(win, [&](const Coordinates &)

153

{

154

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

155

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

156

const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());

157

158

// Compute S elements per iteration

159

int x = window_start_x;

160

for(; x <= (window_end_x - window_step_x); x += window_step_x)

161

{

162

const uint8x16_t a = vld1q_u8(input1_ptr + x);

163

const uint8x16_t b = vld1q_u8(input2_ptr + x);

164

165

const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);

166

const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);

167

const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);

168

const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);

169

170

const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);

171

const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);

172

const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);

173

const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);

int32x4_t rf_0{};

int32x4_t rf_1{};

int32x4_t rf_2{};

int32x4_t rf_3{};

#ifdef __aarch64__

rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));

182

rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));

183

rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));

184

rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));

185

#else //__aarch64__

186

rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));

187

rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));

188

rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));

189

rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));

190

#endif //__aarch64__

191

192

const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));

193

const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));

194

vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));

195

}

196

197

// Compute left-over elements

198

for(; x < window_end_x; ++x)

199

{

200

const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;

201

const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;

Giorgio Arena

433ea49

2021-05-26 15:32:50 +0100

[diff] [blame]

202

*(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);

Michalis Spyrou

a3c9a3b

2020-12-08 21:02:16 +0000

[diff] [blame]

203

}

204

},

205

input1, input2, output);

}

}

} // namespace cpu

} // namespace arm_compute