Blame - src/cpu/kernels/sub/neon/qasymm8_signed.cpp - ml/ComputeLibrary

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),

92

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),

93

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),

94

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),

}

};

// Compute S elements per iteration

99

int x = window_start_x;

100

for(; x <= (window_end_x - window_step_x); x += window_step_x)

101

{

102

const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);

103

104

const float32x4x4_t af =

105

{

106

{

107

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

108

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

109

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

110

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

}

};

const int32x4x4_t rf =

{

{

#ifdef __aarch64_

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

119

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

120

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

121

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

122

#else //__aarch64__

123

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

124

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

125

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

126

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

#endif //__aarch64__

}

};

const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));

132

const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));

133

wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));

134

}

135

136

// Compute left-over elements

137

for(; x < window_end_x; ++x)

138

{

139

const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;

140

const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;

141

*(output_ptr + x) = quantize_qasymm8_signed(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info());

142

}

143

},

144

broadcast_input, non_broadcast_input, output);

}

else

{

const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);

149

const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);

150

const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);

151

const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);

152

153

// Clear X Dimension on execution window as we handle manually

154

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

155

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

156

157

Iterator input1(src0, input1_win);

158

Iterator input2(src1, input2_win);

159

Iterator output(dst, win);

160

161

execute_window_loop(win, [&](const Coordinates &)

162

{

163

const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());

164

const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());

165

const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());

166

167

// Compute S elements per iteration

168

int x = window_start_x;

169

for(; x <= (window_end_x - window_step_x); x += window_step_x)

170

{

171

const auto a = wrapper::vloadq(input1_ptr + x);

172

const auto b = wrapper::vloadq(input2_ptr + x);

173

174

const float32x4x4_t af =

175

{

176

{

177

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

178

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

179

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

180

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

}

};

const float32x4x4_t bf =

185

{

186

{

187

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),

188

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),

189

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),

190

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),

}

};

const int32x4x4_t rf =

{

{

#ifdef __aarch64__

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

199

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

200

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

201

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

202

#else //__aarch64__

203

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

204

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

205

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

206

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

#endif //__aarch64__

}

};

const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));

212

const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));

213

wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));

214

}

215

216

// Compute left-over elements

217

for(; x < window_end_x; ++x)

218

{

219

const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;

220

const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;

221

222

*(output_ptr + x) = quantize_qasymm8_signed((afs - bfs), dst->info()->quantization_info());

223

}

224

},

225

input1, input2, output);

}

}

} // namespace cpu

} // namespace arm_compute