BUILTIN(__builtin_ia32_pmuldq128, "V2LLiV4iV4i", "")
BUILTIN(__builtin_ia32_pmulld128, "V4iV4iV4i", "")
BUILTIN(__builtin_ia32_roundps, "V4fV4fi", "")
-BUILTIN(__builtin_ia32_roundss, "V4fV4fi", "")
-BUILTIN(__builtin_ia32_roundsd, "V2dV2di", "")
+BUILTIN(__builtin_ia32_roundss, "V4fV4fV4fi", "")
+BUILTIN(__builtin_ia32_roundsd, "V2dV2dV2di", "")
BUILTIN(__builtin_ia32_roundpd, "V2dV2di", "")
tmp_V2LLi = __builtin_ia32_pmuldq128(tmp_V4i, tmp_V4i);
tmp_V4i = __builtin_ia32_pmulld128(tmp_V4i, tmp_V4i);
tmp_V4f = __builtin_ia32_roundps(tmp_V4f, imm_i_0_16);
- // tmp_V4f = __builtin_ia32_roundss(tmp_V4f, tmp_V4f, imm_i_0_16);
- // tmp_V2d = __builtin_ia32_roundsd(tmp_V2d, tmp_V2d, imm_i_0_16);
+ tmp_V4f = __builtin_ia32_roundss(tmp_V4f, tmp_V4f, imm_i_0_16);
+ tmp_V2d = __builtin_ia32_roundsd(tmp_V2d, tmp_V2d, imm_i_0_16);
tmp_V2d = __builtin_ia32_roundpd(tmp_V2d, imm_i_0_16);
tmp_V4f = __builtin_ia32_insertps128(tmp_V4f, tmp_V4f, tmp_i);
#endif