0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+multiclass AVX512_rndscale_lowering<X86VectorVTInfo _, string Suffix> {
+ // Register
+ def : Pat<(_.VT (ffloor _.RC:$src)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+ _.RC:$src, (i32 0x9))>;
+ def : Pat<(_.VT (fnearbyint _.RC:$src)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+ _.RC:$src, (i32 0xC))>;
+ def : Pat<(_.VT (fceil _.RC:$src)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+ _.RC:$src, (i32 0xA))>;
+ def : Pat<(_.VT (frint _.RC:$src)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+ _.RC:$src, (i32 0x4))>;
+ def : Pat<(_.VT (ftrunc _.RC:$src)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
+ _.RC:$src, (i32 0xB))>;
+
+ // Merge-masking
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src), _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src), _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src), _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src), _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src), _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xB))>;
+
+ // Zero-masking
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+ _.KRCWM:$mask, _.RC:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+ _.KRCWM:$mask, _.RC:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+ _.KRCWM:$mask, _.RC:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+ _.KRCWM:$mask, _.RC:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
+ _.KRCWM:$mask, _.RC:$src, (i32 0xB))>;
+
+ // Load
+ def : Pat<(_.VT (ffloor (_.LdFrag addr:$src))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+ addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (fnearbyint (_.LdFrag addr:$src))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+ addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (fceil (_.LdFrag addr:$src))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+ addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (frint (_.LdFrag addr:$src))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+ addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (ftrunc (_.LdFrag addr:$src))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
+ addr:$src, (i32 0xB))>;
+
+ // Merge-masking + load
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+
+ // Zero-masking + load
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+ _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+ _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+
+ // Broadcast load
+ def : Pat<(_.VT (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+ addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+ addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+ addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (frint (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+ addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
+ addr:$src, (i32 0xB))>;
+
+ // Merge-masking + broadcast load
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.RC:$dst)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
+ _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+
+ // Zero-masking + broadcast load
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+ _.KRCWM:$mask, addr:$src, (i32 0x9))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xC))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xA))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+ _.KRCWM:$mask, addr:$src, (i32 0x4))>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
+ _.KRCWM:$mask, addr:$src, (i32 0xB))>;
+}
+
let Predicates = [HasAVX512] in {
-def : Pat<(v16f32 (ffloor VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
-def : Pat<(v16f32 (vselect VK16WM:$mask, (ffloor VR512:$src), VR512:$dst)),
- (VRNDSCALEPSZrrik VR512:$dst, VK16WM:$mask, VR512:$src, (i32 0x9))>;
-def : Pat<(v16f32 (vselect VK16WM:$mask, (ffloor VR512:$src), v16f32_info.ImmAllZerosV)),
- (VRNDSCALEPSZrrikz VK16WM:$mask, VR512:$src, (i32 0x9))>;
-def : Pat<(v16f32 (fnearbyint VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
-def : Pat<(v16f32 (fceil VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>;
-def : Pat<(v16f32 (vselect VK16WM:$mask, (fceil VR512:$src), VR512:$dst)),
- (VRNDSCALEPSZrrik VR512:$dst, VK16WM:$mask, VR512:$src, (i32 0xA))>;
-def : Pat<(v16f32 (vselect VK16WM:$mask, (fceil VR512:$src), v16f32_info.ImmAllZerosV)),
- (VRNDSCALEPSZrrikz VK16WM:$mask, VR512:$src, (i32 0xA))>;
-def : Pat<(v16f32 (frint VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
-def : Pat<(v16f32 (ftrunc VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0xB))>;
-
-def : Pat<(v16f32 (ffloor (loadv16f32 addr:$src))),
- (VRNDSCALEPSZrmi addr:$src, (i32 0x9))>;
-def : Pat<(v16f32 (fnearbyint (loadv16f32 addr:$src))),
- (VRNDSCALEPSZrmi addr:$src, (i32 0xC))>;
-def : Pat<(v16f32 (fceil (loadv16f32 addr:$src))),
- (VRNDSCALEPSZrmi addr:$src, (i32 0xA))>;
-def : Pat<(v16f32 (frint (loadv16f32 addr:$src))),
- (VRNDSCALEPSZrmi addr:$src, (i32 0x4))>;
-def : Pat<(v16f32 (ftrunc (loadv16f32 addr:$src))),
- (VRNDSCALEPSZrmi addr:$src, (i32 0xB))>;
-
-def : Pat<(v8f64 (ffloor VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>;
-def : Pat<(v8f64 (vselect VK8WM:$mask, (ffloor VR512:$src), VR512:$dst)),
- (VRNDSCALEPDZrrik VR512:$dst, VK8WM:$mask, VR512:$src, (i32 0x9))>;
-def : Pat<(v8f64 (vselect VK8WM:$mask, (ffloor VR512:$src), v8f64_info.ImmAllZerosV)),
- (VRNDSCALEPDZrrikz VK8WM:$mask, VR512:$src, (i32 0x9))>;
-def : Pat<(v8f64 (fnearbyint VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
-def : Pat<(v8f64 (fceil VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>;
-def : Pat<(v8f64 (vselect VK8WM:$mask, (fceil VR512:$src), VR512:$dst)),
- (VRNDSCALEPDZrrik VR512:$dst, VK8WM:$mask, VR512:$src, (i32 0xA))>;
-def : Pat<(v8f64 (vselect VK8WM:$mask, (fceil VR512:$src), v8f64_info.ImmAllZerosV)),
- (VRNDSCALEPDZrrikz VK8WM:$mask, VR512:$src, (i32 0xA))>;
-def : Pat<(v8f64 (frint VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
-def : Pat<(v8f64 (ftrunc VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0xB))>;
-
-def : Pat<(v8f64 (ffloor (loadv8f64 addr:$src))),
- (VRNDSCALEPDZrmi addr:$src, (i32 0x9))>;
-def : Pat<(v8f64 (fnearbyint (loadv8f64 addr:$src))),
- (VRNDSCALEPDZrmi addr:$src, (i32 0xC))>;
-def : Pat<(v8f64 (fceil (loadv8f64 addr:$src))),
- (VRNDSCALEPDZrmi addr:$src, (i32 0xA))>;
-def : Pat<(v8f64 (frint (loadv8f64 addr:$src))),
- (VRNDSCALEPDZrmi addr:$src, (i32 0x4))>;
-def : Pat<(v8f64 (ftrunc (loadv8f64 addr:$src))),
- (VRNDSCALEPDZrmi addr:$src, (i32 0xB))>;
+ defm : AVX512_rndscale_lowering<v16f32_info, "PS">;
+ defm : AVX512_rndscale_lowering<v8f64_info, "PD">;
}
let Predicates = [HasVLX] in {
-def : Pat<(v4f32 (ffloor VR128X:$src)),
- (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x9))>;
-def : Pat<(v4f32 (vselect VK4WM:$mask, (ffloor VR128X:$src), VR128X:$dst)),
- (VRNDSCALEPSZ128rrik VR128X:$dst, VK4WM:$mask, VR128X:$src, (i32 0x9))>;
-def : Pat<(v4f32 (vselect VK4WM:$mask, (ffloor VR128X:$src), v4f32x_info.ImmAllZerosV)),
- (VRNDSCALEPSZ128rrikz VK4WM:$mask, VR128X:$src, (i32 0x9))>;
-def : Pat<(v4f32 (fnearbyint VR128X:$src)),
- (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xC))>;
-def : Pat<(v4f32 (fceil VR128X:$src)),
- (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xA))>;
-def : Pat<(v4f32 (vselect VK4WM:$mask, (fceil VR128X:$src), VR128X:$dst)),
- (VRNDSCALEPSZ128rrik VR128X:$dst, VK4WM:$mask, VR128X:$src, (i32 0xA))>;
-def : Pat<(v4f32 (vselect VK4WM:$mask, (fceil VR128X:$src), v4f32x_info.ImmAllZerosV)),
- (VRNDSCALEPSZ128rrikz VK4WM:$mask, VR128X:$src, (i32 0xA))>;
-def : Pat<(v4f32 (frint VR128X:$src)),
- (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x4))>;
-def : Pat<(v4f32 (ftrunc VR128X:$src)),
- (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xB))>;
-
-def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))),
- (VRNDSCALEPSZ128rmi addr:$src, (i32 0x9))>;
-def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))),
- (VRNDSCALEPSZ128rmi addr:$src, (i32 0xC))>;
-def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))),
- (VRNDSCALEPSZ128rmi addr:$src, (i32 0xA))>;
-def : Pat<(v4f32 (frint (loadv4f32 addr:$src))),
- (VRNDSCALEPSZ128rmi addr:$src, (i32 0x4))>;
-def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))),
- (VRNDSCALEPSZ128rmi addr:$src, (i32 0xB))>;
-
-def : Pat<(v2f64 (ffloor VR128X:$src)),
- (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x9))>;
-def : Pat<(v2f64 (vselect VK2WM:$mask, (ffloor VR128X:$src), VR128X:$dst)),
- (VRNDSCALEPDZ128rrik VR128X:$dst, VK2WM:$mask, VR128X:$src, (i32 0x9))>;
-def : Pat<(v2f64 (vselect VK2WM:$mask, (ffloor VR128X:$src), v2f64x_info.ImmAllZerosV)),
- (VRNDSCALEPDZ128rrikz VK2WM:$mask, VR128X:$src, (i32 0x9))>;
-def : Pat<(v2f64 (fnearbyint VR128X:$src)),
- (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xC))>;
-def : Pat<(v2f64 (fceil VR128X:$src)),
- (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xA))>;
-def : Pat<(v2f64 (vselect VK2WM:$mask, (fceil VR128X:$src), VR128X:$dst)),
- (VRNDSCALEPDZ128rrik VR128X:$dst, VK2WM:$mask, VR128X:$src, (i32 0xA))>;
-def : Pat<(v2f64 (vselect VK2WM:$mask, (fceil VR128X:$src), v2f64x_info.ImmAllZerosV)),
- (VRNDSCALEPDZ128rrikz VK2WM:$mask, VR128X:$src, (i32 0xA))>;
-def : Pat<(v2f64 (frint VR128X:$src)),
- (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x4))>;
-def : Pat<(v2f64 (ftrunc VR128X:$src)),
- (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xB))>;
-
-def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))),
- (VRNDSCALEPDZ128rmi addr:$src, (i32 0x9))>;
-def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))),
- (VRNDSCALEPDZ128rmi addr:$src, (i32 0xC))>;
-def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))),
- (VRNDSCALEPDZ128rmi addr:$src, (i32 0xA))>;
-def : Pat<(v2f64 (frint (loadv2f64 addr:$src))),
- (VRNDSCALEPDZ128rmi addr:$src, (i32 0x4))>;
-def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))),
- (VRNDSCALEPDZ128rmi addr:$src, (i32 0xB))>;
-
-def : Pat<(v8f32 (ffloor VR256X:$src)),
- (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x9))>;
-def : Pat<(v8f32 (vselect VK8WM:$mask, (ffloor VR256X:$src), VR256X:$dst)),
- (VRNDSCALEPSZ256rrik VR256X:$dst, VK8WM:$mask, VR256X:$src, (i32 0x9))>;
-def : Pat<(v8f32 (vselect VK8WM:$mask, (ffloor VR256X:$src), v8f32x_info.ImmAllZerosV)),
- (VRNDSCALEPSZ256rrikz VK8WM:$mask, VR256X:$src, (i32 0x9))>;
-def : Pat<(v8f32 (fnearbyint VR256X:$src)),
- (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xC))>;
-def : Pat<(v8f32 (fceil VR256X:$src)),
- (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xA))>;
-def : Pat<(v8f32 (vselect VK8WM:$mask, (fceil VR256X:$src), VR256X:$dst)),
- (VRNDSCALEPSZ256rrik VR256X:$dst, VK8WM:$mask, VR256X:$src, (i32 0xA))>;
-def : Pat<(v8f32 (vselect VK8WM:$mask, (fceil VR256X:$src), v8f32x_info.ImmAllZerosV)),
- (VRNDSCALEPSZ256rrikz VK8WM:$mask, VR256X:$src, (i32 0xA))>;
-def : Pat<(v8f32 (frint VR256X:$src)),
- (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x4))>;
-def : Pat<(v8f32 (ftrunc VR256X:$src)),
- (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xB))>;
-
-def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))),
- (VRNDSCALEPSZ256rmi addr:$src, (i32 0x9))>;
-def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))),
- (VRNDSCALEPSZ256rmi addr:$src, (i32 0xC))>;
-def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))),
- (VRNDSCALEPSZ256rmi addr:$src, (i32 0xA))>;
-def : Pat<(v8f32 (frint (loadv8f32 addr:$src))),
- (VRNDSCALEPSZ256rmi addr:$src, (i32 0x4))>;
-def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))),
- (VRNDSCALEPSZ256rmi addr:$src, (i32 0xB))>;
-
-def : Pat<(v4f64 (ffloor VR256X:$src)),
- (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x9))>;
-def : Pat<(v4f64 (vselect VK4WM:$mask, (ffloor VR256X:$src), VR256X:$dst)),
- (VRNDSCALEPDZ256rrik VR256X:$dst, VK4WM:$mask, VR256X:$src, (i32 0x9))>;
-def : Pat<(v4f64 (vselect VK4WM:$mask, (ffloor VR256X:$src), v4f64x_info.ImmAllZerosV)),
- (VRNDSCALEPDZ256rrikz VK4WM:$mask, VR256X:$src, (i32 0x9))>;
-def : Pat<(v4f64 (fnearbyint VR256X:$src)),
- (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xC))>;
-def : Pat<(v4f64 (fceil VR256X:$src)),
- (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xA))>;
-def : Pat<(v4f64 (vselect VK4WM:$mask, (fceil VR256X:$src), VR256X:$dst)),
- (VRNDSCALEPDZ256rrik VR256X:$dst, VK4WM:$mask, VR256X:$src, (i32 0xA))>;
-def : Pat<(v4f64 (vselect VK4WM:$mask, (fceil VR256X:$src), v4f64x_info.ImmAllZerosV)),
- (VRNDSCALEPDZ256rrikz VK4WM:$mask, VR256X:$src, (i32 0xA))>;
-def : Pat<(v4f64 (frint VR256X:$src)),
- (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x4))>;
-def : Pat<(v4f64 (ftrunc VR256X:$src)),
- (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xB))>;
-
-def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))),
- (VRNDSCALEPDZ256rmi addr:$src, (i32 0x9))>;
-def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))),
- (VRNDSCALEPDZ256rmi addr:$src, (i32 0xC))>;
-def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))),
- (VRNDSCALEPDZ256rmi addr:$src, (i32 0xA))>;
-def : Pat<(v4f64 (frint (loadv4f64 addr:$src))),
- (VRNDSCALEPDZ256rmi addr:$src, (i32 0x4))>;
-def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))),
- (VRNDSCALEPDZ256rmi addr:$src, (i32 0xB))>;
+ defm : AVX512_rndscale_lowering<v8f32x_info, "PS">;
+ defm : AVX512_rndscale_lowering<v4f64x_info, "PD">;
+ defm : AVX512_rndscale_lowering<v4f32x_info, "PS">;
+ defm : AVX512_rndscale_lowering<v2f64x_info, "PD">;
}
multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
; CHECK-LABEL: floor_v2f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovapd (%rdi), %xmm1
-; CHECK-NEXT: vrndscalepd $9, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $9, (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%p = load <2 x double>, <2 x double>* %ptr
; CHECK-LABEL: floor_v4f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovaps (%rdi), %xmm1
-; CHECK-NEXT: vrndscaleps $9, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $9, (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%p = load <4 x float>, <4 x float>* %ptr
; CHECK-LABEL: floor_v4f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vmovapd (%rdi), %ymm1
-; CHECK-NEXT: vrndscalepd $9, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $9, (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%p = load <4 x double>, <4 x double>* %ptr
; CHECK-LABEL: floor_v8f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vmovaps (%rdi), %ymm1
-; CHECK-NEXT: vrndscaleps $9, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $9, (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%p = load <8 x float>, <8 x float>* %ptr
; CHECK-LABEL: floor_v8f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vmovapd (%rdi), %zmm1
-; CHECK-NEXT: vrndscalepd $9, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $9, (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%p = load <8 x double>, <8 x double>* %ptr
; CHECK-LABEL: floor_v16f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vmovaps (%rdi), %zmm1
-; CHECK-NEXT: vrndscaleps $9, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $9, (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%p = load <16 x float>, <16 x float>* %ptr
; CHECK-LABEL: floor_v2f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovapd (%rdi), %xmm0
-; CHECK-NEXT: vrndscalepd $9, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $9, (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%p = load <2 x double>, <2 x double>* %ptr
; CHECK-LABEL: floor_v4f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovaps (%rdi), %xmm0
-; CHECK-NEXT: vrndscaleps $9, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $9, (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%p = load <4 x float>, <4 x float>* %ptr
; CHECK-LABEL: floor_v4f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT: vmovapd (%rdi), %ymm0
-; CHECK-NEXT: vrndscalepd $9, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $9, (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%p = load <4 x double>, <4 x double>* %ptr
; CHECK-LABEL: floor_v8f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT: vmovaps (%rdi), %ymm0
-; CHECK-NEXT: vrndscaleps $9, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $9, (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%p = load <8 x float>, <8 x float>* %ptr
; CHECK-LABEL: floor_v8f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vmovapd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $9, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $9, (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%p = load <8 x double>, <8 x double>* %ptr
; CHECK-LABEL: floor_v16f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; CHECK-NEXT: vmovaps (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $9, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $9, (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%p = load <16 x float>, <16 x float>* %ptr
define <2 x double> @floor_v2f64_broadcast(double* %ptr) {
; CHECK-LABEL: floor_v2f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT: vroundpd $9, %xmm0, %xmm0
+; CHECK-NEXT: vrndscalepd $9, (%rdi){1to2}, %xmm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <2 x double> undef, double %ps, i32 0
define <4 x float> @floor_v4f32_broadcast(float* %ptr) {
; CHECK-LABEL: floor_v4f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT: vroundps $9, %xmm0, %xmm0
+; CHECK-NEXT: vrndscaleps $9, (%rdi){1to4}, %xmm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <4 x float> undef, float %ps, i32 0
define <4 x double> @floor_v4f64_broadcast(double* %ptr){
; CHECK-LABEL: floor_v4f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT: vroundpd $9, %ymm0, %ymm0
+; CHECK-NEXT: vrndscalepd $9, (%rdi){1to4}, %ymm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <4 x double> undef, double %ps, i32 0
define <8 x float> @floor_v8f32_broadcast(float* %ptr) {
; CHECK-LABEL: floor_v8f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT: vroundps $9, %ymm0, %ymm0
+; CHECK-NEXT: vrndscaleps $9, (%rdi){1to8}, %ymm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <8 x float> undef, float %ps, i32 0
define <8 x double> @floor_v8f64_broadcast(double* %ptr){
; CHECK-LABEL: floor_v8f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $9, %zmm0, %zmm0
+; CHECK-NEXT: vrndscalepd $9, (%rdi){1to8}, %zmm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <8 x double> undef, double %ps, i32 0
define <16 x float> @floor_v16f32_broadcast(float* %ptr) {
; CHECK-LABEL: floor_v16f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $9, %zmm0, %zmm0
+; CHECK-NEXT: vrndscaleps $9, (%rdi){1to16}, %zmm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <16 x float> undef, float %ps, i32 0
; CHECK-LABEL: floor_v2f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; CHECK-NEXT: vrndscalepd $9, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $9, (%rdi){1to2}, %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: floor_v4f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm1
-; CHECK-NEXT: vrndscaleps $9, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $9, (%rdi){1to4}, %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: floor_v4f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm1
-; CHECK-NEXT: vrndscalepd $9, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $9, (%rdi){1to4}, %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: floor_v8f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm1
-; CHECK-NEXT: vrndscaleps $9, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $9, (%rdi){1to8}, %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: floor_v8f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm1
-; CHECK-NEXT: vrndscalepd $9, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $9, (%rdi){1to8}, %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: floor_v16f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm1
-; CHECK-NEXT: vrndscaleps $9, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $9, (%rdi){1to16}, %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: floor_v2f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT: vrndscalepd $9, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $9, (%rdi){1to2}, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: floor_v4f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT: vrndscaleps $9, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $9, (%rdi){1to4}, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: floor_v4f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT: vrndscalepd $9, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $9, (%rdi){1to4}, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: floor_v8f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT: vrndscaleps $9, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $9, (%rdi){1to8}, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: floor_v8f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $9, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $9, (%rdi){1to8}, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: floor_v16f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $9, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $9, (%rdi){1to16}, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: ceil_v2f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovapd (%rdi), %xmm1
-; CHECK-NEXT: vrndscalepd $10, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $10, (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%p = load <2 x double>, <2 x double>* %ptr
; CHECK-LABEL: ceil_v4f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovaps (%rdi), %xmm1
-; CHECK-NEXT: vrndscaleps $10, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $10, (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%p = load <4 x float>, <4 x float>* %ptr
; CHECK-LABEL: ceil_v4f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vmovapd (%rdi), %ymm1
-; CHECK-NEXT: vrndscalepd $10, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $10, (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%p = load <4 x double>, <4 x double>* %ptr
; CHECK-LABEL: ceil_v8f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vmovaps (%rdi), %ymm1
-; CHECK-NEXT: vrndscaleps $10, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $10, (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%p = load <8 x float>, <8 x float>* %ptr
; CHECK-LABEL: ceil_v8f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vmovapd (%rdi), %zmm1
-; CHECK-NEXT: vrndscalepd $10, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $10, (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%p = load <8 x double>, <8 x double>* %ptr
; CHECK-LABEL: ceil_v16f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vmovaps (%rdi), %zmm1
-; CHECK-NEXT: vrndscaleps $10, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $10, (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%p = load <16 x float>, <16 x float>* %ptr
; CHECK-LABEL: ceil_v2f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovapd (%rdi), %xmm0
-; CHECK-NEXT: vrndscalepd $10, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $10, (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%p = load <2 x double>, <2 x double>* %ptr
; CHECK-LABEL: ceil_v4f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovaps (%rdi), %xmm0
-; CHECK-NEXT: vrndscaleps $10, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $10, (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%p = load <4 x float>, <4 x float>* %ptr
; CHECK-LABEL: ceil_v4f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT: vmovapd (%rdi), %ymm0
-; CHECK-NEXT: vrndscalepd $10, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $10, (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%p = load <4 x double>, <4 x double>* %ptr
; CHECK-LABEL: ceil_v8f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT: vmovaps (%rdi), %ymm0
-; CHECK-NEXT: vrndscaleps $10, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $10, (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%p = load <8 x float>, <8 x float>* %ptr
; CHECK-LABEL: ceil_v8f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vmovapd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $10, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $10, (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%p = load <8 x double>, <8 x double>* %ptr
; CHECK-LABEL: ceil_v16f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; CHECK-NEXT: vmovaps (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $10, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $10, (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%p = load <16 x float>, <16 x float>* %ptr
define <2 x double> @ceil_v2f64_broadcast(double* %ptr) {
; CHECK-LABEL: ceil_v2f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT: vroundpd $10, %xmm0, %xmm0
+; CHECK-NEXT: vrndscalepd $10, (%rdi){1to2}, %xmm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <2 x double> undef, double %ps, i32 0
define <4 x float> @ceil_v4f32_broadcast(float* %ptr) {
; CHECK-LABEL: ceil_v4f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT: vroundps $10, %xmm0, %xmm0
+; CHECK-NEXT: vrndscaleps $10, (%rdi){1to4}, %xmm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <4 x float> undef, float %ps, i32 0
define <4 x double> @ceil_v4f64_broadcast(double* %ptr){
; CHECK-LABEL: ceil_v4f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT: vroundpd $10, %ymm0, %ymm0
+; CHECK-NEXT: vrndscalepd $10, (%rdi){1to4}, %ymm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <4 x double> undef, double %ps, i32 0
define <8 x float> @ceil_v8f32_broadcast(float* %ptr) {
; CHECK-LABEL: ceil_v8f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT: vroundps $10, %ymm0, %ymm0
+; CHECK-NEXT: vrndscaleps $10, (%rdi){1to8}, %ymm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <8 x float> undef, float %ps, i32 0
define <8 x double> @ceil_v8f64_broadcast(double* %ptr){
; CHECK-LABEL: ceil_v8f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $10, %zmm0, %zmm0
+; CHECK-NEXT: vrndscalepd $10, (%rdi){1to8}, %zmm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <8 x double> undef, double %ps, i32 0
define <16 x float> @ceil_v16f32_broadcast(float* %ptr) {
; CHECK-LABEL: ceil_v16f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $10, %zmm0, %zmm0
+; CHECK-NEXT: vrndscaleps $10, (%rdi){1to16}, %zmm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <16 x float> undef, float %ps, i32 0
; CHECK-LABEL: ceil_v2f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; CHECK-NEXT: vrndscalepd $10, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $10, (%rdi){1to2}, %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: ceil_v4f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm1
-; CHECK-NEXT: vrndscaleps $10, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $10, (%rdi){1to4}, %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: ceil_v4f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm1
-; CHECK-NEXT: vrndscalepd $10, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $10, (%rdi){1to4}, %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: ceil_v8f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm1
-; CHECK-NEXT: vrndscaleps $10, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $10, (%rdi){1to8}, %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: ceil_v8f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm1
-; CHECK-NEXT: vrndscalepd $10, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $10, (%rdi){1to8}, %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: ceil_v16f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm1
-; CHECK-NEXT: vrndscaleps $10, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $10, (%rdi){1to16}, %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: ceil_v2f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT: vrndscalepd $10, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $10, (%rdi){1to2}, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: ceil_v4f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT: vrndscaleps $10, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $10, (%rdi){1to4}, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: ceil_v4f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT: vrndscalepd $10, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $10, (%rdi){1to4}, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: ceil_v8f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT: vrndscaleps $10, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $10, (%rdi){1to8}, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: ceil_v8f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $10, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $10, (%rdi){1to8}, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: ceil_v16f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $10, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $10, (%rdi){1to16}, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: trunc_v2f64_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT: vroundpd $11, %xmm0, %xmm0
-; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $11, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
; CHECK-LABEL: trunc_v4f32_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vroundps $11, %xmm0, %xmm0
-; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $11, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
; CHECK-LABEL: trunc_v4f64_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-NEXT: vroundpd $11, %ymm0, %ymm0
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $11, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
; CHECK-LABEL: trunc_v8f32_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; CHECK-NEXT: vroundps $11, %ymm0, %ymm0
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $11, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
; CHECK-LABEL: trunc_v8f64_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm0
-; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%t = call <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
; CHECK-LABEL: trunc_v16f32_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm0
-; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%t = call <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
; CHECK-LABEL: trunc_v2f64_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundpd $11, %xmm0, %xmm0
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $11, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
; CHECK-LABEL: trunc_v4f32_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundps $11, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $11, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
; CHECK-LABEL: trunc_v4f64_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundpd $11, %ymm0, %ymm0
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $11, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
; CHECK-LABEL: trunc_v8f32_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundps $11, %ymm0, %ymm0
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $11, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
; CHECK-LABEL: trunc_v8f64_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm0
-; CHECK-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%t = call <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
; CHECK-LABEL: trunc_v16f32_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%t = call <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
; CHECK-LABEL: trunc_v2f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundpd $11, (%rdi), %xmm1
-; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $11, (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%p = load <2 x double>, <2 x double>* %ptr
; CHECK-LABEL: trunc_v4f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundps $11, (%rdi), %xmm1
-; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $11, (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%p = load <4 x float>, <4 x float>* %ptr
; CHECK-LABEL: trunc_v4f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundpd $11, (%rdi), %ymm1
-; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $11, (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%p = load <4 x double>, <4 x double>* %ptr
; CHECK-LABEL: trunc_v8f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundps $11, (%rdi), %ymm1
-; CHECK-NEXT: vmovaps %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $11, (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%p = load <8 x float>, <8 x float>* %ptr
; CHECK-LABEL: trunc_v8f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscalepd $11, (%rdi), %zmm1
-; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $11, (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%p = load <8 x double>, <8 x double>* %ptr
; CHECK-LABEL: trunc_v16f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscaleps $11, (%rdi), %zmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $11, (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%p = load <16 x float>, <16 x float>* %ptr
; CHECK-LABEL: trunc_v2f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vroundpd $11, (%rdi), %xmm0
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $11, (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%p = load <2 x double>, <2 x double>* %ptr
; CHECK-LABEL: trunc_v4f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vroundps $11, (%rdi), %xmm0
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $11, (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%p = load <4 x float>, <4 x float>* %ptr
; CHECK-LABEL: trunc_v4f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT: vroundpd $11, (%rdi), %ymm0
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $11, (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%p = load <4 x double>, <4 x double>* %ptr
; CHECK-LABEL: trunc_v8f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT: vroundps $11, (%rdi), %ymm0
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $11, (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%p = load <8 x float>, <8 x float>* %ptr
; CHECK-LABEL: trunc_v8f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vrndscalepd $11, (%rdi), %zmm0
-; CHECK-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $11, (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%p = load <8 x double>, <8 x double>* %ptr
; CHECK-LABEL: trunc_v16f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; CHECK-NEXT: vrndscaleps $11, (%rdi), %zmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $11, (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%p = load <16 x float>, <16 x float>* %ptr
define <2 x double> @trunc_v2f64_broadcast(double* %ptr) {
; CHECK-LABEL: trunc_v2f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT: vroundpd $11, %xmm0, %xmm0
+; CHECK-NEXT: vrndscalepd $11, (%rdi){1to2}, %xmm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <2 x double> undef, double %ps, i32 0
define <4 x float> @trunc_v4f32_broadcast(float* %ptr) {
; CHECK-LABEL: trunc_v4f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT: vroundps $11, %xmm0, %xmm0
+; CHECK-NEXT: vrndscaleps $11, (%rdi){1to4}, %xmm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <4 x float> undef, float %ps, i32 0
define <4 x double> @trunc_v4f64_broadcast(double* %ptr){
; CHECK-LABEL: trunc_v4f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT: vroundpd $11, %ymm0, %ymm0
+; CHECK-NEXT: vrndscalepd $11, (%rdi){1to4}, %ymm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <4 x double> undef, double %ps, i32 0
define <8 x float> @trunc_v8f32_broadcast(float* %ptr) {
; CHECK-LABEL: trunc_v8f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT: vroundps $11, %ymm0, %ymm0
+; CHECK-NEXT: vrndscaleps $11, (%rdi){1to8}, %ymm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <8 x float> undef, float %ps, i32 0
define <8 x double> @trunc_v8f64_broadcast(double* %ptr){
; CHECK-LABEL: trunc_v8f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm0
+; CHECK-NEXT: vrndscalepd $11, (%rdi){1to8}, %zmm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <8 x double> undef, double %ps, i32 0
define <16 x float> @trunc_v16f32_broadcast(float* %ptr) {
; CHECK-LABEL: trunc_v16f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm0
+; CHECK-NEXT: vrndscaleps $11, (%rdi){1to16}, %zmm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <16 x float> undef, float %ps, i32 0
; CHECK-LABEL: trunc_v2f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; CHECK-NEXT: vroundpd $11, %xmm1, %xmm1
-; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $11, (%rdi){1to2}, %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: trunc_v4f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm1
-; CHECK-NEXT: vroundps $11, %xmm1, %xmm1
-; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $11, (%rdi){1to4}, %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: trunc_v4f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm1
-; CHECK-NEXT: vroundpd $11, %ymm1, %ymm1
-; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $11, (%rdi){1to4}, %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: trunc_v8f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm1
-; CHECK-NEXT: vroundps $11, %ymm1, %ymm1
-; CHECK-NEXT: vmovaps %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $11, (%rdi){1to8}, %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: trunc_v8f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm1
-; CHECK-NEXT: vrndscalepd $11, %zmm1, %zmm1
-; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $11, (%rdi){1to8}, %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: trunc_v16f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm1
-; CHECK-NEXT: vrndscaleps $11, %zmm1, %zmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $11, (%rdi){1to16}, %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: trunc_v2f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT: vroundpd $11, %xmm0, %xmm0
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $11, (%rdi){1to2}, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: trunc_v4f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT: vroundps $11, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $11, (%rdi){1to4}, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: trunc_v4f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT: vroundpd $11, %ymm0, %ymm0
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $11, (%rdi){1to4}, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: trunc_v8f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT: vroundps $11, %ymm0, %ymm0
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $11, (%rdi){1to8}, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: trunc_v8f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm0
-; CHECK-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $11, (%rdi){1to8}, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: trunc_v16f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $11, (%rdi){1to16}, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: rint_v2f64_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT: vroundpd $4, %xmm0, %xmm0
-; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $4, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%t = call <2 x double> @llvm.rint.v2f64(<2 x double> %p)
; CHECK-LABEL: rint_v4f32_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vroundps $4, %xmm0, %xmm0
-; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $4, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%t = call <4 x float> @llvm.rint.v4f32(<4 x float> %p)
; CHECK-LABEL: rint_v4f64_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-NEXT: vroundpd $4, %ymm0, %ymm0
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $4, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%t = call <4 x double> @llvm.rint.v4f64(<4 x double> %p)
; CHECK-LABEL: rint_v8f32_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; CHECK-NEXT: vroundps $4, %ymm0, %ymm0
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $4, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%t = call <8 x float> @llvm.rint.v8f32(<8 x float> %p)
; CHECK-LABEL: rint_v8f64_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; CHECK-NEXT: vrndscalepd $4, %zmm0, %zmm0
-; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $4, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%t = call <8 x double> @llvm.rint.v8f64(<8 x double> %p)
; CHECK-LABEL: rint_v16f32_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; CHECK-NEXT: vrndscaleps $4, %zmm0, %zmm0
-; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $4, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%t = call <16 x float> @llvm.rint.v16f32(<16 x float> %p)
; CHECK-LABEL: rint_v2f64_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundpd $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $4, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%t = call <2 x double> @llvm.rint.v2f64(<2 x double> %p)
; CHECK-LABEL: rint_v4f32_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundps $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $4, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%t = call <4 x float> @llvm.rint.v4f32(<4 x float> %p)
; CHECK-LABEL: rint_v4f64_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundpd $4, %ymm0, %ymm0
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $4, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%t = call <4 x double> @llvm.rint.v4f64(<4 x double> %p)
; CHECK-LABEL: rint_v8f32_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundps $4, %ymm0, %ymm0
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $4, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%t = call <8 x float> @llvm.rint.v8f32(<8 x float> %p)
; CHECK-LABEL: rint_v8f64_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscalepd $4, %zmm0, %zmm0
-; CHECK-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $4, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%t = call <8 x double> @llvm.rint.v8f64(<8 x double> %p)
; CHECK-LABEL: rint_v16f32_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscaleps $4, %zmm0, %zmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $4, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%t = call <16 x float> @llvm.rint.v16f32(<16 x float> %p)
; CHECK-LABEL: rint_v2f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundpd $4, (%rdi), %xmm1
-; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $4, (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%p = load <2 x double>, <2 x double>* %ptr
; CHECK-LABEL: rint_v4f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundps $4, (%rdi), %xmm1
-; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $4, (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%p = load <4 x float>, <4 x float>* %ptr
; CHECK-LABEL: rint_v4f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundpd $4, (%rdi), %ymm1
-; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $4, (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%p = load <4 x double>, <4 x double>* %ptr
; CHECK-LABEL: rint_v8f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundps $4, (%rdi), %ymm1
-; CHECK-NEXT: vmovaps %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $4, (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%p = load <8 x float>, <8 x float>* %ptr
; CHECK-LABEL: rint_v8f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscalepd $4, (%rdi), %zmm1
-; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $4, (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%p = load <8 x double>, <8 x double>* %ptr
; CHECK-LABEL: rint_v16f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscaleps $4, (%rdi), %zmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $4, (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%p = load <16 x float>, <16 x float>* %ptr
; CHECK-LABEL: rint_v2f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vroundpd $4, (%rdi), %xmm0
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $4, (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%p = load <2 x double>, <2 x double>* %ptr
; CHECK-LABEL: rint_v4f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vroundps $4, (%rdi), %xmm0
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $4, (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%p = load <4 x float>, <4 x float>* %ptr
; CHECK-LABEL: rint_v4f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT: vroundpd $4, (%rdi), %ymm0
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $4, (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%p = load <4 x double>, <4 x double>* %ptr
; CHECK-LABEL: rint_v8f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT: vroundps $4, (%rdi), %ymm0
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $4, (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%p = load <8 x float>, <8 x float>* %ptr
; CHECK-LABEL: rint_v8f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vrndscalepd $4, (%rdi), %zmm0
-; CHECK-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $4, (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%p = load <8 x double>, <8 x double>* %ptr
; CHECK-LABEL: rint_v16f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; CHECK-NEXT: vrndscaleps $4, (%rdi), %zmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $4, (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%p = load <16 x float>, <16 x float>* %ptr
define <2 x double> @rint_v2f64_broadcast(double* %ptr) {
; CHECK-LABEL: rint_v2f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT: vroundpd $4, %xmm0, %xmm0
+; CHECK-NEXT: vrndscalepd $4, (%rdi){1to2}, %xmm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <2 x double> undef, double %ps, i32 0
define <4 x float> @rint_v4f32_broadcast(float* %ptr) {
; CHECK-LABEL: rint_v4f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT: vroundps $4, %xmm0, %xmm0
+; CHECK-NEXT: vrndscaleps $4, (%rdi){1to4}, %xmm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <4 x float> undef, float %ps, i32 0
define <4 x double> @rint_v4f64_broadcast(double* %ptr){
; CHECK-LABEL: rint_v4f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT: vroundpd $4, %ymm0, %ymm0
+; CHECK-NEXT: vrndscalepd $4, (%rdi){1to4}, %ymm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <4 x double> undef, double %ps, i32 0
define <8 x float> @rint_v8f32_broadcast(float* %ptr) {
; CHECK-LABEL: rint_v8f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT: vroundps $4, %ymm0, %ymm0
+; CHECK-NEXT: vrndscaleps $4, (%rdi){1to8}, %ymm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <8 x float> undef, float %ps, i32 0
define <8 x double> @rint_v8f64_broadcast(double* %ptr){
; CHECK-LABEL: rint_v8f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $4, %zmm0, %zmm0
+; CHECK-NEXT: vrndscalepd $4, (%rdi){1to8}, %zmm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <8 x double> undef, double %ps, i32 0
define <16 x float> @rint_v16f32_broadcast(float* %ptr) {
; CHECK-LABEL: rint_v16f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $4, %zmm0, %zmm0
+; CHECK-NEXT: vrndscaleps $4, (%rdi){1to16}, %zmm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <16 x float> undef, float %ps, i32 0
; CHECK-LABEL: rint_v2f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; CHECK-NEXT: vroundpd $4, %xmm1, %xmm1
-; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $4, (%rdi){1to2}, %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: rint_v4f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm1
-; CHECK-NEXT: vroundps $4, %xmm1, %xmm1
-; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $4, (%rdi){1to4}, %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: rint_v4f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm1
-; CHECK-NEXT: vroundpd $4, %ymm1, %ymm1
-; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $4, (%rdi){1to4}, %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: rint_v8f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm1
-; CHECK-NEXT: vroundps $4, %ymm1, %ymm1
-; CHECK-NEXT: vmovaps %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $4, (%rdi){1to8}, %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: rint_v8f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm1
-; CHECK-NEXT: vrndscalepd $4, %zmm1, %zmm1
-; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $4, (%rdi){1to8}, %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: rint_v16f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm1
-; CHECK-NEXT: vrndscaleps $4, %zmm1, %zmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $4, (%rdi){1to16}, %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: rint_v2f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT: vroundpd $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $4, (%rdi){1to2}, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: rint_v4f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT: vroundps $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $4, (%rdi){1to4}, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: rint_v4f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT: vroundpd $4, %ymm0, %ymm0
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $4, (%rdi){1to4}, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: rint_v8f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT: vroundps $4, %ymm0, %ymm0
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $4, (%rdi){1to8}, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: rint_v8f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $4, %zmm0, %zmm0
-; CHECK-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $4, (%rdi){1to8}, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: rint_v16f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $4, %zmm0, %zmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $4, (%rdi){1to16}, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: nearbyint_v2f64_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT: vroundpd $12, %xmm0, %xmm0
-; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $12, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
; CHECK-LABEL: nearbyint_v4f32_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vroundps $12, %xmm0, %xmm0
-; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $12, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
; CHECK-LABEL: nearbyint_v4f64_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-NEXT: vroundpd $12, %ymm0, %ymm0
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $12, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
; CHECK-LABEL: nearbyint_v8f32_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; CHECK-NEXT: vroundps $12, %ymm0, %ymm0
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $12, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
; CHECK-LABEL: nearbyint_v8f64_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
-; CHECK-NEXT: vrndscalepd $12, %zmm0, %zmm0
-; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $12, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%t = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
; CHECK-LABEL: nearbyint_v16f32_mask:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
-; CHECK-NEXT: vrndscaleps $12, %zmm0, %zmm0
-; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $12, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%t = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
; CHECK-LABEL: nearbyint_v2f64_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundpd $12, %xmm0, %xmm0
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $12, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
; CHECK-LABEL: nearbyint_v4f32_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundps $12, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $12, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
; CHECK-LABEL: nearbyint_v4f64_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundpd $12, %ymm0, %ymm0
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $12, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
; CHECK-LABEL: nearbyint_v8f32_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundps $12, %ymm0, %ymm0
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $12, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
; CHECK-LABEL: nearbyint_v8f64_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscalepd $12, %zmm0, %zmm0
-; CHECK-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $12, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%t = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
; CHECK-LABEL: nearbyint_v16f32_maskz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscaleps $12, %zmm0, %zmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $12, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%t = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
; CHECK-LABEL: nearbyint_v2f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundpd $12, (%rdi), %xmm1
-; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $12, (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%p = load <2 x double>, <2 x double>* %ptr
; CHECK-LABEL: nearbyint_v4f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vroundps $12, (%rdi), %xmm1
-; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $12, (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%p = load <4 x float>, <4 x float>* %ptr
; CHECK-LABEL: nearbyint_v4f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundpd $12, (%rdi), %ymm1
-; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $12, (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%p = load <4 x double>, <4 x double>* %ptr
; CHECK-LABEL: nearbyint_v8f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vroundps $12, (%rdi), %ymm1
-; CHECK-NEXT: vmovaps %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $12, (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%p = load <8 x float>, <8 x float>* %ptr
; CHECK-LABEL: nearbyint_v8f64_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscalepd $12, (%rdi), %zmm1
-; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $12, (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%p = load <8 x double>, <8 x double>* %ptr
; CHECK-LABEL: nearbyint_v16f32_mask_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vrndscaleps $12, (%rdi), %zmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $12, (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%p = load <16 x float>, <16 x float>* %ptr
; CHECK-LABEL: nearbyint_v2f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vroundpd $12, (%rdi), %xmm0
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $12, (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%p = load <2 x double>, <2 x double>* %ptr
; CHECK-LABEL: nearbyint_v4f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vroundps $12, (%rdi), %xmm0
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $12, (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%p = load <4 x float>, <4 x float>* %ptr
; CHECK-LABEL: nearbyint_v4f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT: vroundpd $12, (%rdi), %ymm0
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $12, (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%p = load <4 x double>, <4 x double>* %ptr
; CHECK-LABEL: nearbyint_v8f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT: vroundps $12, (%rdi), %ymm0
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $12, (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%p = load <8 x float>, <8 x float>* %ptr
; CHECK-LABEL: nearbyint_v8f64_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vrndscalepd $12, (%rdi), %zmm0
-; CHECK-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $12, (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%p = load <8 x double>, <8 x double>* %ptr
; CHECK-LABEL: nearbyint_v16f32_maskz_load:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; CHECK-NEXT: vrndscaleps $12, (%rdi), %zmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $12, (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%p = load <16 x float>, <16 x float>* %ptr
define <2 x double> @nearbyint_v2f64_broadcast(double* %ptr) {
; CHECK-LABEL: nearbyint_v2f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT: vroundpd $12, %xmm0, %xmm0
+; CHECK-NEXT: vrndscalepd $12, (%rdi){1to2}, %xmm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <2 x double> undef, double %ps, i32 0
define <4 x float> @nearbyint_v4f32_broadcast(float* %ptr) {
; CHECK-LABEL: nearbyint_v4f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT: vroundps $12, %xmm0, %xmm0
+; CHECK-NEXT: vrndscaleps $12, (%rdi){1to4}, %xmm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <4 x float> undef, float %ps, i32 0
define <4 x double> @nearbyint_v4f64_broadcast(double* %ptr){
; CHECK-LABEL: nearbyint_v4f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT: vroundpd $12, %ymm0, %ymm0
+; CHECK-NEXT: vrndscalepd $12, (%rdi){1to4}, %ymm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <4 x double> undef, double %ps, i32 0
define <8 x float> @nearbyint_v8f32_broadcast(float* %ptr) {
; CHECK-LABEL: nearbyint_v8f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT: vroundps $12, %ymm0, %ymm0
+; CHECK-NEXT: vrndscaleps $12, (%rdi){1to8}, %ymm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <8 x float> undef, float %ps, i32 0
define <8 x double> @nearbyint_v8f64_broadcast(double* %ptr){
; CHECK-LABEL: nearbyint_v8f64_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $12, %zmm0, %zmm0
+; CHECK-NEXT: vrndscalepd $12, (%rdi){1to8}, %zmm0
; CHECK-NEXT: retq
%ps = load double, double* %ptr
%pins = insertelement <8 x double> undef, double %ps, i32 0
define <16 x float> @nearbyint_v16f32_broadcast(float* %ptr) {
; CHECK-LABEL: nearbyint_v16f32_broadcast:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $12, %zmm0, %zmm0
+; CHECK-NEXT: vrndscaleps $12, (%rdi){1to16}, %zmm0
; CHECK-NEXT: retq
%ps = load float, float* %ptr
%pins = insertelement <16 x float> undef, float %ps, i32 0
; CHECK-LABEL: nearbyint_v2f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; CHECK-NEXT: vroundpd $12, %xmm1, %xmm1
-; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $12, (%rdi){1to2}, %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: nearbyint_v4f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm1
-; CHECK-NEXT: vroundps $12, %xmm1, %xmm1
-; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $12, (%rdi){1to4}, %xmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: nearbyint_v4f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm1
-; CHECK-NEXT: vroundpd $12, %ymm1, %ymm1
-; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscalepd $12, (%rdi){1to4}, %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: nearbyint_v8f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm1
-; CHECK-NEXT: vroundps $12, %ymm1, %ymm1
-; CHECK-NEXT: vmovaps %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vrndscaleps $12, (%rdi){1to8}, %ymm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: nearbyint_v8f64_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm1
-; CHECK-NEXT: vrndscalepd $12, %zmm1, %zmm1
-; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscalepd $12, (%rdi){1to8}, %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: nearbyint_v16f32_mask_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm1
-; CHECK-NEXT: vrndscaleps $12, %zmm1, %zmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vrndscaleps $12, (%rdi){1to16}, %zmm0 {%k1}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: nearbyint_v2f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT: vroundpd $12, %xmm0, %xmm0
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $12, (%rdi){1to2}, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <2 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: nearbyint_v4f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT: vroundps $12, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $12, (%rdi){1to4}, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: nearbyint_v4f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT: vroundpd $12, %ymm0, %ymm0
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $12, (%rdi){1to4}, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <4 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: nearbyint_v8f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT: vroundps $12, %ymm0, %ymm0
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $12, (%rdi){1to8}, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr
; CHECK-LABEL: nearbyint_v8f64_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
-; CHECK-NEXT: vrndscalepd $12, %zmm0, %zmm0
-; CHECK-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscalepd $12, (%rdi){1to8}, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <8 x i64> %cmp, zeroinitializer
%ps = load double, double* %ptr
; CHECK-LABEL: nearbyint_v16f32_maskz_broadcast:
; CHECK: ## %bb.0:
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
-; CHECK-NEXT: vrndscaleps $12, %zmm0, %zmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vrndscaleps $12, (%rdi){1to16}, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%c = icmp eq <16 x i32> %cmp, zeroinitializer
%ps = load float, float* %ptr