From 592ca89b48ccab8b48e7e6fd9b13114ad3c515d8 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Apr 2019 17:17:35 +0000 Subject: [PATCH] [X86] Restore the pavg intrinsics. The pattern we replaced these with may be too hard to match as demonstrated by PR41496 and PR41316. This patch restores the intrinsics and then we can start focusing on the optimizing the intrinsics. I've mostly reverted the original patch that removed them. Though I modified the avx512 intrinsics to not have masking built in. Differential Revision: https://reviews.llvm.org/D60674 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@358427 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 18 +++ lib/IR/AutoUpgrade.cpp | 36 +++--- lib/Target/X86/X86IntrinsicsInfo.h | 6 + test/CodeGen/X86/avx2-intrinsics-fast-isel.ll | 16 +-- .../X86/avx2-intrinsics-x86-upgrade.ll | 20 ---- test/CodeGen/X86/avx2-intrinsics-x86.ll | 32 ++++++ test/CodeGen/X86/avx512bw-intrinsics.ll | 52 +++++++++ test/CodeGen/X86/avx512bwvl-intrinsics.ll | 105 ++++++++++++++++++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll | 38 +------ .../X86/sse2-intrinsics-x86-upgrade.ll | 40 ------- test/CodeGen/X86/sse2-intrinsics-x86.ll | 42 +++++++ 11 files changed, 280 insertions(+), 125 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index e4f2acdfa99..06b603a788b 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -366,6 +366,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; + def int_x86_sse2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb128">, + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, + llvm_v16i8_ty], [IntrNoMem, Commutative]>; + def int_x86_sse2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw128">, + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, + llvm_v8i16_ty], [IntrNoMem, Commutative]>; def int_x86_sse2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw128">, Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem, Commutative]>; @@ -1330,6 +1336,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">, Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem, Commutative]>; @@ -3533,6 +3545,12 @@ let TargetPrefix = "x86" in { def int_x86_avx512_pmulh_w_512 : GCCBuiltin<"__builtin_ia32_pmulhw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; + def int_x86_avx512_pavg_b_512 : GCCBuiltin<"__builtin_ia32_pavgb512">, + Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], + [IntrNoMem]>; + def int_x86_avx512_pavg_w_512 : GCCBuiltin<"__builtin_ia32_pavgw512">, + Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], + [IntrNoMem]>; def int_x86_avx512_pmaddw_d_512 : GCCBuiltin<"__builtin_ia32_pmaddwd512">, Intrinsic<[llvm_v16i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 4f0a32efcae..84f250afa9a 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -386,8 +386,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.mask.prol.") || // Added in 8.0 Name.startswith("avx512.ptestm") || //Added in 6.0 Name.startswith("avx512.ptestnm") || //Added in 6.0 - Name.startswith("sse2.pavg") || // Added in 6.0 - Name.startswith("avx2.pavg") || // Added in 6.0 Name.startswith("avx512.mask.pavg")) // Added in 6.0 return true; @@ -1547,6 +1545,21 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, IID = Intrinsic::x86_avx512_conflict_q_512; else llvm_unreachable("Unexpected intrinsic"); + } else if (Name.startswith("pavg.")) { + if (Name[5] == 'b' && VecWidth == 128) + IID = Intrinsic::x86_sse2_pavg_b; + else if (Name[5] == 'b' && VecWidth == 256) + IID = Intrinsic::x86_avx2_pavg_b; + else if (Name[5] == 'b' && VecWidth == 512) + IID = Intrinsic::x86_avx512_pavg_b_512; + else if (Name[5] == 'w' && VecWidth == 128) + IID = Intrinsic::x86_sse2_pavg_w; + else if (Name[5] == 'w' && VecWidth == 256) + IID = Intrinsic::x86_avx2_pavg_w; + else if (Name[5] == 'w' && VecWidth == 512) + IID = Intrinsic::x86_avx512_pavg_w_512; + else + llvm_unreachable("Unexpected intrinsic"); } else return false; @@ -2974,25 +2987,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { LoadInst *LI = Builder.CreateAlignedLoad(VTy, BC, VTy->getBitWidth() / 8); LI->setMetadata(M->getMDKindID("nontemporal"), Node); Rep = LI; - } else if (IsX86 && - (Name.startswith("sse2.pavg") || Name.startswith("avx2.pavg") || - Name.startswith("avx512.mask.pavg"))) { - // llvm.x86.sse2.pavg.b/w, llvm.x86.avx2.pavg.b/w, - // llvm.x86.avx512.mask.pavg.b/w - Value *A = CI->getArgOperand(0); - Value *B = CI->getArgOperand(1); - VectorType *ZextType = VectorType::getExtendedElementVectorType( - cast(A->getType())); - Value *ExtendedA = Builder.CreateZExt(A, ZextType); - Value *ExtendedB = Builder.CreateZExt(B, ZextType); - Value *Sum = Builder.CreateAdd(ExtendedA, ExtendedB); - Value *AddOne = Builder.CreateAdd(Sum, ConstantInt::get(ZextType, 1)); - Value *ShiftR = Builder.CreateLShr(AddOne, ConstantInt::get(ZextType, 1)); - Rep = Builder.CreateTrunc(ShiftR, A->getType()); - if (CI->getNumArgOperands() > 2) { - Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, - CI->getArgOperand(2)); - } } else if (IsX86 && (Name.startswith("fma.vfmadd.") || Name.startswith("fma.vfmsub.") || Name.startswith("fma.vfnmadd.") || diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 8243c27b2ec..54168762c7a 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -370,6 +370,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), @@ -819,6 +821,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0), @@ -1030,6 +1034,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), diff --git a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll index 8720c86edff..f29891e6f8a 100644 --- a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -195,15 +195,11 @@ define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %zext0 = zext <32 x i8> %arg0 to <32 x i16> - %zext1 = zext <32 x i8> %arg1 to <32 x i16> - %add = add <32 x i16> %zext0, %zext1 - %add1 = add <32 x i16> %add, - %lshr = lshr <32 x i16> %add1, - %res = trunc <32 x i16> %lshr to <32 x i8> + %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1) %bc = bitcast <32 x i8> %res to <4 x i64> ret <4 x i64> %bc } +declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_avg_epu16: @@ -212,15 +208,11 @@ define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %zext0 = zext <16 x i16> %arg0 to <16 x i32> - %zext1 = zext <16 x i16> %arg1 to <16 x i32> - %add = add <16 x i32> %zext0, %zext1 - %add1 = add <16 x i32> %add, - %lshr = lshr <16 x i32> %add1, - %res = trunc <16 x i32> %lshr to <16 x i16> + %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1) %bc = bitcast <16 x i16> %res to <4 x i64> ret <4 x i64> %bc } +declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_blend_epi16: diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll index b340a66def9..fc6929ce205 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -529,26 +529,6 @@ define <8 x i32> @mm256_min_epu32(<8 x i32> %a0, <8 x i32> %a1) { } declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone -define <32 x i8> @mm256_avg_epu8(<32 x i8> %a0, <32 x i8> %a1) { -; CHECK-LABEL: mm256_avg_epu8: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} - %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone - -define <16 x i16> @mm256_avg_epu16(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: mm256_avg_epu16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} - %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone - define <32 x i8> @test_x86_avx2_pabs_b(<32 x i8> %a0) { ; CHECK-LABEL: test_x86_avx2_pabs_b: ; CHECK: ## %bb.0: diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index de20c07d413..aa68cfd70a8 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -151,6 +151,38 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() { } +define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) { +; AVX2-LABEL: test_x86_avx2_pavg_b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xe0,0xc1] +; AVX2-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pavg_b: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) { +; AVX2-LABEL: test_x86_avx2_pavg_w: +; AVX2: # %bb.0: +; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xe3,0xc1] +; AVX2-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pavg_w: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone + + define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) { ; AVX2-LABEL: test_x86_avx2_pmadd_wd: ; AVX2: # %bb.0: diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 71cfd891bb1..613bbd6633d 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -763,6 +763,58 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 ret <32 x i16> %res2 } +declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>) + +define <64 x i8> @test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_b_512: +; X86: # %bb.0: +; X86-NEXT: vpavgb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe0,0xd9] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe0,0xd1] +; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_b_512: +; X64: # %bb.0: +; X64-NEXT: vpavgb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe0,0xd9] +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe0,0xd1] +; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1) + %2 = bitcast i64 %x3 to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x2 + %4 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1) + %res2 = add <64 x i8> %3, %4 + ret <64 x i8> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>) + +define <32 x i16> @test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_w_512: +; X86: # %bb.0: +; X86-NEXT: vpavgw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe3,0xd9] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe3,0xd1] +; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_w_512: +; X64: # %bb.0: +; X64-NEXT: vpavgw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe3,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe3,0xd1] +; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x2 + %4 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1) + %res2 = add <32 x i16> %3, %4 + ret <32 x i16> %res2 +} + declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>) define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) { diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index b9937603f16..bec461abed8 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1290,6 +1290,111 @@ define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 ret <16 x i16> %res2 } +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_b_128: +; X86: # %bb.0: +; X86-NEXT: vpavgb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xd9] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1] +; X86-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_b_128: +; X64: # %bb.0: +; X64-NEXT: vpavgb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1] +; X64-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %x0, <16 x i8> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x2 + %4 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %x0, <16 x i8> %x1) + %res2 = add <16 x i8> %3, %4 + ret <16 x i8> %res2 +} + +declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) + +define <32 x i8> @test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_b_256: +; X86: # %bb.0: +; X86-NEXT: vpavgb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xd9] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1] +; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_b_256: +; X64: # %bb.0: +; X64-NEXT: vpavgb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1] +; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %x0, <32 x i8> %x1) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x2 + %4 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %x0, <32 x i8> %x1) + %res2 = add <32 x i8> %3, %4 + ret <32 x i8> %res2 +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_w_128: +; X86: # %bb.0: +; X86-NEXT: vpavgw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xd9] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1] +; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_w_128: +; X64: # %bb.0: +; X64-NEXT: vpavgw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1] +; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %x0, <8 x i16> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x2 + %4 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %x0, <8 x i16> %x1) + %res2 = add <8 x i16> %3, %4 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) + +define <16 x i16> @test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_w_256: +; X86: # %bb.0: +; X86-NEXT: vpavgw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xd9] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1] +; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_w_256: +; X64: # %bb.0: +; X64-NEXT: vpavgw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1] +; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %x0, <16 x i16> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x2 + %4 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %x0, <16 x i16> %x1) + %res2 = add <16 x i16> %3, %4 + ret <16 x i16> %res2 +} + declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) define <8 x i16> @test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index beb895537ee..92516b78760 100644 --- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -333,28 +333,15 @@ define <2 x i64> @test_mm_avg_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; ; AVX512-LABEL: test_mm_avg_epu8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xc0] -; AVX512-NEXT: # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512-NEXT: vpmovzxbw %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xc9] -; AVX512-NEXT: # ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0x76,0xc9] -; AVX512-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf9,0xc1] -; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x01] -; AVX512-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] -; AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %zext0 = zext <16 x i8> %arg0 to <16 x i16> - %zext1 = zext <16 x i8> %arg1 to <16 x i16> - %add = add <16 x i16> %zext0, %zext1 - %add1 = add <16 x i16> %add, - %lshr = lshr <16 x i16> %add1, - %res = trunc <16 x i16> %lshr to <16 x i8> + %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) %bc = bitcast <16 x i8> %res to <2 x i64> ret <2 x i64> %bc } +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_avg_epu16: @@ -369,28 +356,15 @@ define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; ; AVX512-LABEL: test_mm_avg_epu16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xc0] -; AVX512-NEXT: # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovzxwd %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xc9] -; AVX512-NEXT: # ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0x76,0xc9] -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0xc1] -; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x01] -; AVX512-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0] -; AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %zext0 = zext <8 x i16> %arg0 to <8 x i32> - %zext1 = zext <8 x i16> %arg1 to <8 x i32> - %add = add <8 x i32> %zext0, %zext1 - %add1 = add <8 x i32> %add, - %lshr = lshr <8 x i32> %add1, - %res = trunc <8 x i32> %lshr to <8 x i16> + %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %arg0, <8 x i16> %arg1) %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind { ; SSE-LABEL: test_mm_bslli_si128: diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll index 7e4703f6957..d57778507e8 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -630,46 +630,6 @@ define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) { } declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone -define <16 x i8> @mm_avg_epu8(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: mm_avg_epu8: -; SSE: ## %bb.0: -; SSE-NEXT: pavgb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe0,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: mm_avg_epu8: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe0,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: mm_avg_epu8: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone - -define <8 x i16> @mm_avg_epu16(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: mm_avg_epu16: -; SSE: ## %bb.0: -; SSE-NEXT: pavgw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe3,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: mm_avg_epu16: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe3,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: mm_avg_epu16: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone - define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-LABEL: test_x86_sse2_pmulu_dq: diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index 44585b77ce0..0ae2c9e3ccf 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -946,6 +946,48 @@ define <16 x i8> @test_x86_sse2_packuswb_128_fold() { } +define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_pavg_b: +; SSE: ## %bb.0: +; SSE-NEXT: pavgb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe0,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse2_pavg_b: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe0,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse2_pavg_b: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_pavg_w: +; SSE: ## %bb.0: +; SSE-NEXT: pavgw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe3,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse2_pavg_w: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe3,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse2_pavg_w: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + + define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) { ; SSE-LABEL: test_x86_sse2_pmadd_wd: ; SSE: ## %bb.0: -- 2.50.1