From 592ca89b48ccab8b48e7e6fd9b13114ad3c515d8 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 15 Apr 2019 17:17:35 +0000
Subject: [PATCH] [X86] Restore the pavg intrinsics.

The pattern we replaced these with may be too hard to match as demonstrated by
PR41496 and PR41316.

This patch restores the intrinsics and then we can start focusing
on the optimizing the intrinsics.

I've mostly reverted the original patch that removed them. Though I modified
the avx512 intrinsics to not have masking built in.

Differential Revision: https://reviews.llvm.org/D60674

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@358427 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsX86.td              |  18 +++
 lib/IR/AutoUpgrade.cpp                        |  36 +++---
 lib/Target/X86/X86IntrinsicsInfo.h            |   6 +
 test/CodeGen/X86/avx2-intrinsics-fast-isel.ll |  16 +--
 .../X86/avx2-intrinsics-x86-upgrade.ll        |  20 ----
 test/CodeGen/X86/avx2-intrinsics-x86.ll       |  32 ++++++
 test/CodeGen/X86/avx512bw-intrinsics.ll       |  52 +++++++++
 test/CodeGen/X86/avx512bwvl-intrinsics.ll     | 105 ++++++++++++++++++
 test/CodeGen/X86/sse2-intrinsics-fast-isel.ll |  38 +------
 .../X86/sse2-intrinsics-x86-upgrade.ll        |  40 -------
 test/CodeGen/X86/sse2-intrinsics-x86.ll       |  42 +++++++
 11 files changed, 280 insertions(+), 125 deletions(-)

diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index e4f2acdfa99..06b603a788b 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -366,6 +366,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem, Commutative]>;
+  def int_x86_sse2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb128">,
+              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
+                         llvm_v16i8_ty], [IntrNoMem, Commutative]>;
+  def int_x86_sse2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw128">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
+                         llvm_v8i16_ty], [IntrNoMem, Commutative]>;
   def int_x86_sse2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty], [IntrNoMem, Commutative]>;
@@ -1330,6 +1336,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
+  def int_x86_avx2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb256">,
+              Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
+                         llvm_v32i8_ty], [IntrNoMem, Commutative]>;
+  def int_x86_avx2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw256">,
+              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
+                         llvm_v16i16_ty], [IntrNoMem, Commutative]>;
   def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty,
                          llvm_v32i8_ty], [IntrNoMem, Commutative]>;
@@ -3533,6 +3545,12 @@ let TargetPrefix = "x86" in {
   def int_x86_avx512_pmulh_w_512 : GCCBuiltin<"__builtin_ia32_pmulhw512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                          llvm_v32i16_ty], [IntrNoMem, Commutative]>;
+  def int_x86_avx512_pavg_b_512 : GCCBuiltin<"__builtin_ia32_pavgb512">,
+          Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
+                    [IntrNoMem]>;
+  def int_x86_avx512_pavg_w_512 : GCCBuiltin<"__builtin_ia32_pavgw512">,
+          Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty],
+                    [IntrNoMem]>;
   def int_x86_avx512_pmaddw_d_512 : GCCBuiltin<"__builtin_ia32_pmaddwd512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v32i16_ty,
                          llvm_v32i16_ty], [IntrNoMem, Commutative]>;
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 4f0a32efcae..84f250afa9a 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -386,8 +386,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("avx512.mask.prol.") ||  // Added in 8.0
       Name.startswith("avx512.ptestm") || //Added in 6.0
       Name.startswith("avx512.ptestnm") || //Added in 6.0
-      Name.startswith("sse2.pavg") || // Added in 6.0
-      Name.startswith("avx2.pavg") || // Added in 6.0
       Name.startswith("avx512.mask.pavg")) // Added in 6.0
     return true;
 
@@ -1547,6 +1545,21 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_conflict_q_512;
     else
       llvm_unreachable("Unexpected intrinsic");
+  } else if (Name.startswith("pavg.")) {
+    if (Name[5] == 'b' && VecWidth == 128)
+      IID = Intrinsic::x86_sse2_pavg_b;
+    else if (Name[5] == 'b' && VecWidth == 256)
+      IID = Intrinsic::x86_avx2_pavg_b;
+    else if (Name[5] == 'b' && VecWidth == 512)
+      IID = Intrinsic::x86_avx512_pavg_b_512;
+    else if (Name[5] == 'w' && VecWidth == 128)
+      IID = Intrinsic::x86_sse2_pavg_w;
+    else if (Name[5] == 'w' && VecWidth == 256)
+      IID = Intrinsic::x86_avx2_pavg_w;
+    else if (Name[5] == 'w' && VecWidth == 512)
+      IID = Intrinsic::x86_avx512_pavg_w_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
   } else
     return false;
 
@@ -2974,25 +2987,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       LoadInst *LI = Builder.CreateAlignedLoad(VTy, BC, VTy->getBitWidth() / 8);
       LI->setMetadata(M->getMDKindID("nontemporal"), Node);
       Rep = LI;
-    } else if (IsX86 &&
-               (Name.startswith("sse2.pavg") || Name.startswith("avx2.pavg") ||
-                Name.startswith("avx512.mask.pavg"))) {
-      // llvm.x86.sse2.pavg.b/w, llvm.x86.avx2.pavg.b/w,
-      // llvm.x86.avx512.mask.pavg.b/w
-      Value *A = CI->getArgOperand(0);
-      Value *B = CI->getArgOperand(1);
-      VectorType *ZextType = VectorType::getExtendedElementVectorType(
-          cast<VectorType>(A->getType()));
-      Value *ExtendedA = Builder.CreateZExt(A, ZextType);
-      Value *ExtendedB = Builder.CreateZExt(B, ZextType);
-      Value *Sum = Builder.CreateAdd(ExtendedA, ExtendedB);
-      Value *AddOne = Builder.CreateAdd(Sum, ConstantInt::get(ZextType, 1));
-      Value *ShiftR = Builder.CreateLShr(AddOne, ConstantInt::get(ZextType, 1));
-      Rep = Builder.CreateTrunc(ShiftR, A->getType());
-      if (CI->getNumArgOperands() > 2) {
-        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
-                            CI->getArgOperand(2));
-      }
     } else if (IsX86 && (Name.startswith("fma.vfmadd.") ||
                          Name.startswith("fma.vfmsub.") ||
                          Name.startswith("fma.vfnmadd.") ||
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 8243c27b2ec..54168762c7a 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -370,6 +370,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_b,  INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_w,  INTR_TYPE_2OP, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0),
   X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
@@ -819,6 +821,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0),
@@ -1030,6 +1034,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_b,       INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_w,       INTR_TYPE_2OP, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(sse2_pmadd_wd,     INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
   X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
diff --git a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
index 8720c86edff..f29891e6f8a 100644
--- a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
@@ -195,15 +195,11 @@ define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
-  %zext0 = zext <32 x i8> %arg0 to <32 x i16>
-  %zext1 = zext <32 x i8> %arg1 to <32 x i16>
-  %add = add <32 x i16> %zext0, %zext1
-  %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-  %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-  %res = trunc <32 x i16> %lshr to <32 x i8>
+  %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
   %bc = bitcast <32 x i8> %res to <4 x i64>
   ret <4 x i64> %bc
 }
+declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; CHECK-LABEL: test_mm256_avg_epu16:
@@ -212,15 +208,11 @@ define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
-  %zext0 = zext <16 x i16> %arg0 to <16 x i32>
-  %zext1 = zext <16 x i16> %arg1 to <16 x i32>
-  %add = add <16 x i32> %zext0, %zext1
-  %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %res = trunc <16 x i32> %lshr to <16 x i16>
+  %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
   %bc = bitcast <16 x i16> %res to <4 x i64>
   ret <4 x i64> %bc
 }
+declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_blend_epi16:
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
index b340a66def9..fc6929ce205 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -529,26 +529,6 @@ define <8 x i32> @mm256_min_epu32(<8 x i32> %a0, <8 x i32> %a1) {
 }
 declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
 
-define <32 x i8> @mm256_avg_epu8(<32 x i8> %a0, <32 x i8> %a1) {
-; CHECK-LABEL: mm256_avg_epu8:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
-
-define <16 x i16> @mm256_avg_epu16(<16 x i16> %a0, <16 x i16> %a1) {
-; CHECK-LABEL: mm256_avg_epu16:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
-
 define <32 x i8> @test_x86_avx2_pabs_b(<32 x i8> %a0) {
 ; CHECK-LABEL: test_x86_avx2_pabs_b:
 ; CHECK:       ## %bb.0:
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index de20c07d413..aa68cfd70a8 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -151,6 +151,38 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() {
 }
 
 
+define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) {
+; AVX2-LABEL: test_x86_avx2_pavg_b:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xe0,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pavg_b:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) {
+; AVX2-LABEL: test_x86_avx2_pavg_w:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xe3,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pavg_w:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+
 define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX2-LABEL: test_x86_avx2_pmadd_wd:
 ; AVX2:       # %bb.0:
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 71cfd891bb1..613bbd6633d 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -763,6 +763,58 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
   ret <32 x i16> %res2
 }
 
+declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>)
+
+define <64 x i8> @test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; X86-LABEL: test_int_x86_avx512_mask_pavg_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpavgb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe0,0xd9]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe0,0xd1]
+; X86-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pavg_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpavgb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe0,0xd9]
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe0,0xd1]
+; X64-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %1 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1)
+  %2 = bitcast i64 %x3 to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x2
+  %4 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1)
+  %res2 = add <64 x i8> %3, %4
+  ret <64 x i8> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>)
+
+define <32 x i16> @test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; X86-LABEL: test_int_x86_avx512_mask_pavg_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpavgw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe3,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe3,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pavg_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpavgw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe3,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe3,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %1 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1)
+  %2 = bitcast i32 %x3 to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x2
+  %4 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1)
+  %res2 = add <32 x i16> %3, %4
+  ret <32 x i16> %res2
+}
+
 declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
 
 define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) {
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index b9937603f16..bec461abed8 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -1290,6 +1290,111 @@ define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16
   ret <16 x i16> %res2
 }
 
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_mask_pavg_b_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpavgb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xd9]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpavgb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1]
+; X86-NEXT:    vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pavg_b_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpavgb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpavgb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1]
+; X64-NEXT:    vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %x0, <16 x i8> %x1)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x2
+  %4 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %x0, <16 x i8> %x1)
+  %res2 = add <16 x i8> %3, %4
+  ret <16 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>)
+
+define <32 x i8> @test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; X86-LABEL: test_int_x86_avx512_mask_pavg_b_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpavgb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpavgb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1]
+; X86-NEXT:    vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pavg_b_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpavgb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpavgb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1]
+; X64-NEXT:    vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %1 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %x0, <32 x i8> %x1)
+  %2 = bitcast i32 %x3 to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x2
+  %4 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %x0, <32 x i8> %x1)
+  %res2 = add <32 x i8> %3, %4
+  ret <32 x i8> %res2
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; X86-LABEL: test_int_x86_avx512_mask_pavg_w_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpavgw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xd9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpavgw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1]
+; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pavg_w_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpavgw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpavgw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1]
+; X64-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %x0, <8 x i16> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x2
+  %4 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %x0, <8 x i16> %x1)
+  %res2 = add <8 x i16> %3, %4
+  ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>)
+
+define <16 x i16> @test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_mask_pavg_w_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpavgw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xd9]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpavgw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1]
+; X86-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pavg_w_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpavgw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpavgw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1]
+; X64-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %1 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %x0, <16 x i16> %x1)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x2
+  %4 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %x0, <16 x i16> %x1)
+  %res2 = add <16 x i16> %3, %4
+  ret <16 x i16> %res2
+}
+
 declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>)
 
 define <8 x i16> @test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index beb895537ee..92516b78760 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -333,28 +333,15 @@ define <2 x i64> @test_mm_avg_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ;
 ; AVX512-LABEL: test_mm_avg_epu8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xc0]
-; AVX512-NEXT:    # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxbw %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xc9]
-; AVX512-NEXT:    # ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
-; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0x76,0xc9]
-; AVX512-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf9,0xc1]
-; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x01]
-; AVX512-NEXT:    vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0]
-; AVX512-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; AVX512-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
-  %zext0 = zext <16 x i8> %arg0 to <16 x i16>
-  %zext1 = zext <16 x i8> %arg1 to <16 x i16>
-  %add = add <16 x i16> %zext0, %zext1
-  %add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-  %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-  %res = trunc <16 x i16> %lshr to <16 x i8>
+  %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1)
   %bc = bitcast <16 x i8> %res to <2 x i64>
   ret <2 x i64> %bc
 }
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone
 
 define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE-LABEL: test_mm_avg_epu16:
@@ -369,28 +356,15 @@ define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ;
 ; AVX512-LABEL: test_mm_avg_epu16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxwd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xc0]
-; AVX512-NEXT:    # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT:    vpmovzxwd %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xc9]
-; AVX512-NEXT:    # ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
-; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0x76,0xc9]
-; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0xc1]
-; AVX512-NEXT:    vpsrld $1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x01]
-; AVX512-NEXT:    vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0]
-; AVX512-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; AVX512-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
-  %zext0 = zext <8 x i16> %arg0 to <8 x i32>
-  %zext1 = zext <8 x i16> %arg1 to <8 x i32>
-  %add = add <8 x i32> %zext0, %zext1
-  %add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %res = trunc <8 x i32> %lshr to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %arg0, <8 x i16> %arg1)
   %bc = bitcast <8 x i16> %res to <2 x i64>
   ret <2 x i64> %bc
 }
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind {
 ; SSE-LABEL: test_mm_bslli_si128:
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 7e4703f6957..d57778507e8 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -630,46 +630,6 @@ define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
 }
 declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define <16 x i8> @mm_avg_epu8(<16 x i8> %a0, <16 x i8> %a1) {
-; SSE-LABEL: mm_avg_epu8:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    pavgb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe0,0xc1]
-; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX1-LABEL: mm_avg_epu8:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe0,0xc1]
-; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX512-LABEL: mm_avg_epu8:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1]
-; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <8 x i16> @mm_avg_epu16(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: mm_avg_epu16:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    pavgw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe3,0xc1]
-; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX1-LABEL: mm_avg_epu16:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe3,0xc1]
-; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX512-LABEL: mm_avg_epu16:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1]
-; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
-
 
 define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: test_x86_sse2_pmulu_dq:
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 44585b77ce0..0ae2c9e3ccf 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -946,6 +946,48 @@ define <16 x i8> @test_x86_sse2_packuswb_128_fold() {
 }
 
 
+define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_x86_sse2_pavg_b:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    pavgb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe0,0xc1]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse2_pavg_b:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe0,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse2_pavg_b:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_pavg_w:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    pavgw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe3,0xc1]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse2_pavg_w:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe3,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse2_pavg_w:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
 define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: test_x86_sse2_pmadd_wd:
 ; SSE:       ## %bb.0:
-- 
2.50.1