From c4d3d32435b0867c3e518acd4004249b7cd63b71 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 18 Apr 2018 21:51:48 +0000 Subject: [PATCH] [NVPTX, CUDA] Added support for m8n32k16 and m32n8k16 variants of wmma instructions. The new instructions were added added for sm_70+ GPUs in CUDA-9.1. Differential Revision: https://reviews.llvm.org/D45068 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@330296 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/BuiltinsNVPTX.def | 60 +++- lib/CodeGen/CGBuiltin.cpp | 150 +++++++- lib/Driver/ToolChains/Cuda.cpp | 22 +- test/CodeGen/builtins-nvptx-sm_70.cu | 489 +++++++++++++++++++++----- 4 files changed, 588 insertions(+), 133 deletions(-) diff --git a/include/clang/Basic/BuiltinsNVPTX.def b/include/clang/Basic/BuiltinsNVPTX.def index ae33315d89..4e5d049930 100644 --- a/include/clang/Basic/BuiltinsNVPTX.def +++ b/include/clang/Basic/BuiltinsNVPTX.def @@ -18,11 +18,18 @@ # define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS) #endif +#pragma push_macro("SM_70") +#define SM_70 "sm_70|sm_71" #pragma push_macro("SM_60") -#define SM_60 "sm_60|sm_61|sm_62|sm_70|sm_71" +#define SM_60 "sm_60|sm_61|sm_62|" SM_70 +#pragma push_macro("PTX61") +#define PTX61 "ptx61" #pragma push_macro("PTX60") -#define PTX60 "ptx60|ptx61" +#define PTX60 "ptx60|" PTX61 + +#pragma push_macro("AND") +#define AND(a, b) a "," b // Special Registers @@ -698,19 +705,46 @@ BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "") BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "") // Builtins to support WMMA instructions on sm_70 -TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", PTX60) - -TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", PTX60) +TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", AND(SM_70,PTX60)) + +TARGET_BUILTIN(__hmma_m32n8k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_st_c_f16, "vi*i*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_st_c_f32, "vf*f*UiIi", "", AND(SM_70,PTX61)) + +TARGET_BUILTIN(__hmma_m8n32k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_st_c_f16, "vi*i*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_st_c_f32, "vf*f*UiIi", "", AND(SM_70,PTX61)) + +TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX60)) + +TARGET_BUILTIN(__hmma_m32n8k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) + +TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) #undef BUILTIN #undef TARGET_BUILTIN +#pragma pop_macro("AND") #pragma pop_macro("SM_60") +#pragma pop_macro("SM_70") #pragma pop_macro("PTX60") +#pragma pop_macro("PTX61") diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index fffc2429fb..6a2f2b0a4a 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -10715,7 +10715,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__hmma_m16n16k16_ld_a: case NVPTX::BI__hmma_m16n16k16_ld_b: case NVPTX::BI__hmma_m16n16k16_ld_c_f16: - case NVPTX::BI__hmma_m16n16k16_ld_c_f32: { + case NVPTX::BI__hmma_m16n16k16_ld_c_f32: + case NVPTX::BI__hmma_m32n8k16_ld_a: + case NVPTX::BI__hmma_m32n8k16_ld_b: + case NVPTX::BI__hmma_m32n8k16_ld_c_f16: + case NVPTX::BI__hmma_m32n8k16_ld_c_f32: + case NVPTX::BI__hmma_m8n32k16_ld_a: + case NVPTX::BI__hmma_m8n32k16_ld_b: + case NVPTX::BI__hmma_m8n32k16_ld_c_f16: + case NVPTX::BI__hmma_m8n32k16_ld_c_f32: { Address Dst = EmitPointerWithAlignment(E->getArg(0)); Value *Src = EmitScalarExpr(E->getArg(1)); Value *Ldm = EmitScalarExpr(E->getArg(2)); @@ -10746,6 +10754,46 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, : Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride; NumResults = 8; break; + case NVPTX::BI__hmma_m32n8k16_ld_a: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m32n8k16_ld_b: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m32n8k16_ld_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m32n8k16_ld_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m8n32k16_ld_a: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m8n32k16_ld_b: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m8n32k16_ld_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m8n32k16_ld_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride; + NumResults = 8; + break; default: llvm_unreachable("Unexpected builtin ID."); } @@ -10764,7 +10812,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, } case NVPTX::BI__hmma_m16n16k16_st_c_f16: - case NVPTX::BI__hmma_m16n16k16_st_c_f32: { + case NVPTX::BI__hmma_m16n16k16_st_c_f32: + case NVPTX::BI__hmma_m32n8k16_st_c_f16: + case NVPTX::BI__hmma_m32n8k16_st_c_f32: + case NVPTX::BI__hmma_m8n32k16_st_c_f16: + case NVPTX::BI__hmma_m8n32k16_st_c_f32: { Value *Dst = EmitScalarExpr(E->getArg(0)); Address Src = EmitPointerWithAlignment(E->getArg(1)); Value *Ldm = EmitScalarExpr(E->getArg(2)); @@ -10786,6 +10838,24 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride : Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride; break; + case NVPTX::BI__hmma_m32n8k16_st_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m32n8k16_st_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride; + break; + case NVPTX::BI__hmma_m8n32k16_st_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m8n32k16_st_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride; + break; default: llvm_unreachable("Unexpected builtin ID."); } @@ -10808,7 +10878,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__hmma_m16n16k16_mma_f16f16: case NVPTX::BI__hmma_m16n16k16_mma_f32f16: case NVPTX::BI__hmma_m16n16k16_mma_f32f32: - case NVPTX::BI__hmma_m16n16k16_mma_f16f32: { + case NVPTX::BI__hmma_m16n16k16_mma_f16f32: + case NVPTX::BI__hmma_m32n8k16_mma_f16f16: + case NVPTX::BI__hmma_m32n8k16_mma_f32f16: + case NVPTX::BI__hmma_m32n8k16_mma_f32f32: + case NVPTX::BI__hmma_m32n8k16_mma_f16f32: + case NVPTX::BI__hmma_m8n32k16_mma_f16f16: + case NVPTX::BI__hmma_m8n32k16_mma_f32f16: + case NVPTX::BI__hmma_m8n32k16_mma_f32f32: + case NVPTX::BI__hmma_m8n32k16_mma_f16f32: { Address Dst = EmitPointerWithAlignment(E->getArg(0)); Address SrcA = EmitPointerWithAlignment(E->getArg(1)); Address SrcB = EmitPointerWithAlignment(E->getArg(2)); @@ -10825,15 +10903,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, bool Satf = SatfArg.getSExtValue(); // clang-format off -#define MMA_VARIANTS(type) {{ \ - Intrinsic::nvvm_wmma_m16n16k16_mma_row_row_##type, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_row_row_##type##_satfinite, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_row_col_##type, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_row_col_##type##_satfinite, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_col_row_##type, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_col_row_##type##_satfinite, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_col_col_##type, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_col_col_##type##_satfinite \ +#define MMA_VARIANTS(geom, type) {{ \ + Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite \ }} // clang-format on @@ -10847,22 +10925,62 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, unsigned NumEltsD; switch (BuiltinID) { case NVPTX::BI__hmma_m16n16k16_mma_f16f16: - IID = getMMAIntrinsic(MMA_VARIANTS(f16_f16)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f16)); NumEltsC = 4; NumEltsD = 4; break; case NVPTX::BI__hmma_m16n16k16_mma_f32f16: - IID = getMMAIntrinsic(MMA_VARIANTS(f32_f16)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f16)); NumEltsC = 4; NumEltsD = 8; break; case NVPTX::BI__hmma_m16n16k16_mma_f16f32: - IID = getMMAIntrinsic(MMA_VARIANTS(f16_f32)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f32)); NumEltsC = 8; NumEltsD = 4; break; case NVPTX::BI__hmma_m16n16k16_mma_f32f32: - IID = getMMAIntrinsic(MMA_VARIANTS(f32_f32)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f32)); + NumEltsC = 8; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f16f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f16)); + NumEltsC = 4; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f32f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f16)); + NumEltsC = 4; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f16f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f32)); + NumEltsC = 8; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f32f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f32)); + NumEltsC = 8; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f16f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f16)); + NumEltsC = 4; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f32f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f16)); + NumEltsC = 4; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f16f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f32)); + NumEltsC = 8; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f32f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f32)); NumEltsC = 8; NumEltsD = 8; break; diff --git a/lib/Driver/ToolChains/Cuda.cpp b/lib/Driver/ToolChains/Cuda.cpp index fc74a8ebbf..fdd62fec39 100644 --- a/lib/Driver/ToolChains/Cuda.cpp +++ b/lib/Driver/ToolChains/Cuda.cpp @@ -622,17 +622,19 @@ void CudaToolChain::addClangTargetOptions( CC1Args.push_back("-mlink-cuda-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); - if (CudaInstallation.version() >= CudaVersion::CUDA_90) { - // CUDA-9 uses new instructions that are only available in PTX6.0 - CC1Args.push_back("-target-feature"); - CC1Args.push_back("+ptx60"); - } else { - // Libdevice in CUDA-7.0 requires PTX version that's more recent - // than LLVM defaults to. Use PTX4.2 which is the PTX version that - // came with CUDA-7.0. - CC1Args.push_back("-target-feature"); - CC1Args.push_back("+ptx42"); + // Libdevice in CUDA-7.0 requires PTX version that's more recent than LLVM + // defaults to. Use PTX4.2 by default, which is the PTX version that came with + // CUDA-7.0. + const char *PtxFeature = "+ptx42"; + if (CudaInstallation.version() >= CudaVersion::CUDA_91) { + // CUDA-9.1 uses new instructions that are only available in PTX6.1+ + PtxFeature = "+ptx61"; + } else if (CudaInstallation.version() >= CudaVersion::CUDA_90) { + // CUDA-9.0 uses new instructions that are only available in PTX6.0+ + PtxFeature = "+ptx60"; } + CC1Args.push_back("-target-feature"); + CC1Args.push_back(PtxFeature); if (DeviceOffloadingKind == Action::OFK_OpenMP) { SmallVector LibraryPaths; diff --git a/test/CodeGen/builtins-nvptx-sm_70.cu b/test/CodeGen/builtins-nvptx-sm_70.cu index 1e9133b3ac..66fa1b5d63 100644 --- a/test/CodeGen/builtins-nvptx-sm_70.cu +++ b/test/CodeGen/builtins-nvptx-sm_70.cu @@ -1,9 +1,16 @@ // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \ // RUN: -fcuda-is-device -target-feature +ptx60 \ // RUN: -S -emit-llvm -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=CHECK %s +// RUN: | FileCheck -check-prefix=CHECK_M16 %s +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \ +// RUN: -fcuda-is-device -target-feature +ptx61 -DPTX61 \ +// RUN: -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefixes=CHECK_M16,CHECK_M32_M8 %s // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \ -// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s +// RUN: -DPTX61 -fcuda-is-device -S -o /dev/null -x cuda -verify=pre-sm_70 %s +// RUN: %clang_cc1 -triple nvptx-unknown-unknown \ +// RUN: -target-cpu sm_70 -target-feature +ptx60 \ +// RUN: -DPTX61 -fcuda-is-device -S -o /dev/null -x cuda -verify=pre-ptx61 %s #if !defined(CUDA_VERSION) #define __device__ __attribute__((device)) @@ -18,149 +25,443 @@ typedef unsigned long long uint64_t; // that encounters an error, so -verify will not be able to find errors in // subsequent functions. -// CHECK-LABEL: nvvm_wmma -__device__ void nvvm_wmma(int *src, int *dst, - float *fsrc, float *fdst, - int ldm) { - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}} +// CHECK-LABEL: nvvm_wmma_m16n16k16 +__device__ void nvvm_wmma_m16n16k16(int *src, int *dst, + float *fsrc, float *fdst, + int ldm) { + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_ld_a(dst, src, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_ld_a(dst, src+1, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_ld_b(dst, src, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_ld_b(dst, src+2, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32 - // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32 - // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_st_c_f16(dst, src, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_st_c_f16(dst, src, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32 - // expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32 - // expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32 + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32.satfinite + // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 1); } + +#ifdef PTX61 +// CHECK-LABEL: nvvm_wmma_m32n8k16 +__device__ void nvvm_wmma_m32n8k16(int *src, int *dst, + float *fsrc, float *fdst, + int ldm) { + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_a(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_a(dst, src+1, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_b(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_b(dst, src+2, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_st_c_f16(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_st_c_f16(dst, src, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 1); + + + // m8n32k16 variants. + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_a(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_a(dst, src+1, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_b(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_b(dst, src+2, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_st_c_f16(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_st_c_f16(dst, src, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32 + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32.satfinite + // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 1); +} +#endif -- 2.40.0