[OpenCL] Add '-cl-uniform-work-group-size' compile option

author Alexey Sotkin <alexey.sotkin@intel.com>

Thu, 22 Feb 2018 11:54:14 +0000 (11:54 +0000)

committer Alexey Sotkin <alexey.sotkin@intel.com>

Thu, 22 Feb 2018 11:54:14 +0000 (11:54 +0000)
author Alexey Sotkin <alexey.sotkin@intel.com>
Thu, 22 Feb 2018 11:54:14 +0000 (11:54 +0000)
committer Alexey Sotkin <alexey.sotkin@intel.com>
Thu, 22 Feb 2018 11:54:14 +0000 (11:54 +0000)
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td

index a1b9810c5475abca3114a9666839d86e272b8a85..50865f22f867f80705bd454b4cbdb006f5dc42cc 100644 (file)
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -518,6 +518,8 @@ def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group
    HelpText<"OpenCL only. Allow denormals to be flushed to zero.">;
  def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
    HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">;
+def cl_uniform_work_group_size : Flag<["-"], "cl-uniform-work-group-size">, Group<opencl_Group>, Flags<[CC1Option]>,
+  HelpText<"OpenCL only. Defines that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel">;
  def client__name : JoinedOrSeparate<["-"], "client_name">;
  def combine : Flag<["-", "--"], "combine">, Flags<[DriverOption, Unsupported]>;
  def compatibility__version : JoinedOrSeparate<["-"], "compatibility_version">;
diff --git a/include/clang/Frontend/CodeGenOptions.def b/include/clang/Frontend/CodeGenOptions.def

index 599b57c189f1196d8d23e5134fc3ff1906a343c3..e987252b2ffbebfb1d7de185c4cb5b98d105c798 100644 (file)
--- a/include/clang/Frontend/CodeGenOptions.def
+++ b/include/clang/Frontend/CodeGenOptions.def
@@ -128,6 +128,7 @@ CODEGENOPT(NoTrappingMath    , 1, 0) ///< Set when -fno-trapping-math is enabled
  CODEGENOPT(NoNaNsFPMath      , 1, 0) ///< Assume FP arguments, results not NaN.
  CODEGENOPT(FlushDenorm       , 1, 0) ///< Allow FP denorm numbers to be flushed to zero
  CODEGENOPT(CorrectlyRoundedDivSqrt, 1, 0) ///< -cl-fp32-correctly-rounded-divide-sqrt
+CODEGENOPT(UniformWGSize     , 1, 0) ///< -cl-uniform-work-group-size
  CODEGENOPT(NoZeroInitializedInBSS , 1, 0) ///< -fno-zero-initialized-in-bss.
  /// \brief Method of Objective-C dispatch to use.
  ENUM_CODEGENOPT(ObjCDispatchMethod, ObjCDispatchMethodKind, 2, Legacy)
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp

index d62b080e31cd655984094715db4edc225f1f835d..959df69f22eb0190aedbda5794fcc11f907e6e2c 100644 (file)
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -1870,6 +1870,21 @@ void CodeGenModule::ConstructAttributeList(
      }
    }
  
+  if (TargetDecl && TargetDecl->hasAttr<OpenCLKernelAttr>()) {
+    if (getLangOpts().OpenCLVersion <= 120) {
+      // OpenCL v1.2 Work groups are always uniform
+      FuncAttrs.addAttribute("uniform-work-group-size", "true");
+    } else {
+      // OpenCL v2.0 Work groups may be whether uniform or not.
+      // '-cl-uniform-work-group-size' compile option gets a hint
+      // to the compiler that the global work-size be a multiple of
+      // the work-group size specified to clEnqueueNDRangeKernel
+      // (i.e. work groups are uniform).
+      FuncAttrs.addAttribute("uniform-work-group-size",
+                             llvm::toStringRef(CodeGenOpts.UniformWGSize));
+    }
+  }
+
    if (!AttrOnCallSite) {
      bool DisableTailCalls =
          CodeGenOpts.DisableTailCalls ||
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp

index 64bea9e56ab677ba55cc4b825d6ee0f7940d7c49..cc5cf0f042f0755465ba80bc32581c02aa6457ff 100644 (file)
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -2379,6 +2379,7 @@ static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs) {
        options::OPT_cl_no_signed_zeros,
        options::OPT_cl_denorms_are_zero,
        options::OPT_cl_fp32_correctly_rounded_divide_sqrt,
+      options::OPT_cl_uniform_work_group_size
    };
  
    if (Arg *A = Args.getLastArg(options::OPT_cl_std_EQ)) {
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp

index 3bdc116b9633c334447af04fe379e817a3d3ab51..5be02c96826ba5b93cb077cdaae7bc9af3665bdb 100644 (file)
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -659,6 +659,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
    Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero);
    Opts.CorrectlyRoundedDivSqrt =
        Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
+  Opts.UniformWGSize =
+      Args.hasArg(OPT_cl_uniform_work_group_size);
    Opts.Reciprocals = Args.getAllArgValues(OPT_mrecip_EQ);
    Opts.ReciprocalMath = Args.hasArg(OPT_freciprocal_math);
    Opts.NoTrappingMath = Args.hasArg(OPT_fno_trapping_math);
diff --git a/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl b/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl

index 1027fd740c524334b01a4ff4ff1abd3ce5d21ffb..aec00e76014ec2d4967ae45f313dba34a28b29be 100644 (file)
--- a/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
+++ b/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
@@ -425,7 +425,7 @@ struct_char_arr32 func_ret_struct_char_arr32()
    return s;
  }
  
-// CHECK: define i32 @func_transparent_union_ret() local_unnamed_addr #0 {
+// CHECK: define i32 @func_transparent_union_ret() local_unnamed_addr #1 {
  // CHECK: ret i32 0
  transparent_u func_transparent_union_ret()
  {
diff --git a/test/CodeGenOpenCL/cl-uniform-wg-size.cl b/test/CodeGenOpenCL/cl-uniform-wg-size.cl

new file mode 100644 (file)

index 0000000..76ace5d
--- /dev/null
+++ b/test/CodeGenOpenCL/cl-uniform-wg-size.cl
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL1.2 -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM
+// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-NONUNIFORM
+// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -cl-uniform-work-group-size -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM
+
+kernel void ker() {};
+// CHECK: define{{.*}}@ker() #0
+
+void foo() {};
+// CHECK: define{{.*}}@foo() #1
+
+// CHECK-LABEL: attributes #0
+// CHECK-UNIFORM: "uniform-work-group-size"="true"
+// CHECK-NONUNIFORM: "uniform-work-group-size"="false"
+
+// CHECK-LABEL: attributes #1
+// CHECK-NOT: uniform-work-group-size
diff --git a/test/CodeGenOpenCL/convergent.cl b/test/CodeGenOpenCL/convergent.cl

index 285b637ca687957cdff8c9e97c5674d2a9db0b46..a011920761f3606764eabd1072b2a0c7dc23879c 100644 (file)
--- a/test/CodeGenOpenCL/convergent.cl
+++ b/test/CodeGenOpenCL/convergent.cl
@@ -127,7 +127,7 @@ void test_not_unroll() {
  // CHECK: declare spir_func void @nodupfun(){{[^#]*}} #[[attr3:[0-9]+]]
  
  // CHECK-LABEL: @assume_convergent_asm
-// CHECK: tail call void asm sideeffect "s_barrier", ""() #4
+// CHECK: tail call void asm sideeffect "s_barrier", ""() #5
  kernel void assume_convergent_asm()
  {
    __asm__ volatile("s_barrier");
@@ -138,4 +138,5 @@ kernel void assume_convergent_asm()
  // CHECK: attributes #2 = { {{[^}]*}}convergent{{[^}]*}} }
  // CHECK: attributes #3 = { {{[^}]*}}convergent noduplicate{{[^}]*}} }
  // CHECK: attributes #4 = { {{[^}]*}}convergent{{[^}]*}} }
-// CHECK: attributes #5 = { {{[^}]*}}convergent noduplicate{{[^}]*}} }
+// CHECK: attributes #5 = { {{[^}]*}}convergent{{[^}]*}} }
+// CHECK: attributes #6 = { {{[^}]*}}convergent noduplicate{{[^}]*}} }
diff --git a/test/Driver/opencl.cl b/test/Driver/opencl.cl

index d68d424b6e362a6edd2ce090da89e89ea0e90944..8c421beeefebfd8f69707c327241d7adb073309b 100644 (file)
--- a/test/Driver/opencl.cl
+++ b/test/Driver/opencl.cl
@@ -13,6 +13,7 @@
  // RUN: %clang -S -### -cl-no-signed-zeros %s 2>&1 | FileCheck --check-prefix=CHECK-NO-SIGNED-ZEROS %s
  // RUN: %clang -S -### -cl-denorms-are-zero %s 2>&1 | FileCheck --check-prefix=CHECK-DENORMS-ARE-ZERO %s
  // RUN: %clang -S -### -cl-fp32-correctly-rounded-divide-sqrt %s 2>&1 | FileCheck --check-prefix=CHECK-ROUND-DIV %s
+// RUN: %clang -S -### -cl-uniform-work-group-size %s 2>&1 | FileCheck --check-prefix=CHECK-UNIFORM-WG %s
  // RUN: not %clang -cl-std=c99 -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-C99 %s
  // RUN: not %clang -cl-std=invalid -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s
  
@@ -31,6 +32,7 @@
  // CHECK-NO-SIGNED-ZEROS: "-cc1" {{.*}} "-cl-no-signed-zeros"
  // CHECK-DENORMS-ARE-ZERO: "-cc1" {{.*}} "-cl-denorms-are-zero"
  // CHECK-ROUND-DIV: "-cc1" {{.*}} "-cl-fp32-correctly-rounded-divide-sqrt"
+// CHECK-UNIFORM-WG: "-cc1" {{.*}} "-cl-uniform-work-group-size"
  // CHECK-C99: error: invalid value 'c99' in '-cl-std=c99'
  // CHECK-INVALID: error: invalid value 'invalid' in '-cl-std=invalid'
author	Alexey Sotkin <alexey.sotkin@intel.com>
	Thu, 22 Feb 2018 11:54:14 +0000 (11:54 +0000)
committer	Alexey Sotkin <alexey.sotkin@intel.com>
	Thu, 22 Feb 2018 11:54:14 +0000 (11:54 +0000)
include/clang/Driver/Options.td		patch \| blob \| history
include/clang/Frontend/CodeGenOptions.def		patch \| blob \| history
lib/CodeGen/CGCall.cpp		patch \| blob \| history
lib/Driver/ToolChains/Clang.cpp		patch \| blob \| history
lib/Frontend/CompilerInvocation.cpp		patch \| blob \| history
test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl		patch \| blob \| history
test/CodeGenOpenCL/cl-uniform-wg-size.cl	[new file with mode: 0644]	patch \| blob
test/CodeGenOpenCL/convergent.cl		patch \| blob \| history
test/Driver/opencl.cl		patch \| blob \| history