[OG9, amdgcn, committed] Detect the actual number of hardware CUs

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[OG9, amdgcn, committed] Detect the actual number of hardware CUs

Andrew Stubbs-4
This patch improves out-of-the-box benchmark results by ensuring that we
don't launch 64 gangs on a device that only has 60 compute units, such
as consumer Vega 20.

It's not suitable for upstream mainline yet because we need to update
hsa.h with definitions from Radeon Open Compute Runtime (ROCr), but
there are license issues with that. We could extract them from the
documentation, but this is still on my TODO list.

Andrew

Detect number of GPU compute units.

2019-09-10  Andrew Stubbs  <[hidden email]>

        libgomp/
        * plugin/plugin-gcn.c (HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT): Define.
        (dump_hsa_agent_info): Dump compute unit count.
        (get_cu_count): New function.
        (parse_target_attributes): Use get_cu_count for default gdims.
        (gcn_exec): Likewise.

diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 6c00c81b588..9d03e4f9f5b 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -44,6 +44,11 @@
 #include "oacc-int.h"
 #include <assert.h>
 
+/* Additional definitions not in HSA 1.1.
+   FIXME: this needs to be updated in hsa.h for upstream, but the only source
+          right now is the ROCr source which may cause license issues.  */
+#define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
+
 /* These probably won't be in elf.h for a while.  */
 #define R_AMDGPU_NONE 0
 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF  */
@@ -845,6 +850,14 @@ dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
   else
     HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: FAILED\n");
 
+  uint32_t cu_count;
+  status = hsa_fns.hsa_agent_get_info_fn
+    (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
+  else
+    HSA_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
+
   uint32_t size;
   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
   &size);
@@ -2449,6 +2462,18 @@ init_kernel (struct kernel_info *kernel)
        "mutex");
 }
 
+static int
+get_cu_count (struct agent_info *agent)
+{
+  uint32_t cu_count;
+  hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
+    (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
+  if (status == HSA_STATUS_SUCCESS)
+    return cu_count;
+  else
+    return 64;  /* The usual number for older devices.  */
+}
+
 /* Calculate the maximum grid size for OMP threads / OACC workers.
    This depends on the kernel's resource usage levels.  */
 
@@ -2527,8 +2552,8 @@ parse_target_attributes (void **input,
  }
 
       def->ndim = 3;
-      /* Fiji has 64 CUs.  */
-      def->gdims[0] = (gcn_teams > 0) ? gcn_teams : 64;
+      /* Fiji has 64 CUs, but Vega20 has 60.  */
+      def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
       /* Each thread is 64 work items wide.  */
       def->gdims[1] = 64;
       /* A work group can have 16 wavefronts.  */
@@ -3308,7 +3333,7 @@ gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
      problem size, so let's do a reasonable number of single-worker gangs.
      64 gangs matches a typical Fiji device.  */
 
-  if (dims[0] == 0) dims[0] = 64; /* Gangs.  */
+  if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs.  */
   if (dims[1] == 0) dims[1] = 16; /* Workers.  */
 
   /* The incoming dimensions are expressed in terms of gangs, workers, and