changeset 11842:8d8f63069f58

PTX warp limiter to available GPU processors
author Morris Meyer <morris.meyer@oracle.com>
date Mon, 30 Sep 2013 13:03:47 -0400
parents 6157a71e0a36
children 372bacc13022
files graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/PTXTestBase.java graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXTargetMethodAssembler.java graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPU.java graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPUImpl.java src/gpu/ptx/vm/gpu_ptx.cpp src/gpu/ptx/vm/gpu_ptx.hpp src/share/vm/graal/graalCompilerToGPU.cpp src/share/vm/runtime/gpu.cpp src/share/vm/runtime/gpu.hpp
diffstat 9 files changed, 109 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/PTXTestBase.java	Mon Sep 30 17:03:14 2013 +0200
+++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/PTXTestBase.java	Mon Sep 30 13:03:47 2013 -0400
@@ -30,6 +30,7 @@
 import com.oracle.graal.api.runtime.Graal;
 import com.oracle.graal.compiler.GraalCompiler;
 import com.oracle.graal.compiler.ptx.PTXBackend;
+import com.oracle.graal.compiler.ptx.PTXTargetMethodAssembler;
 import com.oracle.graal.compiler.test.GraalCompilerTest;
 import com.oracle.graal.debug.Debug;
 import com.oracle.graal.hotspot.meta.HotSpotNmethod;
@@ -138,6 +139,11 @@
             }
             Object r;
             if (dimensionX != 1 || dimensionY != 1 || dimensionZ != 1) {
+                /*
+                 * for now assert that the warp array block is no larger than the number of physical gpu cores.
+                 */
+                assert dimensionX * dimensionY * dimensionZ < PTXTargetMethodAssembler.getAvailableProcessors();
+
                 r = ((HotSpotNmethod) installedCode).executeParallel(dimensionX, dimensionY, dimensionZ, executeArgs);
             } else {
                 r = installedCode.executeVarargs(executeArgs);
--- a/graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXTargetMethodAssembler.java	Mon Sep 30 17:03:14 2013 +0200
+++ b/graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXTargetMethodAssembler.java	Mon Sep 30 13:03:47 2013 -0400
@@ -36,6 +36,12 @@
     private static CompilerToGPU toGPU = HotSpotGraalRuntime.graalRuntime().getCompilerToGPU();
     private static boolean validDevice = toGPU.deviceInit();
 
+    private static final int totalProcessors = (validDevice ? toGPU.availableProcessors() : 0);
+
+    public static int getAvailableProcessors() {
+        return totalProcessors;
+    }
+
     // detach ??
 
     public PTXTargetMethodAssembler(TargetDescription target, CodeCacheProvider runtime, FrameMap frameMap,
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPU.java	Mon Sep 30 17:03:14 2013 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPU.java	Mon Sep 30 13:03:47 2013 -0400
@@ -45,6 +45,8 @@
      */
     boolean deviceDetach();
 
+    int availableProcessors();
+
     /**
      * Attempts to generate and return a bound function to the
      * loaded method kernel on the GPU.
@@ -56,6 +58,7 @@
 
     Object executeExternalMethodVarargs(Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException;
 
+
     Object executeParallelMethodVarargs(int dimX, int dimY, int dimZ,
                                         Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException;
 }
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPUImpl.java	Mon Sep 30 17:03:14 2013 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPUImpl.java	Mon Sep 30 13:03:47 2013 -0400
@@ -38,8 +38,12 @@
 
     public native boolean deviceDetach();
 
-    public native Object executeExternalMethodVarargs(Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException;
+    public native int availableProcessors();
+
+    public native Object executeExternalMethodVarargs(Object[] args,
+                                                      HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException;
 
     public native Object executeParallelMethodVarargs(int dimX, int dimY, int dimZ,
-                                                      Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException;
+                                                      Object[] args,
+                                                      HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException;
 }
--- a/src/gpu/ptx/vm/gpu_ptx.cpp	Mon Sep 30 17:03:14 2013 +0200
+++ b/src/gpu/ptx/vm/gpu_ptx.cpp	Mon Sep 30 13:03:47 2013 -0400
@@ -50,6 +50,28 @@
 gpu::Ptx::cuda_cu_memcpy_dtoh_func_t gpu::Ptx::_cuda_cu_memcpy_dtoh;
 gpu::Ptx::cuda_cu_memfree_func_t gpu::Ptx::_cuda_cu_memfree;
 
+
+/*
+ * see http://en.wikipedia.org/wiki/CUDA#Supported_GPUs
+ */
+int ncores(int major, int minor) {
+    int device_type = major << 4 + minor;
+
+    switch (device_type) {
+        case 0x10: return 8;
+        case 0x11: return 8;
+        case 0x12: return 8;
+        case 0x13: return 8;
+        case 0x20: return 32;
+        case 0x21: return 48;
+        case 0x30: return 192;
+        case 0x35: return 192;
+    defaulf:
+        tty->print_cr("[CUDA] Warning: Unhandled device %x", device_type);
+        return 0;
+    }
+}
+
 bool gpu::Ptx::initialize_gpu() {
 
   /* Initialize CUDA driver API */
@@ -95,24 +117,7 @@
   }
 
   /* Get device attributes */
-  int minor, major, unified_addressing;
-  status = _cuda_cu_device_get_attribute(&minor, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, _cu_device);
-
-  if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to get minor attribute of device: %d", _cu_device);
-    return false;
-  }
-
-  status = _cuda_cu_device_get_attribute(&major, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, _cu_device);
-
-  if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to get major attribute of device: %d", _cu_device);
-    return false;
-  }
-
-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Compatibility version of device %d: %d.%d", _cu_device, major, minor);
-  }
+  int unified_addressing;
 
   status = _cuda_cu_device_get_attribute(&unified_addressing, GRAAL_CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, _cu_device);
 
@@ -139,9 +144,50 @@
     tty->print_cr("[CUDA] Using %s", device_name);
   }
 
+
   return true;
 }
 
+unsigned int gpu::Ptx::total_cores() {
+
+    int minor, major, nmp;
+    int status = _cuda_cu_device_get_attribute(&minor,
+                                               GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                                               _cu_device);
+
+    if (status != GRAAL_CUDA_SUCCESS) {
+        tty->print_cr("[CUDA] Failed to get minor attribute of device: %d", _cu_device);
+        return 0;
+    }
+
+    status = _cuda_cu_device_get_attribute(&major,
+                                           GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                                           _cu_device);
+
+    if (status != GRAAL_CUDA_SUCCESS) {
+        tty->print_cr("[CUDA] Failed to get major attribute of device: %d", _cu_device);
+        return 0;
+    }
+
+    status = _cuda_cu_device_get_attribute(&nmp,
+                                           GRAAL_CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                                           _cu_device);
+
+    if (status != GRAAL_CUDA_SUCCESS) {
+        tty->print_cr("[CUDA] Failed to get numberof MPs on device: %d", _cu_device);
+        return 0;
+    }
+
+    int total = nmp * ncores(major, minor);
+
+    if (TraceGPUInteraction) {
+        tty->print_cr("[CUDA] Compatibility version of device %d: %d.%d", _cu_device, major, minor);
+        tty->print_cr("[CUDA] Number of cores: %d", total);
+    }
+    return (total);
+    
+}
+
 void *gpu::Ptx::generate_kernel(unsigned char *code, int code_len, const char *name) {
 
   struct CUmod_st * cu_module;
--- a/src/gpu/ptx/vm/gpu_ptx.hpp	Mon Sep 30 17:03:14 2013 +0200
+++ b/src/gpu/ptx/vm/gpu_ptx.hpp	Mon Sep 30 13:03:47 2013 -0400
@@ -34,6 +34,7 @@
 #define GRAAL_CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING        41
 #define GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR  75
 #define GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR  76
+#define GRAAL_CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT      16
 #define GRAAL_CU_JIT_MAX_REGISTERS                           0
 #define GRAAL_CU_JIT_THREADS_PER_BLOCK                       1
 #define GRAAL_CU_JIT_INFO_LOG_BUFFER                         3
@@ -73,6 +74,7 @@
  protected:
   static bool probe_linkage();
   static bool initialize_gpu();
+  static unsigned int total_cores();
   static void * generate_kernel(unsigned char *code, int code_len, const char *name);
   static bool execute_warp(int dimX, int dimY, int dimZ, address kernel, PTXKernelArguments & ka, JavaValue &ret);
   static bool execute_kernel(address kernel, PTXKernelArguments & ka, JavaValue &ret);
--- a/src/share/vm/graal/graalCompilerToGPU.cpp	Mon Sep 30 17:03:14 2013 +0200
+++ b/src/share/vm/graal/graalCompilerToGPU.cpp	Mon Sep 30 13:03:47 2013 -0400
@@ -111,7 +111,7 @@
   HandleMark hm;
 
   if (gpu::is_available() == false || gpu::has_gpu_linkage() == false && gpu::is_initialized()) {
-    tty->print_cr("executeExternalMethodVarargs - not available / no linkage / not initialized");
+    tty->print_cr("executeParallelMethodVarargs - not available / no linkage / not initialized");
     return NULL;
   }
   jlong nmethodValue = HotSpotInstalledCode::codeBlob(hotspotInstalledCode);
@@ -155,6 +155,14 @@
   return gpu::is_initialized();
 C2V_END
 
+C2V_VMENTRY(jint, availableProcessors, (JNIEnv *env, jobject))
+  if (gpu::is_available() == false || gpu::has_gpu_linkage() == false) {
+    tty->print_cr("deviceInit - not available / no linkage");
+    return false;
+  }
+  return gpu::available_processors();
+C2V_END
+
 C2V_VMENTRY(jboolean, deviceDetach, (JNIEnv *env, jobject))
 return true;
 C2V_END
@@ -199,6 +207,7 @@
   {CC"generateKernel",                CC"([B" STRING ")"GPUSPACE_METHOD,          FN_PTR(generateKernel)},
   {CC"deviceInit",                    CC"()Z",                                    FN_PTR(deviceInit)},
   {CC"deviceDetach",                  CC"()Z",                                    FN_PTR(deviceDetach)},
+  {CC"availableProcessors",           CC"()I",                                    FN_PTR(availableProcessors)},
   {CC"executeExternalMethodVarargs",  CC"(["OBJECT HS_INSTALLED_CODE")"OBJECT,    FN_PTR(executeExternalMethodVarargs)},
   {CC"executeParallelMethodVarargs",  CC"(III["OBJECT HS_INSTALLED_CODE")"OBJECT, FN_PTR(executeParallelMethodVarargs)},
 };
--- a/src/share/vm/runtime/gpu.cpp	Mon Sep 30 17:03:14 2013 +0200
+++ b/src/share/vm/runtime/gpu.cpp	Mon Sep 30 13:03:47 2013 -0400
@@ -81,3 +81,13 @@
     return false;
 }
 
+int gpu::available_processors() {
+    if (gpu::has_gpu_linkage()) {
+        if (gpu::get_target_il_type() == gpu::PTX) {
+            return (gpu::Ptx::total_cores());
+        }
+        // Add kernel execution functionality of other GPUs here
+    }
+    return 0;
+}
+
--- a/src/share/vm/runtime/gpu.hpp	Mon Sep 30 17:03:14 2013 +0200
+++ b/src/share/vm/runtime/gpu.hpp	Mon Sep 30 13:03:47 2013 -0400
@@ -43,6 +43,8 @@
   static void probe_gpu();
 
   static void initialize_gpu();
+
+  static int available_processors();
   
   static void * generate_kernel(unsigned char *code, int code_len, const char *name);