changeset 13648:27acedac70b7

added support for @ParallelOver annotation in PTX kernel call wrapper
author Doug Simon <doug.simon@oracle.com>
date Wed, 15 Jan 2014 20:24:13 +0100
parents 8edc6b0779f0
children 36d4faef2c56
files graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXLaunchKernelGraphKit.java src/gpu/ptx/vm/gpu_ptx.cpp src/gpu/ptx/vm/gpu_ptx.hpp src/share/vm/graal/graalCompilerToGPU.cpp
diffstat 4 files changed, 108 insertions(+), 77 deletions(-) [+]
line wrap: on
line diff
--- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXLaunchKernelGraphKit.java	Wed Jan 15 20:21:53 2014 +0100
+++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXLaunchKernelGraphKit.java	Wed Jan 15 20:24:13 2014 +0100
@@ -24,9 +24,12 @@
 
 import static com.oracle.graal.api.meta.DeoptimizationReason.*;
 import static com.oracle.graal.api.meta.LocationIdentity.*;
+import static com.oracle.graal.api.meta.MetaUtil.*;
 import static com.oracle.graal.asm.NumUtil.*;
 import static com.oracle.graal.hotspot.ptx.PTXHotSpotBackend.*;
+import static com.oracle.graal.hotspot.ptx.PTXLaunchKernelGraphKit.LaunchArg.*;
 import static com.oracle.graal.hotspot.replacements.HotSpotReplacementsUtil.*;
+import static com.oracle.graal.nodes.ConstantNode.*;
 import static java.lang.reflect.Modifier.*;
 
 import java.util.*;
@@ -38,11 +41,14 @@
 import com.oracle.graal.hotspot.nodes.*;
 import com.oracle.graal.hotspot.stubs.*;
 import com.oracle.graal.java.*;
+import com.oracle.graal.lir.ptx.*;
 import com.oracle.graal.nodes.*;
 import com.oracle.graal.nodes.HeapAccess.BarrierType;
 import com.oracle.graal.nodes.calc.*;
 import com.oracle.graal.nodes.extended.*;
+import com.oracle.graal.nodes.java.*;
 import com.oracle.graal.nodes.type.*;
+import com.oracle.graal.replacements.*;
 import com.oracle.graal.replacements.nodes.*;
 import com.oracle.graal.word.*;
 
@@ -52,9 +58,9 @@
  * 
  * <pre>
  *     jlong kernel(p0, p1, ..., pN) {
- *         jint kernelParamsBufSize = SIZE_OF_ALIGNED_PARAMS_WITH_PADDING(p0, p1, ..., pN);
- *         jbyte kernelParamsBuf[kernelParamsBufSize] = {p0, PAD(p1), p1, ..., PAD(pN), pN};
- *         jlong result = PTX_LAUNCH_KERNEL(THREAD_REGISTER, kernelParamsBuf, kernelParamsBuf);
+ *         jint bufSize = SIZE_OF_ALIGNED_PARAMS_AND_RETURN_VALUE_WITH_PADDING(p0, p1, ..., pN);
+ *         jbyte buf[bufSize] = {p0, PAD(p1), p1, ..., PAD(pN), pN};
+ *         jlong result = PTX_LAUNCH_KERNEL(THREAD_REGISTER, KERNEL_ENTRY_POINT, dimX, dimY, dimZ, buf, bufSize, encodedReturnTypeSize);
  *         return result;
  *     }
  * </pre>
@@ -68,10 +74,11 @@
 
     /**
      * The size of the buffer holding the parameters and the extra word for storing the pointer to
-     * device memory for the return value. This will be the same as
-     * PTXKernelArguments::device_argument_buffer_size().
+     * device memory for the return value.
+     * 
+     * @see LaunchArg#ParametersAndReturnValueBufferSize
      */
-    int kernelParametersAndReturnValueBufferSize;
+    int bufSize;
 
     /**
      * Offsets of each Java argument in the parameters buffer.
@@ -79,6 +86,13 @@
     int[] javaParameterOffsetsInKernelParametersBuffer;
 
     /**
+     * Constants denoting the arguments to {@link PTXHotSpotBackend#LAUNCH_KERNEL}.
+     */
+    enum LaunchArg {
+        Thread, Kernel, DimX, DimY, DimZ, ParametersAndReturnValueBuffer, ParametersAndReturnValueBufferSize, EncodedReturnTypeSize
+    }
+
+    /**
      * Creates a graph implementing the transition from Java to the native routine that launches
      * some compiled PTX code.
      * 
@@ -99,36 +113,18 @@
 
         BitSet objects = new BitSet();
         if (!isStatic) {
-            javaParameters[javaParametersIndex] = unique(new ParameterNode(javaParametersIndex, StampFactory.declaredNonNull(kernelMethod.getDeclaringClass())));
-            kernelParametersAndReturnValueBufferSize += wordSize;
-            javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex++] = 0;
-            objects.set(0);
+            doParameter(wordSize, Kind.Object, javaParametersIndex++, objects);
         }
-        for (int i = 0; i < sigCount; i++) {
-            Kind kind = sig.getParameterKind(i);
-            int kindByteSize = kind.getBitCount() / Byte.SIZE;
-            while ((kernelParametersAndReturnValueBufferSize % kindByteSize) != 0) {
-                kernelParametersAndReturnValueBufferSize++;
-            }
-            javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex] = kernelParametersAndReturnValueBufferSize;
-            Stamp stamp;
-            if (kind == Kind.Object) {
-                stamp = StampFactory.object();
-                int slot = kernelParametersAndReturnValueBufferSize / wordSize;
-                objects.set(slot);
-            } else {
-                stamp = StampFactory.forKind(kind);
-            }
-            ParameterNode param = unique(new ParameterNode(javaParametersIndex, stamp));
-            javaParameters[javaParametersIndex++] = param;
-            kernelParametersAndReturnValueBufferSize += kindByteSize;
+        for (int sigIndex = 0; sigIndex < sigCount; sigIndex++) {
+            Kind kind = sig.getParameterKind(sigIndex);
+            doParameter(wordSize, kind, javaParametersIndex++, objects);
         }
-        kernelParametersAndReturnValueBufferSize = roundUp(kernelParametersAndReturnValueBufferSize, wordSize);
+        bufSize = roundUp(bufSize, wordSize);
 
         // Add slot for holding pointer to device memory storing return value
         int encodedReturnTypeSize = 0;
         if (returnKind != Kind.Void) {
-            kernelParametersAndReturnValueBufferSize += wordSize;
+            bufSize += wordSize;
             if (returnKind == Kind.Object) {
                 encodedReturnTypeSize = -wordSize;
             } else {
@@ -136,33 +132,41 @@
             }
         }
 
-        ReadRegisterNode threadArg = append(new ReadRegisterNode(providers.getRegisters().getThreadRegister(), true, false));
-        ConstantNode kernelAddressArg = ConstantNode.forLong(kernelAddress, getGraph());
-        AllocaNode kernelParametersAndReturnValueBufferArg = append(new AllocaNode(kernelParametersAndReturnValueBufferSize / wordSize, objects));
-        ConstantNode kernelParametersAndReturnValueBufferSizeArg = ConstantNode.forInt(kernelParametersAndReturnValueBufferSize, getGraph());
-        ConstantNode encodedReturnTypeSizeArg = ConstantNode.forInt(encodedReturnTypeSize, getGraph());
+        AllocaNode buf = append(new AllocaNode(bufSize / wordSize, objects));
 
+        Map<LaunchArg, ValueNode> args = new EnumMap<>(LaunchArg.class);
+        args.put(Thread, append(new ReadRegisterNode(providers.getRegisters().getThreadRegister(), true, false)));
+        args.put(Kernel, ConstantNode.forLong(kernelAddress, getGraph()));
+        args.put(DimX, forInt(1, getGraph()));
+        args.put(DimY, forInt(1, getGraph()));
+        args.put(DimZ, forInt(1, getGraph()));
+        args.put(ParametersAndReturnValueBuffer, buf);
+        args.put(ParametersAndReturnValueBufferSize, forInt(bufSize, getGraph()));
+        args.put(EncodedReturnTypeSize, forInt(encodedReturnTypeSize, getGraph()));
+
+        int sigIndex = isStatic ? 0 : -1;
         for (javaParametersIndex = 0; javaParametersIndex < javaParameters.length; javaParametersIndex++) {
             ParameterNode javaParameter = javaParameters[javaParametersIndex];
             int javaParameterOffset = javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex];
             LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, javaParameter.kind(), javaParameterOffset, getGraph());
-            append(new WriteNode(kernelParametersAndReturnValueBufferArg, javaParameter, location, BarrierType.NONE, false, false));
+            append(new WriteNode(buf, javaParameter, location, BarrierType.NONE, false, false));
+            updateDimArg(kernelMethod, providers, sig, sigIndex++, args, javaParameter);
         }
         if (returnKind != Kind.Void) {
-            LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, wordKind, kernelParametersAndReturnValueBufferSize - wordSize, getGraph());
-            append(new WriteNode(kernelParametersAndReturnValueBufferArg, ConstantNode.forIntegerKind(wordKind, 0L, getGraph()), location, BarrierType.NONE, false, false));
+            LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, wordKind, bufSize - wordSize, getGraph());
+            append(new WriteNode(buf, ConstantNode.forIntegerKind(wordKind, 0L, getGraph()), location, BarrierType.NONE, false, false));
         }
 
         FrameStateBuilder fsb = new FrameStateBuilder(kernelMethod, getGraph(), true);
         FrameState fs = fsb.create(0);
         getGraph().start().setStateAfter(fs);
 
-        ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), LAUNCH_KERNEL, threadArg, kernelAddressArg, kernelParametersAndReturnValueBufferArg,
-                        kernelParametersAndReturnValueBufferSizeArg, encodedReturnTypeSizeArg));
+        ValueNode[] launchArgsArray = args.values().toArray(new ValueNode[args.size()]);
+        ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), LAUNCH_KERNEL, launchArgsArray));
         result.setDeoptimizationState(fs);
 
         ConstantNode isObjectResultArg = ConstantNode.forBoolean(returnKind == Kind.Object, getGraph());
-        InvokeNode handlePendingException = createInvoke(getClass(), "handlePendingException", threadArg, isObjectResultArg);
+        InvokeNode handlePendingException = createInvoke(getClass(), "handlePendingException", args.get(Thread), isObjectResultArg);
         handlePendingException.setStateAfter(fs);
         InvokeNode getObjectResult = null;
 
@@ -186,7 +190,7 @@
                 returnValue = unique(new ReinterpretNode(returnKind, result));
                 break;
             case Object:
-                getObjectResult = createInvoke(getClass(), "getObjectResult", threadArg);
+                getObjectResult = createInvoke(getClass(), "getObjectResult", args.get(Thread));
                 returnValue = append(getObjectResult);
                 break;
             default:
@@ -207,7 +211,45 @@
         }
     }
 
-    public static void handlePendingException(Word thread, boolean isObjectResult) {
+    private void doParameter(int wordSize, Kind kind, int javaParametersIndex, BitSet objects) {
+        int kindByteSize = kind == Kind.Object ? wordSize : kind.getBitCount() / Byte.SIZE;
+        bufSize = roundUp(bufSize, kindByteSize);
+        javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex] = bufSize;
+        Stamp stamp;
+        if (kind == Kind.Object) {
+            stamp = StampFactory.object();
+            int slot = bufSize / wordSize;
+            objects.set(slot);
+        } else {
+            stamp = StampFactory.forKind(kind);
+        }
+        javaParameters[javaParametersIndex] = unique(new ParameterNode(javaParametersIndex, stamp));
+        bufSize += kindByteSize;
+    }
+
+    /**
+     * Updates the {@code dimX}, {@code dimY} or {@code dimZ} argument passed to the kernel if
+     * {@code javaParameter} is annotated with {@link ParallelOver}.
+     */
+    private void updateDimArg(ResolvedJavaMethod method, HotSpotProviders providers, Signature sig, int sigIndex, Map<LaunchArg, ValueNode> launchArgs, ParameterNode javaParameter) {
+        if (sigIndex >= 0) {
+            ParallelOver parallelOver = getParameterAnnotation(ParallelOver.class, sigIndex, method);
+            if (parallelOver != null && sig.getParameterType(sigIndex, method.getDeclaringClass()).equals(providers.getMetaAccess().lookupJavaType(int[].class))) {
+                ArrayLengthNode dimension = append(new ArrayLengthNode(javaParameter));
+                LaunchArg argKey = LaunchArg.valueOf(LaunchArg.class, "Dim" + parallelOver.dimension());
+                ValueNode existing = launchArgs.put(argKey, dimension);
+                if (existing != null && existing instanceof ArrayLengthNode) {
+                    throw new GraalInternalError("@" + ParallelOver.class.getSimpleName() + " with dimension=" + parallelOver.dimension() + " applied to multiple parameters");
+                }
+            }
+        }
+    }
+
+    /**
+     * Snippet invoked upon return from the kernel to handle any pending exceptions.
+     */
+    @Snippet
+    private static void handlePendingException(Word thread, boolean isObjectResult) {
         if (clearPendingException(thread)) {
             if (isObjectResult) {
                 getAndClearObjectResult(thread);
@@ -216,7 +258,12 @@
         }
     }
 
-    public static Object getObjectResult(Word thread) {
+    /**
+     * Snippet invoked upon return from the kernel to retrieve an object return value from the
+     * thread local used for communicating object return values from VM calls.
+     */
+    @Snippet
+    private static Object getObjectResult(Word thread) {
         return getAndClearObjectResult(thread);
     }
 }
--- a/src/gpu/ptx/vm/gpu_ptx.cpp	Wed Jan 15 20:21:53 2014 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.cpp	Wed Jan 15 20:24:13 2014 +0100
@@ -337,40 +337,20 @@
   return cu_function;
 }
 
-JRT_ENTRY(jlong, gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jlong parametersAndReturnValueBuffer, jint parametersAndReturnValueBufferSize, int encodedReturnTypeSize))
-  tty->print_cr("*** gpu::Ptx::execute_kernel_from_vm(kernel=%p, parametersAndReturnValueBuffer=%p, parametersAndReturnValueBufferSize=%d, encodedReturnTypeSize=%d)",
-      kernel, parametersAndReturnValueBuffer, parametersAndReturnValueBufferSize, encodedReturnTypeSize);
-  tty->print("  buffer as bytes: ");
-  for (int i = 0; i < parametersAndReturnValueBufferSize; i++) {
-    tty->print(" 0x%02x", ((jbyte*) (address) parametersAndReturnValueBuffer)[i] & 0xFF);
-  }
-  tty->cr();
-  tty->print("  buffer as ints: ");
-  for (int i = 0; i < (parametersAndReturnValueBufferSize / 4); i++) {
-    tty->print(" %d", ((jint*) (address) parametersAndReturnValueBuffer)[i]);
-  }
-  tty->cr();
-  tty->print("  buffer as words: ");
-  for (unsigned i = 0; i < (parametersAndReturnValueBufferSize / sizeof(void*)); i++) {
-    tty->print(" "INTPTR_FORMAT, ((void**) (address) parametersAndReturnValueBuffer)[i]);
-  }
-  tty->cr();
+JRT_ENTRY(jlong, gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ,
+                                                  jlong parametersAndReturnValueBuffer,
+                                                  jint parametersAndReturnValueBufferSize,
+                                                  int encodedReturnTypeSize))
   if (kernel == 0L) {
     SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_NullPointerException(), NULL);
     return 0L;
   }
 
-
   // grid dimensionality
   unsigned int gridX = 1;
   unsigned int gridY = 1;
   unsigned int gridZ = 1;
 
-  // thread dimensionality
-  unsigned int blockX = 1;
-  unsigned int blockY = 1;
-  unsigned int blockZ = 1;
-
   struct CUfunc_st* cu_function = (struct CUfunc_st*) (address) kernel;
 
   void * config[5] = {
@@ -391,7 +371,7 @@
     status = _cuda_cu_memalloc(&device_return_value, returnTypeSize);
     if (status != GRAAL_CUDA_SUCCESS) {
       tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to allocate memory for return value pointer on device");
+      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to allocate memory for return value pointer on device");
       return 0L;
     }
     // Push device_return_value to kernelParams
@@ -401,24 +381,24 @@
 
   status = _cuda_cu_launch_kernel(cu_function,
                                       gridX, gridY, gridZ,
-                                      blockX, blockY, blockZ,
+                                      dimX, dimY, dimZ,
                                       0, NULL, NULL, (void **) &config);
 
   if (status != GRAAL_CUDA_SUCCESS) {
     tty->print_cr("[CUDA] Failed to launch kernel");
-    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to launch kernel");
+    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to launch kernel");
     return 0L;
   }
 
   if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Success: Kernel Launch: X: %d Y: %d Z: %d", blockX, blockY, blockZ);
+    tty->print_cr("[CUDA] Success: Kernel Launch: X: %d Y: %d Z: %d", dimX, dimY, dimZ);
   }
 
   status = _cuda_cu_ctx_synchronize();
 
   if (status != GRAAL_CUDA_SUCCESS) {
     tty->print_cr("[CUDA] Failed to synchronize launched kernel (%d)", status);
-    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to synchronize launched kernel");
+    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to synchronize launched kernel");
     return 0L;
   }
 
--- a/src/gpu/ptx/vm/gpu_ptx.hpp	Wed Jan 15 20:21:53 2014 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.hpp	Wed Jan 15 20:24:13 2014 +0100
@@ -105,7 +105,10 @@
 
 typedef int CUdevice;     /* CUDA device */
 
-  static jlong execute_kernel_from_vm(JavaThread* thread, jlong kernel, jlong parametersAndReturnValueBuffer, jint parametersAndReturnValueBufferSize, int encodedReturnTypeSize);
+  static jlong execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ,
+                                      jlong parametersAndReturnValueBuffer,
+                                      jint parametersAndReturnValueBufferSize,
+                                      int encodedReturnTypeSize);
 
 private:
   typedef int (*cuda_cu_init_func_t)(unsigned int);
--- a/src/share/vm/graal/graalCompilerToGPU.cpp	Wed Jan 15 20:21:53 2014 +0100
+++ b/src/share/vm/graal/graalCompilerToGPU.cpp	Wed Jan 15 20:24:13 2014 +0100
@@ -48,7 +48,9 @@
 
 C2V_ENTRY(jlong, generateKernel, (JNIEnv *env, jobject, jbyteArray code, jstring name))
   if (gpu::is_available() == false || gpu::has_gpu_linkage() == false && gpu::is_initialized()) {
-    tty->print_cr("generateKernel - not available / no linkage / not initialized");
+    if (TraceGPUInteraction) {
+      tty->print_cr("generateKernel - not available / no linkage / not initialized");
+    }
     return 0;
   }
   jboolean is_copy;
@@ -58,8 +60,7 @@
   void *kernel = gpu::generate_kernel((unsigned char *)bytes, len, namestr);
   if (kernel == NULL) {
     tty->print_cr("[CUDA] *** Error: Failed to compile kernel");
-  }
-  else if (TraceGPUInteraction) {
+  } else if (TraceGPUInteraction) {
     tty->print_cr("[CUDA] Generated kernel");
   }
   env->ReleaseByteArrayElements(code, bytes, 0);