changeset 13753:80cd5c3b8827

partially fixed passing of object parameters to PTX kernels; use a C++ object for managing resource allocation and cleanup around a PTX kernel execution
author Doug Simon <doug.simon@oracle.com>
date Fri, 24 Jan 2014 18:01:36 +0100
parents 34ab58984118
children 785b2be9c85a
files graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXWrapperBuilder.java src/gpu/ptx/vm/gpu_ptx.cpp src/gpu/ptx/vm/gpu_ptx.hpp
diffstat 4 files changed, 294 insertions(+), 127 deletions(-) [+]
line wrap: on
line diff
--- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java	Fri Jan 24 17:59:46 2014 +0100
+++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java	Fri Jan 24 18:01:36 2014 +0100
@@ -67,7 +67,7 @@
 public class PTXHotSpotBackend extends HotSpotBackend {
 
     /**
-     * Descriptor for the PTX runtime method for launching a kernel. The C++ signature is:
+     * Descriptor for the PTX runtime method for calling a kernel. The C++ signature is:
      * 
      * <pre>
      *     jlong (JavaThread* thread,
@@ -77,11 +77,14 @@
      *            jint dimZ,
      *            jlong parametersAndReturnValueBuffer,
      *            jint parametersAndReturnValueBufferSize,
+     *            jint objectParametersCount,
+     *            jlong objectParametersOffsets,
+     *            jlong pinnedObjects,
      *            jint encodedReturnTypeSize)
      * </pre>
      */
     // @formatter:off
-    public static final ForeignCallDescriptor LAUNCH_KERNEL = new ForeignCallDescriptor("execute_kernel_from_vm", long.class,
+    public static final ForeignCallDescriptor CALL_KERNEL = new ForeignCallDescriptor("execute_kernel_from_vm", long.class,
                     Word.class, // thread
                     long.class, // kernel
                     int.class,  // dimX
@@ -89,6 +92,9 @@
                     int.class,  // dimZ
                     long.class, // parametersAndReturnValueBuffer
                     int.class,  // parametersAndReturnValueBufferSize
+                    int.class,  // objectParameterCount
+                    long.class, // objectParameterOffsets
+                    long.class, // pinnedObjects
                     int.class); // encodedReturnTypeSize
     // @formatter:on
 
@@ -114,7 +120,7 @@
         CompilerToGPU compilerToGPU = getRuntime().getCompilerToGPU();
         if (deviceInitialized) {
             long launchKernel = compilerToGPU.getLaunchKernelAddress();
-            hostForeignCalls.registerForeignCall(LAUNCH_KERNEL, launchKernel, NativeCall, DESTROYS_REGISTERS, NOT_LEAF, NOT_REEXECUTABLE, ANY_LOCATION);
+            hostForeignCalls.registerForeignCall(CALL_KERNEL, launchKernel, NativeCall, DESTROYS_REGISTERS, NOT_LEAF, NOT_REEXECUTABLE, ANY_LOCATION);
         }
         super.completeInitialization();
     }
@@ -171,6 +177,7 @@
                         new SpeculationLog(), suites, true, new ExternalCompilationResult(), CompilationResultBuilderFactory.Default);
         if (makeBinary) {
             try (Scope ds = Debug.scope("GeneratingKernelBinary")) {
+                assert ptxCode.getTargetCode() != null;
                 long kernel = getRuntime().getCompilerToGPU().generateKernel(ptxCode.getTargetCode(), method.getName());
                 ptxCode.setEntryPoint(kernel);
             } catch (Throwable e) {
--- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXWrapperBuilder.java	Fri Jan 24 17:59:46 2014 +0100
+++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXWrapperBuilder.java	Fri Jan 24 18:01:36 2014 +0100
@@ -55,21 +55,62 @@
 
 /**
  * Utility for building a graph that "wraps" a compiled PTX kernel. Such a wrapper handles the
- * transition from the host CPU to the GPU and back. The graph created is something like the
- * following pseudo code with UPPER CASE denoting compile-time constants:
+ * transition from the host CPU to the GPU and back. The wrapper allocate 3 on-stack buffers:
+ * <ul>
+ * <li>PARAMS: a buffer for the kernel parameters and one word for the on-device address of the
+ * return value (if any).</li>
+ * <li>PINNED: a buffer into which the address of pinned objects is saved.</li>
+ * <li>OBJECT_OFFSETS: the offsets of the object values in PARAMS.</li>
+ * </ul>
+ * 
+ * 
+ * The PARAMS buffer is the {@code CU_LAUNCH_PARAM_BUFFER_POINTER} buffer passed in the
+ * {@code extra} argument to the {@code cuLaunchKernel} function. This buffer contains the
+ * parameters to the call. The buffer is word aligned and each parameter is aligned in the buffer
+ * according to its data size. The wrapper copies the incoming arguments into the buffer as is. The
+ * native {@link PTXHotSpotBackend#CALL_KERNEL callKernel} function will pin the memory for each
+ * object parameter (using {@code cuMemHostRegister}) and then replace the object pointer in PARAMS
+ * with an on-device pointer to the object's memory (see {@code cuMemHostGetDevicePointer}). The
+ * function saves pinned object pointer into PINNED so that it can unpinned once the kernel returns.
+ * The object pointers in PARAMS are specified by OBJECT_OFFSETS.
+ * <p>
+ * As a concrete example, for a kernel whose Java method signature is:
  * 
  * <pre>
- *     T kernel(p0, p1, ..., pN) {
- *         jint bufSize = SIZE_OF_ALIGNED_PARAMS_AND_RETURN_VALUE_WITH_PADDING(p0, p1, ..., pN);
- *         jbyte buf[bufSize] = {p0, PAD(p1), p1, ..., PAD(pN), pN};
- *         jlong result = PTX_LAUNCH_KERNEL(THREAD_REGISTER, KERNEL_ENTRY_POINT, dimX, dimY, dimZ, buf, bufSize, encodedReturnTypeSize);
- *         return convert(result);
+ *     static int kernel(int p1, short p2, Object p3, long p4)
+ * </pre>
+ * 
+ * the graph created is shown below as psuedo-code:
+ * 
+ * <pre>
+ *     int kernel_wrapper(int p1, short p2, oop p3, long p4) {
+ *         address kernelAddr = kernel.start;
+ *         if (kernelAddr == 0) {
+ *             deopt(InvalidateRecompile, RuntimeConstraint);
+ *         }
+ *         byte PARAMS[32];
+ *         word PINNED[1]; // note: no refmap
+ *         int OBJECT_OFFSETS[1] = {8};
+ *         ((int*) PARAMS)[0] = p1;
+ *         ((short*) PARAMS)[2] = p2;
+ *         ((word*) PARAMS)[1] = p3;
+ *         ((long*) PARAMS)[2] = p4;
+ *         int result = CALL_KERNEL(THREAD_REGISTER, KERNEL_ENTRY_POINT, 1, 1, 1, PARAMS, 32, 1, OBJECT_OFFSETS, PINNED, 4);
+ *         if (clearPendingException(thread)) {
+ *             deopt(None, RuntimeConstraint);
+ *         }
+ *         return result;
  *     }
  * </pre>
  * <p>
  * The generated graph includes a reference to the {@link HotSpotNmethod} for the kernel. There must
  * be another reference to the same {@link HotSpotNmethod} object to ensure that the nmethod is not
  * unloaded by the next full GC.
+ * <p>
+ * TODO: Only the memory for objects passed as parameters is pinned. Surely the memory for other
+ * objects accessed in the kernel reachable from the parameter objects needs to be pinned as well?
+ * <p>
+ * TODO: Objects references within kernels are currently completely hidden from GC.
  */
 public class PTXWrapperBuilder extends GraphKit {
 
@@ -92,11 +133,23 @@
     int[] javaParameterOffsetsInKernelParametersBuffer;
 
     /**
-     * Constants denoting the arguments to {@link PTXHotSpotBackend#LAUNCH_KERNEL}.
+     * Constants denoting the arguments to {@link PTXHotSpotBackend#CALL_KERNEL}.
      */
+    // @formatter:off
     enum LaunchArg {
-        Thread, Kernel, DimX, DimY, DimZ, ParametersAndReturnValueBuffer, ParametersAndReturnValueBufferSize, EncodedReturnTypeSize
+        Thread,
+        Kernel,
+        DimX,
+        DimY,
+        DimZ,
+        ParametersAndReturnValueBuffer,
+        ParametersAndReturnValueBufferSize,
+        ObjectParametersCount,
+        ObjectParametersOffsets,
+        PinnedObjects,
+        EncodedReturnTypeSize
     }
+    // @formatter:on
 
     /**
      * Creates the graph implementing the CPU to GPU transition.
@@ -108,6 +161,7 @@
     public PTXWrapperBuilder(ResolvedJavaMethod method, HotSpotNmethod kernel, HotSpotProviders providers) {
         super(new StructuredGraph(method), providers);
         int wordSize = providers.getCodeCache().getTarget().wordSize;
+        int intSize = Integer.SIZE / Byte.SIZE;
         Kind wordKind = providers.getCodeCache().getTarget().wordKind;
         Signature sig = method.getSignature();
         boolean isStatic = isStatic(method.getModifiers());
@@ -117,17 +171,18 @@
         int javaParametersIndex = 0;
         Kind returnKind = sig.getReturnKind();
 
-        BitSet objects = new BitSet();
+        List<Integer> objectSlots = new ArrayList<>(javaParameters.length);
         if (!isStatic) {
-            allocateParameter(Kind.Object, javaParametersIndex++, objects, wordSize);
+            allocateParameter(Kind.Object, javaParametersIndex++, objectSlots, wordSize);
         }
         for (int sigIndex = 0; sigIndex < sigCount; sigIndex++) {
             Kind kind = sig.getParameterKind(sigIndex);
-            allocateParameter(kind, javaParametersIndex++, objects, wordSize);
+            allocateParameter(kind, javaParametersIndex++, objectSlots, wordSize);
         }
         bufSize = roundUp(bufSize, wordSize);
 
-        // Add slot for holding pointer to device memory storing return value
+        // Add slot for the device memory pointer. The kernel writes a
+        // pointer in this slot that points to the return value.
         int encodedReturnTypeSize = 0;
         if (returnKind != Kind.Void) {
             bufSize += wordSize;
@@ -140,7 +195,29 @@
 
         InvokeNode kernelStart = createInvoke(getClass(), "getKernelStart", ConstantNode.forObject(kernel, providers.getMetaAccess(), getGraph()));
 
-        AllocaNode buf = append(new AllocaNode(bufSize / wordSize, objects));
+        AllocaNode buf = append(new AllocaNode(bufSize / wordSize, new BitSet()));
+        ValueNode objectParametersOffsets;
+        ValueNode pinnedObjects;
+        ConstantNode nullWord = ConstantNode.forIntegerKind(wordKind, 0L, getGraph());
+        if (objectSlots.isEmpty()) {
+            objectParametersOffsets = ConstantNode.forLong(0, getGraph());
+            pinnedObjects = ConstantNode.forLong(0, getGraph());
+        } else {
+            int intsPerWord = wordSize / intSize;
+            int slots = roundUp(objectSlots.size(), intsPerWord);
+            objectParametersOffsets = append(new AllocaNode(slots, new BitSet()));
+            // No refmap for pinned objects list since kernel execution is (currently) GC unsafe
+            pinnedObjects = append(new AllocaNode(objectSlots.size(), new BitSet()));
+
+            // Initialize the object parameter offsets array
+            int index = 0;
+            for (int slot : objectSlots) {
+                int offset = slot * wordSize;
+                LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, Kind.Int, index * intSize, getGraph());
+                append(new WriteNode(objectParametersOffsets, ConstantNode.forInt(offset, getGraph()), location, BarrierType.NONE, false, false));
+                index++;
+            }
+        }
 
         Map<LaunchArg, ValueNode> args = new EnumMap<>(LaunchArg.class);
         args.put(Thread, append(new ReadRegisterNode(providers.getRegisters().getThreadRegister(), true, false)));
@@ -150,6 +227,9 @@
         args.put(DimZ, forInt(1, getGraph()));
         args.put(ParametersAndReturnValueBuffer, buf);
         args.put(ParametersAndReturnValueBufferSize, forInt(bufSize, getGraph()));
+        args.put(ObjectParametersCount, forInt(objectSlots.size(), getGraph()));
+        args.put(ObjectParametersOffsets, objectParametersOffsets);
+        args.put(PinnedObjects, pinnedObjects);
         args.put(EncodedReturnTypeSize, forInt(encodedReturnTypeSize, getGraph()));
 
         int sigIndex = isStatic ? 0 : -1;
@@ -162,7 +242,7 @@
         }
         if (returnKind != Kind.Void) {
             LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, wordKind, bufSize - wordSize, getGraph());
-            append(new WriteNode(buf, ConstantNode.forIntegerKind(wordKind, 0L, getGraph()), location, BarrierType.NONE, false, false));
+            append(new WriteNode(buf, nullWord, location, BarrierType.NONE, false, false));
         }
 
         FrameStateBuilder fsb = new FrameStateBuilder(method, getGraph(), true);
@@ -170,7 +250,7 @@
         getGraph().start().setStateAfter(fs);
 
         ValueNode[] launchArgsArray = args.values().toArray(new ValueNode[args.size()]);
-        ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), LAUNCH_KERNEL, launchArgsArray));
+        ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), CALL_KERNEL, launchArgsArray));
         result.setDeoptimizationState(fs);
 
         ConstantNode isObjectResultArg = ConstantNode.forBoolean(returnKind == Kind.Object, getGraph());
@@ -193,7 +273,11 @@
             case Long:
                 returnValue = result;
                 break;
-            case Float:
+            case Float: {
+                ValueNode asInt = unique(new ConvertNode(Kind.Long, Kind.Int, result));
+                returnValue = unique(new ReinterpretNode(Kind.Float, asInt));
+                break;
+            }
             case Double:
                 returnValue = unique(new ReinterpretNode(returnKind, result));
                 break;
@@ -220,12 +304,12 @@
     }
 
     /**
-     * Allocates a slot in the kernel parameters' buffer for a Java parameter.
+     * Computes offset and size of space in PARAMS for a Java parameter.
      * 
      * @param kind the kind of the parameter
      * @param javaParametersIndex the index of the Java parameter
      */
-    private void allocateParameter(Kind kind, int javaParametersIndex, BitSet objects, int wordSize) {
+    private void allocateParameter(Kind kind, int javaParametersIndex, List<Integer> objectSlots, int wordSize) {
         int kindByteSize = kind == Kind.Object ? wordSize : kind.getBitCount() / Byte.SIZE;
         bufSize = roundUp(bufSize, kindByteSize);
         javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex] = bufSize;
@@ -233,7 +317,7 @@
         if (kind == Kind.Object) {
             stamp = StampFactory.object();
             int slot = bufSize / wordSize;
-            objects.set(slot);
+            objectSlots.add(slot);
         } else {
             stamp = StampFactory.forKind(kind);
         }
--- a/src/gpu/ptx/vm/gpu_ptx.cpp	Fri Jan 24 17:59:46 2014 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.cpp	Fri Jan 24 18:01:36 2014 +0100
@@ -39,6 +39,7 @@
 gpu::Ptx::cuda_cu_ctx_create_func_t gpu::Ptx::_cuda_cu_ctx_create;
 gpu::Ptx::cuda_cu_ctx_destroy_func_t gpu::Ptx::_cuda_cu_ctx_destroy;
 gpu::Ptx::cuda_cu_ctx_synchronize_func_t gpu::Ptx::_cuda_cu_ctx_synchronize;
+gpu::Ptx::cuda_cu_ctx_get_current_func_t gpu::Ptx::_cuda_cu_ctx_get_current;
 gpu::Ptx::cuda_cu_ctx_set_current_func_t gpu::Ptx::_cuda_cu_ctx_set_current;
 gpu::Ptx::cuda_cu_device_get_count_func_t gpu::Ptx::_cuda_cu_device_get_count;
 gpu::Ptx::cuda_cu_device_get_name_func_t gpu::Ptx::_cuda_cu_device_get_name;
@@ -337,123 +338,189 @@
   return cu_function;
 }
 
+// A PtxCall is used to manage executing a GPU kernel. In addition to launching
+// the kernel, this class releases resources allocated for the execution.
+class PtxCall: StackObj {
+ private:
+  JavaThread*  _thread;        // the thread on which this call is made
+  address      _buffer;        // buffer containing parameters and _return_value
+  int          _buffer_size;   // size (in bytes) of _buffer
+  oop*         _pinned;        // objects that have been pinned with cuMemHostRegister
+  int          _pinned_length; // length of _pinned
+  gpu::Ptx::CUdeviceptr  _ret_value;     // pointer to slot in GPU memory holding the return value
+  int          _ret_type_size; // size of the return type value
+  bool         _ret_is_object; // specifies if the return type is Object
+
+  bool check(int status, const char *action) {
+    if (status != GRAAL_CUDA_SUCCESS) {
+      Thread* THREAD = _thread;
+      char* message = NEW_RESOURCE_ARRAY_IN_THREAD(THREAD, char, O_BUFLEN + 1);
+      jio_snprintf(message, O_BUFLEN, "[CUDA] *** Error (status=%d): %s", status, action);
+      if (TraceGPUInteraction || HAS_PENDING_EXCEPTION) {
+        tty->print_cr(message);
+      }
+      if (!HAS_PENDING_EXCEPTION) {
+        SharedRuntime::throw_and_post_jvmti_exception(_thread, vmSymbols::java_lang_RuntimeException(), message);
+      }
+      return false;
+    }
+    if (TraceGPUInteraction) {
+      tty->print_cr("[CUDA] Success: %s", action);
+    }
+    return true;
+  }
+
+ public:
+  PtxCall(JavaThread* thread, address buffer, int buffer_size, oop* pinned, int encodedReturnTypeSize) : _thread(thread),
+      _buffer(buffer), _buffer_size(buffer_size), _pinned(pinned), _pinned_length(0), _ret_value(0), _ret_is_object(encodedReturnTypeSize < 0) {
+    _ret_type_size = _ret_is_object ? -encodedReturnTypeSize : encodedReturnTypeSize;
+  }
+
+  bool is_object_return() { return _ret_is_object; }
+
+  void alloc_return_value() {
+    if (_ret_type_size != 0) {
+      if (check(gpu::Ptx::_cuda_cu_memalloc(&_ret_value, _ret_type_size), "Allocate device memory for return value")) {
+        gpu::Ptx::CUdeviceptr* retValuePtr = (gpu::Ptx::CUdeviceptr*) ((_buffer + _buffer_size) - sizeof(_ret_value));
+        *retValuePtr = _ret_value;
+      }
+    }
+  }
+
+  void pin_objects(int count, int* objectOffsets) {
+    if (count == 0) {
+      return;
+    }
+    for (int i = 0; i < count; i++) {
+      int offset = objectOffsets[i];
+      oop* argPtr = (oop*) (_buffer + offset);
+      oop obj = *argPtr;
+      if (obj != NULL) {
+        // Size (in bytes) of object
+        int objSize = obj->size() * HeapWordSize;
+        //tty->print_cr("Pinning object %d at offset %d: %p", i, offset, obj);
+        if (!check(gpu::Ptx::_cuda_cu_mem_host_register(obj, objSize, GRAAL_CU_MEMHOSTREGISTER_DEVICEMAP), "Pin object")) {
+          return;
+        }
+
+        // Record original oop so that its memory can be unpinned
+        _pinned[_pinned_length++] = obj;
+
+        // Replace host pointer to object with device pointer
+        // to object in kernel parameters buffer
+        if (!check(gpu::Ptx::_cuda_cu_mem_host_get_device_pointer((gpu::Ptx::CUdeviceptr*) argPtr, obj, 0), "Get device pointer for pinned object")) {
+          return;
+        }
+      }
+    }
+  }
+
+  void launch(address kernel, jint dimX, jint dimY, jint dimZ) {
+    // grid dimensionality
+    unsigned int gridX = 1;
+    unsigned int gridY = 1;
+    unsigned int gridZ = 1;
+    void * config[] = {
+      GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, (char*) (address) _buffer,
+      GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, &_buffer_size,
+      GRAAL_CU_LAUNCH_PARAM_END
+    };
+    if (check(gpu::Ptx::_cuda_cu_launch_kernel((struct CUfunc_st*) (address) kernel,
+                                      gridX, gridY, gridZ,
+                                      dimX, dimY, dimZ,
+                                      0, NULL, NULL, (void**) &config), "Launch kernel")) {
+    }
+  }
+
+  void synchronize() {
+    check(gpu::Ptx::_cuda_cu_ctx_synchronize(), "Synchronize kernel");
+  }
+
+  void unpin_objects() {
+    while (_pinned_length > 0) {
+      oop obj = _pinned[--_pinned_length];
+      assert(obj != NULL, "npe");
+      //tty->print_cr("Unpinning object %d: %p", _pinned_length, obj);
+      if (!check(gpu::Ptx::_cuda_cu_mem_host_unregister(obj), "Unpin object")) {
+        return;
+      }
+    }
+  }
+
+  oop get_object_return_value() {
+    oop return_val;
+    check(gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, _ret_value, T_OBJECT_BYTE_SIZE), "Copy return value from device");
+    return return_val;
+  }
+
+  jlong get_primitive_return_value() {
+    jlong return_val;
+    check(gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, _ret_value, _ret_type_size), "Copy return value from device");
+    return return_val;
+  }
+
+  void free_return_value() {
+    if (_ret_value != 0) {
+      check(gpu::Ptx::_cuda_cu_memfree(_ret_value), "Free device memory");
+      _ret_value = 0;
+    }
+  }
+
+  void destroy_context() {
+    if (gpu::Ptx::_device_context != NULL) {
+      check(gpu::Ptx::_cuda_cu_ctx_destroy(gpu::Ptx::_device_context), "Destroy context");
+      gpu::Ptx::_device_context = NULL;
+    }
+  }
+
+  ~PtxCall() {
+    unpin_objects();
+    free_return_value();
+    destroy_context();
+  }
+};
+
+
 JRT_ENTRY(jlong, gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ,
-                                                  jlong parametersAndReturnValueBuffer,
-                                                  jint parametersAndReturnValueBufferSize,
+                                                  jlong buffer,
+                                                  jint bufferSize,
+                                                  jint objectParametersCount,
+                                                  jlong objectParametersOffsets,
+                                                  jlong pinnedObjects,
                                                   int encodedReturnTypeSize))
   if (kernel == 0L) {
     SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_NullPointerException(), NULL);
     return 0L;
   }
 
-  // grid dimensionality
-  unsigned int gridX = 1;
-  unsigned int gridY = 1;
-  unsigned int gridZ = 1;
-
-  struct CUfunc_st* cu_function = (struct CUfunc_st*) (address) kernel;
+  PtxCall call(thread, (address) buffer, bufferSize, (oop*) (address) pinnedObjects, encodedReturnTypeSize);
 
-  void * config[5] = {
-    GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, (char*) (address) parametersAndReturnValueBuffer,
-    GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, &parametersAndReturnValueBufferSize,
-    GRAAL_CU_LAUNCH_PARAM_END
-  };
+#define TRY(action) do { \
+  action; \
+  if (HAS_PENDING_EXCEPTION) return 0L; \
+} while (0)
 
-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] launching kernel");
-  }
+  TRY(call.alloc_return_value());
 
-  bool isObjectReturn = encodedReturnTypeSize < 0;
-  int returnTypeSize = encodedReturnTypeSize < 0 ? -encodedReturnTypeSize : encodedReturnTypeSize;
-  gpu::Ptx::CUdeviceptr device_return_value;
-  int status;
-  if (returnTypeSize != 0) {
-    status = _cuda_cu_memalloc(&device_return_value, returnTypeSize);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to allocate memory for return value pointer on device");
-      return 0L;
-    }
-    // Push device_return_value to kernelParams
-    gpu::Ptx::CUdeviceptr* returnValuePtr = (gpu::Ptx::CUdeviceptr*)
-                                               ((address) parametersAndReturnValueBuffer +
-                                                parametersAndReturnValueBufferSize - sizeof(device_return_value));
-    *returnValuePtr = device_return_value;
-  }
+  TRY(call.pin_objects(objectParametersCount, (int*) (address) objectParametersOffsets));
+
+  TRY(call.launch((address) kernel, dimX, dimY, dimZ));
 
-  status = _cuda_cu_launch_kernel(cu_function,
-                                      gridX, gridY, gridZ,
-                                      dimX, dimY, dimZ,
-                                      0, NULL, NULL, (void **) &config);
+  TRY(call.synchronize());
 
-  if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to launch kernel");
-    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to launch kernel");
+  if (call.is_object_return()) {
+    oop return_val;
+    TRY(return_val = call.get_object_return_value());
+    thread->set_vm_result(return_val);
     return 0L;
   }
 
-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Success: Kernel Launch: X: %d Y: %d Z: %d", dimX, dimY, dimZ);
-  }
-
-  status = _cuda_cu_ctx_synchronize();
-
-  if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to synchronize launched kernel (%d)", status);
-    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to synchronize launched kernel");
-    return 0L;
-  }
-
-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Success: Synchronized launch kernel");
-  }
+  jlong return_val;
+  TRY(return_val = call.get_primitive_return_value());
+  return return_val;
 
-  jlong primitiveReturnValue = 0L;
-  if (isObjectReturn) {
-    oop return_val;
-    status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, device_return_value, T_OBJECT_BYTE_SIZE);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status);
-      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument");
-      return 0L;
-    }
-    thread->set_vm_result(return_val);
-  } else if (returnTypeSize > 0) {
-    status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&primitiveReturnValue, device_return_value, returnTypeSize);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status);
-      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument");
-      return 0L;
-    }
-  }
+#undef TRY
 
-  // Free device memory allocated for result
-  if (returnTypeSize != 0) {
-    status = gpu::Ptx::_cuda_cu_memfree(device_return_value);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status);
-      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to free device memory of return value");
-      return 0L;
-    }
-  }
-
-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Success: Freed device memory of return value");
-  }
-
-  // Destroy context
-  status = gpu::Ptx::_cuda_cu_ctx_destroy(_device_context);
-  if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] *** Error (%d) Failed to destroy context", status);
-    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to destroy context");
-    return 0L;
-  }
-
-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Success: Destroy context");
-  }
-
-  return primitiveReturnValue;
 JRT_END
 
 bool gpu::Ptx::execute_kernel(address kernel, PTXKernelArguments &ptxka, JavaValue &ret) {
@@ -620,6 +687,7 @@
     if (handle != NULL) {
       LOOKUP_CUDA_FUNCTION(cuInit, cuda_cu_init);
       LOOKUP_CUDA_FUNCTION(cuCtxSynchronize, cuda_cu_ctx_synchronize);
+      LOOKUP_CUDA_FUNCTION(cuCtxGetCurrent, cuda_cu_ctx_get_current);
       LOOKUP_CUDA_FUNCTION(cuCtxSetCurrent, cuda_cu_ctx_set_current);
       LOOKUP_CUDA_FUNCTION(cuDeviceGetCount, cuda_cu_device_get_count);
       LOOKUP_CUDA_FUNCTION(cuDeviceGetName, cuda_cu_device_get_name);
--- a/src/gpu/ptx/vm/gpu_ptx.hpp	Fri Jan 24 17:59:46 2014 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.hpp	Fri Jan 24 18:01:36 2014 +0100
@@ -84,16 +84,19 @@
  * Context creation flags
  */
 
-#define GRAAL_CU_CTX_MAP_HOST 0x08
+#define GRAAL_CU_CTX_MAP_HOST            0x08
+#define GRAAL_CU_CTX_SCHED_BLOCKING_SYNC 0x04
 
 class Ptx {
   friend class gpu;
+  friend class PtxCall;
 
  protected:
   static bool probe_linkage();
   static bool initialize_gpu();
   static unsigned int total_cores();
-  static void * generate_kernel(unsigned char *code, int code_len, const char *name);
+  static void* get_context();
+  static void* generate_kernel(unsigned char *code, int code_len, const char *name);
   static bool execute_warp(int dimX, int dimY, int dimZ, address kernel, PTXKernelArguments & ka, JavaValue &ret);
   static bool execute_kernel(address kernel, PTXKernelArguments & ka, JavaValue &ret);
 public:
@@ -106,8 +109,11 @@
 typedef int CUdevice;     /* CUDA device */
 
   static jlong execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ,
-                                      jlong parametersAndReturnValueBuffer,
-                                      jint parametersAndReturnValueBufferSize,
+                                      jlong buffer,
+                                      jint bufferSize,
+                                      jint objectParametersCount,
+                                      jlong objectParametersOffsets,
+                                      jlong pinnedObjects,
                                       int encodedReturnTypeSize);
 
 private:
@@ -115,6 +121,7 @@
   typedef int (*cuda_cu_ctx_create_func_t)(void*, unsigned int, CUdevice);
   typedef int (*cuda_cu_ctx_destroy_func_t)(void*);
   typedef int (*cuda_cu_ctx_synchronize_func_t)(void);
+  typedef int (*cuda_cu_ctx_get_current_func_t)(void*);
   typedef int (*cuda_cu_ctx_set_current_func_t)(void*);
   typedef int (*cuda_cu_device_get_count_func_t)(int*);
   typedef int (*cuda_cu_device_get_name_func_t)(char*, int, int);
@@ -152,6 +159,7 @@
   static cuda_cu_memfree_func_t                   _cuda_cu_memfree;
   static cuda_cu_memcpy_htod_func_t               _cuda_cu_memcpy_htod;
   static cuda_cu_memcpy_dtoh_func_t               _cuda_cu_memcpy_dtoh;
+  static cuda_cu_ctx_get_current_func_t           _cuda_cu_ctx_get_current;
   static cuda_cu_ctx_set_current_func_t           _cuda_cu_ctx_set_current;
   static cuda_cu_mem_host_register_func_t         _cuda_cu_mem_host_register;
   static cuda_cu_mem_host_get_device_pointer_func_t _cuda_cu_mem_host_get_device_pointer;