# HG changeset patch # User Doug Simon # Date 1390582896 -3600 # Node ID 80cd5c3b8827e342841c28f416fb0b7210c32bb9 # Parent 34ab58984118e3d476bcfe83aa083027477b912a partially fixed passing of object parameters to PTX kernels; use a C++ object for managing resource allocation and cleanup around a PTX kernel execution diff -r 34ab58984118 -r 80cd5c3b8827 graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java --- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java Fri Jan 24 17:59:46 2014 +0100 +++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java Fri Jan 24 18:01:36 2014 +0100 @@ -67,7 +67,7 @@ public class PTXHotSpotBackend extends HotSpotBackend { /** - * Descriptor for the PTX runtime method for launching a kernel. The C++ signature is: + * Descriptor for the PTX runtime method for calling a kernel. The C++ signature is: * *
      *     jlong (JavaThread* thread,
@@ -77,11 +77,14 @@
      *            jint dimZ,
      *            jlong parametersAndReturnValueBuffer,
      *            jint parametersAndReturnValueBufferSize,
+     *            jint objectParametersCount,
+     *            jlong objectParametersOffsets,
+     *            jlong pinnedObjects,
      *            jint encodedReturnTypeSize)
      * 
*/ // @formatter:off - public static final ForeignCallDescriptor LAUNCH_KERNEL = new ForeignCallDescriptor("execute_kernel_from_vm", long.class, + public static final ForeignCallDescriptor CALL_KERNEL = new ForeignCallDescriptor("execute_kernel_from_vm", long.class, Word.class, // thread long.class, // kernel int.class, // dimX @@ -89,6 +92,9 @@ int.class, // dimZ long.class, // parametersAndReturnValueBuffer int.class, // parametersAndReturnValueBufferSize + int.class, // objectParameterCount + long.class, // objectParameterOffsets + long.class, // pinnedObjects int.class); // encodedReturnTypeSize // @formatter:on @@ -114,7 +120,7 @@ CompilerToGPU compilerToGPU = getRuntime().getCompilerToGPU(); if (deviceInitialized) { long launchKernel = compilerToGPU.getLaunchKernelAddress(); - hostForeignCalls.registerForeignCall(LAUNCH_KERNEL, launchKernel, NativeCall, DESTROYS_REGISTERS, NOT_LEAF, NOT_REEXECUTABLE, ANY_LOCATION); + hostForeignCalls.registerForeignCall(CALL_KERNEL, launchKernel, NativeCall, DESTROYS_REGISTERS, NOT_LEAF, NOT_REEXECUTABLE, ANY_LOCATION); } super.completeInitialization(); } @@ -171,6 +177,7 @@ new SpeculationLog(), suites, true, new ExternalCompilationResult(), CompilationResultBuilderFactory.Default); if (makeBinary) { try (Scope ds = Debug.scope("GeneratingKernelBinary")) { + assert ptxCode.getTargetCode() != null; long kernel = getRuntime().getCompilerToGPU().generateKernel(ptxCode.getTargetCode(), method.getName()); ptxCode.setEntryPoint(kernel); } catch (Throwable e) { diff -r 34ab58984118 -r 80cd5c3b8827 graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXWrapperBuilder.java --- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXWrapperBuilder.java Fri Jan 24 17:59:46 2014 +0100 +++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXWrapperBuilder.java Fri Jan 24 18:01:36 2014 +0100 @@ -55,21 +55,62 @@ /** * Utility for building a graph that "wraps" a compiled PTX kernel. Such a wrapper handles the - * transition from the host CPU to the GPU and back. The graph created is something like the - * following pseudo code with UPPER CASE denoting compile-time constants: + * transition from the host CPU to the GPU and back. The wrapper allocate 3 on-stack buffers: + * + * + * + * The PARAMS buffer is the {@code CU_LAUNCH_PARAM_BUFFER_POINTER} buffer passed in the + * {@code extra} argument to the {@code cuLaunchKernel} function. This buffer contains the + * parameters to the call. The buffer is word aligned and each parameter is aligned in the buffer + * according to its data size. The wrapper copies the incoming arguments into the buffer as is. The + * native {@link PTXHotSpotBackend#CALL_KERNEL callKernel} function will pin the memory for each + * object parameter (using {@code cuMemHostRegister}) and then replace the object pointer in PARAMS + * with an on-device pointer to the object's memory (see {@code cuMemHostGetDevicePointer}). The + * function saves pinned object pointer into PINNED so that it can unpinned once the kernel returns. + * The object pointers in PARAMS are specified by OBJECT_OFFSETS. + *

+ * As a concrete example, for a kernel whose Java method signature is: * *

- *     T kernel(p0, p1, ..., pN) {
- *         jint bufSize = SIZE_OF_ALIGNED_PARAMS_AND_RETURN_VALUE_WITH_PADDING(p0, p1, ..., pN);
- *         jbyte buf[bufSize] = {p0, PAD(p1), p1, ..., PAD(pN), pN};
- *         jlong result = PTX_LAUNCH_KERNEL(THREAD_REGISTER, KERNEL_ENTRY_POINT, dimX, dimY, dimZ, buf, bufSize, encodedReturnTypeSize);
- *         return convert(result);
+ *     static int kernel(int p1, short p2, Object p3, long p4)
+ * 
+ * + * the graph created is shown below as psuedo-code: + * + *
+ *     int kernel_wrapper(int p1, short p2, oop p3, long p4) {
+ *         address kernelAddr = kernel.start;
+ *         if (kernelAddr == 0) {
+ *             deopt(InvalidateRecompile, RuntimeConstraint);
+ *         }
+ *         byte PARAMS[32];
+ *         word PINNED[1]; // note: no refmap
+ *         int OBJECT_OFFSETS[1] = {8};
+ *         ((int*) PARAMS)[0] = p1;
+ *         ((short*) PARAMS)[2] = p2;
+ *         ((word*) PARAMS)[1] = p3;
+ *         ((long*) PARAMS)[2] = p4;
+ *         int result = CALL_KERNEL(THREAD_REGISTER, KERNEL_ENTRY_POINT, 1, 1, 1, PARAMS, 32, 1, OBJECT_OFFSETS, PINNED, 4);
+ *         if (clearPendingException(thread)) {
+ *             deopt(None, RuntimeConstraint);
+ *         }
+ *         return result;
  *     }
  * 
*

* The generated graph includes a reference to the {@link HotSpotNmethod} for the kernel. There must * be another reference to the same {@link HotSpotNmethod} object to ensure that the nmethod is not * unloaded by the next full GC. + *

+ * TODO: Only the memory for objects passed as parameters is pinned. Surely the memory for other + * objects accessed in the kernel reachable from the parameter objects needs to be pinned as well? + *

+ * TODO: Objects references within kernels are currently completely hidden from GC. */ public class PTXWrapperBuilder extends GraphKit { @@ -92,11 +133,23 @@ int[] javaParameterOffsetsInKernelParametersBuffer; /** - * Constants denoting the arguments to {@link PTXHotSpotBackend#LAUNCH_KERNEL}. + * Constants denoting the arguments to {@link PTXHotSpotBackend#CALL_KERNEL}. */ + // @formatter:off enum LaunchArg { - Thread, Kernel, DimX, DimY, DimZ, ParametersAndReturnValueBuffer, ParametersAndReturnValueBufferSize, EncodedReturnTypeSize + Thread, + Kernel, + DimX, + DimY, + DimZ, + ParametersAndReturnValueBuffer, + ParametersAndReturnValueBufferSize, + ObjectParametersCount, + ObjectParametersOffsets, + PinnedObjects, + EncodedReturnTypeSize } + // @formatter:on /** * Creates the graph implementing the CPU to GPU transition. @@ -108,6 +161,7 @@ public PTXWrapperBuilder(ResolvedJavaMethod method, HotSpotNmethod kernel, HotSpotProviders providers) { super(new StructuredGraph(method), providers); int wordSize = providers.getCodeCache().getTarget().wordSize; + int intSize = Integer.SIZE / Byte.SIZE; Kind wordKind = providers.getCodeCache().getTarget().wordKind; Signature sig = method.getSignature(); boolean isStatic = isStatic(method.getModifiers()); @@ -117,17 +171,18 @@ int javaParametersIndex = 0; Kind returnKind = sig.getReturnKind(); - BitSet objects = new BitSet(); + List objectSlots = new ArrayList<>(javaParameters.length); if (!isStatic) { - allocateParameter(Kind.Object, javaParametersIndex++, objects, wordSize); + allocateParameter(Kind.Object, javaParametersIndex++, objectSlots, wordSize); } for (int sigIndex = 0; sigIndex < sigCount; sigIndex++) { Kind kind = sig.getParameterKind(sigIndex); - allocateParameter(kind, javaParametersIndex++, objects, wordSize); + allocateParameter(kind, javaParametersIndex++, objectSlots, wordSize); } bufSize = roundUp(bufSize, wordSize); - // Add slot for holding pointer to device memory storing return value + // Add slot for the device memory pointer. The kernel writes a + // pointer in this slot that points to the return value. int encodedReturnTypeSize = 0; if (returnKind != Kind.Void) { bufSize += wordSize; @@ -140,7 +195,29 @@ InvokeNode kernelStart = createInvoke(getClass(), "getKernelStart", ConstantNode.forObject(kernel, providers.getMetaAccess(), getGraph())); - AllocaNode buf = append(new AllocaNode(bufSize / wordSize, objects)); + AllocaNode buf = append(new AllocaNode(bufSize / wordSize, new BitSet())); + ValueNode objectParametersOffsets; + ValueNode pinnedObjects; + ConstantNode nullWord = ConstantNode.forIntegerKind(wordKind, 0L, getGraph()); + if (objectSlots.isEmpty()) { + objectParametersOffsets = ConstantNode.forLong(0, getGraph()); + pinnedObjects = ConstantNode.forLong(0, getGraph()); + } else { + int intsPerWord = wordSize / intSize; + int slots = roundUp(objectSlots.size(), intsPerWord); + objectParametersOffsets = append(new AllocaNode(slots, new BitSet())); + // No refmap for pinned objects list since kernel execution is (currently) GC unsafe + pinnedObjects = append(new AllocaNode(objectSlots.size(), new BitSet())); + + // Initialize the object parameter offsets array + int index = 0; + for (int slot : objectSlots) { + int offset = slot * wordSize; + LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, Kind.Int, index * intSize, getGraph()); + append(new WriteNode(objectParametersOffsets, ConstantNode.forInt(offset, getGraph()), location, BarrierType.NONE, false, false)); + index++; + } + } Map args = new EnumMap<>(LaunchArg.class); args.put(Thread, append(new ReadRegisterNode(providers.getRegisters().getThreadRegister(), true, false))); @@ -150,6 +227,9 @@ args.put(DimZ, forInt(1, getGraph())); args.put(ParametersAndReturnValueBuffer, buf); args.put(ParametersAndReturnValueBufferSize, forInt(bufSize, getGraph())); + args.put(ObjectParametersCount, forInt(objectSlots.size(), getGraph())); + args.put(ObjectParametersOffsets, objectParametersOffsets); + args.put(PinnedObjects, pinnedObjects); args.put(EncodedReturnTypeSize, forInt(encodedReturnTypeSize, getGraph())); int sigIndex = isStatic ? 0 : -1; @@ -162,7 +242,7 @@ } if (returnKind != Kind.Void) { LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, wordKind, bufSize - wordSize, getGraph()); - append(new WriteNode(buf, ConstantNode.forIntegerKind(wordKind, 0L, getGraph()), location, BarrierType.NONE, false, false)); + append(new WriteNode(buf, nullWord, location, BarrierType.NONE, false, false)); } FrameStateBuilder fsb = new FrameStateBuilder(method, getGraph(), true); @@ -170,7 +250,7 @@ getGraph().start().setStateAfter(fs); ValueNode[] launchArgsArray = args.values().toArray(new ValueNode[args.size()]); - ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), LAUNCH_KERNEL, launchArgsArray)); + ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), CALL_KERNEL, launchArgsArray)); result.setDeoptimizationState(fs); ConstantNode isObjectResultArg = ConstantNode.forBoolean(returnKind == Kind.Object, getGraph()); @@ -193,7 +273,11 @@ case Long: returnValue = result; break; - case Float: + case Float: { + ValueNode asInt = unique(new ConvertNode(Kind.Long, Kind.Int, result)); + returnValue = unique(new ReinterpretNode(Kind.Float, asInt)); + break; + } case Double: returnValue = unique(new ReinterpretNode(returnKind, result)); break; @@ -220,12 +304,12 @@ } /** - * Allocates a slot in the kernel parameters' buffer for a Java parameter. + * Computes offset and size of space in PARAMS for a Java parameter. * * @param kind the kind of the parameter * @param javaParametersIndex the index of the Java parameter */ - private void allocateParameter(Kind kind, int javaParametersIndex, BitSet objects, int wordSize) { + private void allocateParameter(Kind kind, int javaParametersIndex, List objectSlots, int wordSize) { int kindByteSize = kind == Kind.Object ? wordSize : kind.getBitCount() / Byte.SIZE; bufSize = roundUp(bufSize, kindByteSize); javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex] = bufSize; @@ -233,7 +317,7 @@ if (kind == Kind.Object) { stamp = StampFactory.object(); int slot = bufSize / wordSize; - objects.set(slot); + objectSlots.add(slot); } else { stamp = StampFactory.forKind(kind); } diff -r 34ab58984118 -r 80cd5c3b8827 src/gpu/ptx/vm/gpu_ptx.cpp --- a/src/gpu/ptx/vm/gpu_ptx.cpp Fri Jan 24 17:59:46 2014 +0100 +++ b/src/gpu/ptx/vm/gpu_ptx.cpp Fri Jan 24 18:01:36 2014 +0100 @@ -39,6 +39,7 @@ gpu::Ptx::cuda_cu_ctx_create_func_t gpu::Ptx::_cuda_cu_ctx_create; gpu::Ptx::cuda_cu_ctx_destroy_func_t gpu::Ptx::_cuda_cu_ctx_destroy; gpu::Ptx::cuda_cu_ctx_synchronize_func_t gpu::Ptx::_cuda_cu_ctx_synchronize; +gpu::Ptx::cuda_cu_ctx_get_current_func_t gpu::Ptx::_cuda_cu_ctx_get_current; gpu::Ptx::cuda_cu_ctx_set_current_func_t gpu::Ptx::_cuda_cu_ctx_set_current; gpu::Ptx::cuda_cu_device_get_count_func_t gpu::Ptx::_cuda_cu_device_get_count; gpu::Ptx::cuda_cu_device_get_name_func_t gpu::Ptx::_cuda_cu_device_get_name; @@ -337,123 +338,189 @@ return cu_function; } +// A PtxCall is used to manage executing a GPU kernel. In addition to launching +// the kernel, this class releases resources allocated for the execution. +class PtxCall: StackObj { + private: + JavaThread* _thread; // the thread on which this call is made + address _buffer; // buffer containing parameters and _return_value + int _buffer_size; // size (in bytes) of _buffer + oop* _pinned; // objects that have been pinned with cuMemHostRegister + int _pinned_length; // length of _pinned + gpu::Ptx::CUdeviceptr _ret_value; // pointer to slot in GPU memory holding the return value + int _ret_type_size; // size of the return type value + bool _ret_is_object; // specifies if the return type is Object + + bool check(int status, const char *action) { + if (status != GRAAL_CUDA_SUCCESS) { + Thread* THREAD = _thread; + char* message = NEW_RESOURCE_ARRAY_IN_THREAD(THREAD, char, O_BUFLEN + 1); + jio_snprintf(message, O_BUFLEN, "[CUDA] *** Error (status=%d): %s", status, action); + if (TraceGPUInteraction || HAS_PENDING_EXCEPTION) { + tty->print_cr(message); + } + if (!HAS_PENDING_EXCEPTION) { + SharedRuntime::throw_and_post_jvmti_exception(_thread, vmSymbols::java_lang_RuntimeException(), message); + } + return false; + } + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] Success: %s", action); + } + return true; + } + + public: + PtxCall(JavaThread* thread, address buffer, int buffer_size, oop* pinned, int encodedReturnTypeSize) : _thread(thread), + _buffer(buffer), _buffer_size(buffer_size), _pinned(pinned), _pinned_length(0), _ret_value(0), _ret_is_object(encodedReturnTypeSize < 0) { + _ret_type_size = _ret_is_object ? -encodedReturnTypeSize : encodedReturnTypeSize; + } + + bool is_object_return() { return _ret_is_object; } + + void alloc_return_value() { + if (_ret_type_size != 0) { + if (check(gpu::Ptx::_cuda_cu_memalloc(&_ret_value, _ret_type_size), "Allocate device memory for return value")) { + gpu::Ptx::CUdeviceptr* retValuePtr = (gpu::Ptx::CUdeviceptr*) ((_buffer + _buffer_size) - sizeof(_ret_value)); + *retValuePtr = _ret_value; + } + } + } + + void pin_objects(int count, int* objectOffsets) { + if (count == 0) { + return; + } + for (int i = 0; i < count; i++) { + int offset = objectOffsets[i]; + oop* argPtr = (oop*) (_buffer + offset); + oop obj = *argPtr; + if (obj != NULL) { + // Size (in bytes) of object + int objSize = obj->size() * HeapWordSize; + //tty->print_cr("Pinning object %d at offset %d: %p", i, offset, obj); + if (!check(gpu::Ptx::_cuda_cu_mem_host_register(obj, objSize, GRAAL_CU_MEMHOSTREGISTER_DEVICEMAP), "Pin object")) { + return; + } + + // Record original oop so that its memory can be unpinned + _pinned[_pinned_length++] = obj; + + // Replace host pointer to object with device pointer + // to object in kernel parameters buffer + if (!check(gpu::Ptx::_cuda_cu_mem_host_get_device_pointer((gpu::Ptx::CUdeviceptr*) argPtr, obj, 0), "Get device pointer for pinned object")) { + return; + } + } + } + } + + void launch(address kernel, jint dimX, jint dimY, jint dimZ) { + // grid dimensionality + unsigned int gridX = 1; + unsigned int gridY = 1; + unsigned int gridZ = 1; + void * config[] = { + GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, (char*) (address) _buffer, + GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, &_buffer_size, + GRAAL_CU_LAUNCH_PARAM_END + }; + if (check(gpu::Ptx::_cuda_cu_launch_kernel((struct CUfunc_st*) (address) kernel, + gridX, gridY, gridZ, + dimX, dimY, dimZ, + 0, NULL, NULL, (void**) &config), "Launch kernel")) { + } + } + + void synchronize() { + check(gpu::Ptx::_cuda_cu_ctx_synchronize(), "Synchronize kernel"); + } + + void unpin_objects() { + while (_pinned_length > 0) { + oop obj = _pinned[--_pinned_length]; + assert(obj != NULL, "npe"); + //tty->print_cr("Unpinning object %d: %p", _pinned_length, obj); + if (!check(gpu::Ptx::_cuda_cu_mem_host_unregister(obj), "Unpin object")) { + return; + } + } + } + + oop get_object_return_value() { + oop return_val; + check(gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, _ret_value, T_OBJECT_BYTE_SIZE), "Copy return value from device"); + return return_val; + } + + jlong get_primitive_return_value() { + jlong return_val; + check(gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, _ret_value, _ret_type_size), "Copy return value from device"); + return return_val; + } + + void free_return_value() { + if (_ret_value != 0) { + check(gpu::Ptx::_cuda_cu_memfree(_ret_value), "Free device memory"); + _ret_value = 0; + } + } + + void destroy_context() { + if (gpu::Ptx::_device_context != NULL) { + check(gpu::Ptx::_cuda_cu_ctx_destroy(gpu::Ptx::_device_context), "Destroy context"); + gpu::Ptx::_device_context = NULL; + } + } + + ~PtxCall() { + unpin_objects(); + free_return_value(); + destroy_context(); + } +}; + + JRT_ENTRY(jlong, gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ, - jlong parametersAndReturnValueBuffer, - jint parametersAndReturnValueBufferSize, + jlong buffer, + jint bufferSize, + jint objectParametersCount, + jlong objectParametersOffsets, + jlong pinnedObjects, int encodedReturnTypeSize)) if (kernel == 0L) { SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_NullPointerException(), NULL); return 0L; } - // grid dimensionality - unsigned int gridX = 1; - unsigned int gridY = 1; - unsigned int gridZ = 1; - - struct CUfunc_st* cu_function = (struct CUfunc_st*) (address) kernel; + PtxCall call(thread, (address) buffer, bufferSize, (oop*) (address) pinnedObjects, encodedReturnTypeSize); - void * config[5] = { - GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, (char*) (address) parametersAndReturnValueBuffer, - GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, ¶metersAndReturnValueBufferSize, - GRAAL_CU_LAUNCH_PARAM_END - }; +#define TRY(action) do { \ + action; \ + if (HAS_PENDING_EXCEPTION) return 0L; \ +} while (0) - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] launching kernel"); - } + TRY(call.alloc_return_value()); - bool isObjectReturn = encodedReturnTypeSize < 0; - int returnTypeSize = encodedReturnTypeSize < 0 ? -encodedReturnTypeSize : encodedReturnTypeSize; - gpu::Ptx::CUdeviceptr device_return_value; - int status; - if (returnTypeSize != 0) { - status = _cuda_cu_memalloc(&device_return_value, returnTypeSize); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to allocate memory for return value pointer on device"); - return 0L; - } - // Push device_return_value to kernelParams - gpu::Ptx::CUdeviceptr* returnValuePtr = (gpu::Ptx::CUdeviceptr*) - ((address) parametersAndReturnValueBuffer + - parametersAndReturnValueBufferSize - sizeof(device_return_value)); - *returnValuePtr = device_return_value; - } + TRY(call.pin_objects(objectParametersCount, (int*) (address) objectParametersOffsets)); + + TRY(call.launch((address) kernel, dimX, dimY, dimZ)); - status = _cuda_cu_launch_kernel(cu_function, - gridX, gridY, gridZ, - dimX, dimY, dimZ, - 0, NULL, NULL, (void **) &config); + TRY(call.synchronize()); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] Failed to launch kernel"); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to launch kernel"); + if (call.is_object_return()) { + oop return_val; + TRY(return_val = call.get_object_return_value()); + thread->set_vm_result(return_val); return 0L; } - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Success: Kernel Launch: X: %d Y: %d Z: %d", dimX, dimY, dimZ); - } - - status = _cuda_cu_ctx_synchronize(); - - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] Failed to synchronize launched kernel (%d)", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to synchronize launched kernel"); - return 0L; - } - - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Success: Synchronized launch kernel"); - } + jlong return_val; + TRY(return_val = call.get_primitive_return_value()); + return return_val; - jlong primitiveReturnValue = 0L; - if (isObjectReturn) { - oop return_val; - status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, device_return_value, T_OBJECT_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument"); - return 0L; - } - thread->set_vm_result(return_val); - } else if (returnTypeSize > 0) { - status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&primitiveReturnValue, device_return_value, returnTypeSize); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument"); - return 0L; - } - } +#undef TRY - // Free device memory allocated for result - if (returnTypeSize != 0) { - status = gpu::Ptx::_cuda_cu_memfree(device_return_value); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to free device memory of return value"); - return 0L; - } - } - - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Success: Freed device memory of return value"); - } - - // Destroy context - status = gpu::Ptx::_cuda_cu_ctx_destroy(_device_context); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to destroy context", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to destroy context"); - return 0L; - } - - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Success: Destroy context"); - } - - return primitiveReturnValue; JRT_END bool gpu::Ptx::execute_kernel(address kernel, PTXKernelArguments &ptxka, JavaValue &ret) { @@ -620,6 +687,7 @@ if (handle != NULL) { LOOKUP_CUDA_FUNCTION(cuInit, cuda_cu_init); LOOKUP_CUDA_FUNCTION(cuCtxSynchronize, cuda_cu_ctx_synchronize); + LOOKUP_CUDA_FUNCTION(cuCtxGetCurrent, cuda_cu_ctx_get_current); LOOKUP_CUDA_FUNCTION(cuCtxSetCurrent, cuda_cu_ctx_set_current); LOOKUP_CUDA_FUNCTION(cuDeviceGetCount, cuda_cu_device_get_count); LOOKUP_CUDA_FUNCTION(cuDeviceGetName, cuda_cu_device_get_name); diff -r 34ab58984118 -r 80cd5c3b8827 src/gpu/ptx/vm/gpu_ptx.hpp --- a/src/gpu/ptx/vm/gpu_ptx.hpp Fri Jan 24 17:59:46 2014 +0100 +++ b/src/gpu/ptx/vm/gpu_ptx.hpp Fri Jan 24 18:01:36 2014 +0100 @@ -84,16 +84,19 @@ * Context creation flags */ -#define GRAAL_CU_CTX_MAP_HOST 0x08 +#define GRAAL_CU_CTX_MAP_HOST 0x08 +#define GRAAL_CU_CTX_SCHED_BLOCKING_SYNC 0x04 class Ptx { friend class gpu; + friend class PtxCall; protected: static bool probe_linkage(); static bool initialize_gpu(); static unsigned int total_cores(); - static void * generate_kernel(unsigned char *code, int code_len, const char *name); + static void* get_context(); + static void* generate_kernel(unsigned char *code, int code_len, const char *name); static bool execute_warp(int dimX, int dimY, int dimZ, address kernel, PTXKernelArguments & ka, JavaValue &ret); static bool execute_kernel(address kernel, PTXKernelArguments & ka, JavaValue &ret); public: @@ -106,8 +109,11 @@ typedef int CUdevice; /* CUDA device */ static jlong execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ, - jlong parametersAndReturnValueBuffer, - jint parametersAndReturnValueBufferSize, + jlong buffer, + jint bufferSize, + jint objectParametersCount, + jlong objectParametersOffsets, + jlong pinnedObjects, int encodedReturnTypeSize); private: @@ -115,6 +121,7 @@ typedef int (*cuda_cu_ctx_create_func_t)(void*, unsigned int, CUdevice); typedef int (*cuda_cu_ctx_destroy_func_t)(void*); typedef int (*cuda_cu_ctx_synchronize_func_t)(void); + typedef int (*cuda_cu_ctx_get_current_func_t)(void*); typedef int (*cuda_cu_ctx_set_current_func_t)(void*); typedef int (*cuda_cu_device_get_count_func_t)(int*); typedef int (*cuda_cu_device_get_name_func_t)(char*, int, int); @@ -152,6 +159,7 @@ static cuda_cu_memfree_func_t _cuda_cu_memfree; static cuda_cu_memcpy_htod_func_t _cuda_cu_memcpy_htod; static cuda_cu_memcpy_dtoh_func_t _cuda_cu_memcpy_dtoh; + static cuda_cu_ctx_get_current_func_t _cuda_cu_ctx_get_current; static cuda_cu_ctx_set_current_func_t _cuda_cu_ctx_set_current; static cuda_cu_mem_host_register_func_t _cuda_cu_mem_host_register; static cuda_cu_mem_host_get_device_pointer_func_t _cuda_cu_mem_host_get_device_pointer;