# HG changeset patch # User S.Bharadwaj Yadavalli # Date 1383345243 14400 # Node ID 1a7e7011a3413e92146a3f6ceaf7e16d4bf6012e # Parent 0dd597c6c9c74df893c7a395d6acdd9860cfdd1d * PTX kernel argument buffer now has naturally aligned arguments as required by PTX JIT compiler. * Change dynamic loading of CUDA driver API functions to load 32-bit or 64-bit versions of depending on the the host architecture. * Add ability to generate PTX kernels to be launched both on 32-bit and 64-bit hosts. * Use Unified Virtual Memory APIs to perform array argument marshalling. * PTX array storage test runs on the device and returns correct results. * More integer test failures on GPU fixed. diff -r 0dd597c6c9c7 -r 1a7e7011a341 graal/com.oracle.graal.asm.ptx/src/com/oracle/graal/asm/ptx/PTXAssembler.java --- a/graal/com.oracle.graal.asm.ptx/src/com/oracle/graal/asm/ptx/PTXAssembler.java Fri Nov 01 13:07:22 2013 +0100 +++ b/graal/com.oracle.graal.asm.ptx/src/com/oracle/graal/asm/ptx/PTXAssembler.java Fri Nov 01 18:34:03 2013 -0400 @@ -105,6 +105,7 @@ protected Value source1; protected Value source2; private boolean logicInstruction = false; + private boolean ldRetAddrInstruction = false; public StandardFormat(Variable dst, Value src1, Value src2) { setDestination(dst); @@ -139,8 +140,18 @@ logicInstruction = b; } + public void setLdRetAddrInstruction(boolean b) { + ldRetAddrInstruction = b; + } + public String typeForKind(Kind k) { - if (logicInstruction) { + if (ldRetAddrInstruction) { + if (System.getProperty("os.arch").compareTo("amd64") == 0) { + return "u64"; + } else { + return "u32"; + } + } else if (logicInstruction) { switch (k.getTypeChar()) { case 's': return "b16"; @@ -658,16 +669,16 @@ } public static class Param extends SingleOperandFormat { - - private boolean lastParameter; + // Last parameter holds the return parameter. + private boolean returnParameter; public Param(Variable d, boolean lastParam) { super(d, null); - setLastParameter(lastParam); + setReturnParameter(lastParam); } - public void setLastParameter(boolean value) { - lastParameter = value; + public void setReturnParameter(boolean value) { + returnParameter = value; } public String emitParameter(Variable v) { @@ -675,30 +686,38 @@ } public void emit(PTXAssembler asm) { - asm.emitString(".param ." + paramForKind(dest.getKind()) + emitParameter(dest) + (lastParameter ? "" : ",")); + asm.emitString(".param ." + paramForKind(dest.getKind()) + emitParameter(dest) + (returnParameter ? "" : ",")); } public String paramForKind(Kind k) { - switch (k.getTypeChar()) { - case 'z': - case 'f': - return "s32"; - case 'b': - return "s8"; - case 's': - return "s16"; - case 'c': - return "u16"; - case 'i': - return "s32"; - case 'j': - return "s64"; - case 'd': - return "f64"; - case 'a': + if (returnParameter) { + if (System.getProperty("os.arch").compareTo("amd64") == 0) { return "u64"; - default: - throw GraalInternalError.shouldNotReachHere(); + } else { + return "u32"; + } + } else { + switch (k.getTypeChar()) { + case 'z': + case 'f': + return "s32"; + case 'b': + return "s8"; + case 's': + return "s16"; + case 'c': + return "u16"; + case 'i': + return "s32"; + case 'j': + return "s64"; + case 'd': + return "f64"; + case 'a': + return "u64"; + default: + throw GraalInternalError.shouldNotReachHere(); + } } } diff -r 0dd597c6c9c7 -r 1a7e7011a341 graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ArrayPTXTest.java --- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ArrayPTXTest.java Fri Nov 01 13:07:22 2013 +0100 +++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ArrayPTXTest.java Fri Nov 01 18:34:03 2013 -0400 @@ -40,7 +40,11 @@ int[] array3 = {1, 2, 3, 4, 5, 6, 7, 8, 9}; invoke(compile("testStoreArray1I"), array1, 2); - printReport("testStoreArray1I: " + Arrays.toString(array1)); + if (array1[2] == 42) { + printReport("testStoreArray1I: " + Arrays.toString(array1) + " PASSED"); + } else { + printReport("testStoreArray1I: " + Arrays.toString(array1) + " FAILED"); + } invoke(compile("testStoreArrayWarp0"), array2, 2); printReport("testStoreArrayWarp0: " + Arrays.toString(array2)); diff -r 0dd597c6c9c7 -r 1a7e7011a341 graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java --- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java Fri Nov 01 13:07:22 2013 +0100 +++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java Fri Nov 01 18:34:03 2013 -0400 @@ -44,6 +44,7 @@ import com.oracle.graal.lir.ptx.*; import com.oracle.graal.nodes.*; import com.oracle.graal.nodes.cfg.*; +import com.oracle.graal.lir.ptx.PTXMemOp.LoadReturnAddrOp; /** * HotSpot PTX specific backend. @@ -64,6 +65,84 @@ return new PTXFrameMap(getCodeCache()); } + static final class RegisterAnalysis extends ValueProcedure { + private final SortedSet unsigned64 = new TreeSet<>(); + private final SortedSet signed64 = new TreeSet<>(); + private final SortedSet float32 = new TreeSet<>(); + private final SortedSet signed32 = new TreeSet<>(); + private final SortedSet float64 = new TreeSet<>(); + + LIRInstruction op; + + void emitDeclarations(Buffer codeBuffer) { + for (Integer i : signed32) { + codeBuffer.emitString(".reg .s32 %r" + i.intValue() + ";"); + } + for (Integer i : signed64) { + codeBuffer.emitString(".reg .s64 %r" + i.intValue() + ";"); + } + for (Integer i : unsigned64) { + codeBuffer.emitString(".reg .u64 %r" + i.intValue() + ";"); + } + for (Integer i : float32) { + codeBuffer.emitString(".reg .f32 %r" + i.intValue() + ";"); + } + for (Integer i : float64) { + codeBuffer.emitString(".reg .f64 %r" + i.intValue() + ";"); + } + } + + @Override + public Value doValue(Value value, OperandMode mode, EnumSet flags) { + if (isVariable(value)) { + Variable regVal = (Variable) value; + Kind regKind = regVal.getKind(); + if ((op instanceof LoadReturnAddrOp) && (mode == OperandMode.DEF)) { + unsigned64.add(regVal.index); + } else { + switch (regKind) { + case Int: + // If the register was used as a wider signed type + // do not add it here + if (!signed64.contains(regVal.index)) { + signed32.add(regVal.index); + } + break; + case Long: + // If the register was used as a narrower signed type + // remove it from there and add it to wider type. + if (signed32.contains(regVal.index)) { + signed32.remove(regVal.index); + } + signed64.add(regVal.index); + break; + case Float: + // If the register was used as a wider signed type + // do not add it here + if (!float64.contains(regVal.index)) { + float32.add(regVal.index); + } + break; + case Double: + // If the register was used as a narrower signed type + // remove it from there and add it to wider type. + if (float32.contains(regVal.index)) { + float32.remove(regVal.index); + } + float64.add(regVal.index); + break; + case Object: + unsigned64.add(regVal.index); + break; + default: + throw GraalInternalError.shouldNotReachHere("unhandled register type " + value.toString()); + } + } + } + return value; + } + } + class PTXFrameContext implements FrameContext { @Override @@ -147,94 +226,27 @@ assert codeCacheOwner != null : lirGen.getGraph() + " is not associated with a method"; Buffer codeBuffer = tasm.asm.codeBuffer; - - final SortedSet signed32 = new TreeSet<>(); - final SortedSet signed64 = new TreeSet<>(); - final SortedSet unsigned64 = new TreeSet<>(); - final SortedSet float32 = new TreeSet<>(); - final SortedSet float64 = new TreeSet<>(); - - ValueProcedure trackRegisterKind = new ValueProcedure() { - - @Override - public Value doValue(Value value, OperandMode mode, EnumSet flags) { - if (isVariable(value)) { - Variable regVal = (Variable) value; - Kind regKind = regVal.getKind(); - switch (regKind) { - case Int: - // If the register was used as a wider signed type - // do not add it here - if (!signed64.contains(regVal.index)) { - signed32.add(regVal.index); - } - break; - case Long: - // If the register was used as a narrower signed type - // remove it from there and add it to wider type. - if (signed32.contains(regVal.index)) { - signed32.remove(regVal.index); - } - signed64.add(regVal.index); - break; - case Float: - // If the register was used as a wider signed type - // do not add it here - if (!float64.contains(regVal.index)) { - float32.add(regVal.index); - } - break; - case Double: - // If the register was used as a narrower signed type - // remove it from there and add it to wider type. - if (float32.contains(regVal.index)) { - float32.remove(regVal.index); - } - float64.add(regVal.index); - break; - case Object: - unsigned64.add(regVal.index); - break; - default: - throw GraalInternalError.shouldNotReachHere("unhandled register type " + value.toString()); - } - } - return value; - } - }; + RegisterAnalysis registerAnalysis = new RegisterAnalysis(); for (Block b : lirGen.lir.codeEmittingOrder()) { for (LIRInstruction op : lirGen.lir.lir(b)) { if (op instanceof LabelOp) { // Don't consider this as a definition } else { - op.forEachTemp(trackRegisterKind); - op.forEachOutput(trackRegisterKind); + registerAnalysis.op = op; + op.forEachTemp(registerAnalysis); + op.forEachOutput(registerAnalysis); } } } - for (Integer i : signed32) { - codeBuffer.emitString(".reg .s32 %r" + i.intValue() + ";"); - } - for (Integer i : signed64) { - codeBuffer.emitString(".reg .s64 %r" + i.intValue() + ";"); - } - for (Integer i : unsigned64) { - codeBuffer.emitString(".reg .u64 %r" + i.intValue() + ";"); - } - for (Integer i : float32) { - codeBuffer.emitString(".reg .f32 %r" + i.intValue() + ";"); - } - for (Integer i : float64) { - codeBuffer.emitString(".reg .f64 %r" + i.intValue() + ";"); - } + registerAnalysis.emitDeclarations(codeBuffer); + // emit predicate register declaration int maxPredRegNum = ((PTXLIRGenerator) lirGen).getNextPredRegNumber(); if (maxPredRegNum > 0) { codeBuffer.emitString(".reg .pred %p<" + maxPredRegNum + ">;"); } - codeBuffer.emitString(".reg .pred %r;"); // used for setp bool } @Override diff -r 0dd597c6c9c7 -r 1a7e7011a341 graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXMemOp.java --- a/graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXMemOp.java Fri Nov 01 13:07:22 2013 +0100 +++ b/graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXMemOp.java Fri Nov 01 18:34:03 2013 -0400 @@ -166,7 +166,9 @@ case Long: case Float: case Double: - new Ld(Parameter, result, addr.getBase(), Constant.forLong(addr.getDisplacement())).emit(masm); + Ld ldIns = new Ld(Parameter, result, addr.getBase(), Constant.forLong(addr.getDisplacement())); + ldIns.setLdRetAddrInstruction(true); + ldIns.emit(masm); break; default: throw GraalInternalError.shouldNotReachHere(); diff -r 0dd597c6c9c7 -r 1a7e7011a341 src/gpu/ptx/vm/gpu_ptx.cpp --- a/src/gpu/ptx/vm/gpu_ptx.cpp Fri Nov 01 13:07:22 2013 +0100 +++ b/src/gpu/ptx/vm/gpu_ptx.cpp Fri Nov 01 18:34:03 2013 -0400 @@ -49,7 +49,21 @@ gpu::Ptx::cuda_cu_module_load_data_ex_func_t gpu::Ptx::_cuda_cu_module_load_data_ex; gpu::Ptx::cuda_cu_memcpy_dtoh_func_t gpu::Ptx::_cuda_cu_memcpy_dtoh; gpu::Ptx::cuda_cu_memfree_func_t gpu::Ptx::_cuda_cu_memfree; +gpu::Ptx::cuda_cu_mem_host_register_func_t gpu::Ptx::_cuda_cu_mem_host_register; +gpu::Ptx::cuda_cu_mem_host_get_device_pointer_func_t gpu::Ptx::_cuda_cu_mem_host_get_device_pointer; +gpu::Ptx::cuda_cu_mem_host_unregister_func_t gpu::Ptx::_cuda_cu_mem_host_unregister; +#define STRINGIFY(x) #x + +#define LOOKUP_CUDA_FUNCTION(name, alias) \ + _##alias = \ + CAST_TO_FN_PTR(alias##_func_t, os::dll_lookup(handle, STRINGIFY(name))); \ + if (_##alias == NULL) { \ + tty->print_cr("[CUDA] ***** Error: Failed to lookup %s", STRINGIFY(name)); \ + return 0; \ + } \ + +#define LOOKUP_CUDA_V2_FUNCTION(name, alias) LOOKUP_CUDA_FUNCTION(name##_v2, alias) /* * see http://en.wikipedia.org/wiki/CUDA#Supported_GPUs @@ -199,7 +213,7 @@ tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_WARP_SIZE: %d", _cu_device); return 0; } - + status = _cuda_cu_device_get_attribute(&async_engines, GRAAL_CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, _cu_device); @@ -234,7 +248,7 @@ tty->print_cr("[CUDA] Max threads per block: %d warp size: %d", max_threads_per_block, warp_size); } return (total); - + } void *gpu::Ptx::generate_kernel(unsigned char *code, int code_len, const char *name) { @@ -262,7 +276,7 @@ jit_option_values[2] = (void *)(size_t)jit_register_count; /* Create CUDA context to compile and execute the kernel */ - int status = _cuda_cu_ctx_create(&_device_context, 0, _cu_device); + int status = _cuda_cu_ctx_create(&_device_context, GRAAL_CU_CTX_MAP_HOST, _cu_device); if (status != GRAAL_CUDA_SUCCESS) { tty->print_cr("[CUDA] Failed to create CUDA context for device(%d): %d", _cu_device, status); @@ -443,9 +457,6 @@ tty->print_cr("[CUDA] TODO *** Unhandled return type: %d", return_type); } - // Copy all reference arguments from device to host memory. - ptxka.copyRefArgsFromDtoH(); - // Free device memory allocated for result status = gpu::Ptx::_cuda_cu_memfree(ptxka._dev_return_value); if (status != GRAAL_CUDA_SUCCESS) { @@ -487,40 +498,36 @@ void *handle = os::dll_load(cuda_library_name, buffer, STD_BUFFER_SIZE); free(buffer); if (handle != NULL) { - _cuda_cu_init = - CAST_TO_FN_PTR(cuda_cu_init_func_t, os::dll_lookup(handle, "cuInit")); - _cuda_cu_ctx_create = - CAST_TO_FN_PTR(cuda_cu_ctx_create_func_t, os::dll_lookup(handle, "cuCtxCreate")); - _cuda_cu_ctx_destroy = - CAST_TO_FN_PTR(cuda_cu_ctx_destroy_func_t, os::dll_lookup(handle, "cuCtxDestroy")); - _cuda_cu_ctx_synchronize = - CAST_TO_FN_PTR(cuda_cu_ctx_synchronize_func_t, os::dll_lookup(handle, "cuCtxSynchronize")); - _cuda_cu_ctx_set_current = - CAST_TO_FN_PTR(cuda_cu_ctx_set_current_func_t, os::dll_lookup(handle, "cuCtxSetCurrent")); - _cuda_cu_device_get_count = - CAST_TO_FN_PTR(cuda_cu_device_get_count_func_t, os::dll_lookup(handle, "cuDeviceGetCount")); - _cuda_cu_device_get_name = - CAST_TO_FN_PTR(cuda_cu_device_get_name_func_t, os::dll_lookup(handle, "cuDeviceGetName")); - _cuda_cu_device_get = - CAST_TO_FN_PTR(cuda_cu_device_get_func_t, os::dll_lookup(handle, "cuDeviceGet")); - _cuda_cu_device_compute_capability = - CAST_TO_FN_PTR(cuda_cu_device_compute_capability_func_t, os::dll_lookup(handle, "cuDeviceComputeCapability")); - _cuda_cu_device_get_attribute = - CAST_TO_FN_PTR(cuda_cu_device_get_attribute_func_t, os::dll_lookup(handle, "cuDeviceGetAttribute")); - _cuda_cu_module_get_function = - CAST_TO_FN_PTR(cuda_cu_module_get_function_func_t, os::dll_lookup(handle, "cuModuleGetFunction")); - _cuda_cu_module_load_data_ex = - CAST_TO_FN_PTR(cuda_cu_module_load_data_ex_func_t, os::dll_lookup(handle, "cuModuleLoadDataEx")); - _cuda_cu_launch_kernel = - CAST_TO_FN_PTR(cuda_cu_launch_kernel_func_t, os::dll_lookup(handle, "cuLaunchKernel")); - _cuda_cu_memalloc = - CAST_TO_FN_PTR(cuda_cu_memalloc_func_t, os::dll_lookup(handle, "cuMemAlloc")); - _cuda_cu_memfree = - CAST_TO_FN_PTR(cuda_cu_memfree_func_t, os::dll_lookup(handle, "cuMemFree")); - _cuda_cu_memcpy_htod = - CAST_TO_FN_PTR(cuda_cu_memcpy_htod_func_t, os::dll_lookup(handle, "cuMemcpyHtoD")); - _cuda_cu_memcpy_dtoh = - CAST_TO_FN_PTR(cuda_cu_memcpy_dtoh_func_t, os::dll_lookup(handle, "cuMemcpyDtoH")); + LOOKUP_CUDA_FUNCTION(cuInit, cuda_cu_init); + LOOKUP_CUDA_FUNCTION(cuCtxSynchronize, cuda_cu_ctx_synchronize); + LOOKUP_CUDA_FUNCTION(cuCtxSetCurrent, cuda_cu_ctx_set_current); + LOOKUP_CUDA_FUNCTION(cuDeviceGetCount, cuda_cu_device_get_count); + LOOKUP_CUDA_FUNCTION(cuDeviceGetName, cuda_cu_device_get_name); + LOOKUP_CUDA_FUNCTION(cuDeviceGet, cuda_cu_device_get); + LOOKUP_CUDA_FUNCTION(cuDeviceComputeCapability, cuda_cu_device_compute_capability); + LOOKUP_CUDA_FUNCTION(cuDeviceGetAttribute, cuda_cu_device_get_attribute); + LOOKUP_CUDA_FUNCTION(cuModuleGetFunction, cuda_cu_module_get_function); + LOOKUP_CUDA_FUNCTION(cuModuleLoadDataEx, cuda_cu_module_load_data_ex); + LOOKUP_CUDA_FUNCTION(cuLaunchKernel, cuda_cu_launch_kernel); + LOOKUP_CUDA_FUNCTION(cuMemHostRegister, cuda_cu_mem_host_register); + LOOKUP_CUDA_FUNCTION(cuMemHostUnregister, cuda_cu_mem_host_unregister); +#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) + LOOKUP_CUDA_V2_FUNCTION(cuCtxCreate, cuda_cu_ctx_create); + LOOKUP_CUDA_V2_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy); + LOOKUP_CUDA_V2_FUNCTION(cuMemAlloc, cuda_cu_memalloc); + LOOKUP_CUDA_V2_FUNCTION(cuMemFree, cuda_cu_memfree); + LOOKUP_CUDA_V2_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod); + LOOKUP_CUDA_V2_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh); + LOOKUP_CUDA_V2_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer); +#else + LOOKUP_CUDA_FUNCTION(cuCtxCreate, cuda_cu_ctx_create); + LOOKUP_CUDA_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy); + LOOKUP_CUDA_FUNCTION(cuMemAlloc, cuda_cu_memalloc); + LOOKUP_CUDA_FUNCTION(cuMemFree, cuda_cu_memfree); + LOOKUP_CUDA_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod); + LOOKUP_CUDA_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh); + LOOKUP_CUDA_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer); +#endif if (TraceGPUInteraction) { tty->print_cr("[CUDA] Success: library linkage"); diff -r 0dd597c6c9c7 -r 1a7e7011a341 src/gpu/ptx/vm/gpu_ptx.hpp --- a/src/gpu/ptx/vm/gpu_ptx.hpp Fri Nov 01 13:07:22 2013 +0100 +++ b/src/gpu/ptx/vm/gpu_ptx.hpp Fri Nov 01 18:34:03 2013 -0400 @@ -46,6 +46,13 @@ #define GRAAL_CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES 4 #define GRAAL_CUDA_ERROR_NO_BINARY_FOR_GPU 209 +/* + * Flags for cuMemHostRegister + */ + +#define GRAAL_CU_MEMHOSTREGISTER_PORTABLE 1 +#define GRAAL_CU_MEMHOSTREGISTER_DEVICEMAP 2 + /** * End of array terminator for the extra parameter to * ::cuLaunchKernel @@ -73,6 +80,12 @@ */ #define GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE ((void*) 0x02) +/* + * Context creation flags + */ + +#define GRAAL_CU_CTX_MAP_HOST 0x08 + class Ptx { friend class gpu; @@ -90,9 +103,11 @@ typedef unsigned int CUdeviceptr; #endif +typedef int CUdevice; /**< CUDA device */ + private: typedef int (*cuda_cu_init_func_t)(unsigned int); - typedef int (*cuda_cu_ctx_create_func_t)(void*, int, int); + typedef int (*cuda_cu_ctx_create_func_t)(void*, unsigned int, CUdevice); typedef int (*cuda_cu_ctx_destroy_func_t)(void*); typedef int (*cuda_cu_ctx_synchronize_func_t)(void); typedef int (*cuda_cu_ctx_set_current_func_t)(void*); @@ -107,10 +122,13 @@ unsigned int, void*, void**, void**); typedef int (*cuda_cu_module_get_function_func_t)(void*, void*, const char*); typedef int (*cuda_cu_module_load_data_ex_func_t)(void*, void*, unsigned int, void*, void**); - typedef int (*cuda_cu_memalloc_func_t)(void*, size_t); + typedef int (*cuda_cu_memalloc_func_t)(gpu::Ptx::CUdeviceptr*, size_t); typedef int (*cuda_cu_memfree_func_t)(gpu::Ptx::CUdeviceptr); typedef int (*cuda_cu_memcpy_htod_func_t)(gpu::Ptx::CUdeviceptr, const void*, unsigned int); typedef int (*cuda_cu_memcpy_dtoh_func_t)(const void*, gpu::Ptx::CUdeviceptr, unsigned int); + typedef int (*cuda_cu_mem_host_register_func_t)(void*, size_t, unsigned int); + typedef int (*cuda_cu_mem_host_get_device_pointer_func_t)(gpu::Ptx::CUdeviceptr*, void*, unsigned int); + typedef int (*cuda_cu_mem_host_unregister_func_t)(void*); public: static cuda_cu_init_func_t _cuda_cu_init; @@ -130,6 +148,9 @@ static cuda_cu_memcpy_htod_func_t _cuda_cu_memcpy_htod; static cuda_cu_memcpy_dtoh_func_t _cuda_cu_memcpy_dtoh; static cuda_cu_ctx_set_current_func_t _cuda_cu_ctx_set_current; + static cuda_cu_mem_host_register_func_t _cuda_cu_mem_host_register; + static cuda_cu_mem_host_get_device_pointer_func_t _cuda_cu_mem_host_get_device_pointer; + static cuda_cu_mem_host_unregister_func_t _cuda_cu_mem_host_unregister; protected: static void* _device_context; diff -r 0dd597c6c9c7 -r 1a7e7011a341 src/gpu/ptx/vm/ptxKernelArguments.cpp --- a/src/gpu/ptx/vm/ptxKernelArguments.cpp Fri Nov 01 13:07:22 2013 +0100 +++ b/src/gpu/ptx/vm/ptxKernelArguments.cpp Fri Nov 01 18:34:03 2013 -0400 @@ -38,20 +38,32 @@ return arg; } +/* + * Pad kernel argument buffer to naturally align for given size. + */ +void PTXKernelArguments::pad_kernel_argument_buffer(size_t dataSz) { + while ((_bufferOffset % dataSz) != 0) { + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (char) 0; + _bufferOffset += sizeof(char); + } + return; +} void PTXKernelArguments::do_int() { // If the parameter is a return value, if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_INT return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_INT_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_INT return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_INT_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } + + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; _bufferOffset += sizeof(_dev_return_value); } else { // Get the next java argument and its value which should be a T_INT @@ -63,9 +75,13 @@ _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = intval.i; - } + + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(intval.i)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = intval.i; + // Advance _bufferOffset _bufferOffset += sizeof(intval.i); } @@ -75,17 +91,18 @@ void PTXKernelArguments::do_float() { // If the parameter is a return value, if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_INT return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_FLOAT_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_FLOAT return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_FLOAT_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; // Advance _bufferOffset _bufferOffset += sizeof(_dev_return_value); } else { @@ -98,9 +115,11 @@ _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) floatval.f; - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(floatval.f)); + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) floatval.f; + // Advance _bufferOffset _bufferOffset += sizeof(floatval.f); } @@ -111,18 +130,19 @@ // If the parameter is a return value, jvalue doubleval; if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_INT return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_DOUBLE_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_DOUBLE return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_DOUBLE_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } - // Advance _bufferOffset + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Advance _bufferOffset. _bufferOffset += sizeof(doubleval.d); } else { // Get the next java argument and its value which should be a T_INT @@ -133,11 +153,16 @@ _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) doubleval.d; - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(doubleval.d)); + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) doubleval.d; + // Advance _bufferOffset _bufferOffset += sizeof(doubleval.d); + // For a 64-bit host, since size of double is 8, there is no need + // to pad the kernel argument buffer to ensure 8-byte alignment of + // the next potential argument to be pushed. } return; } @@ -145,17 +170,18 @@ void PTXKernelArguments::do_long() { // If the parameter is a return value, if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_LONG return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_LONG_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_LONG return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_LONG_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; // Advance _bufferOffset _bufferOffset += sizeof(_dev_return_value); } else { @@ -168,11 +194,16 @@ _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.j; - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(val.j)); + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.j; + // Advance _bufferOffset _bufferOffset += sizeof(val.j); + // For a 64-bit host, since size of long is 8, there is no need + // to pad the kernel argument buffer to ensure 8-byte alignment of + // the next potential argument to be pushed. } return; } @@ -180,17 +211,19 @@ void PTXKernelArguments::do_byte() { // If the parameter is a return value, if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_BYTE return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_BYTE return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Advance _bufferOffset _bufferOffset += sizeof(_dev_return_value); } else { @@ -203,11 +236,16 @@ _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.b; - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(val.b)); + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.b; + // Advance _bufferOffset _bufferOffset += sizeof(val.b); + // For a 64-bit host, since size of T_BYTE is 8, there is no need + // to pad the kernel argument buffer to ensure 8-byte alignment of + // the next potential argument to be pushed. } return; } @@ -215,32 +253,34 @@ void PTXKernelArguments::do_bool() { // If the parameter is a return value, if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_BYTE return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BOOLEAN_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_BYTE return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BOOLEAN_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } - // Advance _bufferOffset + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; _bufferOffset += sizeof(_dev_return_value); } else { - // Get the next java argument and its value which should be a T_BYTE - oop arg = next_arg(T_BYTE); + // Get the next java argument and its value which should be a T_BOOLEAN + oop arg = next_arg(T_BOOLEAN); // Copy the java argument value to kernelArgBuffer jvalue val; if (java_lang_boxing_object::get_value(arg, &val) != T_BOOLEAN) { - tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BYTE"); + tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BOOLEAN"); _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.z; - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(val.z)); + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.z; + // Advance _bufferOffset _bufferOffset += sizeof(val.z); } @@ -257,35 +297,28 @@ gpu::Ptx::CUdeviceptr arrayArgOnDev; int status; - if (is_kernel_arg_setup()) { - // Allocate device memory for array argument on device. Size in bytes - status = gpu::Ptx::_cuda_cu_memalloc(&arrayArgOnDev, argSize); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for array argument on device", - status); - _success = false; - return; - } - // Copy array argument to device - status = gpu::Ptx::_cuda_cu_memcpy_htod(arrayArgOnDev, arg, argSize); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to copy array argument content to device memory", - status); - _success = false; - return; - } + // Register host memory for use by the device. Size in bytes + status = gpu::Ptx::_cuda_cu_mem_host_register(arg, argSize, GRAAL_CU_MEMHOSTREGISTER_DEVICEMAP); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to register host memory for array argument on device", + status); + _success = false; + return; + } + // Get device pointer + status = gpu::Ptx::_cuda_cu_mem_host_get_device_pointer(&arrayArgOnDev, arg, 0); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to get device pointer of mapped pinned memory of array argument.", + status); + _success = false; + return; + } - // Push device array argument to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = arrayArgOnDev; - } else { - arrayArgOnDev = *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]); - status = gpu::Ptx::_cuda_cu_memcpy_dtoh(arg, arrayArgOnDev, argSize); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to copy array argument to host", status); - _success = false; - return; - } - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(arrayArgOnDev)); + // Push device array argument to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = arrayArgOnDev; // Advance _bufferOffset _bufferOffset += sizeof(arrayArgOnDev); diff -r 0dd597c6c9c7 -r 1a7e7011a341 src/gpu/ptx/vm/ptxKernelArguments.hpp --- a/src/gpu/ptx/vm/ptxKernelArguments.hpp Fri Nov 01 13:07:22 2013 +0100 +++ b/src/gpu/ptx/vm/ptxKernelArguments.hpp Fri Nov 01 18:34:03 2013 -0400 @@ -45,10 +45,6 @@ // Device pointer holding return value gpu::Ptx::CUdeviceptr _dev_return_value; - // Indicates if signature iteration is being done during kernel - // setup i.e., java arguments are being copied to device pointers. - bool _kernelArgSetup; - private: // Array of java argument oops arrayOop _args; @@ -68,8 +64,6 @@ _success = true; _bufferOffset = 0; _dev_return_value = 0; - _kernelArgSetup = true; - //_dev_call_by_reference_args_index = 0; if (!is_static) { // TODO : Create a device argument for receiver object and add it to _kernelBuffer tty->print_cr("{CUDA] ****** TODO: Support for execution of non-static java methods not implemented yet."); @@ -87,17 +81,6 @@ return _bufferOffset; } - void copyRefArgsFromDtoH() { - _kernelArgSetup = false; - _bufferOffset = 0; - _index = 0; - iterate(); - } - - inline bool is_kernel_arg_setup() { - return _kernelArgSetup; - } - // Get the return oop value oop get_return_oop(); @@ -106,6 +89,10 @@ return _dev_return_value; } + /* + * Pad kernel argument buffer to naturally align for given size. + */ + void pad_kernel_argument_buffer(size_t); void do_byte(); void do_bool(); diff -r 0dd597c6c9c7 -r 1a7e7011a341 src/share/vm/graal/graalCompilerToGPU.cpp --- a/src/share/vm/graal/graalCompilerToGPU.cpp Fri Nov 01 13:07:22 2013 +0100 +++ b/src/share/vm/graal/graalCompilerToGPU.cpp Fri Nov 01 18:34:03 2013 -0400 @@ -99,16 +99,19 @@ if (TraceGPUInteraction) { switch (ptxka.get_ret_type()) { case T_INT: - tty->print_cr("GPU execution returned %d", result.get_jint()); + tty->print_cr("GPU execution returned (int) %d", result.get_jint()); + break; + case T_LONG: + tty->print_cr("GPU execution returned (long) %ld", result.get_jlong()); break; case T_FLOAT: - tty->print_cr("GPU execution returned %f", result.get_jfloat()); + tty->print_cr("GPU execution returned (float) %f", result.get_jfloat()); break; case T_DOUBLE: - tty->print_cr("GPU execution returned %f", result.get_jdouble()); + tty->print_cr("GPU execution returned (double) %f", result.get_jdouble()); break; default: - tty->print_cr("GPU returned unhandled"); + tty->print_cr("**** Value returned by GPU not yet handled"); break; } }