Mercurial > hg > truffle
diff src/gpu/ptx/vm/ptxKernelArguments.cpp @ 12653:1a7e7011a341
* PTX kernel argument buffer now has naturally aligned arguments as required by PTX JIT compiler.
* Change dynamic loading of CUDA driver API functions to load 32-bit or 64-bit versions of depending on the the host architecture.
* Add ability to generate PTX kernels to be launched both on 32-bit and 64-bit hosts.
* Use Unified Virtual Memory APIs to perform array argument marshalling.
* PTX array storage test runs on the device and returns correct results.
* More integer test failures on GPU fixed.
author | S.Bharadwaj Yadavalli <bharadwaj.yadavalli@oracle.com> |
---|---|
date | Fri, 01 Nov 2013 18:34:03 -0400 |
parents | 11b086b1bae4 |
children |
line wrap: on
line diff
--- a/src/gpu/ptx/vm/ptxKernelArguments.cpp Fri Nov 01 13:07:22 2013 +0100 +++ b/src/gpu/ptx/vm/ptxKernelArguments.cpp Fri Nov 01 18:34:03 2013 -0400 @@ -38,20 +38,32 @@ return arg; } +/* + * Pad kernel argument buffer to naturally align for given size. + */ +void PTXKernelArguments::pad_kernel_argument_buffer(size_t dataSz) { + while ((_bufferOffset % dataSz) != 0) { + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (char) 0; + _bufferOffset += sizeof(char); + } + return; +} void PTXKernelArguments::do_int() { // If the parameter is a return value, if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_INT return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_INT_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_INT return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_INT_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } + + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; _bufferOffset += sizeof(_dev_return_value); } else { // Get the next java argument and its value which should be a T_INT @@ -63,9 +75,13 @@ _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = intval.i; - } + + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(intval.i)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = intval.i; + // Advance _bufferOffset _bufferOffset += sizeof(intval.i); } @@ -75,17 +91,18 @@ void PTXKernelArguments::do_float() { // If the parameter is a return value, if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_INT return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_FLOAT_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_FLOAT return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_FLOAT_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; // Advance _bufferOffset _bufferOffset += sizeof(_dev_return_value); } else { @@ -98,9 +115,11 @@ _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) floatval.f; - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(floatval.f)); + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) floatval.f; + // Advance _bufferOffset _bufferOffset += sizeof(floatval.f); } @@ -111,18 +130,19 @@ // If the parameter is a return value, jvalue doubleval; if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_INT return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_DOUBLE_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_DOUBLE return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_DOUBLE_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } - // Advance _bufferOffset + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Advance _bufferOffset. _bufferOffset += sizeof(doubleval.d); } else { // Get the next java argument and its value which should be a T_INT @@ -133,11 +153,16 @@ _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) doubleval.d; - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(doubleval.d)); + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) doubleval.d; + // Advance _bufferOffset _bufferOffset += sizeof(doubleval.d); + // For a 64-bit host, since size of double is 8, there is no need + // to pad the kernel argument buffer to ensure 8-byte alignment of + // the next potential argument to be pushed. } return; } @@ -145,17 +170,18 @@ void PTXKernelArguments::do_long() { // If the parameter is a return value, if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_LONG return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_LONG_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_LONG return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_LONG_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; // Advance _bufferOffset _bufferOffset += sizeof(_dev_return_value); } else { @@ -168,11 +194,16 @@ _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.j; - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(val.j)); + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.j; + // Advance _bufferOffset _bufferOffset += sizeof(val.j); + // For a 64-bit host, since size of long is 8, there is no need + // to pad the kernel argument buffer to ensure 8-byte alignment of + // the next potential argument to be pushed. } return; } @@ -180,17 +211,19 @@ void PTXKernelArguments::do_byte() { // If the parameter is a return value, if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_BYTE return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_BYTE return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Advance _bufferOffset _bufferOffset += sizeof(_dev_return_value); } else { @@ -203,11 +236,16 @@ _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.b; - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(val.b)); + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.b; + // Advance _bufferOffset _bufferOffset += sizeof(val.b); + // For a 64-bit host, since size of T_BYTE is 8, there is no need + // to pad the kernel argument buffer to ensure 8-byte alignment of + // the next potential argument to be pushed. } return; } @@ -215,32 +253,34 @@ void PTXKernelArguments::do_bool() { // If the parameter is a return value, if (is_return_type()) { - if (is_kernel_arg_setup()) { - // Allocate device memory for T_BYTE return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BOOLEAN_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; - } - // Push _dev_return_value to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; + // Allocate device memory for T_BYTE return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BOOLEAN_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; } - // Advance _bufferOffset + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(_dev_return_value)); + // Push _dev_return_value to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value; _bufferOffset += sizeof(_dev_return_value); } else { - // Get the next java argument and its value which should be a T_BYTE - oop arg = next_arg(T_BYTE); + // Get the next java argument and its value which should be a T_BOOLEAN + oop arg = next_arg(T_BOOLEAN); // Copy the java argument value to kernelArgBuffer jvalue val; if (java_lang_boxing_object::get_value(arg, &val) != T_BOOLEAN) { - tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BYTE"); + tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BOOLEAN"); _success = false; return; } - if (is_kernel_arg_setup()) { - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.z; - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(val.z)); + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.z; + // Advance _bufferOffset _bufferOffset += sizeof(val.z); } @@ -257,35 +297,28 @@ gpu::Ptx::CUdeviceptr arrayArgOnDev; int status; - if (is_kernel_arg_setup()) { - // Allocate device memory for array argument on device. Size in bytes - status = gpu::Ptx::_cuda_cu_memalloc(&arrayArgOnDev, argSize); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for array argument on device", - status); - _success = false; - return; - } - // Copy array argument to device - status = gpu::Ptx::_cuda_cu_memcpy_htod(arrayArgOnDev, arg, argSize); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to copy array argument content to device memory", - status); - _success = false; - return; - } + // Register host memory for use by the device. Size in bytes + status = gpu::Ptx::_cuda_cu_mem_host_register(arg, argSize, GRAAL_CU_MEMHOSTREGISTER_DEVICEMAP); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to register host memory for array argument on device", + status); + _success = false; + return; + } + // Get device pointer + status = gpu::Ptx::_cuda_cu_mem_host_get_device_pointer(&arrayArgOnDev, arg, 0); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to get device pointer of mapped pinned memory of array argument.", + status); + _success = false; + return; + } - // Push device array argument to _kernelBuffer - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = arrayArgOnDev; - } else { - arrayArgOnDev = *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]); - status = gpu::Ptx::_cuda_cu_memcpy_dtoh(arg, arrayArgOnDev, argSize); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to copy array argument to host", status); - _success = false; - return; - } - } + // Kernel arguments are expected to be naturally aligned. + // Insert padding into kernel argument buffer, if needed. + pad_kernel_argument_buffer(sizeof(arrayArgOnDev)); + // Push device array argument to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = arrayArgOnDev; // Advance _bufferOffset _bufferOffset += sizeof(arrayArgOnDev);