Mercurial > hg > truffle
diff src/gpu/ptx/vm/gpu_ptx.cpp @ 11485:49bb1bc983c6
Implement several missing PTX codegen features; return value capture and method args passing of java method executed on GPU.
author | bharadwaj |
---|---|
date | Fri, 30 Aug 2013 16:39:05 -0400 |
parents | d876002b98e6 |
children | c99e65785936 |
line wrap: on
line diff
--- a/src/gpu/ptx/vm/gpu_ptx.cpp Fri Aug 30 17:31:59 2013 +0200 +++ b/src/gpu/ptx/vm/gpu_ptx.cpp Fri Aug 30 16:39:05 2013 -0400 @@ -29,12 +29,14 @@ #include "utilities/ostream.hpp" #include "memory/allocation.hpp" #include "memory/allocation.inline.hpp" +#include "kernelArguments.hpp" void * gpu::Ptx::_device_context; +int gpu::Ptx::_cu_device = 0; gpu::Ptx::cuda_cu_init_func_t gpu::Ptx::_cuda_cu_init; gpu::Ptx::cuda_cu_ctx_create_func_t gpu::Ptx::_cuda_cu_ctx_create; -gpu::Ptx::cuda_cu_ctx_detach_func_t gpu::Ptx::_cuda_cu_ctx_detach; +gpu::Ptx::cuda_cu_ctx_destroy_func_t gpu::Ptx::_cuda_cu_ctx_destroy; gpu::Ptx::cuda_cu_ctx_synchronize_func_t gpu::Ptx::_cuda_cu_ctx_synchronize; gpu::Ptx::cuda_cu_device_get_count_func_t gpu::Ptx::_cuda_cu_device_get_count; gpu::Ptx::cuda_cu_device_get_name_func_t gpu::Ptx::_cuda_cu_device_get_name; @@ -44,6 +46,8 @@ gpu::Ptx::cuda_cu_launch_kernel_func_t gpu::Ptx::_cuda_cu_launch_kernel; gpu::Ptx::cuda_cu_module_get_function_func_t gpu::Ptx::_cuda_cu_module_get_function; gpu::Ptx::cuda_cu_module_load_data_ex_func_t gpu::Ptx::_cuda_cu_module_load_data_ex; +gpu::Ptx::cuda_cu_memcpy_dtoh_func_t gpu::Ptx::_cuda_cu_memcpy_dtoh; +gpu::Ptx::cuda_cu_memfree_func_t gpu::Ptx::_cuda_cu_memfree; void gpu::probe_linkage() { #if defined(__APPLE__) || defined(LINUX) @@ -67,9 +71,9 @@ } } -bool gpu::execute_kernel(address kernel, JavaCallArguments * jca) { +bool gpu::execute_kernel(address kernel, PTXKernelArguments & ptxka, JavaValue& ret) { if (gpu::has_gpu_linkage()) { - return (gpu::Ptx::execute_kernel(kernel, jca)); + return (gpu::Ptx::execute_kernel(kernel, ptxka, ret)); } else { return false; } @@ -108,8 +112,7 @@ /* Get the handle to the first compute device */ int device_id = 0; /* Compute-capable device handle */ - int cu_device = 0; - status = _cuda_cu_device_get(&cu_device, device_id); + status = _cuda_cu_device_get(&_cu_device, device_id); if (status != GRAAL_CUDA_SUCCESS) { tty->print_cr("[CUDA] Failed to get handle of first compute-capable device i.e., the one at ordinal: %d", device_id); @@ -122,42 +125,42 @@ /* Get device attributes */ int minor, major, unified_addressing; - status = _cuda_cu_device_get_attribute(&minor, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device); + status = _cuda_cu_device_get_attribute(&minor, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, _cu_device); if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] Failed to get minor attribute of device: %d", cu_device); + tty->print_cr("[CUDA] Failed to get minor attribute of device: %d", _cu_device); return false; } - status = _cuda_cu_device_get_attribute(&major, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device); + status = _cuda_cu_device_get_attribute(&major, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, _cu_device); if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] Failed to get major attribute of device: %d", cu_device); + tty->print_cr("[CUDA] Failed to get major attribute of device: %d", _cu_device); return false; } if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Compatibility version of device %d: %d.%d", cu_device, major, minor); + tty->print_cr("[CUDA] Compatibility version of device %d: %d.%d", _cu_device, major, minor); } - status = _cuda_cu_device_get_attribute(&unified_addressing, GRAAL_CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cu_device); + status = _cuda_cu_device_get_attribute(&unified_addressing, GRAAL_CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, _cu_device); if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] Failed to query unified addressing mode of device: %d", cu_device); + tty->print_cr("[CUDA] Failed to query unified addressing mode of device: %d", _cu_device); return false; } if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Unified addressing support on device %d: %d", cu_device, unified_addressing); + tty->print_cr("[CUDA] Unified addressing support on device %d: %d", _cu_device, unified_addressing); } /* Get device name */ char device_name[256]; - status = _cuda_cu_device_get_name(device_name, 256, cu_device); + status = _cuda_cu_device_get_name(device_name, 256, _cu_device); if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] Failed to get name of device: %d", cu_device); + tty->print_cr("[CUDA] Failed to get name of device: %d", _cu_device); return false; } @@ -165,18 +168,6 @@ tty->print_cr("[CUDA] Using %s", device_name); } - /* Create CUDA context */ - status = _cuda_cu_ctx_create(&_device_context, 0, cu_device); - - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] Failed to create CUDA context for device: %d", cu_device); - return false; - } - - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Success: Created context for device: %d", cu_device); - } - return true; } @@ -210,8 +201,20 @@ } + /* Create CUDA context to compile and execute the kernel */ + int status = _cuda_cu_ctx_create(&_device_context, 0, _cu_device); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to create CUDA context for device: %d", _cu_device); + return NULL; + } + + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] Success: Created context for device: %d", _cu_device); + } + /* Load module's data with compiler options */ - int status = _cuda_cu_module_load_data_ex(&cu_module, code, jit_num_options, + status = _cuda_cu_module_load_data_ex(&cu_module, (void*) code, jit_num_options, jit_options, (void **)jit_option_values); if (status != GRAAL_CUDA_SUCCESS) { if (status == GRAAL_CUDA_ERROR_NO_BINARY_FOR_GPU) { @@ -238,10 +241,11 @@ if (TraceGPUInteraction) { tty->print_cr("[CUDA] Got function handle for %s", name); } + return cu_function; } -bool gpu::Ptx::execute_kernel(address kernel, JavaCallArguments * jca) { +bool gpu::Ptx::execute_kernel(address kernel, PTXKernelArguments &ptxka, JavaValue &ret) { // grid dimensionality unsigned int gridX = 1; unsigned int gridY = 1; @@ -252,14 +256,11 @@ unsigned int blockY = 1; unsigned int blockZ = 1; - int *cu_function = (int *)kernel; + struct CUfunc_st* cu_function = (struct CUfunc_st*) kernel; - char * paramBuffer = (char *) jca->parameters(); - size_t paramBufferSz = (size_t) jca->size_of_parameters(); - - void * config[] = { - GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, paramBuffer, - GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, ¶mBufferSz, + void * config[5] = { + GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, ptxka._kernelArgBuffer, + GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, &(ptxka._bufferOffset), GRAAL_CU_LAUNCH_PARAM_END }; @@ -270,10 +271,11 @@ if (TraceGPUInteraction) { tty->print_cr("[CUDA] launching kernel"); } + int status = _cuda_cu_launch_kernel(cu_function, gridX, gridY, gridZ, blockX, blockY, blockZ, - 0, NULL, NULL, config); + 0, NULL, NULL, (void **) &config); if (status != GRAAL_CUDA_SUCCESS) { tty->print_cr("[CUDA] Failed to launch kernel"); return false; @@ -282,7 +284,72 @@ if (TraceGPUInteraction) { tty->print_cr("[CUDA] Success: Kernel Launch"); } - return status == 0; // GRAAL_CUDA_SUCCESS + + status = _cuda_cu_ctx_synchronize(); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to synchronize launched kernel (%d)", status); + return false; + } + + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] Success: Synchronized launch kernel"); + } + + + // Get the result. TODO: Move this code to get_return_oop() + BasicType return_type = ptxka.get_ret_type(); + switch (return_type) { + case T_INT : + { + int return_val; + status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_INT_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status); + return false; + } + ret.set_jint(return_val); + } + break; + case T_LONG : + { + long return_val; + status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_LONG_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status); + return false; + } + ret.set_jlong(return_val); + } + break; + default: + tty->print_cr("[CUDA] TODO *** Unhandled return type"); + } + + + // Free device memory allocated for result + status = gpu::Ptx::_cuda_cu_memfree(ptxka._return_value_ptr); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status); + return false; + } + + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] Success: Freed device memory of return value"); + } + + // Destroy context + status = gpu::Ptx::_cuda_cu_ctx_destroy(_device_context); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to destroy context", status); + return false; + } + + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] Success: Destroy context"); + } + + return (status == GRAAL_CUDA_SUCCESS); } #if defined(LINUX) @@ -305,8 +372,8 @@ CAST_TO_FN_PTR(cuda_cu_init_func_t, os::dll_lookup(handle, "cuInit")); _cuda_cu_ctx_create = CAST_TO_FN_PTR(cuda_cu_ctx_create_func_t, os::dll_lookup(handle, "cuCtxCreate")); - _cuda_cu_ctx_detach = - CAST_TO_FN_PTR(cuda_cu_ctx_detach_func_t, os::dll_lookup(handle, "cuCtxDetach")); + _cuda_cu_ctx_destroy = + CAST_TO_FN_PTR(cuda_cu_ctx_destroy_func_t, os::dll_lookup(handle, "cuCtxDestroy")); _cuda_cu_ctx_synchronize = CAST_TO_FN_PTR(cuda_cu_ctx_synchronize_func_t, os::dll_lookup(handle, "cuCtxSynchronize")); _cuda_cu_device_get_count = @@ -325,6 +392,15 @@ CAST_TO_FN_PTR(cuda_cu_module_load_data_ex_func_t, os::dll_lookup(handle, "cuModuleLoadDataEx")); _cuda_cu_launch_kernel = CAST_TO_FN_PTR(cuda_cu_launch_kernel_func_t, os::dll_lookup(handle, "cuLaunchKernel")); + _cuda_cu_memalloc = + CAST_TO_FN_PTR(cuda_cu_memalloc_func_t, os::dll_lookup(handle, "cuMemAlloc")); + _cuda_cu_memfree = + CAST_TO_FN_PTR(cuda_cu_memfree_func_t, os::dll_lookup(handle, "cuMemFree")); + _cuda_cu_memcpy_htod = + CAST_TO_FN_PTR(cuda_cu_memcpy_htod_func_t, os::dll_lookup(handle, "cuMemcpyHtoD")); + _cuda_cu_memcpy_dtoh = + CAST_TO_FN_PTR(cuda_cu_memcpy_dtoh_func_t, os::dll_lookup(handle, "cuMemcpyDtoH")); + if (TraceGPUInteraction) { tty->print_cr("[CUDA] Success: library linkage"); }