Mercurial > hg > truffle
comparison src/gpu/ptx/vm/gpu_ptx.cpp @ 12653:1a7e7011a341
* PTX kernel argument buffer now has naturally aligned arguments as required by PTX JIT compiler.
* Change dynamic loading of CUDA driver API functions to load 32-bit or 64-bit versions of depending on the the host architecture.
* Add ability to generate PTX kernels to be launched both on 32-bit and 64-bit hosts.
* Use Unified Virtual Memory APIs to perform array argument marshalling.
* PTX array storage test runs on the device and returns correct results.
* More integer test failures on GPU fixed.
author | S.Bharadwaj Yadavalli <bharadwaj.yadavalli@oracle.com> |
---|---|
date | Fri, 01 Nov 2013 18:34:03 -0400 |
parents | f020e149c1b6 |
children | 220ed109bf77 |
comparison
equal
deleted
inserted
replaced
12652:0dd597c6c9c7 | 12653:1a7e7011a341 |
---|---|
47 gpu::Ptx::cuda_cu_launch_kernel_func_t gpu::Ptx::_cuda_cu_launch_kernel; | 47 gpu::Ptx::cuda_cu_launch_kernel_func_t gpu::Ptx::_cuda_cu_launch_kernel; |
48 gpu::Ptx::cuda_cu_module_get_function_func_t gpu::Ptx::_cuda_cu_module_get_function; | 48 gpu::Ptx::cuda_cu_module_get_function_func_t gpu::Ptx::_cuda_cu_module_get_function; |
49 gpu::Ptx::cuda_cu_module_load_data_ex_func_t gpu::Ptx::_cuda_cu_module_load_data_ex; | 49 gpu::Ptx::cuda_cu_module_load_data_ex_func_t gpu::Ptx::_cuda_cu_module_load_data_ex; |
50 gpu::Ptx::cuda_cu_memcpy_dtoh_func_t gpu::Ptx::_cuda_cu_memcpy_dtoh; | 50 gpu::Ptx::cuda_cu_memcpy_dtoh_func_t gpu::Ptx::_cuda_cu_memcpy_dtoh; |
51 gpu::Ptx::cuda_cu_memfree_func_t gpu::Ptx::_cuda_cu_memfree; | 51 gpu::Ptx::cuda_cu_memfree_func_t gpu::Ptx::_cuda_cu_memfree; |
52 | 52 gpu::Ptx::cuda_cu_mem_host_register_func_t gpu::Ptx::_cuda_cu_mem_host_register; |
53 gpu::Ptx::cuda_cu_mem_host_get_device_pointer_func_t gpu::Ptx::_cuda_cu_mem_host_get_device_pointer; | |
54 gpu::Ptx::cuda_cu_mem_host_unregister_func_t gpu::Ptx::_cuda_cu_mem_host_unregister; | |
55 | |
56 #define STRINGIFY(x) #x | |
57 | |
58 #define LOOKUP_CUDA_FUNCTION(name, alias) \ | |
59 _##alias = \ | |
60 CAST_TO_FN_PTR(alias##_func_t, os::dll_lookup(handle, STRINGIFY(name))); \ | |
61 if (_##alias == NULL) { \ | |
62 tty->print_cr("[CUDA] ***** Error: Failed to lookup %s", STRINGIFY(name)); \ | |
63 return 0; \ | |
64 } \ | |
65 | |
66 #define LOOKUP_CUDA_V2_FUNCTION(name, alias) LOOKUP_CUDA_FUNCTION(name##_v2, alias) | |
53 | 67 |
54 /* | 68 /* |
55 * see http://en.wikipedia.org/wiki/CUDA#Supported_GPUs | 69 * see http://en.wikipedia.org/wiki/CUDA#Supported_GPUs |
56 */ | 70 */ |
57 int ncores(int major, int minor) { | 71 int ncores(int major, int minor) { |
197 | 211 |
198 if (status != GRAAL_CUDA_SUCCESS) { | 212 if (status != GRAAL_CUDA_SUCCESS) { |
199 tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_WARP_SIZE: %d", _cu_device); | 213 tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_WARP_SIZE: %d", _cu_device); |
200 return 0; | 214 return 0; |
201 } | 215 } |
202 | 216 |
203 status = _cuda_cu_device_get_attribute(&async_engines, | 217 status = _cuda_cu_device_get_attribute(&async_engines, |
204 GRAAL_CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, | 218 GRAAL_CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, |
205 _cu_device); | 219 _cu_device); |
206 | 220 |
207 if (status != GRAAL_CUDA_SUCCESS) { | 221 if (status != GRAAL_CUDA_SUCCESS) { |
232 tty->print_cr("[CUDA] Number of cores: %d async engines: %d can map host mem: %d concurrent kernels: %d", | 246 tty->print_cr("[CUDA] Number of cores: %d async engines: %d can map host mem: %d concurrent kernels: %d", |
233 total, async_engines, can_map_host_memory, concurrent_kernels); | 247 total, async_engines, can_map_host_memory, concurrent_kernels); |
234 tty->print_cr("[CUDA] Max threads per block: %d warp size: %d", max_threads_per_block, warp_size); | 248 tty->print_cr("[CUDA] Max threads per block: %d warp size: %d", max_threads_per_block, warp_size); |
235 } | 249 } |
236 return (total); | 250 return (total); |
237 | 251 |
238 } | 252 } |
239 | 253 |
240 void *gpu::Ptx::generate_kernel(unsigned char *code, int code_len, const char *name) { | 254 void *gpu::Ptx::generate_kernel(unsigned char *code, int code_len, const char *name) { |
241 | 255 |
242 struct CUmod_st * cu_module; | 256 struct CUmod_st * cu_module; |
260 int jit_register_count = 32; | 274 int jit_register_count = 32; |
261 jit_options[2] = GRAAL_CU_JIT_MAX_REGISTERS; | 275 jit_options[2] = GRAAL_CU_JIT_MAX_REGISTERS; |
262 jit_option_values[2] = (void *)(size_t)jit_register_count; | 276 jit_option_values[2] = (void *)(size_t)jit_register_count; |
263 | 277 |
264 /* Create CUDA context to compile and execute the kernel */ | 278 /* Create CUDA context to compile and execute the kernel */ |
265 int status = _cuda_cu_ctx_create(&_device_context, 0, _cu_device); | 279 int status = _cuda_cu_ctx_create(&_device_context, GRAAL_CU_CTX_MAP_HOST, _cu_device); |
266 | 280 |
267 if (status != GRAAL_CUDA_SUCCESS) { | 281 if (status != GRAAL_CUDA_SUCCESS) { |
268 tty->print_cr("[CUDA] Failed to create CUDA context for device(%d): %d", _cu_device, status); | 282 tty->print_cr("[CUDA] Failed to create CUDA context for device(%d): %d", _cu_device, status); |
269 return NULL; | 283 return NULL; |
270 } | 284 } |
441 break; | 455 break; |
442 default: | 456 default: |
443 tty->print_cr("[CUDA] TODO *** Unhandled return type: %d", return_type); | 457 tty->print_cr("[CUDA] TODO *** Unhandled return type: %d", return_type); |
444 } | 458 } |
445 | 459 |
446 // Copy all reference arguments from device to host memory. | |
447 ptxka.copyRefArgsFromDtoH(); | |
448 | |
449 // Free device memory allocated for result | 460 // Free device memory allocated for result |
450 status = gpu::Ptx::_cuda_cu_memfree(ptxka._dev_return_value); | 461 status = gpu::Ptx::_cuda_cu_memfree(ptxka._dev_return_value); |
451 if (status != GRAAL_CUDA_SUCCESS) { | 462 if (status != GRAAL_CUDA_SUCCESS) { |
452 tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status); | 463 tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status); |
453 return false; | 464 return false; |
485 if (cuda_library_name != NULL) { | 496 if (cuda_library_name != NULL) { |
486 char *buffer = (char*)malloc(STD_BUFFER_SIZE); | 497 char *buffer = (char*)malloc(STD_BUFFER_SIZE); |
487 void *handle = os::dll_load(cuda_library_name, buffer, STD_BUFFER_SIZE); | 498 void *handle = os::dll_load(cuda_library_name, buffer, STD_BUFFER_SIZE); |
488 free(buffer); | 499 free(buffer); |
489 if (handle != NULL) { | 500 if (handle != NULL) { |
490 _cuda_cu_init = | 501 LOOKUP_CUDA_FUNCTION(cuInit, cuda_cu_init); |
491 CAST_TO_FN_PTR(cuda_cu_init_func_t, os::dll_lookup(handle, "cuInit")); | 502 LOOKUP_CUDA_FUNCTION(cuCtxSynchronize, cuda_cu_ctx_synchronize); |
492 _cuda_cu_ctx_create = | 503 LOOKUP_CUDA_FUNCTION(cuCtxSetCurrent, cuda_cu_ctx_set_current); |
493 CAST_TO_FN_PTR(cuda_cu_ctx_create_func_t, os::dll_lookup(handle, "cuCtxCreate")); | 504 LOOKUP_CUDA_FUNCTION(cuDeviceGetCount, cuda_cu_device_get_count); |
494 _cuda_cu_ctx_destroy = | 505 LOOKUP_CUDA_FUNCTION(cuDeviceGetName, cuda_cu_device_get_name); |
495 CAST_TO_FN_PTR(cuda_cu_ctx_destroy_func_t, os::dll_lookup(handle, "cuCtxDestroy")); | 506 LOOKUP_CUDA_FUNCTION(cuDeviceGet, cuda_cu_device_get); |
496 _cuda_cu_ctx_synchronize = | 507 LOOKUP_CUDA_FUNCTION(cuDeviceComputeCapability, cuda_cu_device_compute_capability); |
497 CAST_TO_FN_PTR(cuda_cu_ctx_synchronize_func_t, os::dll_lookup(handle, "cuCtxSynchronize")); | 508 LOOKUP_CUDA_FUNCTION(cuDeviceGetAttribute, cuda_cu_device_get_attribute); |
498 _cuda_cu_ctx_set_current = | 509 LOOKUP_CUDA_FUNCTION(cuModuleGetFunction, cuda_cu_module_get_function); |
499 CAST_TO_FN_PTR(cuda_cu_ctx_set_current_func_t, os::dll_lookup(handle, "cuCtxSetCurrent")); | 510 LOOKUP_CUDA_FUNCTION(cuModuleLoadDataEx, cuda_cu_module_load_data_ex); |
500 _cuda_cu_device_get_count = | 511 LOOKUP_CUDA_FUNCTION(cuLaunchKernel, cuda_cu_launch_kernel); |
501 CAST_TO_FN_PTR(cuda_cu_device_get_count_func_t, os::dll_lookup(handle, "cuDeviceGetCount")); | 512 LOOKUP_CUDA_FUNCTION(cuMemHostRegister, cuda_cu_mem_host_register); |
502 _cuda_cu_device_get_name = | 513 LOOKUP_CUDA_FUNCTION(cuMemHostUnregister, cuda_cu_mem_host_unregister); |
503 CAST_TO_FN_PTR(cuda_cu_device_get_name_func_t, os::dll_lookup(handle, "cuDeviceGetName")); | 514 #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) |
504 _cuda_cu_device_get = | 515 LOOKUP_CUDA_V2_FUNCTION(cuCtxCreate, cuda_cu_ctx_create); |
505 CAST_TO_FN_PTR(cuda_cu_device_get_func_t, os::dll_lookup(handle, "cuDeviceGet")); | 516 LOOKUP_CUDA_V2_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy); |
506 _cuda_cu_device_compute_capability = | 517 LOOKUP_CUDA_V2_FUNCTION(cuMemAlloc, cuda_cu_memalloc); |
507 CAST_TO_FN_PTR(cuda_cu_device_compute_capability_func_t, os::dll_lookup(handle, "cuDeviceComputeCapability")); | 518 LOOKUP_CUDA_V2_FUNCTION(cuMemFree, cuda_cu_memfree); |
508 _cuda_cu_device_get_attribute = | 519 LOOKUP_CUDA_V2_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod); |
509 CAST_TO_FN_PTR(cuda_cu_device_get_attribute_func_t, os::dll_lookup(handle, "cuDeviceGetAttribute")); | 520 LOOKUP_CUDA_V2_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh); |
510 _cuda_cu_module_get_function = | 521 LOOKUP_CUDA_V2_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer); |
511 CAST_TO_FN_PTR(cuda_cu_module_get_function_func_t, os::dll_lookup(handle, "cuModuleGetFunction")); | 522 #else |
512 _cuda_cu_module_load_data_ex = | 523 LOOKUP_CUDA_FUNCTION(cuCtxCreate, cuda_cu_ctx_create); |
513 CAST_TO_FN_PTR(cuda_cu_module_load_data_ex_func_t, os::dll_lookup(handle, "cuModuleLoadDataEx")); | 524 LOOKUP_CUDA_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy); |
514 _cuda_cu_launch_kernel = | 525 LOOKUP_CUDA_FUNCTION(cuMemAlloc, cuda_cu_memalloc); |
515 CAST_TO_FN_PTR(cuda_cu_launch_kernel_func_t, os::dll_lookup(handle, "cuLaunchKernel")); | 526 LOOKUP_CUDA_FUNCTION(cuMemFree, cuda_cu_memfree); |
516 _cuda_cu_memalloc = | 527 LOOKUP_CUDA_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod); |
517 CAST_TO_FN_PTR(cuda_cu_memalloc_func_t, os::dll_lookup(handle, "cuMemAlloc")); | 528 LOOKUP_CUDA_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh); |
518 _cuda_cu_memfree = | 529 LOOKUP_CUDA_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer); |
519 CAST_TO_FN_PTR(cuda_cu_memfree_func_t, os::dll_lookup(handle, "cuMemFree")); | 530 #endif |
520 _cuda_cu_memcpy_htod = | |
521 CAST_TO_FN_PTR(cuda_cu_memcpy_htod_func_t, os::dll_lookup(handle, "cuMemcpyHtoD")); | |
522 _cuda_cu_memcpy_dtoh = | |
523 CAST_TO_FN_PTR(cuda_cu_memcpy_dtoh_func_t, os::dll_lookup(handle, "cuMemcpyDtoH")); | |
524 | 531 |
525 if (TraceGPUInteraction) { | 532 if (TraceGPUInteraction) { |
526 tty->print_cr("[CUDA] Success: library linkage"); | 533 tty->print_cr("[CUDA] Success: library linkage"); |
527 } | 534 } |
528 return true; | 535 return true; |