comparison src/gpu/ptx/vm/gpu_ptx.cpp @ 12653:1a7e7011a341

* PTX kernel argument buffer now has naturally aligned arguments as required by PTX JIT compiler. * Change dynamic loading of CUDA driver API functions to load 32-bit or 64-bit versions of depending on the the host architecture. * Add ability to generate PTX kernels to be launched both on 32-bit and 64-bit hosts. * Use Unified Virtual Memory APIs to perform array argument marshalling. * PTX array storage test runs on the device and returns correct results. * More integer test failures on GPU fixed.
author S.Bharadwaj Yadavalli <bharadwaj.yadavalli@oracle.com>
date Fri, 01 Nov 2013 18:34:03 -0400
parents f020e149c1b6
children 220ed109bf77
comparison
equal deleted inserted replaced
12652:0dd597c6c9c7 12653:1a7e7011a341
47 gpu::Ptx::cuda_cu_launch_kernel_func_t gpu::Ptx::_cuda_cu_launch_kernel; 47 gpu::Ptx::cuda_cu_launch_kernel_func_t gpu::Ptx::_cuda_cu_launch_kernel;
48 gpu::Ptx::cuda_cu_module_get_function_func_t gpu::Ptx::_cuda_cu_module_get_function; 48 gpu::Ptx::cuda_cu_module_get_function_func_t gpu::Ptx::_cuda_cu_module_get_function;
49 gpu::Ptx::cuda_cu_module_load_data_ex_func_t gpu::Ptx::_cuda_cu_module_load_data_ex; 49 gpu::Ptx::cuda_cu_module_load_data_ex_func_t gpu::Ptx::_cuda_cu_module_load_data_ex;
50 gpu::Ptx::cuda_cu_memcpy_dtoh_func_t gpu::Ptx::_cuda_cu_memcpy_dtoh; 50 gpu::Ptx::cuda_cu_memcpy_dtoh_func_t gpu::Ptx::_cuda_cu_memcpy_dtoh;
51 gpu::Ptx::cuda_cu_memfree_func_t gpu::Ptx::_cuda_cu_memfree; 51 gpu::Ptx::cuda_cu_memfree_func_t gpu::Ptx::_cuda_cu_memfree;
52 52 gpu::Ptx::cuda_cu_mem_host_register_func_t gpu::Ptx::_cuda_cu_mem_host_register;
53 gpu::Ptx::cuda_cu_mem_host_get_device_pointer_func_t gpu::Ptx::_cuda_cu_mem_host_get_device_pointer;
54 gpu::Ptx::cuda_cu_mem_host_unregister_func_t gpu::Ptx::_cuda_cu_mem_host_unregister;
55
56 #define STRINGIFY(x) #x
57
58 #define LOOKUP_CUDA_FUNCTION(name, alias) \
59 _##alias = \
60 CAST_TO_FN_PTR(alias##_func_t, os::dll_lookup(handle, STRINGIFY(name))); \
61 if (_##alias == NULL) { \
62 tty->print_cr("[CUDA] ***** Error: Failed to lookup %s", STRINGIFY(name)); \
63 return 0; \
64 } \
65
66 #define LOOKUP_CUDA_V2_FUNCTION(name, alias) LOOKUP_CUDA_FUNCTION(name##_v2, alias)
53 67
54 /* 68 /*
55 * see http://en.wikipedia.org/wiki/CUDA#Supported_GPUs 69 * see http://en.wikipedia.org/wiki/CUDA#Supported_GPUs
56 */ 70 */
57 int ncores(int major, int minor) { 71 int ncores(int major, int minor) {
197 211
198 if (status != GRAAL_CUDA_SUCCESS) { 212 if (status != GRAAL_CUDA_SUCCESS) {
199 tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_WARP_SIZE: %d", _cu_device); 213 tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_WARP_SIZE: %d", _cu_device);
200 return 0; 214 return 0;
201 } 215 }
202 216
203 status = _cuda_cu_device_get_attribute(&async_engines, 217 status = _cuda_cu_device_get_attribute(&async_engines,
204 GRAAL_CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, 218 GRAAL_CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT,
205 _cu_device); 219 _cu_device);
206 220
207 if (status != GRAAL_CUDA_SUCCESS) { 221 if (status != GRAAL_CUDA_SUCCESS) {
232 tty->print_cr("[CUDA] Number of cores: %d async engines: %d can map host mem: %d concurrent kernels: %d", 246 tty->print_cr("[CUDA] Number of cores: %d async engines: %d can map host mem: %d concurrent kernels: %d",
233 total, async_engines, can_map_host_memory, concurrent_kernels); 247 total, async_engines, can_map_host_memory, concurrent_kernels);
234 tty->print_cr("[CUDA] Max threads per block: %d warp size: %d", max_threads_per_block, warp_size); 248 tty->print_cr("[CUDA] Max threads per block: %d warp size: %d", max_threads_per_block, warp_size);
235 } 249 }
236 return (total); 250 return (total);
237 251
238 } 252 }
239 253
240 void *gpu::Ptx::generate_kernel(unsigned char *code, int code_len, const char *name) { 254 void *gpu::Ptx::generate_kernel(unsigned char *code, int code_len, const char *name) {
241 255
242 struct CUmod_st * cu_module; 256 struct CUmod_st * cu_module;
260 int jit_register_count = 32; 274 int jit_register_count = 32;
261 jit_options[2] = GRAAL_CU_JIT_MAX_REGISTERS; 275 jit_options[2] = GRAAL_CU_JIT_MAX_REGISTERS;
262 jit_option_values[2] = (void *)(size_t)jit_register_count; 276 jit_option_values[2] = (void *)(size_t)jit_register_count;
263 277
264 /* Create CUDA context to compile and execute the kernel */ 278 /* Create CUDA context to compile and execute the kernel */
265 int status = _cuda_cu_ctx_create(&_device_context, 0, _cu_device); 279 int status = _cuda_cu_ctx_create(&_device_context, GRAAL_CU_CTX_MAP_HOST, _cu_device);
266 280
267 if (status != GRAAL_CUDA_SUCCESS) { 281 if (status != GRAAL_CUDA_SUCCESS) {
268 tty->print_cr("[CUDA] Failed to create CUDA context for device(%d): %d", _cu_device, status); 282 tty->print_cr("[CUDA] Failed to create CUDA context for device(%d): %d", _cu_device, status);
269 return NULL; 283 return NULL;
270 } 284 }
441 break; 455 break;
442 default: 456 default:
443 tty->print_cr("[CUDA] TODO *** Unhandled return type: %d", return_type); 457 tty->print_cr("[CUDA] TODO *** Unhandled return type: %d", return_type);
444 } 458 }
445 459
446 // Copy all reference arguments from device to host memory.
447 ptxka.copyRefArgsFromDtoH();
448
449 // Free device memory allocated for result 460 // Free device memory allocated for result
450 status = gpu::Ptx::_cuda_cu_memfree(ptxka._dev_return_value); 461 status = gpu::Ptx::_cuda_cu_memfree(ptxka._dev_return_value);
451 if (status != GRAAL_CUDA_SUCCESS) { 462 if (status != GRAAL_CUDA_SUCCESS) {
452 tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status); 463 tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status);
453 return false; 464 return false;
485 if (cuda_library_name != NULL) { 496 if (cuda_library_name != NULL) {
486 char *buffer = (char*)malloc(STD_BUFFER_SIZE); 497 char *buffer = (char*)malloc(STD_BUFFER_SIZE);
487 void *handle = os::dll_load(cuda_library_name, buffer, STD_BUFFER_SIZE); 498 void *handle = os::dll_load(cuda_library_name, buffer, STD_BUFFER_SIZE);
488 free(buffer); 499 free(buffer);
489 if (handle != NULL) { 500 if (handle != NULL) {
490 _cuda_cu_init = 501 LOOKUP_CUDA_FUNCTION(cuInit, cuda_cu_init);
491 CAST_TO_FN_PTR(cuda_cu_init_func_t, os::dll_lookup(handle, "cuInit")); 502 LOOKUP_CUDA_FUNCTION(cuCtxSynchronize, cuda_cu_ctx_synchronize);
492 _cuda_cu_ctx_create = 503 LOOKUP_CUDA_FUNCTION(cuCtxSetCurrent, cuda_cu_ctx_set_current);
493 CAST_TO_FN_PTR(cuda_cu_ctx_create_func_t, os::dll_lookup(handle, "cuCtxCreate")); 504 LOOKUP_CUDA_FUNCTION(cuDeviceGetCount, cuda_cu_device_get_count);
494 _cuda_cu_ctx_destroy = 505 LOOKUP_CUDA_FUNCTION(cuDeviceGetName, cuda_cu_device_get_name);
495 CAST_TO_FN_PTR(cuda_cu_ctx_destroy_func_t, os::dll_lookup(handle, "cuCtxDestroy")); 506 LOOKUP_CUDA_FUNCTION(cuDeviceGet, cuda_cu_device_get);
496 _cuda_cu_ctx_synchronize = 507 LOOKUP_CUDA_FUNCTION(cuDeviceComputeCapability, cuda_cu_device_compute_capability);
497 CAST_TO_FN_PTR(cuda_cu_ctx_synchronize_func_t, os::dll_lookup(handle, "cuCtxSynchronize")); 508 LOOKUP_CUDA_FUNCTION(cuDeviceGetAttribute, cuda_cu_device_get_attribute);
498 _cuda_cu_ctx_set_current = 509 LOOKUP_CUDA_FUNCTION(cuModuleGetFunction, cuda_cu_module_get_function);
499 CAST_TO_FN_PTR(cuda_cu_ctx_set_current_func_t, os::dll_lookup(handle, "cuCtxSetCurrent")); 510 LOOKUP_CUDA_FUNCTION(cuModuleLoadDataEx, cuda_cu_module_load_data_ex);
500 _cuda_cu_device_get_count = 511 LOOKUP_CUDA_FUNCTION(cuLaunchKernel, cuda_cu_launch_kernel);
501 CAST_TO_FN_PTR(cuda_cu_device_get_count_func_t, os::dll_lookup(handle, "cuDeviceGetCount")); 512 LOOKUP_CUDA_FUNCTION(cuMemHostRegister, cuda_cu_mem_host_register);
502 _cuda_cu_device_get_name = 513 LOOKUP_CUDA_FUNCTION(cuMemHostUnregister, cuda_cu_mem_host_unregister);
503 CAST_TO_FN_PTR(cuda_cu_device_get_name_func_t, os::dll_lookup(handle, "cuDeviceGetName")); 514 #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
504 _cuda_cu_device_get = 515 LOOKUP_CUDA_V2_FUNCTION(cuCtxCreate, cuda_cu_ctx_create);
505 CAST_TO_FN_PTR(cuda_cu_device_get_func_t, os::dll_lookup(handle, "cuDeviceGet")); 516 LOOKUP_CUDA_V2_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy);
506 _cuda_cu_device_compute_capability = 517 LOOKUP_CUDA_V2_FUNCTION(cuMemAlloc, cuda_cu_memalloc);
507 CAST_TO_FN_PTR(cuda_cu_device_compute_capability_func_t, os::dll_lookup(handle, "cuDeviceComputeCapability")); 518 LOOKUP_CUDA_V2_FUNCTION(cuMemFree, cuda_cu_memfree);
508 _cuda_cu_device_get_attribute = 519 LOOKUP_CUDA_V2_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod);
509 CAST_TO_FN_PTR(cuda_cu_device_get_attribute_func_t, os::dll_lookup(handle, "cuDeviceGetAttribute")); 520 LOOKUP_CUDA_V2_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh);
510 _cuda_cu_module_get_function = 521 LOOKUP_CUDA_V2_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer);
511 CAST_TO_FN_PTR(cuda_cu_module_get_function_func_t, os::dll_lookup(handle, "cuModuleGetFunction")); 522 #else
512 _cuda_cu_module_load_data_ex = 523 LOOKUP_CUDA_FUNCTION(cuCtxCreate, cuda_cu_ctx_create);
513 CAST_TO_FN_PTR(cuda_cu_module_load_data_ex_func_t, os::dll_lookup(handle, "cuModuleLoadDataEx")); 524 LOOKUP_CUDA_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy);
514 _cuda_cu_launch_kernel = 525 LOOKUP_CUDA_FUNCTION(cuMemAlloc, cuda_cu_memalloc);
515 CAST_TO_FN_PTR(cuda_cu_launch_kernel_func_t, os::dll_lookup(handle, "cuLaunchKernel")); 526 LOOKUP_CUDA_FUNCTION(cuMemFree, cuda_cu_memfree);
516 _cuda_cu_memalloc = 527 LOOKUP_CUDA_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod);
517 CAST_TO_FN_PTR(cuda_cu_memalloc_func_t, os::dll_lookup(handle, "cuMemAlloc")); 528 LOOKUP_CUDA_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh);
518 _cuda_cu_memfree = 529 LOOKUP_CUDA_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer);
519 CAST_TO_FN_PTR(cuda_cu_memfree_func_t, os::dll_lookup(handle, "cuMemFree")); 530 #endif
520 _cuda_cu_memcpy_htod =
521 CAST_TO_FN_PTR(cuda_cu_memcpy_htod_func_t, os::dll_lookup(handle, "cuMemcpyHtoD"));
522 _cuda_cu_memcpy_dtoh =
523 CAST_TO_FN_PTR(cuda_cu_memcpy_dtoh_func_t, os::dll_lookup(handle, "cuMemcpyDtoH"));
524 531
525 if (TraceGPUInteraction) { 532 if (TraceGPUInteraction) {
526 tty->print_cr("[CUDA] Success: library linkage"); 533 tty->print_cr("[CUDA] Success: library linkage");
527 } 534 }
528 return true; 535 return true;