comparison src/gpu/ptx/vm/gpu_ptx.cpp @ 13819:49db2c1e3bee

added support for co-existing GPU backends (JBS:GRAAL-1)
author Doug Simon <doug.simon@oracle.com>
date Thu, 30 Jan 2014 00:52:33 +0100
parents 80cd5c3b8827
children 5c8a3c09397b ab370d74a8eb
comparison
equal deleted inserted replaced
13818:d2f520f46180 13819:49db2c1e3bee
28 #include "utilities/globalDefinitions.hpp" 28 #include "utilities/globalDefinitions.hpp"
29 #include "utilities/ostream.hpp" 29 #include "utilities/ostream.hpp"
30 #include "memory/allocation.hpp" 30 #include "memory/allocation.hpp"
31 #include "memory/allocation.inline.hpp" 31 #include "memory/allocation.inline.hpp"
32 #include "runtime/interfaceSupport.hpp" 32 #include "runtime/interfaceSupport.hpp"
33 #include "graal/graalEnv.hpp"
34 #include "graal/graalCompiler.hpp"
33 #include "ptxKernelArguments.hpp" 35 #include "ptxKernelArguments.hpp"
36
37 // Entry to GPU native method implementation that transitions current thread to '_thread_in_vm'.
38 #define GPU_VMENTRY(result_type, name, signature) \
39 JNIEXPORT result_type JNICALL name signature { \
40 GRAAL_VM_ENTRY_MARK; \
41
42 // Entry to GPU native method implementation that calls a JNI function
43 // and hence cannot transition current thread to '_thread_in_vm'.
44 #define GPU_ENTRY(result_type, name, signature) \
45 JNIEXPORT result_type JNICALL name signature { \
46
47 #define GPU_END }
48
49 #define CC (char*) /*cast a literal from (const char*)*/
50 #define FN_PTR(f) CAST_FROM_FN_PTR(void*, &(f))
51
52 #define STRING "Ljava/lang/String;"
53
54 JNINativeMethod gpu::Ptx::PTX_methods[] = {
55 {CC"initialize", CC"()Z", FN_PTR(gpu::Ptx::initialize)},
56 {CC"generateKernel", CC"([B" STRING ")J", FN_PTR(gpu::Ptx::generate_kernel)},
57 {CC"getLaunchKernelAddress", CC"()J", FN_PTR(gpu::Ptx::get_execute_kernel_from_vm_address)},
58 {CC"getAvailableProcessors0", CC"()I", FN_PTR(gpu::Ptx::get_total_cores)},
59 };
34 60
35 void * gpu::Ptx::_device_context; 61 void * gpu::Ptx::_device_context;
36 int gpu::Ptx::_cu_device = 0; 62 int gpu::Ptx::_cu_device = 0;
37 63
38 gpu::Ptx::cuda_cu_init_func_t gpu::Ptx::_cuda_cu_init; 64 gpu::Ptx::cuda_cu_init_func_t gpu::Ptx::_cuda_cu_init;
60 #define LOOKUP_CUDA_FUNCTION(name, alias) \ 86 #define LOOKUP_CUDA_FUNCTION(name, alias) \
61 _##alias = \ 87 _##alias = \
62 CAST_TO_FN_PTR(alias##_func_t, os::dll_lookup(handle, STRINGIFY(name))); \ 88 CAST_TO_FN_PTR(alias##_func_t, os::dll_lookup(handle, STRINGIFY(name))); \
63 if (_##alias == NULL) { \ 89 if (_##alias == NULL) { \
64 tty->print_cr("[CUDA] ***** Error: Failed to lookup %s", STRINGIFY(name)); \ 90 tty->print_cr("[CUDA] ***** Error: Failed to lookup %s", STRINGIFY(name)); \
65 return 0; \ 91 return false; \
66 } \ 92 } \
67 93
68 #define LOOKUP_CUDA_V2_FUNCTION(name, alias) LOOKUP_CUDA_FUNCTION(name##_v2, alias) 94 #define LOOKUP_CUDA_V2_FUNCTION(name, alias) LOOKUP_CUDA_FUNCTION(name##_v2, alias)
69 95
70 /* 96 /*
71 * see http://en.wikipedia.org/wiki/CUDA#Supported_GPUs 97 * see http://en.wikipedia.org/wiki/CUDA#Supported_GPUs
72 */ 98 */
73 int ncores(int major, int minor) { 99 int gpu::Ptx::ncores(int major, int minor) {
74 int device_type = (major << 4) + minor; 100 int device_type = (major << 4) + minor;
75 101
76 switch (device_type) { 102 switch (device_type) {
77 case 0x10: return 8; 103 case 0x10: return 8;
78 case 0x11: return 8; 104 case 0x11: return 8;
86 tty->print_cr("[CUDA] Warning: Unhandled device %x", device_type); 112 tty->print_cr("[CUDA] Warning: Unhandled device %x", device_type);
87 return 0; 113 return 0;
88 } 114 }
89 } 115 }
90 116
91 bool gpu::Ptx::initialize_gpu() { 117 bool gpu::Ptx::register_natives(JNIEnv* env) {
118 jclass klass = env->FindClass("com/oracle/graal/hotspot/ptx/PTXHotSpotBackend");
119 if (klass == NULL) {
120 if (TraceGPUInteraction) {
121 tty->print_cr("PTXHotSpotBackend class not found");
122 }
123 return false;
124 }
125 jint status = env->RegisterNatives(klass, PTX_methods, sizeof(PTX_methods) / sizeof(JNINativeMethod));
126 if (status != JNI_OK) {
127 if (TraceGPUInteraction) {
128 tty->print_cr("Error registering natives for PTXHotSpotBackend: %d", status);
129 }
130 return false;
131 }
132 return true;
133 }
134
135 GPU_ENTRY(jboolean, gpu::Ptx::initialize, (JNIEnv *env, jclass))
136
137 if (!link()) {
138 return false;
139 }
92 140
93 /* Initialize CUDA driver API */ 141 /* Initialize CUDA driver API */
94 int status = _cuda_cu_init(0); 142 int status = _cuda_cu_init(0);
95 if (status != GRAAL_CUDA_SUCCESS) { 143 if (status != GRAAL_CUDA_SUCCESS) {
96 tty->print_cr("Failed to initialize CUDA device"); 144 if (TraceGPUInteraction) {
145 tty->print_cr("Failed to initialize CUDA device: %d", status);
146 }
97 return false; 147 return false;
98 } 148 }
99 149
100 if (TraceGPUInteraction) { 150 if (TraceGPUInteraction) {
101 tty->print_cr("CUDA driver initialization: Success"); 151 tty->print_cr("CUDA driver initialization: Success");
158 208
159 if (TraceGPUInteraction) { 209 if (TraceGPUInteraction) {
160 tty->print_cr("[CUDA] Using %s", device_name); 210 tty->print_cr("[CUDA] Using %s", device_name);
161 } 211 }
162 212
213 gpu::initialized_gpu(device_name);
163 214
164 return true; 215 return true;
165 } 216 GPU_END
166 217
167 unsigned int gpu::Ptx::total_cores() { 218 GPU_ENTRY(jint, gpu::Ptx::get_total_cores, (JNIEnv *env, jobject))
168 219
169 int minor, major, nmp; 220 int minor, major, nmp;
170 int status = _cuda_cu_device_get_attribute(&minor, 221 int status = _cuda_cu_device_get_attribute(&minor,
171 GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, 222 GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
172 _cu_device); 223 _cu_device);
188 status = _cuda_cu_device_get_attribute(&nmp, 239 status = _cuda_cu_device_get_attribute(&nmp,
189 GRAAL_CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 240 GRAAL_CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
190 _cu_device); 241 _cu_device);
191 242
192 if (status != GRAAL_CUDA_SUCCESS) { 243 if (status != GRAAL_CUDA_SUCCESS) {
193 tty->print_cr("[CUDA] Failed to get numberof MPs on device: %d", _cu_device); 244 tty->print_cr("[CUDA] Failed to get number of MPs on device: %d", _cu_device);
194 return 0; 245 return 0;
195 } 246 }
196 247
197 int total = nmp * ncores(major, minor); 248 int total = nmp * ncores(major, minor);
198 249
247 tty->print_cr("[CUDA] Compatibility version of device %d: %d.%d", _cu_device, major, minor); 298 tty->print_cr("[CUDA] Compatibility version of device %d: %d.%d", _cu_device, major, minor);
248 tty->print_cr("[CUDA] Number of cores: %d async engines: %d can map host mem: %d concurrent kernels: %d", 299 tty->print_cr("[CUDA] Number of cores: %d async engines: %d can map host mem: %d concurrent kernels: %d",
249 total, async_engines, can_map_host_memory, concurrent_kernels); 300 total, async_engines, can_map_host_memory, concurrent_kernels);
250 tty->print_cr("[CUDA] Max threads per block: %d warp size: %d", max_threads_per_block, warp_size); 301 tty->print_cr("[CUDA] Max threads per block: %d warp size: %d", max_threads_per_block, warp_size);
251 } 302 }
252 return (total); 303 return total;
253 304 GPU_END
254 } 305
255 306 GPU_ENTRY(jlong, gpu::Ptx::generate_kernel, (JNIEnv *env, jclass, jbyteArray code_handle, jstring name_handle))
256 void *gpu::Ptx::generate_kernel(unsigned char *code, int code_len, const char *name) { 307 ResourceMark rm;
308 jsize name_len = env->GetStringLength(name_handle);
309 jsize code_len = env->GetArrayLength(code_handle);
310
311 char* name = NEW_RESOURCE_ARRAY(char, name_len + 1);
312 unsigned char *code = NEW_RESOURCE_ARRAY(unsigned char, code_len + 1);
313
314 code[code_len] = 0;
315 name[name_len] = 0;
316
317 env->GetByteArrayRegion(code_handle, 0, code_len, (jbyte*) code);
318 env->GetStringUTFRegion(name_handle, 0, name_len, name);
257 319
258 struct CUmod_st * cu_module; 320 struct CUmod_st * cu_module;
259 // Use three JIT compiler options 321 // Use three JIT compiler options
260 const unsigned int jit_num_options = 3; 322 const unsigned int jit_num_options = 3;
261 int *jit_options = NEW_C_HEAP_ARRAY(int, jit_num_options, mtCompiler); 323 int *jit_options = NEW_RESOURCE_ARRAY(int, jit_num_options);
262 void **jit_option_values = NEW_C_HEAP_ARRAY(void *, jit_num_options, mtCompiler); 324 void **jit_option_values = NEW_RESOURCE_ARRAY(void *, jit_num_options);
263 325
264 // Set up PTX JIT compiler options 326 // Set up PTX JIT compiler options
265 // 1. set size of compilation log buffer 327 // 1. set size of compilation log buffer
266 int jit_log_buffer_size = 1024; 328 int jit_log_buffer_size = 1024;
267 jit_options[0] = GRAAL_CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 329 jit_options[0] = GRAAL_CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
268 jit_option_values[0] = (void *)(size_t)jit_log_buffer_size; 330 jit_option_values[0] = (void *)(size_t)jit_log_buffer_size;
269 331
270 // 2. set pointer to compilation log buffer 332 // 2. set pointer to compilation log buffer
271 char *jit_log_buffer = NEW_C_HEAP_ARRAY(char, jit_log_buffer_size, mtCompiler); 333 char *jit_log_buffer = NEW_RESOURCE_ARRAY(char, jit_log_buffer_size);
272 jit_options[1] = GRAAL_CU_JIT_INFO_LOG_BUFFER; 334 jit_options[1] = GRAAL_CU_JIT_INFO_LOG_BUFFER;
273 jit_option_values[1] = jit_log_buffer; 335 jit_option_values[1] = jit_log_buffer;
274 336
275 // 3. set pointer to set the Maximum # of registers (32) for the kernel 337 // 3. set pointer to set the maximum number of registers (32) for the kernel
276 int jit_register_count = 32; 338 int jit_register_count = 32;
277 jit_options[2] = GRAAL_CU_JIT_MAX_REGISTERS; 339 jit_options[2] = GRAAL_CU_JIT_MAX_REGISTERS;
278 jit_option_values[2] = (void *)(size_t)jit_register_count; 340 jit_option_values[2] = (void *)(size_t)jit_register_count;
279 341
280 /* Create CUDA context to compile and execute the kernel */ 342 // Create CUDA context to compile and execute the kernel
281 int status = _cuda_cu_ctx_create(&_device_context, GRAAL_CU_CTX_MAP_HOST, _cu_device); 343 int status = _cuda_cu_ctx_create(&_device_context, GRAAL_CU_CTX_MAP_HOST, _cu_device);
282 344
283 if (status != GRAAL_CUDA_SUCCESS) { 345 if (status != GRAAL_CUDA_SUCCESS) {
284 tty->print_cr("[CUDA] Failed to create CUDA context for device(%d): %d", _cu_device, status); 346 tty->print_cr("[CUDA] Failed to create CUDA context for device(%d): %d", _cu_device, status);
285 return NULL; 347 return 0L;
286 } 348 }
287
288 if (TraceGPUInteraction) { 349 if (TraceGPUInteraction) {
289 tty->print_cr("[CUDA] Success: Created context for device: %d", _cu_device); 350 tty->print_cr("[CUDA] Success: Created context for device: %d", _cu_device);
290 } 351 }
291 352
292 status = _cuda_cu_ctx_set_current(_device_context); 353 status = _cuda_cu_ctx_set_current(_device_context);
293 354
294 if (status != GRAAL_CUDA_SUCCESS) { 355 if (status != GRAAL_CUDA_SUCCESS) {
295 tty->print_cr("[CUDA] Failed to set current context for device: %d", _cu_device); 356 tty->print_cr("[CUDA] Failed to set current context for device: %d", _cu_device);
296 return NULL; 357 return 0L;
297 } 358 }
298
299 if (TraceGPUInteraction) { 359 if (TraceGPUInteraction) {
300 tty->print_cr("[CUDA] Success: Set current context for device: %d", _cu_device); 360 tty->print_cr("[CUDA] Success: Set current context for device: %d", _cu_device);
301 }
302
303 if (TraceGPUInteraction) {
304 tty->print_cr("[CUDA] PTX Kernel\n%s", code); 361 tty->print_cr("[CUDA] PTX Kernel\n%s", code);
305 tty->print_cr("[CUDA] Function name : %s", name); 362 tty->print_cr("[CUDA] Function name : %s", name);
306
307 } 363 }
308 364
309 /* Load module's data with compiler options */ 365 /* Load module's data with compiler options */
310 status = _cuda_cu_module_load_data_ex(&cu_module, (void*) code, jit_num_options, 366 status = _cuda_cu_module_load_data_ex(&cu_module, (void*) code, jit_num_options,
311 jit_options, (void **)jit_option_values); 367 jit_options, (void **)jit_option_values);
312 if (status != GRAAL_CUDA_SUCCESS) { 368 if (status != GRAAL_CUDA_SUCCESS) {
313 if (status == GRAAL_CUDA_ERROR_NO_BINARY_FOR_GPU) { 369 if (status == GRAAL_CUDA_ERROR_NO_BINARY_FOR_GPU) {
314 tty->print_cr("[CUDA] Check for malformed PTX kernel or incorrect PTX compilation options"); 370 tty->print_cr("[CUDA] Check for malformed PTX kernel or incorrect PTX compilation options");
315 } 371 }
316 tty->print_cr("[CUDA] *** Error (%d) Failed to load module data with online compiler options for method %s", 372 tty->print_cr("[CUDA] *** Error (%d) Failed to load module data with online compiler options for method %s",
317 status, name); 373 status, name);
318 return NULL; 374 return 0L;
319 } 375 }
320 376
321 if (TraceGPUInteraction) { 377 if (TraceGPUInteraction) {
322 tty->print_cr("[CUDA] Loaded data for PTX Kernel"); 378 tty->print_cr("[CUDA] Loaded data for PTX Kernel");
323 } 379 }
324 380
325 struct CUfunc_st * cu_function; 381 struct CUfunc_st* cu_function;
326
327 status = _cuda_cu_module_get_function(&cu_function, cu_module, name); 382 status = _cuda_cu_module_get_function(&cu_function, cu_module, name);
328 383
329 if (status != GRAAL_CUDA_SUCCESS) { 384 if (status != GRAAL_CUDA_SUCCESS) {
330 tty->print_cr("[CUDA] *** Error: Failed to get function %s", name); 385 tty->print_cr("[CUDA] *** Error: Failed to get function %s", name);
331 return NULL; 386 return 0L;
332 } 387 }
333 388
334 if (TraceGPUInteraction) { 389 if (TraceGPUInteraction) {
335 tty->print_cr("[CUDA] Got function handle for %s kernel address %p", name, cu_function); 390 tty->print_cr("[CUDA] Got function handle for %s kernel address %p", name, cu_function);
336 } 391 }
337 392 return (jlong) cu_function;
338 return cu_function; 393 GPU_END
339 }
340 394
341 // A PtxCall is used to manage executing a GPU kernel. In addition to launching 395 // A PtxCall is used to manage executing a GPU kernel. In addition to launching
342 // the kernel, this class releases resources allocated for the execution. 396 // the kernel, this class releases resources allocated for the execution.
343 class PtxCall: StackObj { 397 class PtxCall: StackObj {
344 private: 398 private:
478 free_return_value(); 532 free_return_value();
479 destroy_context(); 533 destroy_context();
480 } 534 }
481 }; 535 };
482 536
537 GPU_VMENTRY(jlong, gpu::Ptx::get_execute_kernel_from_vm_address, (JNIEnv *env, jclass))
538 return (jlong) gpu::Ptx::execute_kernel_from_vm;
539 GPU_END
483 540
484 JRT_ENTRY(jlong, gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ, 541 JRT_ENTRY(jlong, gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ,
485 jlong buffer, 542 jlong buffer,
486 jint bufferSize, 543 jint bufferSize,
487 jint objectParametersCount, 544 jint objectParametersCount,
491 if (kernel == 0L) { 548 if (kernel == 0L) {
492 SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_NullPointerException(), NULL); 549 SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_NullPointerException(), NULL);
493 return 0L; 550 return 0L;
494 } 551 }
495 552
553 #if 0
554 Universe::heap()->collect(GCCause::_jvmti_force_gc);
555 #endif
556
496 PtxCall call(thread, (address) buffer, bufferSize, (oop*) (address) pinnedObjects, encodedReturnTypeSize); 557 PtxCall call(thread, (address) buffer, bufferSize, (oop*) (address) pinnedObjects, encodedReturnTypeSize);
497 558
498 #define TRY(action) do { \ 559 #define TRY(action) do { \
499 action; \ 560 action; \
500 if (HAS_PENDING_EXCEPTION) return 0L; \ 561 if (HAS_PENDING_EXCEPTION) return 0L; \
520 return return_val; 581 return return_val;
521 582
522 #undef TRY 583 #undef TRY
523 584
524 JRT_END 585 JRT_END
525
526 bool gpu::Ptx::execute_kernel(address kernel, PTXKernelArguments &ptxka, JavaValue &ret) {
527 return gpu::Ptx::execute_warp(1, 1, 1, kernel, ptxka, ret);
528 }
529
530 bool gpu::Ptx::execute_warp(int dimX, int dimY, int dimZ,
531 address kernel, PTXKernelArguments &ptxka, JavaValue &ret) {
532 // grid dimensionality
533 unsigned int gridX = 1;
534 unsigned int gridY = 1;
535 unsigned int gridZ = 1;
536
537 // thread dimensionality
538 unsigned int blockX = dimX;
539 unsigned int blockY = dimY;
540 unsigned int blockZ = dimZ;
541
542 struct CUfunc_st* cu_function = (struct CUfunc_st*) kernel;
543
544 void * config[5] = {
545 GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, ptxka._kernelArgBuffer,
546 GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, &(ptxka._bufferOffset),
547 GRAAL_CU_LAUNCH_PARAM_END
548 };
549
550 if (kernel == NULL) {
551 return false;
552 }
553
554 if (TraceGPUInteraction) {
555 tty->print_cr("[CUDA] launching kernel");
556 }
557
558 int status = _cuda_cu_launch_kernel(cu_function,
559 gridX, gridY, gridZ,
560 blockX, blockY, blockZ,
561 0, NULL, NULL, (void **) &config);
562 if (status != GRAAL_CUDA_SUCCESS) {
563 tty->print_cr("[CUDA] Failed to launch kernel");
564 return false;
565 }
566
567 if (TraceGPUInteraction) {
568 tty->print_cr("[CUDA] Success: Kernel Launch: X: %d Y: %d Z: %d", blockX, blockY, blockZ);
569 }
570
571 status = _cuda_cu_ctx_synchronize();
572
573 if (status != GRAAL_CUDA_SUCCESS) {
574 tty->print_cr("[CUDA] Failed to synchronize launched kernel (%d)", status);
575 return false;
576 }
577
578 if (TraceGPUInteraction) {
579 tty->print_cr("[CUDA] Success: Synchronized launch kernel");
580 }
581
582
583 // Get the result. TODO: Move this code to get_return_oop()
584 BasicType return_type = ptxka.get_ret_type();
585 switch (return_type) {
586 case T_INT:
587 {
588 int return_val;
589 status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._dev_return_value, T_INT_BYTE_SIZE);
590 if (status != GRAAL_CUDA_SUCCESS) {
591 tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
592 return false;
593 }
594 ret.set_jint(return_val);
595 }
596 break;
597 case T_BOOLEAN:
598 {
599 int return_val;
600 status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._dev_return_value, T_INT_BYTE_SIZE);
601 if (status != GRAAL_CUDA_SUCCESS) {
602 tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
603 return false;
604 }
605 ret.set_jint(return_val);
606 }
607 break;
608 case T_FLOAT:
609 {
610 float return_val;
611 status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._dev_return_value, T_FLOAT_BYTE_SIZE);
612 if (status != GRAAL_CUDA_SUCCESS) {
613 tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
614 return false;
615 }
616 ret.set_jfloat(return_val);
617 }
618 break;
619 case T_DOUBLE:
620 {
621 double return_val;
622 status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._dev_return_value, T_DOUBLE_BYTE_SIZE);
623 if (status != GRAAL_CUDA_SUCCESS) {
624 tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
625 return false;
626 }
627 ret.set_jdouble(return_val);
628 }
629 break;
630 case T_LONG:
631 {
632 long return_val;
633 status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._dev_return_value, T_LONG_BYTE_SIZE);
634 if (status != GRAAL_CUDA_SUCCESS) {
635 tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
636 return false;
637 }
638 ret.set_jlong(return_val);
639 }
640 break;
641 case T_VOID:
642 break;
643 default:
644 tty->print_cr("[CUDA] TODO *** Unhandled return type: %d", return_type);
645 }
646
647 // Free device memory allocated for result
648 status = gpu::Ptx::_cuda_cu_memfree(ptxka._dev_return_value);
649 if (status != GRAAL_CUDA_SUCCESS) {
650 tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status);
651 return false;
652 }
653
654 if (TraceGPUInteraction) {
655 tty->print_cr("[CUDA] Success: Freed device memory of return value");
656 }
657
658 // Destroy context
659 status = gpu::Ptx::_cuda_cu_ctx_destroy(_device_context);
660 if (status != GRAAL_CUDA_SUCCESS) {
661 tty->print_cr("[CUDA] *** Error (%d) Failed to destroy context", status);
662 return false;
663 }
664
665 if (TraceGPUInteraction) {
666 tty->print_cr("[CUDA] Success: Destroy context");
667 }
668
669 return (status == GRAAL_CUDA_SUCCESS);
670 }
671 586
672 #if defined(LINUX) 587 #if defined(LINUX)
673 static const char cuda_library_name[] = "libcuda.so"; 588 static const char cuda_library_name[] = "libcuda.so";
674 #elif defined(__APPLE__) 589 #elif defined(__APPLE__)
675 static char const cuda_library_name[] = "/usr/local/cuda/lib/libcuda.dylib"; 590 static char const cuda_library_name[] = "/usr/local/cuda/lib/libcuda.dylib";
676 #else 591 #else
677 static char const cuda_library_name[] = ""; 592 static char const cuda_library_name[] = "";
678 #endif 593 #endif
679 594
680 #define STD_BUFFER_SIZE 1024 595 bool gpu::Ptx::link() {
681 596 if (cuda_library_name == NULL) {
682 bool gpu::Ptx::probe_linkage() { 597 if (TraceGPUInteraction) {
683 if (cuda_library_name != NULL) { 598 tty->print_cr("Failed to find CUDA linkage");
684 char *buffer = (char*)malloc(STD_BUFFER_SIZE); 599 }
685 void *handle = os::dll_load(cuda_library_name, buffer, STD_BUFFER_SIZE); 600 return false;
686 free(buffer); 601 }
687 if (handle != NULL) { 602 char ebuf[O_BUFLEN];
688 LOOKUP_CUDA_FUNCTION(cuInit, cuda_cu_init); 603 void *handle = os::dll_load(cuda_library_name, ebuf, O_BUFLEN);
689 LOOKUP_CUDA_FUNCTION(cuCtxSynchronize, cuda_cu_ctx_synchronize); 604 if (handle == NULL) {
690 LOOKUP_CUDA_FUNCTION(cuCtxGetCurrent, cuda_cu_ctx_get_current); 605 if (TraceGPUInteraction) {
691 LOOKUP_CUDA_FUNCTION(cuCtxSetCurrent, cuda_cu_ctx_set_current); 606 tty->print_cr("Unsupported CUDA platform: %s", ebuf);
692 LOOKUP_CUDA_FUNCTION(cuDeviceGetCount, cuda_cu_device_get_count); 607 }
693 LOOKUP_CUDA_FUNCTION(cuDeviceGetName, cuda_cu_device_get_name); 608 return false;
694 LOOKUP_CUDA_FUNCTION(cuDeviceGet, cuda_cu_device_get); 609 }
695 LOOKUP_CUDA_FUNCTION(cuDeviceComputeCapability, cuda_cu_device_compute_capability); 610
696 LOOKUP_CUDA_FUNCTION(cuDeviceGetAttribute, cuda_cu_device_get_attribute); 611 LOOKUP_CUDA_FUNCTION(cuInit, cuda_cu_init);
697 LOOKUP_CUDA_FUNCTION(cuModuleGetFunction, cuda_cu_module_get_function); 612 LOOKUP_CUDA_FUNCTION(cuCtxSynchronize, cuda_cu_ctx_synchronize);
698 LOOKUP_CUDA_FUNCTION(cuModuleLoadDataEx, cuda_cu_module_load_data_ex); 613 LOOKUP_CUDA_FUNCTION(cuCtxGetCurrent, cuda_cu_ctx_get_current);
699 LOOKUP_CUDA_FUNCTION(cuLaunchKernel, cuda_cu_launch_kernel); 614 LOOKUP_CUDA_FUNCTION(cuCtxSetCurrent, cuda_cu_ctx_set_current);
700 LOOKUP_CUDA_FUNCTION(cuMemHostRegister, cuda_cu_mem_host_register); 615 LOOKUP_CUDA_FUNCTION(cuDeviceGetCount, cuda_cu_device_get_count);
701 LOOKUP_CUDA_FUNCTION(cuMemHostUnregister, cuda_cu_mem_host_unregister); 616 LOOKUP_CUDA_FUNCTION(cuDeviceGetName, cuda_cu_device_get_name);
617 LOOKUP_CUDA_FUNCTION(cuDeviceGet, cuda_cu_device_get);
618 LOOKUP_CUDA_FUNCTION(cuDeviceComputeCapability, cuda_cu_device_compute_capability);
619 LOOKUP_CUDA_FUNCTION(cuDeviceGetAttribute, cuda_cu_device_get_attribute);
620 LOOKUP_CUDA_FUNCTION(cuModuleGetFunction, cuda_cu_module_get_function);
621 LOOKUP_CUDA_FUNCTION(cuModuleLoadDataEx, cuda_cu_module_load_data_ex);
622 LOOKUP_CUDA_FUNCTION(cuLaunchKernel, cuda_cu_launch_kernel);
623 LOOKUP_CUDA_FUNCTION(cuMemHostRegister, cuda_cu_mem_host_register);
624 LOOKUP_CUDA_FUNCTION(cuMemHostUnregister, cuda_cu_mem_host_unregister);
702 #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) 625 #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
703 LOOKUP_CUDA_V2_FUNCTION(cuCtxCreate, cuda_cu_ctx_create); 626 LOOKUP_CUDA_V2_FUNCTION(cuCtxCreate, cuda_cu_ctx_create);
704 LOOKUP_CUDA_V2_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy); 627 LOOKUP_CUDA_V2_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy);
705 LOOKUP_CUDA_V2_FUNCTION(cuMemAlloc, cuda_cu_memalloc); 628 LOOKUP_CUDA_V2_FUNCTION(cuMemAlloc, cuda_cu_memalloc);
706 LOOKUP_CUDA_V2_FUNCTION(cuMemFree, cuda_cu_memfree); 629 LOOKUP_CUDA_V2_FUNCTION(cuMemFree, cuda_cu_memfree);
707 LOOKUP_CUDA_V2_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod); 630 LOOKUP_CUDA_V2_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod);
708 LOOKUP_CUDA_V2_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh); 631 LOOKUP_CUDA_V2_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh);
709 LOOKUP_CUDA_V2_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer); 632 LOOKUP_CUDA_V2_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer);
710 #else 633 #else
711 LOOKUP_CUDA_FUNCTION(cuCtxCreate, cuda_cu_ctx_create); 634 LOOKUP_CUDA_FUNCTION(cuCtxCreate, cuda_cu_ctx_create);
712 LOOKUP_CUDA_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy); 635 LOOKUP_CUDA_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy);
713 LOOKUP_CUDA_FUNCTION(cuMemAlloc, cuda_cu_memalloc); 636 LOOKUP_CUDA_FUNCTION(cuMemAlloc, cuda_cu_memalloc);
714 LOOKUP_CUDA_FUNCTION(cuMemFree, cuda_cu_memfree); 637 LOOKUP_CUDA_FUNCTION(cuMemFree, cuda_cu_memfree);
715 LOOKUP_CUDA_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod); 638 LOOKUP_CUDA_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod);
716 LOOKUP_CUDA_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh); 639 LOOKUP_CUDA_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh);
717 LOOKUP_CUDA_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer); 640 LOOKUP_CUDA_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer);
718 #endif 641 #endif
719 642
720 if (TraceGPUInteraction) { 643 if (TraceGPUInteraction) {
721 tty->print_cr("[CUDA] Success: library linkage"); 644 tty->print_cr("[CUDA] Success: library linkage");
722 } 645 }
723 return true; 646 return true;
724 } else {
725 // Unable to dlopen libcuda
726 return false;
727 }
728 } else {
729 tty->print_cr("Unsupported CUDA platform");
730 return false;
731 }
732 tty->print_cr("Failed to find CUDA linkage");
733 return false;
734 } 647 }