truffle: src/gpu/ptx/vm/gpu_ptx.cpp comparison

comparison src/gpu/ptx/vm/gpu_ptx.cpp @ 12653:1a7e7011a341

* PTX kernel argument buffer now has naturally aligned arguments as required by PTX JIT compiler. * Change dynamic loading of CUDA driver API functions to load 32-bit or 64-bit versions of depending on the the host architecture. * Add ability to generate PTX kernels to be launched both on 32-bit and 64-bit hosts. * Use Unified Virtual Memory APIs to perform array argument marshalling. * PTX array storage test runs on the device and returns correct results. * More integer test failures on GPU fixed.

author	S.Bharadwaj Yadavalli <bharadwaj.yadavalli@oracle.com>
date	Fri, 01 Nov 2013 18:34:03 -0400
parents	f020e149c1b6
children	220ed109bf77

comparison

equal deleted inserted replaced

-:0dd597c6c9c7
+:1a7e7011a341
 gpu::Ptx::cuda_cu_launch_kernel_func_t gpu::Ptx::_cuda_cu_launch_kernel;
 gpu::Ptx::cuda_cu_module_get_function_func_t gpu::Ptx::_cuda_cu_module_get_function;
 gpu::Ptx::cuda_cu_module_load_data_ex_func_t gpu::Ptx::_cuda_cu_module_load_data_ex;
 gpu::Ptx::cuda_cu_memcpy_dtoh_func_t gpu::Ptx::_cuda_cu_memcpy_dtoh;
 gpu::Ptx::cuda_cu_memfree_func_t gpu::Ptx::_cuda_cu_memfree;
+gpu::Ptx::cuda_cu_mem_host_register_func_t gpu::Ptx::_cuda_cu_mem_host_register;
+gpu::Ptx::cuda_cu_mem_host_get_device_pointer_func_t gpu::Ptx::_cuda_cu_mem_host_get_device_pointer;
+gpu::Ptx::cuda_cu_mem_host_unregister_func_t gpu::Ptx::_cuda_cu_mem_host_unregister;
+#define STRINGIFY(x)     #x
+#define LOOKUP_CUDA_FUNCTION(name, alias)  \
+_##alias =                               \
+CAST_TO_FN_PTR(alias##_func_t, os::dll_lookup(handle, STRINGIFY(name))); \
+if (_##alias == NULL) {      \
+tty->print_cr("[CUDA] ***** Error: Failed to lookup %s", STRINGIFY(name)); \
+return 0; \
+} \
+#define LOOKUP_CUDA_V2_FUNCTION(name, alias)  LOOKUP_CUDA_FUNCTION(name##_v2, alias)
 /*
 * see http://en.wikipedia.org/wiki/CUDA#Supported_GPUs
 */
 int ncores(int major, int minor) {
 if (status != GRAAL_CUDA_SUCCESS) {
 tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_WARP_SIZE: %d", _cu_device);
 return 0;
 }
 status = _cuda_cu_device_get_attribute(&async_engines,
 GRAAL_CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT,
 _cu_device);
 if (status != GRAAL_CUDA_SUCCESS) {
 tty->print_cr("[CUDA] Number of cores: %d async engines: %d can map host mem: %d concurrent kernels: %d",
 total, async_engines, can_map_host_memory, concurrent_kernels);
 tty->print_cr("[CUDA] Max threads per block: %d warp size: %d", max_threads_per_block, warp_size);
 }
 return (total);
 }
 void *gpu::Ptx::generate_kernel(unsigned char *code, int code_len, const char *name) {
 struct CUmod_st * cu_module;
 int jit_register_count = 32;
 jit_options[2] = GRAAL_CU_JIT_MAX_REGISTERS;
 jit_option_values[2] = (void *)(size_t)jit_register_count;
 /* Create CUDA context to compile and execute the kernel */
-int status = _cuda_cu_ctx_create(&_device_context, 0, _cu_device);
+int status = _cuda_cu_ctx_create(&_device_context, GRAAL_CU_CTX_MAP_HOST, _cu_device);
 if (status != GRAAL_CUDA_SUCCESS) {
 tty->print_cr("[CUDA] Failed to create CUDA context for device(%d): %d", _cu_device, status);
 return NULL;
 }
 break;
 default:
 tty->print_cr("[CUDA] TODO *** Unhandled return type: %d", return_type);
 }
-// Copy all reference arguments from device to host memory.
-ptxka.copyRefArgsFromDtoH();
 // Free device memory allocated for result
 status = gpu::Ptx::_cuda_cu_memfree(ptxka._dev_return_value);
 if (status != GRAAL_CUDA_SUCCESS) {
 tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status);
 return false;
 if (cuda_library_name != NULL) {
 char *buffer = (char*)malloc(STD_BUFFER_SIZE);
 void *handle = os::dll_load(cuda_library_name, buffer, STD_BUFFER_SIZE);
 free(buffer);
 if (handle != NULL) {
-_cuda_cu_init =
+LOOKUP_CUDA_FUNCTION(cuInit, cuda_cu_init);
-CAST_TO_FN_PTR(cuda_cu_init_func_t, os::dll_lookup(handle, "cuInit"));
+LOOKUP_CUDA_FUNCTION(cuCtxSynchronize, cuda_cu_ctx_synchronize);
-_cuda_cu_ctx_create =
+LOOKUP_CUDA_FUNCTION(cuCtxSetCurrent, cuda_cu_ctx_set_current);
-CAST_TO_FN_PTR(cuda_cu_ctx_create_func_t, os::dll_lookup(handle, "cuCtxCreate"));
+LOOKUP_CUDA_FUNCTION(cuDeviceGetCount, cuda_cu_device_get_count);
-_cuda_cu_ctx_destroy =
+LOOKUP_CUDA_FUNCTION(cuDeviceGetName, cuda_cu_device_get_name);
-CAST_TO_FN_PTR(cuda_cu_ctx_destroy_func_t, os::dll_lookup(handle, "cuCtxDestroy"));
+LOOKUP_CUDA_FUNCTION(cuDeviceGet, cuda_cu_device_get);
-_cuda_cu_ctx_synchronize =
+LOOKUP_CUDA_FUNCTION(cuDeviceComputeCapability, cuda_cu_device_compute_capability);
-CAST_TO_FN_PTR(cuda_cu_ctx_synchronize_func_t, os::dll_lookup(handle, "cuCtxSynchronize"));
+LOOKUP_CUDA_FUNCTION(cuDeviceGetAttribute, cuda_cu_device_get_attribute);
-_cuda_cu_ctx_set_current =
+LOOKUP_CUDA_FUNCTION(cuModuleGetFunction, cuda_cu_module_get_function);
-CAST_TO_FN_PTR(cuda_cu_ctx_set_current_func_t, os::dll_lookup(handle, "cuCtxSetCurrent"));
+LOOKUP_CUDA_FUNCTION(cuModuleLoadDataEx, cuda_cu_module_load_data_ex);
-_cuda_cu_device_get_count =
+LOOKUP_CUDA_FUNCTION(cuLaunchKernel, cuda_cu_launch_kernel);
-CAST_TO_FN_PTR(cuda_cu_device_get_count_func_t, os::dll_lookup(handle, "cuDeviceGetCount"));
+LOOKUP_CUDA_FUNCTION(cuMemHostRegister, cuda_cu_mem_host_register);
-_cuda_cu_device_get_name =
+LOOKUP_CUDA_FUNCTION(cuMemHostUnregister, cuda_cu_mem_host_unregister);
-CAST_TO_FN_PTR(cuda_cu_device_get_name_func_t, os::dll_lookup(handle, "cuDeviceGetName"));
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
-_cuda_cu_device_get =
+LOOKUP_CUDA_V2_FUNCTION(cuCtxCreate, cuda_cu_ctx_create);
-CAST_TO_FN_PTR(cuda_cu_device_get_func_t, os::dll_lookup(handle, "cuDeviceGet"));
+LOOKUP_CUDA_V2_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy);
-_cuda_cu_device_compute_capability =
+LOOKUP_CUDA_V2_FUNCTION(cuMemAlloc, cuda_cu_memalloc);
-CAST_TO_FN_PTR(cuda_cu_device_compute_capability_func_t, os::dll_lookup(handle, "cuDeviceComputeCapability"));
+LOOKUP_CUDA_V2_FUNCTION(cuMemFree, cuda_cu_memfree);
-_cuda_cu_device_get_attribute =
+LOOKUP_CUDA_V2_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod);
-CAST_TO_FN_PTR(cuda_cu_device_get_attribute_func_t, os::dll_lookup(handle, "cuDeviceGetAttribute"));
+LOOKUP_CUDA_V2_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh);
-_cuda_cu_module_get_function =
+LOOKUP_CUDA_V2_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer);
-CAST_TO_FN_PTR(cuda_cu_module_get_function_func_t, os::dll_lookup(handle, "cuModuleGetFunction"));
+#else
-_cuda_cu_module_load_data_ex =
+LOOKUP_CUDA_FUNCTION(cuCtxCreate, cuda_cu_ctx_create);
-CAST_TO_FN_PTR(cuda_cu_module_load_data_ex_func_t, os::dll_lookup(handle, "cuModuleLoadDataEx"));
+LOOKUP_CUDA_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy);
-_cuda_cu_launch_kernel =
+LOOKUP_CUDA_FUNCTION(cuMemAlloc, cuda_cu_memalloc);
-CAST_TO_FN_PTR(cuda_cu_launch_kernel_func_t, os::dll_lookup(handle, "cuLaunchKernel"));
+LOOKUP_CUDA_FUNCTION(cuMemFree, cuda_cu_memfree);
-_cuda_cu_memalloc =
+LOOKUP_CUDA_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod);
-CAST_TO_FN_PTR(cuda_cu_memalloc_func_t, os::dll_lookup(handle, "cuMemAlloc"));
+LOOKUP_CUDA_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh);
-_cuda_cu_memfree =
+LOOKUP_CUDA_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer);
-CAST_TO_FN_PTR(cuda_cu_memfree_func_t, os::dll_lookup(handle, "cuMemFree"));
+#endif
-_cuda_cu_memcpy_htod =
-CAST_TO_FN_PTR(cuda_cu_memcpy_htod_func_t, os::dll_lookup(handle, "cuMemcpyHtoD"));
-_cuda_cu_memcpy_dtoh =
-CAST_TO_FN_PTR(cuda_cu_memcpy_dtoh_func_t, os::dll_lookup(handle, "cuMemcpyDtoH"));
 if (TraceGPUInteraction) {
 tty->print_cr("[CUDA] Success: library linkage");
 }
 return true;

Mercurial > hg > truffle

comparison src/gpu/ptx/vm/gpu_ptx.cpp @ 12653:1a7e7011a341