Mercurial > hg > truffle
diff src/gpu/ptx/vm/gpu_ptx.cpp @ 12371:e32f2b195867
Merge
author | Christos Kotselidis <christos.kotselidis@oracle.com> |
---|---|
date | Fri, 11 Oct 2013 17:21:14 +0200 |
parents | 67a1e27a8dbb |
children | f020e149c1b6 |
line wrap: on
line diff
--- a/src/gpu/ptx/vm/gpu_ptx.cpp Fri Oct 11 17:14:35 2013 +0200 +++ b/src/gpu/ptx/vm/gpu_ptx.cpp Fri Oct 11 17:21:14 2013 +0200 @@ -50,6 +50,28 @@ gpu::Ptx::cuda_cu_memcpy_dtoh_func_t gpu::Ptx::_cuda_cu_memcpy_dtoh; gpu::Ptx::cuda_cu_memfree_func_t gpu::Ptx::_cuda_cu_memfree; + +/* + * see http://en.wikipedia.org/wiki/CUDA#Supported_GPUs + */ +int ncores(int major, int minor) { + int device_type = (major << 4) + minor; + + switch (device_type) { + case 0x10: return 8; + case 0x11: return 8; + case 0x12: return 8; + case 0x13: return 8; + case 0x20: return 32; + case 0x21: return 48; + case 0x30: return 192; + case 0x35: return 192; + default: + tty->print_cr("[CUDA] Warning: Unhandled device %x", device_type); + return 0; + } +} + bool gpu::Ptx::initialize_gpu() { /* Initialize CUDA driver API */ @@ -95,24 +117,7 @@ } /* Get device attributes */ - int minor, major, unified_addressing; - status = _cuda_cu_device_get_attribute(&minor, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, _cu_device); - - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] Failed to get minor attribute of device: %d", _cu_device); - return false; - } - - status = _cuda_cu_device_get_attribute(&major, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, _cu_device); - - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] Failed to get major attribute of device: %d", _cu_device); - return false; - } - - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Compatibility version of device %d: %d.%d", _cu_device, major, minor); - } + int unified_addressing; status = _cuda_cu_device_get_attribute(&unified_addressing, GRAAL_CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, _cu_device); @@ -139,9 +144,99 @@ tty->print_cr("[CUDA] Using %s", device_name); } + return true; } +unsigned int gpu::Ptx::total_cores() { + + int minor, major, nmp; + int status = _cuda_cu_device_get_attribute(&minor, + GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + _cu_device); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to get minor attribute of device: %d", _cu_device); + return 0; + } + + status = _cuda_cu_device_get_attribute(&major, + GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + _cu_device); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to get major attribute of device: %d", _cu_device); + return 0; + } + + status = _cuda_cu_device_get_attribute(&nmp, + GRAAL_CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + _cu_device); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to get numberof MPs on device: %d", _cu_device); + return 0; + } + + int total = nmp * ncores(major, minor); + + int max_threads_per_block, warp_size, async_engines, can_map_host_memory, concurrent_kernels; + + status = _cuda_cu_device_get_attribute(&max_threads_per_block, + GRAAL_CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + _cu_device); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: %d", _cu_device); + return 0; + } + + status = _cuda_cu_device_get_attribute(&warp_size, + GRAAL_CU_DEVICE_ATTRIBUTE_WARP_SIZE, + _cu_device); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_WARP_SIZE: %d", _cu_device); + return 0; + } + + status = _cuda_cu_device_get_attribute(&async_engines, + GRAAL_CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, + _cu_device); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_WARP_SIZE: %d", _cu_device); + return 0; + } + + status = _cuda_cu_device_get_attribute(&can_map_host_memory, + GRAAL_CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, + _cu_device); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: %d", _cu_device); + return 0; + } + + status = _cuda_cu_device_get_attribute(&concurrent_kernels, + GRAAL_CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, + _cu_device); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: %d", _cu_device); + return 0; + } + + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] Compatibility version of device %d: %d.%d", _cu_device, major, minor); + tty->print_cr("[CUDA] Number of cores: %d async engines: %d can map host mem: %d concurrent kernels: %d", + total, async_engines, can_map_host_memory, concurrent_kernels); + tty->print_cr("[CUDA] Max threads per block: %d warp size: %d", max_threads_per_block, warp_size); + } + return (total); + +} + void *gpu::Ptx::generate_kernel(unsigned char *code, int code_len, const char *name) { struct CUmod_st * cu_module; @@ -228,15 +323,20 @@ } bool gpu::Ptx::execute_kernel(address kernel, PTXKernelArguments &ptxka, JavaValue &ret) { + return gpu::Ptx::execute_warp(1, 1, 1, kernel, ptxka, ret); +} + +bool gpu::Ptx::execute_warp(int dimX, int dimY, int dimZ, + address kernel, PTXKernelArguments &ptxka, JavaValue &ret) { // grid dimensionality unsigned int gridX = 1; unsigned int gridY = 1; unsigned int gridZ = 1; // thread dimensionality - unsigned int blockX = 1; - unsigned int blockY = 1; - unsigned int blockZ = 1; + unsigned int blockX = dimX; + unsigned int blockY = dimY; + unsigned int blockZ = dimZ; struct CUfunc_st* cu_function = (struct CUfunc_st*) kernel; @@ -264,7 +364,7 @@ } if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Success: Kernel Launch"); + tty->print_cr("[CUDA] Success: Kernel Launch: X: %d Y: %d Z: %d", blockX, blockY, blockZ); } status = _cuda_cu_ctx_synchronize(); @@ -282,7 +382,18 @@ // Get the result. TODO: Move this code to get_return_oop() BasicType return_type = ptxka.get_ret_type(); switch (return_type) { - case T_INT : + case T_INT: + { + int return_val; + status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_INT_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status); + return false; + } + ret.set_jint(return_val); + } + break; + case T_BOOLEAN: { int return_val; status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_INT_BYTE_SIZE); @@ -293,7 +404,29 @@ ret.set_jint(return_val); } break; - case T_LONG : + case T_FLOAT: + { + float return_val; + status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_FLOAT_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status); + return false; + } + ret.set_jfloat(return_val); + } + break; + case T_DOUBLE: + { + double return_val; + status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_DOUBLE_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status); + return false; + } + ret.set_jdouble(return_val); + } + break; + case T_LONG: { long return_val; status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_LONG_BYTE_SIZE); @@ -304,10 +437,14 @@ ret.set_jlong(return_val); } break; + case T_VOID: + break; default: - tty->print_cr("[CUDA] TODO *** Unhandled return type"); + tty->print_cr("[CUDA] TODO *** Unhandled return type: %d", return_type); } + // handle post-invocation object and array arguemtn + ptxka.reiterate(); // Free device memory allocated for result status = gpu::Ptx::_cuda_cu_memfree(ptxka._return_value_ptr);