diff src/gpu/ptx/vm/gpu_ptx.cpp @ 11485:49bb1bc983c6

Implement several missing PTX codegen features; return value capture and method args passing of java method executed on GPU.
author bharadwaj
date Fri, 30 Aug 2013 16:39:05 -0400
parents d876002b98e6
children c99e65785936
line wrap: on
line diff
--- a/src/gpu/ptx/vm/gpu_ptx.cpp	Fri Aug 30 17:31:59 2013 +0200
+++ b/src/gpu/ptx/vm/gpu_ptx.cpp	Fri Aug 30 16:39:05 2013 -0400
@@ -29,12 +29,14 @@
 #include "utilities/ostream.hpp"
 #include "memory/allocation.hpp"
 #include "memory/allocation.inline.hpp"
+#include "kernelArguments.hpp"
 
 void * gpu::Ptx::_device_context;
+int    gpu::Ptx::_cu_device = 0;
 
 gpu::Ptx::cuda_cu_init_func_t gpu::Ptx::_cuda_cu_init;
 gpu::Ptx::cuda_cu_ctx_create_func_t gpu::Ptx::_cuda_cu_ctx_create;
-gpu::Ptx::cuda_cu_ctx_detach_func_t gpu::Ptx::_cuda_cu_ctx_detach;
+gpu::Ptx::cuda_cu_ctx_destroy_func_t gpu::Ptx::_cuda_cu_ctx_destroy;
 gpu::Ptx::cuda_cu_ctx_synchronize_func_t gpu::Ptx::_cuda_cu_ctx_synchronize;
 gpu::Ptx::cuda_cu_device_get_count_func_t gpu::Ptx::_cuda_cu_device_get_count;
 gpu::Ptx::cuda_cu_device_get_name_func_t gpu::Ptx::_cuda_cu_device_get_name;
@@ -44,6 +46,8 @@
 gpu::Ptx::cuda_cu_launch_kernel_func_t gpu::Ptx::_cuda_cu_launch_kernel;
 gpu::Ptx::cuda_cu_module_get_function_func_t gpu::Ptx::_cuda_cu_module_get_function;
 gpu::Ptx::cuda_cu_module_load_data_ex_func_t gpu::Ptx::_cuda_cu_module_load_data_ex;
+gpu::Ptx::cuda_cu_memcpy_dtoh_func_t gpu::Ptx::_cuda_cu_memcpy_dtoh;
+gpu::Ptx::cuda_cu_memfree_func_t gpu::Ptx::_cuda_cu_memfree;
 
 void gpu::probe_linkage() {
 #if defined(__APPLE__) || defined(LINUX)
@@ -67,9 +71,9 @@
   }
 }
 
-bool gpu::execute_kernel(address kernel, JavaCallArguments * jca) {
+bool gpu::execute_kernel(address kernel, PTXKernelArguments & ptxka, JavaValue& ret) {
   if (gpu::has_gpu_linkage()) {
-    return (gpu::Ptx::execute_kernel(kernel, jca));
+    return (gpu::Ptx::execute_kernel(kernel, ptxka, ret));
   } else {
     return false;
   }
@@ -108,8 +112,7 @@
   /* Get the handle to the first compute device */
   int device_id = 0;
   /* Compute-capable device handle */
-  int cu_device = 0;
-  status = _cuda_cu_device_get(&cu_device, device_id);
+  status = _cuda_cu_device_get(&_cu_device, device_id);
 
   if (status != GRAAL_CUDA_SUCCESS) {
     tty->print_cr("[CUDA] Failed to get handle of first compute-capable device i.e., the one at ordinal: %d", device_id);
@@ -122,42 +125,42 @@
 
   /* Get device attributes */
   int minor, major, unified_addressing;
-  status = _cuda_cu_device_get_attribute(&minor, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device);
+  status = _cuda_cu_device_get_attribute(&minor, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, _cu_device);
 
   if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to get minor attribute of device: %d", cu_device);
+    tty->print_cr("[CUDA] Failed to get minor attribute of device: %d", _cu_device);
     return false;
   }
 
-  status = _cuda_cu_device_get_attribute(&major, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device);
+  status = _cuda_cu_device_get_attribute(&major, GRAAL_CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, _cu_device);
 
   if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to get major attribute of device: %d", cu_device);
+    tty->print_cr("[CUDA] Failed to get major attribute of device: %d", _cu_device);
     return false;
   }
 
   if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Compatibility version of device %d: %d.%d", cu_device, major, minor);
+    tty->print_cr("[CUDA] Compatibility version of device %d: %d.%d", _cu_device, major, minor);
   }
 
-  status = _cuda_cu_device_get_attribute(&unified_addressing, GRAAL_CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cu_device);
+  status = _cuda_cu_device_get_attribute(&unified_addressing, GRAAL_CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, _cu_device);
 
   if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to query unified addressing mode of device: %d", cu_device);
+    tty->print_cr("[CUDA] Failed to query unified addressing mode of device: %d", _cu_device);
     return false;
   }
 
   if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Unified addressing support on device %d: %d", cu_device, unified_addressing);
+    tty->print_cr("[CUDA] Unified addressing support on device %d: %d", _cu_device, unified_addressing);
   }
 
 
   /* Get device name */
   char device_name[256];
-  status = _cuda_cu_device_get_name(device_name, 256, cu_device);
+  status = _cuda_cu_device_get_name(device_name, 256, _cu_device);
 
   if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to get name of device: %d", cu_device);
+    tty->print_cr("[CUDA] Failed to get name of device: %d", _cu_device);
     return false;
   }
 
@@ -165,18 +168,6 @@
     tty->print_cr("[CUDA] Using %s", device_name);
   }
 
-  /* Create CUDA context */
-  status = _cuda_cu_ctx_create(&_device_context, 0, cu_device);
-
-  if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to create CUDA context for device: %d", cu_device);
-    return false;
-  }
-
-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Success: Created context for device: %d", cu_device);
-  }
-
   return true;
 }
 
@@ -210,8 +201,20 @@
 
   }
 
+  /* Create CUDA context to compile and execute the kernel */
+  int status = _cuda_cu_ctx_create(&_device_context, 0, _cu_device);
+
+  if (status != GRAAL_CUDA_SUCCESS) {
+    tty->print_cr("[CUDA] Failed to create CUDA context for device: %d", _cu_device);
+    return NULL;
+  }
+
+  if (TraceGPUInteraction) {
+    tty->print_cr("[CUDA] Success: Created context for device: %d", _cu_device);
+  }
+
   /* Load module's data with compiler options */
-  int status = _cuda_cu_module_load_data_ex(&cu_module, code, jit_num_options,
+  status = _cuda_cu_module_load_data_ex(&cu_module, (void*) code, jit_num_options,
                                             jit_options, (void **)jit_option_values);
   if (status != GRAAL_CUDA_SUCCESS) {
     if (status == GRAAL_CUDA_ERROR_NO_BINARY_FOR_GPU) {
@@ -238,10 +241,11 @@
   if (TraceGPUInteraction) {
     tty->print_cr("[CUDA] Got function handle for %s", name);
   }
+
   return cu_function;
 }
 
-bool gpu::Ptx::execute_kernel(address kernel, JavaCallArguments * jca) {
+bool gpu::Ptx::execute_kernel(address kernel, PTXKernelArguments &ptxka, JavaValue &ret) {
   // grid dimensionality
   unsigned int gridX = 1;
   unsigned int gridY = 1;
@@ -252,14 +256,11 @@
   unsigned int blockY = 1;
   unsigned int blockZ = 1;
   
-  int *cu_function = (int *)kernel;
+  struct CUfunc_st* cu_function = (struct CUfunc_st*) kernel;
 
-  char * paramBuffer = (char *) jca->parameters();
-  size_t paramBufferSz = (size_t) jca->size_of_parameters();
-
-  void * config[] = {
-    GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, paramBuffer,
-    GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, &paramBufferSz,
+  void * config[5] = {
+    GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, ptxka._kernelArgBuffer,
+    GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, &(ptxka._bufferOffset),
     GRAAL_CU_LAUNCH_PARAM_END
   };
 
@@ -270,10 +271,11 @@
   if (TraceGPUInteraction) {
     tty->print_cr("[CUDA] launching kernel");
   }
+
   int status = _cuda_cu_launch_kernel(cu_function,
                                       gridX, gridY, gridZ,
                                       blockX, blockY, blockZ,
-                                      0, NULL, NULL, config);
+                                      0, NULL, NULL, (void **) &config);
   if (status != GRAAL_CUDA_SUCCESS) {
     tty->print_cr("[CUDA] Failed to launch kernel");
     return false;
@@ -282,7 +284,72 @@
   if (TraceGPUInteraction) {
     tty->print_cr("[CUDA] Success: Kernel Launch");
   }
-  return status == 0;  // GRAAL_CUDA_SUCCESS
+
+  status = _cuda_cu_ctx_synchronize();
+
+  if (status != GRAAL_CUDA_SUCCESS) {
+    tty->print_cr("[CUDA] Failed to synchronize launched kernel (%d)", status);
+    return false;
+  }
+
+  if (TraceGPUInteraction) {
+    tty->print_cr("[CUDA] Success: Synchronized launch kernel");
+  }
+
+
+  // Get the result. TODO: Move this code to get_return_oop()
+  BasicType return_type = ptxka.get_ret_type();
+  switch (return_type) {
+     case T_INT :
+       {
+         int return_val;
+         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_INT_BYTE_SIZE);
+         if (status != GRAAL_CUDA_SUCCESS) {
+           tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
+           return false;
+         }
+         ret.set_jint(return_val);
+       }
+       break;
+     case T_LONG :
+       {
+         long return_val;
+         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_LONG_BYTE_SIZE);
+         if (status != GRAAL_CUDA_SUCCESS) {
+           tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
+           return false;
+         }
+         ret.set_jlong(return_val);
+       }
+       break;
+     default:
+       tty->print_cr("[CUDA] TODO *** Unhandled return type");
+  }
+
+
+  // Free device memory allocated for result
+  status = gpu::Ptx::_cuda_cu_memfree(ptxka._return_value_ptr);
+  if (status != GRAAL_CUDA_SUCCESS) {
+    tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status);
+    return false;
+  }
+
+  if (TraceGPUInteraction) {
+    tty->print_cr("[CUDA] Success: Freed device memory of return value");
+  }
+
+  // Destroy context
+  status = gpu::Ptx::_cuda_cu_ctx_destroy(_device_context);
+  if (status != GRAAL_CUDA_SUCCESS) {
+    tty->print_cr("[CUDA] *** Error (%d) Failed to destroy context", status);
+    return false;
+  }
+
+  if (TraceGPUInteraction) {
+    tty->print_cr("[CUDA] Success: Destroy context");
+  }
+
+  return (status == GRAAL_CUDA_SUCCESS);
 }
 
 #if defined(LINUX)
@@ -305,8 +372,8 @@
         CAST_TO_FN_PTR(cuda_cu_init_func_t, os::dll_lookup(handle, "cuInit"));
       _cuda_cu_ctx_create =
         CAST_TO_FN_PTR(cuda_cu_ctx_create_func_t, os::dll_lookup(handle, "cuCtxCreate"));
-      _cuda_cu_ctx_detach =
-        CAST_TO_FN_PTR(cuda_cu_ctx_detach_func_t, os::dll_lookup(handle, "cuCtxDetach"));
+      _cuda_cu_ctx_destroy =
+        CAST_TO_FN_PTR(cuda_cu_ctx_destroy_func_t, os::dll_lookup(handle, "cuCtxDestroy"));
       _cuda_cu_ctx_synchronize =
         CAST_TO_FN_PTR(cuda_cu_ctx_synchronize_func_t, os::dll_lookup(handle, "cuCtxSynchronize"));
       _cuda_cu_device_get_count =
@@ -325,6 +392,15 @@
         CAST_TO_FN_PTR(cuda_cu_module_load_data_ex_func_t, os::dll_lookup(handle, "cuModuleLoadDataEx"));
       _cuda_cu_launch_kernel =
         CAST_TO_FN_PTR(cuda_cu_launch_kernel_func_t, os::dll_lookup(handle, "cuLaunchKernel"));
+      _cuda_cu_memalloc =
+        CAST_TO_FN_PTR(cuda_cu_memalloc_func_t, os::dll_lookup(handle, "cuMemAlloc"));
+      _cuda_cu_memfree =
+        CAST_TO_FN_PTR(cuda_cu_memfree_func_t, os::dll_lookup(handle, "cuMemFree"));
+      _cuda_cu_memcpy_htod =
+        CAST_TO_FN_PTR(cuda_cu_memcpy_htod_func_t, os::dll_lookup(handle, "cuMemcpyHtoD"));
+      _cuda_cu_memcpy_dtoh =
+        CAST_TO_FN_PTR(cuda_cu_memcpy_dtoh_func_t, os::dll_lookup(handle, "cuMemcpyDtoH"));
+
       if (TraceGPUInteraction) {
         tty->print_cr("[CUDA] Success: library linkage");
       }