Mercurial > hg > truffle

--- a/graal/com.oracle.graal.asm.ptx/src/com/oracle/graal/asm/ptx/PTXAssembler.java	Fri Nov 01 13:07:22 2013 +0100
+++ b/graal/com.oracle.graal.asm.ptx/src/com/oracle/graal/asm/ptx/PTXAssembler.java	Fri Nov 01 18:34:03 2013 -0400
@@ -105,6 +105,7 @@
         protected Value source1;
         protected Value source2;
         private boolean logicInstruction = false;
+        private boolean ldRetAddrInstruction = false;

         public StandardFormat(Variable dst, Value src1, Value src2) {
             setDestination(dst);
@@ -139,8 +140,18 @@
             logicInstruction = b;
         }

+        public void setLdRetAddrInstruction(boolean b) {
+            ldRetAddrInstruction = b;
+        }
+
         public String typeForKind(Kind k) {
-            if (logicInstruction) {
+            if (ldRetAddrInstruction) {
+                if (System.getProperty("os.arch").compareTo("amd64") == 0) {
+                    return "u64";
+                } else {
+                    return "u32";
+                }
+            } else if (logicInstruction) {
                 switch (k.getTypeChar()) {
                     case 's':
                         return "b16";
@@ -658,16 +669,16 @@
     }

     public static class Param extends SingleOperandFormat {
-
-        private boolean lastParameter;
+        // Last parameter holds the return parameter.
+        private boolean returnParameter;

         public Param(Variable d, boolean lastParam) {
             super(d, null);
-            setLastParameter(lastParam);
+            setReturnParameter(lastParam);
         }

-        public void setLastParameter(boolean value) {
-            lastParameter = value;
+        public void setReturnParameter(boolean value) {
+            returnParameter = value;
         }

         public String emitParameter(Variable v) {
@@ -675,30 +686,38 @@
         }

         public void emit(PTXAssembler asm) {
-            asm.emitString(".param ." + paramForKind(dest.getKind()) + emitParameter(dest) + (lastParameter ? "" : ","));
+            asm.emitString(".param ." + paramForKind(dest.getKind()) + emitParameter(dest) + (returnParameter ? "" : ","));
         }

         public String paramForKind(Kind k) {
-            switch (k.getTypeChar()) {
-                case 'z':
-                case 'f':
-                    return "s32";
-                case 'b':
-                    return "s8";
-                case 's':
-                    return "s16";
-                case 'c':
-                    return "u16";
-                case 'i':
-                    return "s32";
-                case 'j':
-                    return "s64";
-                case 'd':
-                    return "f64";
-                case 'a':
+            if (returnParameter) {
+                if (System.getProperty("os.arch").compareTo("amd64") == 0) {
                     return "u64";
-                default:
-                    throw GraalInternalError.shouldNotReachHere();
+                } else {
+                    return "u32";
+                }
+            } else {
+                switch (k.getTypeChar()) {
+                    case 'z':
+                    case 'f':
+                        return "s32";
+                    case 'b':
+                        return "s8";
+                    case 's':
+                        return "s16";
+                    case 'c':
+                        return "u16";
+                    case 'i':
+                        return "s32";
+                    case 'j':
+                        return "s64";
+                    case 'd':
+                        return "f64";
+                    case 'a':
+                        return "u64";
+                    default:
+                        throw GraalInternalError.shouldNotReachHere();
+                }
             }
         }
--- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ArrayPTXTest.java	Fri Nov 01 13:07:22 2013 +0100
+++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ArrayPTXTest.java	Fri Nov 01 18:34:03 2013 -0400
@@ -40,7 +40,11 @@
         int[] array3 = {1, 2, 3, 4, 5, 6, 7, 8, 9};

         invoke(compile("testStoreArray1I"), array1, 2);
-        printReport("testStoreArray1I: " + Arrays.toString(array1));
+        if (array1[2] == 42) {
+            printReport("testStoreArray1I: " + Arrays.toString(array1) + " PASSED");
+        } else {
+            printReport("testStoreArray1I: " + Arrays.toString(array1) + " FAILED");
+        }

         invoke(compile("testStoreArrayWarp0"), array2, 2);
         printReport("testStoreArrayWarp0: " + Arrays.toString(array2));
--- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java	Fri Nov 01 13:07:22 2013 +0100
+++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java	Fri Nov 01 18:34:03 2013 -0400
@@ -44,6 +44,7 @@
 import com.oracle.graal.lir.ptx.*;
 import com.oracle.graal.nodes.*;
 import com.oracle.graal.nodes.cfg.*;
+import com.oracle.graal.lir.ptx.PTXMemOp.LoadReturnAddrOp;

 /**
  * HotSpot PTX specific backend.
@@ -64,6 +65,84 @@
         return new PTXFrameMap(getCodeCache());
     }

+    static final class RegisterAnalysis extends ValueProcedure {
+        private final SortedSet<Integer> unsigned64 = new TreeSet<>();
+        private final SortedSet<Integer> signed64 = new TreeSet<>();
+        private final SortedSet<Integer> float32 = new TreeSet<>();
+        private final SortedSet<Integer> signed32 = new TreeSet<>();
+        private final SortedSet<Integer> float64 = new TreeSet<>();
+
+        LIRInstruction op;
+
+        void emitDeclarations(Buffer codeBuffer) {
+            for (Integer i : signed32) {
+                codeBuffer.emitString(".reg .s32 %r" + i.intValue() + ";");
+            }
+            for (Integer i : signed64) {
+                codeBuffer.emitString(".reg .s64 %r" + i.intValue() + ";");
+            }
+            for (Integer i : unsigned64) {
+                codeBuffer.emitString(".reg .u64 %r" + i.intValue() + ";");
+            }
+            for (Integer i : float32) {
+                codeBuffer.emitString(".reg .f32 %r" + i.intValue() + ";");
+            }
+            for (Integer i : float64) {
+                codeBuffer.emitString(".reg .f64 %r" + i.intValue() + ";");
+            }
+        }
+
+        @Override
+        public Value doValue(Value value, OperandMode mode, EnumSet<OperandFlag> flags) {
+            if (isVariable(value)) {
+                Variable regVal = (Variable) value;
+                Kind regKind = regVal.getKind();
+                if ((op instanceof LoadReturnAddrOp) && (mode == OperandMode.DEF)) {
+                    unsigned64.add(regVal.index);
+                } else {
+                    switch (regKind) {
+                        case Int:
+                            // If the register was used as a wider signed type
+                            // do not add it here
+                            if (!signed64.contains(regVal.index)) {
+                                signed32.add(regVal.index);
+                            }
+                            break;
+                        case Long:
+                            // If the register was used as a narrower signed type
+                            // remove it from there and add it to wider type.
+                            if (signed32.contains(regVal.index)) {
+                                signed32.remove(regVal.index);
+                            }
+                            signed64.add(regVal.index);
+                            break;
+                        case Float:
+                            // If the register was used as a wider signed type
+                            // do not add it here
+                            if (!float64.contains(regVal.index)) {
+                                float32.add(regVal.index);
+                            }
+                            break;
+                        case Double:
+                            // If the register was used as a narrower signed type
+                            // remove it from there and add it to wider type.
+                            if (float32.contains(regVal.index)) {
+                                float32.remove(regVal.index);
+                            }
+                            float64.add(regVal.index);
+                            break;
+                        case Object:
+                            unsigned64.add(regVal.index);
+                            break;
+                        default:
+                            throw GraalInternalError.shouldNotReachHere("unhandled register type " + value.toString());
+                    }
+                }
+            }
+            return value;
+        }
+    }
+
     class PTXFrameContext implements FrameContext {

         @Override
@@ -147,94 +226,27 @@
         assert codeCacheOwner != null : lirGen.getGraph() + " is not associated with a method";

         Buffer codeBuffer = tasm.asm.codeBuffer;
-
-        final SortedSet<Integer> signed32 = new TreeSet<>();
-        final SortedSet<Integer> signed64 = new TreeSet<>();
-        final SortedSet<Integer> unsigned64 = new TreeSet<>();
-        final SortedSet<Integer> float32 = new TreeSet<>();
-        final SortedSet<Integer> float64 = new TreeSet<>();
-
-        ValueProcedure trackRegisterKind = new ValueProcedure() {
-
-            @Override
-            public Value doValue(Value value, OperandMode mode, EnumSet<OperandFlag> flags) {
-                if (isVariable(value)) {
-                    Variable regVal = (Variable) value;
-                    Kind regKind = regVal.getKind();
-                    switch (regKind) {
-                        case Int:
-                            // If the register was used as a wider signed type
-                            // do not add it here
-                            if (!signed64.contains(regVal.index)) {
-                                signed32.add(regVal.index);
-                            }
-                            break;
-                        case Long:
-                            // If the register was used as a narrower signed type
-                            // remove it from there and add it to wider type.
-                            if (signed32.contains(regVal.index)) {
-                                signed32.remove(regVal.index);
-                            }
-                            signed64.add(regVal.index);
-                            break;
-                        case Float:
-                            // If the register was used as a wider signed type
-                            // do not add it here
-                            if (!float64.contains(regVal.index)) {
-                                float32.add(regVal.index);
-                            }
-                            break;
-                        case Double:
-                            // If the register was used as a narrower signed type
-                            // remove it from there and add it to wider type.
-                            if (float32.contains(regVal.index)) {
-                                float32.remove(regVal.index);
-                            }
-                            float64.add(regVal.index);
-                            break;
-                        case Object:
-                            unsigned64.add(regVal.index);
-                            break;
-                        default:
-                            throw GraalInternalError.shouldNotReachHere("unhandled register type " + value.toString());
-                    }
-                }
-                return value;
-            }
-        };
+        RegisterAnalysis registerAnalysis = new RegisterAnalysis();

         for (Block b : lirGen.lir.codeEmittingOrder()) {
             for (LIRInstruction op : lirGen.lir.lir(b)) {
                 if (op instanceof LabelOp) {
                     // Don't consider this as a definition
                 } else {
-                    op.forEachTemp(trackRegisterKind);
-                    op.forEachOutput(trackRegisterKind);
+                    registerAnalysis.op = op;
+                    op.forEachTemp(registerAnalysis);
+                    op.forEachOutput(registerAnalysis);
                 }
             }
         }

-        for (Integer i : signed32) {
-            codeBuffer.emitString(".reg .s32 %r" + i.intValue() + ";");
-        }
-        for (Integer i : signed64) {
-            codeBuffer.emitString(".reg .s64 %r" + i.intValue() + ";");
-        }
-        for (Integer i : unsigned64) {
-            codeBuffer.emitString(".reg .u64 %r" + i.intValue() + ";");
-        }
-        for (Integer i : float32) {
-            codeBuffer.emitString(".reg .f32 %r" + i.intValue() + ";");
-        }
-        for (Integer i : float64) {
-            codeBuffer.emitString(".reg .f64 %r" + i.intValue() + ";");
-        }
+        registerAnalysis.emitDeclarations(codeBuffer);
+
         // emit predicate register declaration
         int maxPredRegNum = ((PTXLIRGenerator) lirGen).getNextPredRegNumber();
         if (maxPredRegNum > 0) {
             codeBuffer.emitString(".reg .pred %p<" + maxPredRegNum + ">;");
         }
-        codeBuffer.emitString(".reg .pred %r;");  // used for setp bool
     }

     @Override
--- a/graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXMemOp.java	Fri Nov 01 13:07:22 2013 +0100
+++ b/graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXMemOp.java	Fri Nov 01 18:34:03 2013 -0400
@@ -166,7 +166,9 @@
                 case Long:
                 case Float:
                 case Double:
-                    new Ld(Parameter, result, addr.getBase(), Constant.forLong(addr.getDisplacement())).emit(masm);
+                    Ld ldIns = new Ld(Parameter, result, addr.getBase(), Constant.forLong(addr.getDisplacement()));
+                    ldIns.setLdRetAddrInstruction(true);
+                    ldIns.emit(masm);
                     break;
                 default:
                     throw GraalInternalError.shouldNotReachHere();
--- a/src/gpu/ptx/vm/gpu_ptx.cpp	Fri Nov 01 13:07:22 2013 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.cpp	Fri Nov 01 18:34:03 2013 -0400
@@ -49,7 +49,21 @@
 gpu::Ptx::cuda_cu_module_load_data_ex_func_t gpu::Ptx::_cuda_cu_module_load_data_ex;
 gpu::Ptx::cuda_cu_memcpy_dtoh_func_t gpu::Ptx::_cuda_cu_memcpy_dtoh;
 gpu::Ptx::cuda_cu_memfree_func_t gpu::Ptx::_cuda_cu_memfree;
+gpu::Ptx::cuda_cu_mem_host_register_func_t gpu::Ptx::_cuda_cu_mem_host_register;
+gpu::Ptx::cuda_cu_mem_host_get_device_pointer_func_t gpu::Ptx::_cuda_cu_mem_host_get_device_pointer;
+gpu::Ptx::cuda_cu_mem_host_unregister_func_t gpu::Ptx::_cuda_cu_mem_host_unregister;

+#define STRINGIFY(x)     #x
+
+#define LOOKUP_CUDA_FUNCTION(name, alias)  \
+  _##alias =                               \
+    CAST_TO_FN_PTR(alias##_func_t, os::dll_lookup(handle, STRINGIFY(name))); \
+  if (_##alias == NULL) {      \
+  tty->print_cr("[CUDA] ***** Error: Failed to lookup %s", STRINGIFY(name)); \
+        return 0; \
+  } \
+
+#define LOOKUP_CUDA_V2_FUNCTION(name, alias)  LOOKUP_CUDA_FUNCTION(name##_v2, alias)

 /*
  * see http://en.wikipedia.org/wiki/CUDA#Supported_GPUs
@@ -199,7 +213,7 @@
         tty->print_cr("[CUDA] Failed to get GRAAL_CU_DEVICE_ATTRIBUTE_WARP_SIZE: %d", _cu_device);
         return 0;
     }
-
+
     status = _cuda_cu_device_get_attribute(&async_engines,
                                            GRAAL_CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT,
                                            _cu_device);
@@ -234,7 +248,7 @@
         tty->print_cr("[CUDA] Max threads per block: %d warp size: %d", max_threads_per_block, warp_size);
     }
     return (total);
-
+
 }

 void *gpu::Ptx::generate_kernel(unsigned char *code, int code_len, const char *name) {
@@ -262,7 +276,7 @@
   jit_option_values[2] = (void *)(size_t)jit_register_count;

   /* Create CUDA context to compile and execute the kernel */
-  int status = _cuda_cu_ctx_create(&_device_context, 0, _cu_device);
+  int status = _cuda_cu_ctx_create(&_device_context, GRAAL_CU_CTX_MAP_HOST, _cu_device);

   if (status != GRAAL_CUDA_SUCCESS) {
     tty->print_cr("[CUDA] Failed to create CUDA context for device(%d): %d", _cu_device, status);
@@ -443,9 +457,6 @@
        tty->print_cr("[CUDA] TODO *** Unhandled return type: %d", return_type);
   }

-  // Copy all reference arguments from device to host memory.
-  ptxka.copyRefArgsFromDtoH();
-
   // Free device memory allocated for result
   status = gpu::Ptx::_cuda_cu_memfree(ptxka._dev_return_value);
   if (status != GRAAL_CUDA_SUCCESS) {
@@ -487,40 +498,36 @@
     void *handle = os::dll_load(cuda_library_name, buffer, STD_BUFFER_SIZE);
         free(buffer);
     if (handle != NULL) {
-      _cuda_cu_init =
-        CAST_TO_FN_PTR(cuda_cu_init_func_t, os::dll_lookup(handle, "cuInit"));
-      _cuda_cu_ctx_create =
-        CAST_TO_FN_PTR(cuda_cu_ctx_create_func_t, os::dll_lookup(handle, "cuCtxCreate"));
-      _cuda_cu_ctx_destroy =
-        CAST_TO_FN_PTR(cuda_cu_ctx_destroy_func_t, os::dll_lookup(handle, "cuCtxDestroy"));
-      _cuda_cu_ctx_synchronize =
-        CAST_TO_FN_PTR(cuda_cu_ctx_synchronize_func_t, os::dll_lookup(handle, "cuCtxSynchronize"));
-      _cuda_cu_ctx_set_current =
-        CAST_TO_FN_PTR(cuda_cu_ctx_set_current_func_t, os::dll_lookup(handle, "cuCtxSetCurrent"));
-      _cuda_cu_device_get_count =
-        CAST_TO_FN_PTR(cuda_cu_device_get_count_func_t, os::dll_lookup(handle, "cuDeviceGetCount"));
-      _cuda_cu_device_get_name =
-        CAST_TO_FN_PTR(cuda_cu_device_get_name_func_t, os::dll_lookup(handle, "cuDeviceGetName"));
-      _cuda_cu_device_get =
-        CAST_TO_FN_PTR(cuda_cu_device_get_func_t, os::dll_lookup(handle, "cuDeviceGet"));
-      _cuda_cu_device_compute_capability =
-        CAST_TO_FN_PTR(cuda_cu_device_compute_capability_func_t, os::dll_lookup(handle, "cuDeviceComputeCapability"));
-      _cuda_cu_device_get_attribute =
-        CAST_TO_FN_PTR(cuda_cu_device_get_attribute_func_t, os::dll_lookup(handle, "cuDeviceGetAttribute"));
-      _cuda_cu_module_get_function =
-        CAST_TO_FN_PTR(cuda_cu_module_get_function_func_t, os::dll_lookup(handle, "cuModuleGetFunction"));
-      _cuda_cu_module_load_data_ex =
-        CAST_TO_FN_PTR(cuda_cu_module_load_data_ex_func_t, os::dll_lookup(handle, "cuModuleLoadDataEx"));
-      _cuda_cu_launch_kernel =
-        CAST_TO_FN_PTR(cuda_cu_launch_kernel_func_t, os::dll_lookup(handle, "cuLaunchKernel"));
-      _cuda_cu_memalloc =
-        CAST_TO_FN_PTR(cuda_cu_memalloc_func_t, os::dll_lookup(handle, "cuMemAlloc"));
-      _cuda_cu_memfree =
-        CAST_TO_FN_PTR(cuda_cu_memfree_func_t, os::dll_lookup(handle, "cuMemFree"));
-      _cuda_cu_memcpy_htod =
-        CAST_TO_FN_PTR(cuda_cu_memcpy_htod_func_t, os::dll_lookup(handle, "cuMemcpyHtoD"));
-      _cuda_cu_memcpy_dtoh =
-        CAST_TO_FN_PTR(cuda_cu_memcpy_dtoh_func_t, os::dll_lookup(handle, "cuMemcpyDtoH"));
+      LOOKUP_CUDA_FUNCTION(cuInit, cuda_cu_init);
+      LOOKUP_CUDA_FUNCTION(cuCtxSynchronize, cuda_cu_ctx_synchronize);
+      LOOKUP_CUDA_FUNCTION(cuCtxSetCurrent, cuda_cu_ctx_set_current);
+      LOOKUP_CUDA_FUNCTION(cuDeviceGetCount, cuda_cu_device_get_count);
+      LOOKUP_CUDA_FUNCTION(cuDeviceGetName, cuda_cu_device_get_name);
+      LOOKUP_CUDA_FUNCTION(cuDeviceGet, cuda_cu_device_get);
+      LOOKUP_CUDA_FUNCTION(cuDeviceComputeCapability, cuda_cu_device_compute_capability);
+      LOOKUP_CUDA_FUNCTION(cuDeviceGetAttribute, cuda_cu_device_get_attribute);
+      LOOKUP_CUDA_FUNCTION(cuModuleGetFunction, cuda_cu_module_get_function);
+      LOOKUP_CUDA_FUNCTION(cuModuleLoadDataEx, cuda_cu_module_load_data_ex);
+      LOOKUP_CUDA_FUNCTION(cuLaunchKernel, cuda_cu_launch_kernel);
+      LOOKUP_CUDA_FUNCTION(cuMemHostRegister, cuda_cu_mem_host_register);
+      LOOKUP_CUDA_FUNCTION(cuMemHostUnregister, cuda_cu_mem_host_unregister);
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+      LOOKUP_CUDA_V2_FUNCTION(cuCtxCreate, cuda_cu_ctx_create);
+      LOOKUP_CUDA_V2_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy);
+      LOOKUP_CUDA_V2_FUNCTION(cuMemAlloc, cuda_cu_memalloc);
+      LOOKUP_CUDA_V2_FUNCTION(cuMemFree, cuda_cu_memfree);
+      LOOKUP_CUDA_V2_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod);
+      LOOKUP_CUDA_V2_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh);
+      LOOKUP_CUDA_V2_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer);
+#else
+      LOOKUP_CUDA_FUNCTION(cuCtxCreate, cuda_cu_ctx_create);
+      LOOKUP_CUDA_FUNCTION(cuCtxDestroy, cuda_cu_ctx_destroy);
+      LOOKUP_CUDA_FUNCTION(cuMemAlloc, cuda_cu_memalloc);
+      LOOKUP_CUDA_FUNCTION(cuMemFree, cuda_cu_memfree);
+      LOOKUP_CUDA_FUNCTION(cuMemcpyHtoD, cuda_cu_memcpy_htod);
+      LOOKUP_CUDA_FUNCTION(cuMemcpyDtoH, cuda_cu_memcpy_dtoh);
+      LOOKUP_CUDA_FUNCTION(cuMemHostGetDevicePointer, cuda_cu_mem_host_get_device_pointer);
+#endif

       if (TraceGPUInteraction) {
         tty->print_cr("[CUDA] Success: library linkage");
--- a/src/gpu/ptx/vm/gpu_ptx.hpp	Fri Nov 01 13:07:22 2013 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.hpp	Fri Nov 01 18:34:03 2013 -0400
@@ -46,6 +46,13 @@
 #define GRAAL_CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES              4
 #define GRAAL_CUDA_ERROR_NO_BINARY_FOR_GPU                 209

+/*
+ * Flags for cuMemHostRegister
+ */
+
+#define GRAAL_CU_MEMHOSTREGISTER_PORTABLE                    1
+#define GRAAL_CU_MEMHOSTREGISTER_DEVICEMAP                   2
+
 /**
  * End of array terminator for the extra parameter to
  * ::cuLaunchKernel
@@ -73,6 +80,12 @@
  */
 #define GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE    ((void*) 0x02)

+/*
+ * Context creation flags
+ */
+
+#define GRAAL_CU_CTX_MAP_HOST 0x08
+
 class Ptx {
   friend class gpu;

@@ -90,9 +103,11 @@
   typedef unsigned int CUdeviceptr;
 #endif

+typedef int CUdevice;     /**< CUDA device */
+
 private:
   typedef int (*cuda_cu_init_func_t)(unsigned int);
-  typedef int (*cuda_cu_ctx_create_func_t)(void*, int, int);
+  typedef int (*cuda_cu_ctx_create_func_t)(void*, unsigned int, CUdevice);
   typedef int (*cuda_cu_ctx_destroy_func_t)(void*);
   typedef int (*cuda_cu_ctx_synchronize_func_t)(void);
   typedef int (*cuda_cu_ctx_set_current_func_t)(void*);
@@ -107,10 +122,13 @@
                                               unsigned int, void*, void**, void**);
   typedef int (*cuda_cu_module_get_function_func_t)(void*, void*, const char*);
   typedef int (*cuda_cu_module_load_data_ex_func_t)(void*, void*, unsigned int, void*, void**);
-  typedef int (*cuda_cu_memalloc_func_t)(void*, size_t);
+  typedef int (*cuda_cu_memalloc_func_t)(gpu::Ptx::CUdeviceptr*, size_t);
   typedef int (*cuda_cu_memfree_func_t)(gpu::Ptx::CUdeviceptr);
   typedef int (*cuda_cu_memcpy_htod_func_t)(gpu::Ptx::CUdeviceptr, const void*, unsigned int);
   typedef int (*cuda_cu_memcpy_dtoh_func_t)(const void*, gpu::Ptx::CUdeviceptr,  unsigned int);
+  typedef int (*cuda_cu_mem_host_register_func_t)(void*, size_t, unsigned int);
+  typedef int (*cuda_cu_mem_host_get_device_pointer_func_t)(gpu::Ptx::CUdeviceptr*, void*, unsigned int);
+  typedef int (*cuda_cu_mem_host_unregister_func_t)(void*);

 public:
   static cuda_cu_init_func_t                      _cuda_cu_init;
@@ -130,6 +148,9 @@
   static cuda_cu_memcpy_htod_func_t               _cuda_cu_memcpy_htod;
   static cuda_cu_memcpy_dtoh_func_t               _cuda_cu_memcpy_dtoh;
   static cuda_cu_ctx_set_current_func_t           _cuda_cu_ctx_set_current;
+  static cuda_cu_mem_host_register_func_t         _cuda_cu_mem_host_register;
+  static cuda_cu_mem_host_get_device_pointer_func_t _cuda_cu_mem_host_get_device_pointer;
+  static cuda_cu_mem_host_unregister_func_t        _cuda_cu_mem_host_unregister;

 protected:
   static void* _device_context;
--- a/src/gpu/ptx/vm/ptxKernelArguments.cpp	Fri Nov 01 13:07:22 2013 +0100
+++ b/src/gpu/ptx/vm/ptxKernelArguments.cpp	Fri Nov 01 18:34:03 2013 -0400
@@ -38,20 +38,32 @@
   return arg;
 }

+/*
+ * Pad kernel argument buffer to naturally align for given size.
+ */
+void PTXKernelArguments::pad_kernel_argument_buffer(size_t dataSz) {
+  while ((_bufferOffset % dataSz) != 0) {
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (char) 0;
+    _bufferOffset += sizeof(char);
+  }
+  return;
+}
 void PTXKernelArguments::do_int() {
   // If the parameter is a return value,
   if (is_return_type()) {
-    if (is_kernel_arg_setup()) {
-      // Allocate device memory for T_INT return value pointer on device. Size in bytes
-      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_INT_BYTE_SIZE);
-      if (status != GRAAL_CUDA_SUCCESS) {
-        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-        _success = false;
-        return;
-      }
-      // Push _dev_return_value to _kernelBuffer
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
+    // Allocate device memory for T_INT return value pointer on device. Size in bytes
+    int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_INT_BYTE_SIZE);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+      _success = false;
+      return;
     }
+
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(_dev_return_value));
+    // Push _dev_return_value to _kernelBuffer
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
     _bufferOffset += sizeof(_dev_return_value);
   } else {
     // Get the next java argument and its value which should be a T_INT
@@ -63,9 +75,13 @@
       _success = false;
       return;
     }
-    if (is_kernel_arg_setup()) {
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = intval.i;
-    }
+
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(intval.i));
+    // Push _dev_return_value to _kernelBuffer
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = intval.i;
+
     // Advance _bufferOffset
     _bufferOffset += sizeof(intval.i);
   }
@@ -75,17 +91,18 @@
 void PTXKernelArguments::do_float() {
   // If the parameter is a return value,
   if (is_return_type()) {
-    if (is_kernel_arg_setup()) {
-      // Allocate device memory for T_INT return value pointer on device. Size in bytes
-      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_FLOAT_BYTE_SIZE);
-      if (status != GRAAL_CUDA_SUCCESS) {
-        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-        _success = false;
-        return;
-      }
-      // Push _dev_return_value to _kernelBuffer
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
+    // Allocate device memory for T_FLOAT return value pointer on device. Size in bytes
+    int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_FLOAT_BYTE_SIZE);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+      _success = false;
+      return;
     }
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(_dev_return_value));
+    // Push _dev_return_value to _kernelBuffer
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
     // Advance _bufferOffset
     _bufferOffset += sizeof(_dev_return_value);
   } else {
@@ -98,9 +115,11 @@
       _success = false;
       return;
     }
-    if (is_kernel_arg_setup()) {
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) floatval.f;
-    }
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(floatval.f));
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) floatval.f;
+
     // Advance _bufferOffset
     _bufferOffset += sizeof(floatval.f);
   }
@@ -111,18 +130,19 @@
   // If the parameter is a return value,
   jvalue doubleval;
   if (is_return_type()) {
-    if (is_kernel_arg_setup()) {
-      // Allocate device memory for T_INT return value pointer on device. Size in bytes
-      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_DOUBLE_BYTE_SIZE);
-      if (status != GRAAL_CUDA_SUCCESS) {
-        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-        _success = false;
-        return;
-      }
-      // Push _dev_return_value to _kernelBuffer
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
+    // Allocate device memory for T_DOUBLE return value pointer on device. Size in bytes
+    int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_DOUBLE_BYTE_SIZE);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+      _success = false;
+      return;
     }
-    // Advance _bufferOffset
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(_dev_return_value));
+    // Push _dev_return_value to _kernelBuffer
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
+    // Advance _bufferOffset.
     _bufferOffset += sizeof(doubleval.d);
   } else {
     // Get the next java argument and its value which should be a T_INT
@@ -133,11 +153,16 @@
       _success = false;
       return;
     }
-    if (is_kernel_arg_setup()) {
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) doubleval.d;
-    }
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(doubleval.d));
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) doubleval.d;
+
     // Advance _bufferOffset
     _bufferOffset += sizeof(doubleval.d);
+    // For a 64-bit host, since size of double is 8, there is no need
+    // to pad the kernel argument buffer to ensure 8-byte alignment of
+    // the next potential argument to be pushed.
   }
   return;
 }
@@ -145,17 +170,18 @@
 void PTXKernelArguments::do_long() {
   // If the parameter is a return value,
   if (is_return_type()) {
-    if (is_kernel_arg_setup()) {
-      // Allocate device memory for T_LONG return value pointer on device. Size in bytes
-      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_LONG_BYTE_SIZE);
-      if (status != GRAAL_CUDA_SUCCESS) {
-        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-        _success = false;
-        return;
-      }
-      // Push _dev_return_value to _kernelBuffer
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
+    // Allocate device memory for T_LONG return value pointer on device. Size in bytes
+    int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_LONG_BYTE_SIZE);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+      _success = false;
+      return;
     }
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(_dev_return_value));
+    // Push _dev_return_value to _kernelBuffer
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
     // Advance _bufferOffset
     _bufferOffset += sizeof(_dev_return_value);
   } else {
@@ -168,11 +194,16 @@
       _success = false;
       return;
     }
-    if (is_kernel_arg_setup()) {
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.j;
-    }
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(val.j));
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.j;
+
     // Advance _bufferOffset
     _bufferOffset += sizeof(val.j);
+    // For a 64-bit host, since size of long is 8, there is no need
+    // to pad the kernel argument buffer to ensure 8-byte alignment of
+    // the next potential argument to be pushed.
   }
   return;
 }
@@ -180,17 +211,19 @@
 void PTXKernelArguments::do_byte() {
   // If the parameter is a return value,
   if (is_return_type()) {
-    if (is_kernel_arg_setup()) {
-      // Allocate device memory for T_BYTE return value pointer on device. Size in bytes
-      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BYTE_SIZE);
-      if (status != GRAAL_CUDA_SUCCESS) {
-        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-        _success = false;
-        return;
-      }
-      // Push _dev_return_value to _kernelBuffer
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
+    // Allocate device memory for T_BYTE return value pointer on device. Size in bytes
+    int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BYTE_SIZE);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+      _success = false;
+      return;
     }
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(_dev_return_value));
+    // Push _dev_return_value to _kernelBuffer
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
+
     // Advance _bufferOffset
     _bufferOffset += sizeof(_dev_return_value);
   } else {
@@ -203,11 +236,16 @@
       _success = false;
       return;
     }
-    if (is_kernel_arg_setup()) {
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.b;
-    }
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(val.b));
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.b;
+
     // Advance _bufferOffset
     _bufferOffset += sizeof(val.b);
+    // For a 64-bit host, since size of T_BYTE is 8, there is no need
+    // to pad the kernel argument buffer to ensure 8-byte alignment of
+    // the next potential argument to be pushed.
   }
   return;
 }
@@ -215,32 +253,34 @@
 void PTXKernelArguments::do_bool() {
   // If the parameter is a return value,
   if (is_return_type()) {
-    if (is_kernel_arg_setup()) {
-      // Allocate device memory for T_BYTE return value pointer on device. Size in bytes
-      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BOOLEAN_SIZE);
-      if (status != GRAAL_CUDA_SUCCESS) {
-        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-        _success = false;
-        return;
-      }
-      // Push _dev_return_value to _kernelBuffer
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
+    // Allocate device memory for T_BYTE return value pointer on device. Size in bytes
+    int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BOOLEAN_SIZE);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+      _success = false;
+      return;
     }
-    // Advance _bufferOffset
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(_dev_return_value));
+    // Push _dev_return_value to _kernelBuffer
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
     _bufferOffset += sizeof(_dev_return_value);
   } else {
-    // Get the next java argument and its value which should be a T_BYTE
-    oop arg = next_arg(T_BYTE);
+    // Get the next java argument and its value which should be a T_BOOLEAN
+    oop arg = next_arg(T_BOOLEAN);
     // Copy the java argument value to kernelArgBuffer
     jvalue val;
     if (java_lang_boxing_object::get_value(arg, &val) != T_BOOLEAN) {
-      tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BYTE");
+      tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BOOLEAN");
       _success = false;
       return;
     }
-    if (is_kernel_arg_setup()) {
-      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.z;
-    }
+    // Kernel arguments are expected to be naturally aligned.
+    // Insert padding into kernel argument buffer, if needed.
+    pad_kernel_argument_buffer(sizeof(val.z));
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.z;
+
     // Advance _bufferOffset
     _bufferOffset += sizeof(val.z);
   }
@@ -257,35 +297,28 @@
   gpu::Ptx::CUdeviceptr arrayArgOnDev;
   int status;

-  if (is_kernel_arg_setup()) {
-    // Allocate device memory for array argument on device. Size in bytes
-    status = gpu::Ptx::_cuda_cu_memalloc(&arrayArgOnDev, argSize);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for array argument on device",
-                    status);
-      _success = false;
-      return;
-    }
-    // Copy array argument to device
-    status = gpu::Ptx::_cuda_cu_memcpy_htod(arrayArgOnDev, arg, argSize);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to copy array argument content to device memory",
-                    status);
-      _success = false;
-      return;
-    }
+  // Register host memory for use by the device. Size in bytes
+  status = gpu::Ptx::_cuda_cu_mem_host_register(arg, argSize, GRAAL_CU_MEMHOSTREGISTER_DEVICEMAP);
+  if (status != GRAAL_CUDA_SUCCESS) {
+    tty->print_cr("[CUDA] *** Error (%d) Failed to register host memory for array argument on device",
+                  status);
+    _success = false;
+    return;
+  }
+  // Get device pointer
+  status = gpu::Ptx::_cuda_cu_mem_host_get_device_pointer(&arrayArgOnDev, arg, 0);
+  if (status != GRAAL_CUDA_SUCCESS) {
+    tty->print_cr("[CUDA] *** Error (%d) Failed to get device pointer of mapped pinned memory of array argument.",
+                  status);
+    _success = false;
+    return;
+  }

-    // Push device array argument to _kernelBuffer
-    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = arrayArgOnDev;
-  } else {
-    arrayArgOnDev = *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]);
-    status = gpu::Ptx::_cuda_cu_memcpy_dtoh(arg, arrayArgOnDev, argSize);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to copy array argument to host", status);
-      _success = false;
-      return;
-    }
-  }
+  // Kernel arguments are expected to be naturally aligned.
+  // Insert padding into kernel argument buffer, if needed.
+  pad_kernel_argument_buffer(sizeof(arrayArgOnDev));
+  // Push device array argument to _kernelBuffer
+  *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = arrayArgOnDev;

   // Advance _bufferOffset
   _bufferOffset += sizeof(arrayArgOnDev);
--- a/src/gpu/ptx/vm/ptxKernelArguments.hpp	Fri Nov 01 13:07:22 2013 +0100
+++ b/src/gpu/ptx/vm/ptxKernelArguments.hpp	Fri Nov 01 18:34:03 2013 -0400
@@ -45,10 +45,6 @@
   // Device pointer holding return value
   gpu::Ptx::CUdeviceptr _dev_return_value;

-  // Indicates if signature iteration is being done during kernel
-  // setup i.e., java arguments are being copied to device pointers.
-  bool _kernelArgSetup;
-
 private:
   // Array of java argument oops
   arrayOop _args;
@@ -68,8 +64,6 @@
     _success = true;
     _bufferOffset = 0;
     _dev_return_value = 0;
-    _kernelArgSetup = true;
-    //_dev_call_by_reference_args_index = 0;
     if (!is_static) {
       // TODO : Create a device argument for receiver object and add it to _kernelBuffer
       tty->print_cr("{CUDA] ****** TODO: Support for execution of non-static java methods not implemented yet.");
@@ -87,17 +81,6 @@
     return _bufferOffset;
   }

-  void copyRefArgsFromDtoH() {
-    _kernelArgSetup = false;
-    _bufferOffset = 0;
-    _index = 0;
-    iterate();
-  }
-
-  inline bool is_kernel_arg_setup() {
-    return _kernelArgSetup;
-  }
-
   // Get the return oop value
   oop get_return_oop();

@@ -106,6 +89,10 @@
       return _dev_return_value;
   }

+  /*
+   * Pad kernel argument buffer to naturally align for given size.
+   */
+  void pad_kernel_argument_buffer(size_t);

   void do_byte();
   void do_bool();
--- a/src/share/vm/graal/graalCompilerToGPU.cpp	Fri Nov 01 13:07:22 2013 +0100
+++ b/src/share/vm/graal/graalCompilerToGPU.cpp	Fri Nov 01 18:34:03 2013 -0400
@@ -99,16 +99,19 @@
     if (TraceGPUInteraction) {
       switch (ptxka.get_ret_type()) {
         case T_INT:
-          tty->print_cr("GPU execution returned %d", result.get_jint());
+          tty->print_cr("GPU execution returned (int) %d", result.get_jint());
+          break;
+        case T_LONG:
+          tty->print_cr("GPU execution returned (long) %ld", result.get_jlong());
           break;
         case T_FLOAT:
-          tty->print_cr("GPU execution returned %f", result.get_jfloat());
+          tty->print_cr("GPU execution returned (float) %f", result.get_jfloat());
           break;
         case T_DOUBLE:
-          tty->print_cr("GPU execution returned %f", result.get_jdouble());
+          tty->print_cr("GPU execution returned (double) %f", result.get_jdouble());
           break;
         default:
-          tty->print_cr("GPU returned unhandled");
+          tty->print_cr("**** Value returned by GPU not yet handled");
           break;
         }
     }