# HG changeset patch
# User S.Bharadwaj Yadavalli <bharadwaj.yadavalli@oracle.com>
# Date 1382466941 14400
# Node ID f020e149c1b6be311be84c8c6950044fbf4a3bd0
# Parent  0916da3633acf54b32aebde0b44ea71c1515da68
PTX codegen enhancements; fixes to PTX test regressions.

diff -r 0916da3633ac -r f020e149c1b6 graal/com.oracle.graal.asm.ptx/src/com/oracle/graal/asm/ptx/PTXAssembler.java
--- a/graal/com.oracle.graal.asm.ptx/src/com/oracle/graal/asm/ptx/PTXAssembler.java	Tue Oct 22 17:03:01 2013 +0200
+++ b/graal/com.oracle.graal.asm.ptx/src/com/oracle/graal/asm/ptx/PTXAssembler.java	Tue Oct 22 14:35:41 2013 -0400
@@ -360,18 +360,18 @@
             assert var instanceof Variable;
             assert val instanceof Constant;
             Constant constant = (Constant) val;
-            return ("[" + emitRegister((Variable) var, false) + " + " + constant.asBoxedValue() + "]");
+            return ("[" + ((space == PTXStateSpace.Parameter) ? emitParameter((Variable) var) : emitRegister((Variable) var, false)) + " + " + constant.asBoxedValue() + "]");
         }
 
         @Override
         public String emitRegister(Variable var, boolean comma) {
-            /*
-             * if (space == Parameter) { return ("param" + var.index); } else { return ("%r" +
-             * var.index); }
-             */
             return ("%r" + var.index);
         }
 
+        public String emitParameter(Variable v) {
+            return ("param" + v.index);
+        }
+
         public String emit(boolean isLoad) {
             if (isLoad) {
                 return (space.getStateName() + "." + typeForKind(valueKind) + " " + emitRegister(dest, false) + ", " + emitAddress(source1, source2) + ";");
@@ -671,7 +671,7 @@
         }
 
         public String emitParameter(Variable v) {
-            return (" %r" + v.index);
+            return (" param" + v.index);
         }
 
         public void emit(PTXAssembler asm) {
diff -r 0916da3633ac -r f020e149c1b6 graal/com.oracle.graal.nodes/src/com/oracle/graal/nodes/GuardNode.java
--- a/graal/com.oracle.graal.nodes/src/com/oracle/graal/nodes/GuardNode.java	Tue Oct 22 17:03:01 2013 +0200
+++ b/graal/com.oracle.graal.nodes/src/com/oracle/graal/nodes/GuardNode.java	Tue Oct 22 14:35:41 2013 -0400
@@ -110,5 +110,5 @@
 
     public void setAction(DeoptimizationAction invalidaterecompile) {
         this.action = invalidaterecompile;
+    }
 }
-}
diff -r 0916da3633ac -r f020e149c1b6 src/gpu/ptx/vm/gpu_ptx.cpp
--- a/src/gpu/ptx/vm/gpu_ptx.cpp	Tue Oct 22 17:03:01 2013 +0200
+++ b/src/gpu/ptx/vm/gpu_ptx.cpp	Tue Oct 22 14:35:41 2013 -0400
@@ -385,7 +385,7 @@
      case T_INT:
        {
          int return_val;
-         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_INT_BYTE_SIZE);
+         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._dev_return_value, T_INT_BYTE_SIZE);
          if (status != GRAAL_CUDA_SUCCESS) {
            tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
            return false;
@@ -396,7 +396,7 @@
      case T_BOOLEAN:
        {
          int return_val;
-         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_INT_BYTE_SIZE);
+         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._dev_return_value, T_INT_BYTE_SIZE);
          if (status != GRAAL_CUDA_SUCCESS) {
            tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
            return false;
@@ -407,7 +407,7 @@
      case T_FLOAT:
        {
          float return_val;
-         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_FLOAT_BYTE_SIZE);
+         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._dev_return_value, T_FLOAT_BYTE_SIZE);
          if (status != GRAAL_CUDA_SUCCESS) {
            tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
            return false;
@@ -418,7 +418,7 @@
      case T_DOUBLE:
        {
          double return_val;
-         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_DOUBLE_BYTE_SIZE);
+         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._dev_return_value, T_DOUBLE_BYTE_SIZE);
          if (status != GRAAL_CUDA_SUCCESS) {
            tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
            return false;
@@ -429,7 +429,7 @@
      case T_LONG:
        {
          long return_val;
-         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_LONG_BYTE_SIZE);
+         status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._dev_return_value, T_LONG_BYTE_SIZE);
          if (status != GRAAL_CUDA_SUCCESS) {
            tty->print_cr("[CUDA] *** Error (%d) Failed to copy value to device argument", status);
            return false;
@@ -443,11 +443,11 @@
        tty->print_cr("[CUDA] TODO *** Unhandled return type: %d", return_type);
   }
 
-  // handle post-invocation object and array arguemtn
-  ptxka.reiterate();
+  // Copy all reference arguments from device to host memory.
+  ptxka.copyRefArgsFromDtoH();
 
   // Free device memory allocated for result
-  status = gpu::Ptx::_cuda_cu_memfree(ptxka._return_value_ptr);
+  status = gpu::Ptx::_cuda_cu_memfree(ptxka._dev_return_value);
   if (status != GRAAL_CUDA_SUCCESS) {
     tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status);
     return false;
diff -r 0916da3633ac -r f020e149c1b6 src/gpu/ptx/vm/ptxKernelArguments.cpp
--- a/src/gpu/ptx/vm/ptxKernelArguments.cpp	Tue Oct 22 17:03:01 2013 +0200
+++ b/src/gpu/ptx/vm/ptxKernelArguments.cpp	Tue Oct 22 14:35:41 2013 -0400
@@ -32,127 +32,132 @@
 // Get next java argument
 oop PTXKernelArguments::next_arg(BasicType expectedType) {
   assert(_index < _args->length(), "out of bounds");
-
   oop arg = ((objArrayOop) (_args))->obj_at(_index++);
   assert(expectedType == T_OBJECT ||
          java_lang_boxing_object::is_instance(arg, expectedType), "arg type mismatch");
-
   return arg;
 }
 
 void PTXKernelArguments::do_int() {
-    if (is_after_invocation()) {
+  // If the parameter is a return value,
+  if (is_return_type()) {
+    if (is_kernel_arg_setup()) {
+      // Allocate device memory for T_INT return value pointer on device. Size in bytes
+      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_INT_BYTE_SIZE);
+      if (status != GRAAL_CUDA_SUCCESS) {
+        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+        _success = false;
         return;
+      }
+      // Push _dev_return_value to _kernelBuffer
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
     }
-    // If the parameter is a return value,
-    if (is_return_type()) {
-        // Allocate device memory for T_INT return value pointer on device. Size in bytes
-        int status = gpu::Ptx::_cuda_cu_memalloc(&_return_value_ptr, T_INT_BYTE_SIZE);
-        if (status != GRAAL_CUDA_SUCCESS) {
-            tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-            _success = false;
-            return;
-        }
-        // Push _return_value_ptr to _kernelBuffer
-        *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _return_value_ptr;
-        _bufferOffset += sizeof(_return_value_ptr);
-    } else {
-        // Get the next java argument and its value which should be a T_INT
-        oop arg = next_arg(T_INT);
-        // Copy the java argument value to kernelArgBuffer
-        jvalue intval;
-        if (java_lang_boxing_object::get_value(arg, &intval) != T_INT) {
-            tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_INT");
-            _success = false;
-            return;
-        }
-        *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = intval.i;
-        _bufferOffset += sizeof(intval.i);
+    _bufferOffset += sizeof(_dev_return_value);
+  } else {
+    // Get the next java argument and its value which should be a T_INT
+    oop arg = next_arg(T_INT);
+    // Copy the java argument value to kernelArgBuffer
+    jvalue intval;
+    if (java_lang_boxing_object::get_value(arg, &intval) != T_INT) {
+      tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_INT");
+      _success = false;
+      return;
     }
-    return;
+    if (is_kernel_arg_setup()) {
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = intval.i;
+    }
+    // Advance _bufferOffset
+    _bufferOffset += sizeof(intval.i);
+  }
+  return;
 }
 
 void PTXKernelArguments::do_float() {
-    if (is_after_invocation()) {
+  // If the parameter is a return value,
+  if (is_return_type()) {
+    if (is_kernel_arg_setup()) {
+      // Allocate device memory for T_INT return value pointer on device. Size in bytes
+      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_FLOAT_BYTE_SIZE);
+      if (status != GRAAL_CUDA_SUCCESS) {
+        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+        _success = false;
         return;
+      }
+      // Push _dev_return_value to _kernelBuffer
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
     }
-    // If the parameter is a return value,
-    if (is_return_type()) {
-        // Allocate device memory for T_INT return value pointer on device. Size in bytes
-        int status = gpu::Ptx::_cuda_cu_memalloc(&_return_value_ptr, T_FLOAT_BYTE_SIZE);
-        if (status != GRAAL_CUDA_SUCCESS) {
-            tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-            _success = false;
-            return;
-        }
-        // Push _return_value_ptr to _kernelBuffer
-        *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _return_value_ptr;
-        _bufferOffset += sizeof(_return_value_ptr);
-    } else {
-        // Get the next java argument and its value which should be a T_INT
-        oop arg = next_arg(T_FLOAT);
-        // Copy the java argument value to kernelArgBuffer
-        jvalue floatval;
-        if (java_lang_boxing_object::get_value(arg, &floatval) != T_FLOAT) {
-            tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_INT");
-            _success = false;
-            return;
-        }
-        *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) floatval.f;
-        _bufferOffset += sizeof(floatval.f);
+    // Advance _bufferOffset
+    _bufferOffset += sizeof(_dev_return_value);
+  } else {
+    // Get the next java argument and its value which should be a T_FLOAT
+    oop arg = next_arg(T_FLOAT);
+    // Copy the java argument value to kernelArgBuffer
+    jvalue floatval;
+    if (java_lang_boxing_object::get_value(arg, &floatval) != T_FLOAT) {
+      tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_FLOAT");
+      _success = false;
+      return;
     }
-    return;
+    if (is_kernel_arg_setup()) {
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = floatval.f;
+    }
+    // Advance _bufferOffset
+    _bufferOffset += sizeof(floatval.f);
+  }
+  return;
 }
 
 void PTXKernelArguments::do_double() {
-    if (is_after_invocation()) {
+  // If the parameter is a return value,
+  jvalue doubleval;
+  if (is_return_type()) {
+    if (is_kernel_arg_setup()) {
+      // Allocate device memory for T_INT return value pointer on device. Size in bytes
+      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_DOUBLE_BYTE_SIZE);
+      if (status != GRAAL_CUDA_SUCCESS) {
+        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+        _success = false;
         return;
+      }
+      // Push _dev_return_value to _kernelBuffer
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
     }
-    // If the parameter is a return value,
-    jvalue doubleval;
-    if (is_return_type()) {
-        // Allocate device memory for T_INT return value pointer on device. Size in bytes
-        int status = gpu::Ptx::_cuda_cu_memalloc(&_return_value_ptr, T_DOUBLE_BYTE_SIZE);
-        if (status != GRAAL_CUDA_SUCCESS) {
-            tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-            _success = false;
-            return;
-        }
-        // Push _return_value_ptr to _kernelBuffer
-        *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _return_value_ptr;
-        // _bufferOffset += sizeof(_return_value_ptr);
-        _bufferOffset += sizeof(doubleval.d);
-    } else {
-        // Get the next java argument and its value which should be a T_INT
-        oop arg = next_arg(T_FLOAT);
-        // Copy the java argument value to kernelArgBuffer
-        if (java_lang_boxing_object::get_value(arg, &doubleval) != T_DOUBLE) {
-            tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_INT");
-            _success = false;
-            return;
-        }
-        *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = (gpu::Ptx::CUdeviceptr) doubleval.d;
-        _bufferOffset += sizeof(doubleval.d);
+    // Advance _bufferOffset
+    _bufferOffset += sizeof(doubleval.d);
+  } else {
+    // Get the next java argument and its value which should be a T_INT
+    oop arg = next_arg(T_FLOAT);
+    // Copy the java argument value to kernelArgBuffer
+    if (java_lang_boxing_object::get_value(arg, &doubleval) != T_DOUBLE) {
+      tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_INT");
+      _success = false;
+      return;
     }
-    return;
+    if (is_kernel_arg_setup()) {
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = doubleval.d;
+    }
+    // Advance _bufferOffset
+    _bufferOffset += sizeof(doubleval.d);
+  }
+  return;
 }
 
 void PTXKernelArguments::do_long() {
-  if (is_after_invocation()) {
-    return;
-  }
   // If the parameter is a return value,
   if (is_return_type()) {
-    // Allocate device memory for T_LONG return value pointer on device. Size in bytes
-    int status = gpu::Ptx::_cuda_cu_memalloc(&_return_value_ptr, T_LONG_BYTE_SIZE);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-      _success = false;
-      return;
+    if (is_kernel_arg_setup()) {
+      // Allocate device memory for T_LONG return value pointer on device. Size in bytes
+      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_LONG_BYTE_SIZE);
+      if (status != GRAAL_CUDA_SUCCESS) {
+        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+        _success = false;
+        return;
+      }
+      // Push _dev_return_value to _kernelBuffer
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
     }
-    // Push _return_value_ptr to _kernelBuffer
-    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _return_value_ptr;
-    _bufferOffset += sizeof(_return_value_ptr);
+    // Advance _bufferOffset
+    _bufferOffset += sizeof(_dev_return_value);
   } else {
     // Get the next java argument and its value which should be a T_LONG
     oop arg = next_arg(T_LONG);
@@ -163,119 +168,132 @@
       _success = false;
       return;
     }
-    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.j;
+    if (is_kernel_arg_setup()) {
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.j;
+    }
+    // Advance _bufferOffset
     _bufferOffset += sizeof(val.j);
   }
   return;
 }
 
 void PTXKernelArguments::do_byte() {
-    if (is_after_invocation()) {
+  // If the parameter is a return value,
+  if (is_return_type()) {
+    if (is_kernel_arg_setup()) {
+      // Allocate device memory for T_BYTE return value pointer on device. Size in bytes
+      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BYTE_SIZE);
+      if (status != GRAAL_CUDA_SUCCESS) {
+        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+        _success = false;
         return;
+      }
+      // Push _dev_return_value to _kernelBuffer
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
     }
-    // If the parameter is a return value,
-    if (is_return_type()) {
-        // Allocate device memory for T_BYTE return value pointer on device. Size in bytes
-        int status = gpu::Ptx::_cuda_cu_memalloc(&_return_value_ptr, T_BYTE_SIZE);
-        if (status != GRAAL_CUDA_SUCCESS) {
-            tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-            _success = false;
-            return;
-        }
-        // Push _return_value_ptr to _kernelBuffer
-        *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _return_value_ptr;
-        _bufferOffset += sizeof(_return_value_ptr);
-    } else {
-        // Get the next java argument and its value which should be a T_BYTE
-        oop arg = next_arg(T_BYTE);
-        // Copy the java argument value to kernelArgBuffer
-        jvalue val;
-        if (java_lang_boxing_object::get_value(arg, &val) != T_BYTE) {
-            tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BYTE");
-            _success = false;
-            return;
-        }
-        *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.b;
-        _bufferOffset += sizeof(val.b);
+    // Advance _bufferOffset
+    _bufferOffset += sizeof(_dev_return_value);
+  } else {
+    // Get the next java argument and its value which should be a T_BYTE
+    oop arg = next_arg(T_BYTE);
+    // Copy the java argument value to kernelArgBuffer
+    jvalue val;
+    if (java_lang_boxing_object::get_value(arg, &val) != T_BYTE) {
+      tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BYTE");
+      _success = false;
+      return;
     }
-    return;
+    if (is_kernel_arg_setup()) {
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.b;
+    }
+    // Advance _bufferOffset
+    _bufferOffset += sizeof(val.b);
+  }
+  return;
 }
 
 void PTXKernelArguments::do_bool() {
-    if (is_after_invocation()) {
+  // If the parameter is a return value,
+  if (is_return_type()) {
+    if (is_kernel_arg_setup()) {
+      // Allocate device memory for T_BYTE return value pointer on device. Size in bytes
+      int status = gpu::Ptx::_cuda_cu_memalloc(&_dev_return_value, T_BOOLEAN_SIZE);
+      if (status != GRAAL_CUDA_SUCCESS) {
+        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+        _success = false;
         return;
+      }
+      // Push _dev_return_value to _kernelBuffer
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _dev_return_value;
     }
-    // If the parameter is a return value,
-    if (is_return_type()) {
-        // Allocate device memory for T_BYTE return value pointer on device. Size in bytes
-        int status = gpu::Ptx::_cuda_cu_memalloc(&_return_value_ptr, T_BOOLEAN_SIZE);
-        if (status != GRAAL_CUDA_SUCCESS) {
-            tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-            _success = false;
-            return;
-        }
-        // Push _return_value_ptr to _kernelBuffer
-        *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _return_value_ptr;
-        _bufferOffset += sizeof(_return_value_ptr);
-    } else {
-        // Get the next java argument and its value which should be a T_BYTE
-        oop arg = next_arg(T_BYTE);
-        // Copy the java argument value to kernelArgBuffer
-        jvalue val;
-        if (java_lang_boxing_object::get_value(arg, &val) != T_BOOLEAN) {
-            tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BYTE");
-            _success = false;
-            return;
-        }
-        *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.z;
-        _bufferOffset += sizeof(val.z);
+    // Advance _bufferOffset
+    _bufferOffset += sizeof(_dev_return_value);
+  } else {
+    // Get the next java argument and its value which should be a T_BYTE
+    oop arg = next_arg(T_BYTE);
+    // Copy the java argument value to kernelArgBuffer
+    jvalue val;
+    if (java_lang_boxing_object::get_value(arg, &val) != T_BOOLEAN) {
+      tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BYTE");
+      _success = false;
+      return;
     }
-    return;
+    if (is_kernel_arg_setup()) {
+      *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.z;
+    }
+    // Advance _bufferOffset
+    _bufferOffset += sizeof(val.z);
+  }
+  return;
 }
 
 void PTXKernelArguments::do_array(int begin, int end) {
-    gpu::Ptx::CUdeviceptr _array_ptr;
-    int status;
-
-    // Get the next java argument and its value which should be a T_ARRAY
-    oop arg = next_arg(T_OBJECT);
-    int array_size = arg->size() * HeapWordSize;
+  // Get the next java argument and its value which should be a T_ARRAY
+  oop arg = next_arg(T_OBJECT);
+  assert(arg->is_array(), "argument value not an array");
+  // Size of array argument
+  int argSize = arg->size() * HeapWordSize;
+  // Device pointer to array argument.
+  gpu::Ptx::CUdeviceptr arrayArgOnDev;
+  int status;
 
-    if (is_after_invocation()) {
-        _array_ptr = *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]);
-        status = gpu::Ptx::_cuda_cu_memcpy_dtoh(arg, _array_ptr, array_size);
-        if (status != GRAAL_CUDA_SUCCESS) {
-            tty->print_cr("[CUDA] *** Error (%d) Failed to copy array argument to host", status);
-            _success = false;
-            return;
-        } else {
-            // tty->print_cr("device: %x host: %x size: %d", _array_ptr, arg, array_size);
-        }
-        return;
+  if (is_kernel_arg_setup()) {
+    // Allocate device memory for array argument on device. Size in bytes
+    status = gpu::Ptx::_cuda_cu_memalloc(&arrayArgOnDev, argSize);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for array argument on device",
+                    status);
+      _success = false;
+      return;
     }
-    // Allocate device memory for T_ARRAY return value pointer on device. Size in bytes
-    status = gpu::Ptx::_cuda_cu_memalloc(&_return_value_ptr, array_size);
-    if (status != GRAAL_CUDA_SUCCESS) {
-        tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-        _success = false;
-        return;
-    }
-    status = gpu::Ptx::_cuda_cu_memcpy_htod(_return_value_ptr, arg, array_size);
+    // Copy array argument to device
+    status = gpu::Ptx::_cuda_cu_memcpy_htod(arrayArgOnDev, arg, argSize);
     if (status != GRAAL_CUDA_SUCCESS) {
-        tty->print_cr("[CUDA] *** Error (%d) Failed to copy array to device argument", status);
-        _success = false;
-        return;
-    } else {
-        // tty->print_cr("host: %x device: %x size: %d", arg, _return_value_ptr, array_size);
+      tty->print_cr("[CUDA] *** Error (%d) Failed to copy array argument content to device memory",
+                    status);
+      _success = false;
+      return;
     }
-    // Push _return_value_ptr to _kernelBuffer
-    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _return_value_ptr;
-    _bufferOffset += sizeof(_return_value_ptr);
-    return;
+
+    // Push device array argument to _kernelBuffer
+    *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = arrayArgOnDev;
+  } else {
+    arrayArgOnDev = *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]);
+    status = gpu::Ptx::_cuda_cu_memcpy_dtoh(arg, arrayArgOnDev, argSize);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to copy array argument to host", status);
+      _success = false;
+      return;
+    }
+  }
+
+  // Advance _bufferOffset
+  _bufferOffset += sizeof(arrayArgOnDev);
+  return;
 }
 
 void PTXKernelArguments::do_void() {
-    return;
+  return;
 }
 
 // TODO implement other do_*
diff -r 0916da3633ac -r f020e149c1b6 src/gpu/ptx/vm/ptxKernelArguments.hpp
--- a/src/gpu/ptx/vm/ptxKernelArguments.hpp	Tue Oct 22 17:03:01 2013 +0200
+++ b/src/gpu/ptx/vm/ptxKernelArguments.hpp	Tue Oct 22 14:35:41 2013 -0400
@@ -42,7 +42,13 @@
   char _kernelArgBuffer[1024];
   // Current offset into _kernelArgBuffer
   size_t _bufferOffset;
-  gpu::Ptx::CUdeviceptr _return_value_ptr;
+  // Device pointer holding return value
+  gpu::Ptx::CUdeviceptr _dev_return_value;
+
+  // Indicates if signature iteration is being done during kernel
+  // setup i.e., java arguments are being copied to device pointers.
+  bool _kernelArgSetup;
+
 private:
   // Array of java argument oops
   arrayOop _args;
@@ -51,7 +57,6 @@
   // Flag to indicate successful creation of kernel argument buffer
   bool _success;
 
-    bool _afterInvoocation;
   // Get next java argument
   oop next_arg(BasicType expectedType);
 
@@ -62,7 +67,9 @@
     _args = args;
     _success = true;
     _bufferOffset = 0;
-    _return_value_ptr = 0;
+    _dev_return_value = 0;
+    _kernelArgSetup = true;
+    //_dev_call_by_reference_args_index = 0;
     if (!is_static) {
       // TODO : Create a device argument for receiver object and add it to _kernelBuffer
       tty->print_cr("{CUDA] ****** TODO: Support for execution of non-static java methods not implemented yet.");
@@ -80,23 +87,23 @@
     return _bufferOffset;
   }
 
-    void reiterate() {
-        _afterInvoocation = true;
-        _bufferOffset = 0;
-        _index = 0;
-        iterate();
-    }
+  void copyRefArgsFromDtoH() {
+    _kernelArgSetup = false;
+    _bufferOffset = 0;
+    _index = 0;
+    iterate();
+  }
 
-    inline bool is_after_invocation() {
-        return _afterInvoocation;
-    }
+  inline bool is_kernel_arg_setup() {
+    return _kernelArgSetup;
+  }
 
   // Get the return oop value
   oop get_return_oop();
 
   // get device return value ptr
-  gpu::Ptx::CUdeviceptr get_return_value_ptr() {
-      return _return_value_ptr;
+  gpu::Ptx::CUdeviceptr get_dev_return_value() {
+      return _dev_return_value;
   }