Mercurial > hg > graal-jvmci-8

--- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ControlPTXTest.java	Fri Jan 24 17:43:14 2014 +0100
+++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ControlPTXTest.java	Fri Jan 24 18:34:18 2014 +0100
@@ -26,6 +26,7 @@

 public class ControlPTXTest extends PTXTest {

+    @Ignore("[CUDA] *** Error (status=702): Synchronize kernel")
     @Test
     public void testControl() {
         test("testLoop", 42);
--- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/FloatPTXTest.java	Fri Jan 24 17:43:14 2014 +0100
+++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/FloatPTXTest.java	Fri Jan 24 18:34:18 2014 +0100
@@ -63,7 +63,6 @@
         return 32.0 + a;
     }

-    @Ignore
     @Test
     public void testSub() {
         compileKernel("testSub2F");
@@ -98,7 +97,7 @@
         return 32.0 - a;
     }

-    @Ignore
+    @Ignore("[CUDA] *** Error (209) Failed to load module data with online compiler options for method testMul2F")
     @Test
     public void testMul() {
         compileKernel("testMul2F");
@@ -133,7 +132,7 @@
         return 32.0 * a;
     }

-    @Ignore
+    @Ignore("[CUDA] *** Error (209) Failed to load module data with online compiler options for method testDiv2F")
     @Test
     public void testDiv() {
         compileKernel("testDiv2F");
@@ -168,7 +167,6 @@
         return 32.0 / a;
     }

-    @Ignore
     @Test
     public void testNeg() {
         compileKernel("testNeg2F");
@@ -183,12 +181,11 @@
         return -a;
     }

-    @Ignore
+    @Ignore("need linkage to PTX remainder")
     @Test
     public void testRem() {
-        // need linkage to PTX remainder()
-        // compileKernel("testRem2F");
-        // compileKernel("testRem2D");
+        compileKernel("testRem2F");
+        compileKernel("testRem2D");
     }

     public static float testRem2F(float a, float b) {
@@ -199,7 +196,7 @@
         return a % b;
     }

-    @Ignore
+    @Ignore("[CUDA] *** Error (209) Failed to load module data with online compiler options for method testF2I")
     @Test
     public void testFloatConversion() {
         compileKernel("testF2I");
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ObjectPTXTest.java	Fri Jan 24 18:34:18 2014 +0100
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.graal.compiler.ptx.test;
+
+import java.util.*;
+
+import org.junit.*;
+
+public class ObjectPTXTest extends PTXTest {
+
+    static class A {
+        boolean z = true;
+        byte b = 17;
+        char c = 'D';
+        short s = 12345;
+        int i = 0x1234565;
+        long l;
+        Object o;
+        float f;
+        double d;
+    }
+
+    @Ignore("Object parameters not yet GC safe")
+    @Test
+    public void test0() {
+        for (long l : new long[]{Long.MIN_VALUE, -10, 0, 1, 2, 10, Long.MAX_VALUE}) {
+            A a = new A();
+            a.l = l;
+            test("testLong", l * 2, a);
+        }
+    }
+
+    public static long testLong(long l, A a) {
+        return a.l + l;
+    }
+
+    @Ignore("Object parameters not yet GC safe")
+    @Test
+    public void test1() {
+        for (int i : new int[]{Integer.MIN_VALUE, -10, 0, 1, 2, 10, Integer.MAX_VALUE}) {
+            A a = new A();
+            a.i = i;
+            test("testInt", i * 2, a);
+        }
+    }
+
+    public static int testInt(int i, A a) {
+        return a.i + i;
+    }
+
+    @Ignore("Object parameters not yet GC safe")
+    @Test
+    public void test2() {
+        A a = new A();
+        a.z = true;
+        test("testBoolean", a);
+        a.z = false;
+        test("testBoolean", a);
+    }
+
+    public static boolean testBoolean(A a) {
+        return a.z;
+    }
+
+    @Ignore("Object parameters not yet GC safe")
+    @Test
+    public void test3() {
+        for (byte b : new byte[]{Byte.MIN_VALUE, -10, 0, 1, 2, 10, Byte.MAX_VALUE}) {
+            A a = new A();
+            a.b = b;
+            test("testByte", b, a);
+        }
+    }
+
+    public static int testByte(byte b, A a) {
+        return a.b + b;
+    }
+
+    @Ignore("Object parameters not yet GC safe")
+    @Test
+    public void test4() {
+        for (short s : new short[]{Short.MIN_VALUE, -10, 0, 1, 2, 10, Short.MAX_VALUE}) {
+            A a = new A();
+            a.s = s;
+            test("testShort", s, a);
+        }
+    }
+
+    public static int testShort(short s, A a) {
+        return a.s + s;
+    }
+
+    @Ignore("Object parameters not yet GC safe")
+    @Test
+    public void test5() {
+        for (char c : new char[]{Character.MIN_VALUE, 1, 2, 10, Character.MAX_VALUE}) {
+            A a = new A();
+            a.c = c;
+            test("testChar", (char) (c - 5), a);
+        }
+    }
+
+    public static int testChar(char c, A a) {
+        return a.c + c;
+    }
+
+    @Ignore("Object parameters not yet GC safe")
+    @Test
+    public void test6() {
+        for (float f : new float[]{Float.MIN_VALUE, Float.MIN_NORMAL, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, Float.NaN, -11.45F, -0.0F, 0.0F, 2, 10, Float.MAX_VALUE}) {
+            A a = new A();
+            a.f = f;
+            test("testFloat", f * 2, a);
+        }
+    }
+
+    public static float testFloat(float f, A a) {
+        return a.f + f;
+    }
+
+    @Ignore("Object parameters not yet GC safe")
+    @Test
+    public void test7() {
+        for (double d : new double[]{Double.MIN_VALUE, Double.MIN_NORMAL, Double.NaN, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY - 11.45D, -0.0D, 0.0D, 2, 10, Double.MAX_VALUE}) {
+            A a = new A();
+            a.d = d;
+            test("testDouble", d * 2, a);
+        }
+    }
+
+    public static double testDouble(double d, A a) {
+        return a.d + d;
+    }
+
+    @Ignore("Object return values not yet supported")
+    @Test
+    public void test9() {
+        for (Object o : new Object[]{null, "object", new Object(), new HashMap()}) {
+            A a = new A();
+            a.o = o;
+            test("testObject", a);
+        }
+    }
+
+    public static Object testObject(A a) {
+        return a.o;
+    }
+}
--- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/PTXPhase.java	Fri Jan 24 17:43:14 2014 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-package com.oracle.graal.compiler.ptx.test;
-
-import com.oracle.graal.nodes.*;
-import com.oracle.graal.nodes.type.*;
-import com.oracle.graal.phases.*;
-
-public class PTXPhase extends Phase {
-
-    @Override
-    protected void run(StructuredGraph graph) {
-        /*
-         * Assume that null checks would be done on the CPU caller side prior to copying data onto
-         * the GPU.
-         */
-        for (ParameterNode param : graph.getNodes(ParameterNode.class)) {
-            if (param.stamp() instanceof ObjectStamp) {
-                param.setStamp(StampFactory.declaredNonNull(((ObjectStamp) param.stamp()).type()));
-            }
-        }
-    }
-}
--- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/PTXTest.java	Fri Jan 24 17:43:14 2014 +0100
+++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/PTXTest.java	Fri Jan 24 18:34:18 2014 +0100
@@ -69,7 +69,15 @@
         Assume.assumeTrue(ptxBackend.isDeviceInitialized());
         HotSpotNmethod installedPTXCode = installKernel(method, ptxCode);
         StructuredGraph wrapper = new PTXWrapperBuilder(method, installedPTXCode, (HotSpotProviders) getProviders()).getGraph();
-        return super.getCode(method, wrapper);
+
+        // The PTX C++ layer expects a 1:1 relationship between kernel compilation
+        // and kernel execution as it creates a cuContext in the former and
+        // destroys it in the latter. So, each kernel installed requires a unique
+        // wrapper.
+        // TODO: do cuContext management properly
+        boolean forceCompile = true;
+
+        return getCode(method, wrapper, forceCompile);
     }

     protected static void compileAndPrintCode(PTXTest test) {
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILCompilationResult.java	Fri Jan 24 17:43:14 2014 +0100
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILCompilationResult.java	Fri Jan 24 18:34:18 2014 +0100
@@ -43,8 +43,8 @@
 import com.oracle.graal.lir.asm.*;
 import com.oracle.graal.nodes.*;
 import com.oracle.graal.nodes.java.*;
-import com.oracle.graal.nodes.type.*;
 import com.oracle.graal.phases.*;
+import com.oracle.graal.phases.common.*;
 import com.oracle.graal.phases.tiers.*;
 import com.oracle.graal.phases.util.*;
 import com.oracle.graal.runtime.*;
@@ -139,8 +139,7 @@
         Providers providers = backend.getProviders();
         TargetDescription target = providers.getCodeCache().getTarget();
         PhaseSuite<HighTierContext> graphBuilderSuite = backend.getSuites().getDefaultGraphBuilderSuite().copy();
-        graphBuilderSuite.appendPhase(new HSAILPhase());
-        new HSAILPhase().apply(graph);
+        graphBuilderSuite.appendPhase(new NonNullParametersPhase());
         CallingConvention cc = CodeUtil.getCallingConvention(providers.getCodeCache(), Type.JavaCallee, graph.method(), false);
         SuitesProvider suitesProvider = backend.getSuites();
         try {
@@ -173,18 +172,6 @@
         }
     }

-    private static class HSAILPhase extends Phase {
-
-        @Override
-        protected void run(StructuredGraph graph) {
-            for (ParameterNode param : graph.getNodes(ParameterNode.class)) {
-                if (param.stamp() instanceof ObjectStamp) {
-                    param.setStamp(StampFactory.declaredNonNull(((ObjectStamp) param.stamp()).type()));
-                }
-            }
-        }
-    }
-
     protected HSAILCompilationResult() {
     }
--- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java	Fri Jan 24 17:43:14 2014 +0100
+++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java	Fri Jan 24 18:34:18 2014 +0100
@@ -58,6 +58,7 @@
 import com.oracle.graal.nodes.*;
 import com.oracle.graal.nodes.cfg.*;
 import com.oracle.graal.phases.*;
+import com.oracle.graal.phases.common.*;
 import com.oracle.graal.phases.tiers.*;
 import com.oracle.graal.word.*;

@@ -67,7 +68,7 @@
 public class PTXHotSpotBackend extends HotSpotBackend {

     /**
-     * Descriptor for the PTX runtime method for launching a kernel. The C++ signature is:
+     * Descriptor for the PTX runtime method for calling a kernel. The C++ signature is:
      *
      * <pre>
      *     jlong (JavaThread* thread,
@@ -77,11 +78,14 @@
      *            jint dimZ,
      *            jlong parametersAndReturnValueBuffer,
      *            jint parametersAndReturnValueBufferSize,
+     *            jint objectParametersCount,
+     *            jlong objectParametersOffsets,
+     *            jlong pinnedObjects,
      *            jint encodedReturnTypeSize)
      * </pre>
      */
     // @formatter:off
-    public static final ForeignCallDescriptor LAUNCH_KERNEL = new ForeignCallDescriptor("execute_kernel_from_vm", long.class,
+    public static final ForeignCallDescriptor CALL_KERNEL = new ForeignCallDescriptor("execute_kernel_from_vm", long.class,
                     Word.class, // thread
                     long.class, // kernel
                     int.class,  // dimX
@@ -89,6 +93,9 @@
                     int.class,  // dimZ
                     long.class, // parametersAndReturnValueBuffer
                     int.class,  // parametersAndReturnValueBufferSize
+                    int.class,  // objectParameterCount
+                    long.class, // objectParameterOffsets
+                    long.class, // pinnedObjects
                     int.class); // encodedReturnTypeSize
     // @formatter:on

@@ -114,7 +121,7 @@
         CompilerToGPU compilerToGPU = getRuntime().getCompilerToGPU();
         if (deviceInitialized) {
             long launchKernel = compilerToGPU.getLaunchKernelAddress();
-            hostForeignCalls.registerForeignCall(LAUNCH_KERNEL, launchKernel, NativeCall, DESTROYS_REGISTERS, NOT_LEAF, NOT_REEXECUTABLE, ANY_LOCATION);
+            hostForeignCalls.registerForeignCall(CALL_KERNEL, launchKernel, NativeCall, DESTROYS_REGISTERS, NOT_LEAF, NOT_REEXECUTABLE, ANY_LOCATION);
         }
         super.completeInitialization();
     }
@@ -166,11 +173,13 @@
         HotSpotProviders providers = getProviders();
         CallingConvention cc = getCallingConvention(providers.getCodeCache(), Type.JavaCallee, method, false);
         PhaseSuite<HighTierContext> graphBuilderSuite = providers.getSuites().getDefaultGraphBuilderSuite();
+        graphBuilderSuite.appendPhase(new NonNullParametersPhase());
         Suites suites = providers.getSuites().getDefaultSuites();
         ExternalCompilationResult ptxCode = compileGraph(graph, cc, method, providers, this, this.getTarget(), null, graphBuilderSuite, OptimisticOptimizations.NONE, getProfilingInfo(graph),
                         new SpeculationLog(), suites, true, new ExternalCompilationResult(), CompilationResultBuilderFactory.Default);
         if (makeBinary) {
             try (Scope ds = Debug.scope("GeneratingKernelBinary")) {
+                assert ptxCode.getTargetCode() != null;
                 long kernel = getRuntime().getCompilerToGPU().generateKernel(ptxCode.getTargetCode(), method.getName());
                 ptxCode.setEntryPoint(kernel);
             } catch (Throwable e) {
--- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXWrapperBuilder.java	Fri Jan 24 17:43:14 2014 +0100
+++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXWrapperBuilder.java	Fri Jan 24 18:34:18 2014 +0100
@@ -55,21 +55,62 @@

 /**
  * Utility for building a graph that "wraps" a compiled PTX kernel. Such a wrapper handles the
- * transition from the host CPU to the GPU and back. The graph created is something like the
- * following pseudo code with UPPER CASE denoting compile-time constants:
+ * transition from the host CPU to the GPU and back. The wrapper allocate 3 on-stack buffers:
+ * <ul>
+ * <li>PARAMS: a buffer for the kernel parameters and one word for the on-device address of the
+ * return value (if any).</li>
+ * <li>PINNED: a buffer into which the address of pinned objects is saved.</li>
+ * <li>OBJECT_OFFSETS: the offsets of the object values in PARAMS.</li>
+ * </ul>
+ *
+ *
+ * The PARAMS buffer is the {@code CU_LAUNCH_PARAM_BUFFER_POINTER} buffer passed in the
+ * {@code extra} argument to the {@code cuLaunchKernel} function. This buffer contains the
+ * parameters to the call. The buffer is word aligned and each parameter is aligned in the buffer
+ * according to its data size. The wrapper copies the incoming arguments into the buffer as is. The
+ * native {@link PTXHotSpotBackend#CALL_KERNEL callKernel} function will pin the memory for each
+ * object parameter (using {@code cuMemHostRegister}) and then replace the object pointer in PARAMS
+ * with an on-device pointer to the object's memory (see {@code cuMemHostGetDevicePointer}). The
+ * function saves pinned object pointer into PINNED so that it can unpinned once the kernel returns.
+ * The object pointers in PARAMS are specified by OBJECT_OFFSETS.
+ * <p>
+ * As a concrete example, for a kernel whose Java method signature is:
  *
  * <pre>
- *     T kernel(p0, p1, ..., pN) {
- *         jint bufSize = SIZE_OF_ALIGNED_PARAMS_AND_RETURN_VALUE_WITH_PADDING(p0, p1, ..., pN);
- *         jbyte buf[bufSize] = {p0, PAD(p1), p1, ..., PAD(pN), pN};
- *         jlong result = PTX_LAUNCH_KERNEL(THREAD_REGISTER, KERNEL_ENTRY_POINT, dimX, dimY, dimZ, buf, bufSize, encodedReturnTypeSize);
- *         return convert(result);
+ *     static int kernel(int p1, short p2, Object p3, long p4)
+ * </pre>
+ *
+ * the graph created is shown below as psuedo-code:
+ *
+ * <pre>
+ *     int kernel_wrapper(int p1, short p2, oop p3, long p4) {
+ *         address kernelAddr = kernel.start;
+ *         if (kernelAddr == 0) {
+ *             deopt(InvalidateRecompile, RuntimeConstraint);
+ *         }
+ *         byte PARAMS[32];
+ *         word PINNED[1]; // note: no refmap
+ *         int OBJECT_OFFSETS[1] = {8};
+ *         ((int*) PARAMS)[0] = p1;
+ *         ((short*) PARAMS)[2] = p2;
+ *         ((word*) PARAMS)[1] = p3;
+ *         ((long*) PARAMS)[2] = p4;
+ *         int result = CALL_KERNEL(THREAD_REGISTER, KERNEL_ENTRY_POINT, 1, 1, 1, PARAMS, 32, 1, OBJECT_OFFSETS, PINNED, 4);
+ *         if (clearPendingException(thread)) {
+ *             deopt(None, RuntimeConstraint);
+ *         }
+ *         return result;
  *     }
  * </pre>
  * <p>
  * The generated graph includes a reference to the {@link HotSpotNmethod} for the kernel. There must
  * be another reference to the same {@link HotSpotNmethod} object to ensure that the nmethod is not
  * unloaded by the next full GC.
+ * <p>
+ * TODO: Only the memory for objects passed as parameters is pinned. Surely the memory for other
+ * objects accessed in the kernel reachable from the parameter objects needs to be pinned as well?
+ * <p>
+ * TODO: Objects references within kernels are currently completely hidden from GC.
  */
 public class PTXWrapperBuilder extends GraphKit {

@@ -92,11 +133,23 @@
     int[] javaParameterOffsetsInKernelParametersBuffer;

     /**
-     * Constants denoting the arguments to {@link PTXHotSpotBackend#LAUNCH_KERNEL}.
+     * Constants denoting the arguments to {@link PTXHotSpotBackend#CALL_KERNEL}.
      */
+    // @formatter:off
     enum LaunchArg {
-        Thread, Kernel, DimX, DimY, DimZ, ParametersAndReturnValueBuffer, ParametersAndReturnValueBufferSize, EncodedReturnTypeSize
+        Thread,
+        Kernel,
+        DimX,
+        DimY,
+        DimZ,
+        ParametersAndReturnValueBuffer,
+        ParametersAndReturnValueBufferSize,
+        ObjectParametersCount,
+        ObjectParametersOffsets,
+        PinnedObjects,
+        EncodedReturnTypeSize
     }
+    // @formatter:on

     /**
      * Creates the graph implementing the CPU to GPU transition.
@@ -108,6 +161,7 @@
     public PTXWrapperBuilder(ResolvedJavaMethod method, HotSpotNmethod kernel, HotSpotProviders providers) {
         super(new StructuredGraph(method), providers);
         int wordSize = providers.getCodeCache().getTarget().wordSize;
+        int intSize = Integer.SIZE / Byte.SIZE;
         Kind wordKind = providers.getCodeCache().getTarget().wordKind;
         Signature sig = method.getSignature();
         boolean isStatic = isStatic(method.getModifiers());
@@ -117,17 +171,18 @@
         int javaParametersIndex = 0;
         Kind returnKind = sig.getReturnKind();

-        BitSet objects = new BitSet();
+        List<Integer> objectSlots = new ArrayList<>(javaParameters.length);
         if (!isStatic) {
-            allocateParameter(Kind.Object, javaParametersIndex++, objects, wordSize);
+            allocateParameter(Kind.Object, javaParametersIndex++, objectSlots, wordSize);
         }
         for (int sigIndex = 0; sigIndex < sigCount; sigIndex++) {
             Kind kind = sig.getParameterKind(sigIndex);
-            allocateParameter(kind, javaParametersIndex++, objects, wordSize);
+            allocateParameter(kind, javaParametersIndex++, objectSlots, wordSize);
         }
         bufSize = roundUp(bufSize, wordSize);

-        // Add slot for holding pointer to device memory storing return value
+        // Add slot for the device memory pointer. The kernel writes a
+        // pointer in this slot that points to the return value.
         int encodedReturnTypeSize = 0;
         if (returnKind != Kind.Void) {
             bufSize += wordSize;
@@ -140,7 +195,29 @@

         InvokeNode kernelStart = createInvoke(getClass(), "getKernelStart", ConstantNode.forObject(kernel, providers.getMetaAccess(), getGraph()));

-        AllocaNode buf = append(new AllocaNode(bufSize / wordSize, objects));
+        AllocaNode buf = append(new AllocaNode(bufSize / wordSize, new BitSet()));
+        ValueNode objectParametersOffsets;
+        ValueNode pinnedObjects;
+        ConstantNode nullWord = ConstantNode.forIntegerKind(wordKind, 0L, getGraph());
+        if (objectSlots.isEmpty()) {
+            objectParametersOffsets = ConstantNode.forLong(0, getGraph());
+            pinnedObjects = ConstantNode.forLong(0, getGraph());
+        } else {
+            int intsPerWord = wordSize / intSize;
+            int slots = roundUp(objectSlots.size(), intsPerWord);
+            objectParametersOffsets = append(new AllocaNode(slots, new BitSet()));
+            // No refmap for pinned objects list since kernel execution is (currently) GC unsafe
+            pinnedObjects = append(new AllocaNode(objectSlots.size(), new BitSet()));
+
+            // Initialize the object parameter offsets array
+            int index = 0;
+            for (int slot : objectSlots) {
+                int offset = slot * wordSize;
+                LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, Kind.Int, index * intSize, getGraph());
+                append(new WriteNode(objectParametersOffsets, ConstantNode.forInt(offset, getGraph()), location, BarrierType.NONE, false, false));
+                index++;
+            }
+        }

         Map<LaunchArg, ValueNode> args = new EnumMap<>(LaunchArg.class);
         args.put(Thread, append(new ReadRegisterNode(providers.getRegisters().getThreadRegister(), true, false)));
@@ -150,6 +227,9 @@
         args.put(DimZ, forInt(1, getGraph()));
         args.put(ParametersAndReturnValueBuffer, buf);
         args.put(ParametersAndReturnValueBufferSize, forInt(bufSize, getGraph()));
+        args.put(ObjectParametersCount, forInt(objectSlots.size(), getGraph()));
+        args.put(ObjectParametersOffsets, objectParametersOffsets);
+        args.put(PinnedObjects, pinnedObjects);
         args.put(EncodedReturnTypeSize, forInt(encodedReturnTypeSize, getGraph()));

         int sigIndex = isStatic ? 0 : -1;
@@ -162,7 +242,7 @@
         }
         if (returnKind != Kind.Void) {
             LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, wordKind, bufSize - wordSize, getGraph());
-            append(new WriteNode(buf, ConstantNode.forIntegerKind(wordKind, 0L, getGraph()), location, BarrierType.NONE, false, false));
+            append(new WriteNode(buf, nullWord, location, BarrierType.NONE, false, false));
         }

         FrameStateBuilder fsb = new FrameStateBuilder(method, getGraph(), true);
@@ -170,7 +250,7 @@
         getGraph().start().setStateAfter(fs);

         ValueNode[] launchArgsArray = args.values().toArray(new ValueNode[args.size()]);
-        ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), LAUNCH_KERNEL, launchArgsArray));
+        ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), CALL_KERNEL, launchArgsArray));
         result.setDeoptimizationState(fs);

         ConstantNode isObjectResultArg = ConstantNode.forBoolean(returnKind == Kind.Object, getGraph());
@@ -193,7 +273,11 @@
             case Long:
                 returnValue = result;
                 break;
-            case Float:
+            case Float: {
+                ValueNode asInt = unique(new ConvertNode(Kind.Long, Kind.Int, result));
+                returnValue = unique(new ReinterpretNode(Kind.Float, asInt));
+                break;
+            }
             case Double:
                 returnValue = unique(new ReinterpretNode(returnKind, result));
                 break;
@@ -220,12 +304,12 @@
     }

     /**
-     * Allocates a slot in the kernel parameters' buffer for a Java parameter.
+     * Computes offset and size of space in PARAMS for a Java parameter.
      *
      * @param kind the kind of the parameter
      * @param javaParametersIndex the index of the Java parameter
      */
-    private void allocateParameter(Kind kind, int javaParametersIndex, BitSet objects, int wordSize) {
+    private void allocateParameter(Kind kind, int javaParametersIndex, List<Integer> objectSlots, int wordSize) {
         int kindByteSize = kind == Kind.Object ? wordSize : kind.getBitCount() / Byte.SIZE;
         bufSize = roundUp(bufSize, kindByteSize);
         javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex] = bufSize;
@@ -233,7 +317,7 @@
         if (kind == Kind.Object) {
             stamp = StampFactory.object();
             int slot = bufSize / wordSize;
-            objects.set(slot);
+            objectSlots.add(slot);
         } else {
             stamp = StampFactory.forKind(kind);
         }
--- a/graal/com.oracle.graal.lir/src/com/oracle/graal/lir/FrameMap.java	Fri Jan 24 17:43:14 2014 +0100
+++ b/graal/com.oracle.graal.lir/src/com/oracle/graal/lir/FrameMap.java	Fri Jan 24 18:34:18 2014 +0100
@@ -316,7 +316,7 @@
         spillSize += (slots * target.wordSize);

         if (!objects.isEmpty()) {
-            assert objects.length() < slots;
+            assert objects.length() <= slots;
             StackSlot result = null;
             for (int slotIndex = 0; slotIndex < slots; slotIndex++) {
                 StackSlot objectSlot = null;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.phases.common/src/com/oracle/graal/phases/common/NonNullParametersPhase.java	Fri Jan 24 18:34:18 2014 +0100
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.graal.phases.common;
+
+import com.oracle.graal.nodes.*;
+import com.oracle.graal.nodes.type.*;
+import com.oracle.graal.phases.*;
+
+/**
+ * Modifies the stamp of all object {@linkplain ParameterNode parameters} in a graph to denote they
+ * are non-null. This can be used for graphs where the caller null checks all arguments.
+ */
+public class NonNullParametersPhase extends Phase {
+
+    @Override
+    protected void run(StructuredGraph graph) {
+        for (ParameterNode param : graph.getNodes(ParameterNode.class)) {
+            if (param.stamp() instanceof ObjectStamp) {
+                param.setStamp(StampFactory.declaredNonNull(((ObjectStamp) param.stamp()).type()));
+            }
+        }
+    }
+}
--- a/src/gpu/ptx/vm/gpu_ptx.cpp	Fri Jan 24 17:43:14 2014 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.cpp	Fri Jan 24 18:34:18 2014 +0100
@@ -39,6 +39,7 @@
 gpu::Ptx::cuda_cu_ctx_create_func_t gpu::Ptx::_cuda_cu_ctx_create;
 gpu::Ptx::cuda_cu_ctx_destroy_func_t gpu::Ptx::_cuda_cu_ctx_destroy;
 gpu::Ptx::cuda_cu_ctx_synchronize_func_t gpu::Ptx::_cuda_cu_ctx_synchronize;
+gpu::Ptx::cuda_cu_ctx_get_current_func_t gpu::Ptx::_cuda_cu_ctx_get_current;
 gpu::Ptx::cuda_cu_ctx_set_current_func_t gpu::Ptx::_cuda_cu_ctx_set_current;
 gpu::Ptx::cuda_cu_device_get_count_func_t gpu::Ptx::_cuda_cu_device_get_count;
 gpu::Ptx::cuda_cu_device_get_name_func_t gpu::Ptx::_cuda_cu_device_get_name;
@@ -337,123 +338,189 @@
   return cu_function;
 }

+// A PtxCall is used to manage executing a GPU kernel. In addition to launching
+// the kernel, this class releases resources allocated for the execution.
+class PtxCall: StackObj {
+ private:
+  JavaThread*  _thread;        // the thread on which this call is made
+  address      _buffer;        // buffer containing parameters and _return_value
+  int          _buffer_size;   // size (in bytes) of _buffer
+  oop*         _pinned;        // objects that have been pinned with cuMemHostRegister
+  int          _pinned_length; // length of _pinned
+  gpu::Ptx::CUdeviceptr  _ret_value;     // pointer to slot in GPU memory holding the return value
+  int          _ret_type_size; // size of the return type value
+  bool         _ret_is_object; // specifies if the return type is Object
+
+  bool check(int status, const char *action) {
+    if (status != GRAAL_CUDA_SUCCESS) {
+      Thread* THREAD = _thread;
+      char* message = NEW_RESOURCE_ARRAY_IN_THREAD(THREAD, char, O_BUFLEN + 1);
+      jio_snprintf(message, O_BUFLEN, "[CUDA] *** Error (status=%d): %s", status, action);
+      if (TraceGPUInteraction || HAS_PENDING_EXCEPTION) {
+        tty->print_cr(message);
+      }
+      if (!HAS_PENDING_EXCEPTION) {
+        SharedRuntime::throw_and_post_jvmti_exception(_thread, vmSymbols::java_lang_RuntimeException(), message);
+      }
+      return false;
+    }
+    if (TraceGPUInteraction) {
+      tty->print_cr("[CUDA] Success: %s", action);
+    }
+    return true;
+  }
+
+ public:
+  PtxCall(JavaThread* thread, address buffer, int buffer_size, oop* pinned, int encodedReturnTypeSize) : _thread(thread),
+      _buffer(buffer), _buffer_size(buffer_size), _pinned(pinned), _pinned_length(0), _ret_value(0), _ret_is_object(encodedReturnTypeSize < 0) {
+    _ret_type_size = _ret_is_object ? -encodedReturnTypeSize : encodedReturnTypeSize;
+  }
+
+  bool is_object_return() { return _ret_is_object; }
+
+  void alloc_return_value() {
+    if (_ret_type_size != 0) {
+      if (check(gpu::Ptx::_cuda_cu_memalloc(&_ret_value, _ret_type_size), "Allocate device memory for return value")) {
+        gpu::Ptx::CUdeviceptr* retValuePtr = (gpu::Ptx::CUdeviceptr*) ((_buffer + _buffer_size) - sizeof(_ret_value));
+        *retValuePtr = _ret_value;
+      }
+    }
+  }
+
+  void pin_objects(int count, int* objectOffsets) {
+    if (count == 0) {
+      return;
+    }
+    for (int i = 0; i < count; i++) {
+      int offset = objectOffsets[i];
+      oop* argPtr = (oop*) (_buffer + offset);
+      oop obj = *argPtr;
+      if (obj != NULL) {
+        // Size (in bytes) of object
+        int objSize = obj->size() * HeapWordSize;
+        //tty->print_cr("Pinning object %d at offset %d: %p", i, offset, obj);
+        if (!check(gpu::Ptx::_cuda_cu_mem_host_register(obj, objSize, GRAAL_CU_MEMHOSTREGISTER_DEVICEMAP), "Pin object")) {
+          return;
+        }
+
+        // Record original oop so that its memory can be unpinned
+        _pinned[_pinned_length++] = obj;
+
+        // Replace host pointer to object with device pointer
+        // to object in kernel parameters buffer
+        if (!check(gpu::Ptx::_cuda_cu_mem_host_get_device_pointer((gpu::Ptx::CUdeviceptr*) argPtr, obj, 0), "Get device pointer for pinned object")) {
+          return;
+        }
+      }
+    }
+  }
+
+  void launch(address kernel, jint dimX, jint dimY, jint dimZ) {
+    // grid dimensionality
+    unsigned int gridX = 1;
+    unsigned int gridY = 1;
+    unsigned int gridZ = 1;
+    void * config[] = {
+      GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, (char*) (address) _buffer,
+      GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, &_buffer_size,
+      GRAAL_CU_LAUNCH_PARAM_END
+    };
+    if (check(gpu::Ptx::_cuda_cu_launch_kernel((struct CUfunc_st*) (address) kernel,
+                                      gridX, gridY, gridZ,
+                                      dimX, dimY, dimZ,
+                                      0, NULL, NULL, (void**) &config), "Launch kernel")) {
+    }
+  }
+
+  void synchronize() {
+    check(gpu::Ptx::_cuda_cu_ctx_synchronize(), "Synchronize kernel");
+  }
+
+  void unpin_objects() {
+    while (_pinned_length > 0) {
+      oop obj = _pinned[--_pinned_length];
+      assert(obj != NULL, "npe");
+      //tty->print_cr("Unpinning object %d: %p", _pinned_length, obj);
+      if (!check(gpu::Ptx::_cuda_cu_mem_host_unregister(obj), "Unpin object")) {
+        return;
+      }
+    }
+  }
+
+  oop get_object_return_value() {
+    oop return_val;
+    check(gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, _ret_value, T_OBJECT_BYTE_SIZE), "Copy return value from device");
+    return return_val;
+  }
+
+  jlong get_primitive_return_value() {
+    jlong return_val;
+    check(gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, _ret_value, _ret_type_size), "Copy return value from device");
+    return return_val;
+  }
+
+  void free_return_value() {
+    if (_ret_value != 0) {
+      check(gpu::Ptx::_cuda_cu_memfree(_ret_value), "Free device memory");
+      _ret_value = 0;
+    }
+  }
+
+  void destroy_context() {
+    if (gpu::Ptx::_device_context != NULL) {
+      check(gpu::Ptx::_cuda_cu_ctx_destroy(gpu::Ptx::_device_context), "Destroy context");
+      gpu::Ptx::_device_context = NULL;
+    }
+  }
+
+  ~PtxCall() {
+    unpin_objects();
+    free_return_value();
+    destroy_context();
+  }
+};
+
+
 JRT_ENTRY(jlong, gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ,
-                                                  jlong parametersAndReturnValueBuffer,
-                                                  jint parametersAndReturnValueBufferSize,
+                                                  jlong buffer,
+                                                  jint bufferSize,
+                                                  jint objectParametersCount,
+                                                  jlong objectParametersOffsets,
+                                                  jlong pinnedObjects,
                                                   int encodedReturnTypeSize))
   if (kernel == 0L) {
     SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_NullPointerException(), NULL);
     return 0L;
   }

-  // grid dimensionality
-  unsigned int gridX = 1;
-  unsigned int gridY = 1;
-  unsigned int gridZ = 1;
-
-  struct CUfunc_st* cu_function = (struct CUfunc_st*) (address) kernel;
+  PtxCall call(thread, (address) buffer, bufferSize, (oop*) (address) pinnedObjects, encodedReturnTypeSize);

-  void * config[5] = {
-    GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, (char*) (address) parametersAndReturnValueBuffer,
-    GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, &parametersAndReturnValueBufferSize,
-    GRAAL_CU_LAUNCH_PARAM_END
-  };
+#define TRY(action) do { \
+  action; \
+  if (HAS_PENDING_EXCEPTION) return 0L; \
+} while (0)

-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] launching kernel");
-  }
+  TRY(call.alloc_return_value());

-  bool isObjectReturn = encodedReturnTypeSize < 0;
-  int returnTypeSize = encodedReturnTypeSize < 0 ? -encodedReturnTypeSize : encodedReturnTypeSize;
-  gpu::Ptx::CUdeviceptr device_return_value;
-  int status;
-  if (returnTypeSize != 0) {
-    status = _cuda_cu_memalloc(&device_return_value, returnTypeSize);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
-      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to allocate memory for return value pointer on device");
-      return 0L;
-    }
-    // Push device_return_value to kernelParams
-    gpu::Ptx::CUdeviceptr* returnValuePtr = (gpu::Ptx::CUdeviceptr*)
-                                               ((address) parametersAndReturnValueBuffer +
-                                                parametersAndReturnValueBufferSize - sizeof(device_return_value));
-    *returnValuePtr = device_return_value;
-  }
+  TRY(call.pin_objects(objectParametersCount, (int*) (address) objectParametersOffsets));
+
+  TRY(call.launch((address) kernel, dimX, dimY, dimZ));

-  status = _cuda_cu_launch_kernel(cu_function,
-                                      gridX, gridY, gridZ,
-                                      dimX, dimY, dimZ,
-                                      0, NULL, NULL, (void **) &config);
+  TRY(call.synchronize());

-  if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to launch kernel");
-    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to launch kernel");
+  if (call.is_object_return()) {
+    oop return_val;
+    TRY(return_val = call.get_object_return_value());
+    thread->set_vm_result(return_val);
     return 0L;
   }

-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Success: Kernel Launch: X: %d Y: %d Z: %d", dimX, dimY, dimZ);
-  }
-
-  status = _cuda_cu_ctx_synchronize();
-
-  if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to synchronize launched kernel (%d)", status);
-    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to synchronize launched kernel");
-    return 0L;
-  }
-
-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Success: Synchronized launch kernel");
-  }
+  jlong return_val;
+  TRY(return_val = call.get_primitive_return_value());
+  return return_val;

-  jlong primitiveReturnValue = 0L;
-  if (isObjectReturn) {
-    oop return_val;
-    status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, device_return_value, T_OBJECT_BYTE_SIZE);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status);
-      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument");
-      return 0L;
-    }
-    thread->set_vm_result(return_val);
-  } else if (returnTypeSize > 0) {
-    status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&primitiveReturnValue, device_return_value, returnTypeSize);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status);
-      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument");
-      return 0L;
-    }
-  }
+#undef TRY

-  // Free device memory allocated for result
-  if (returnTypeSize != 0) {
-    status = gpu::Ptx::_cuda_cu_memfree(device_return_value);
-    if (status != GRAAL_CUDA_SUCCESS) {
-      tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status);
-      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to free device memory of return value");
-      return 0L;
-    }
-  }
-
-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Success: Freed device memory of return value");
-  }
-
-  // Destroy context
-  status = gpu::Ptx::_cuda_cu_ctx_destroy(_device_context);
-  if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] *** Error (%d) Failed to destroy context", status);
-    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to destroy context");
-    return 0L;
-  }
-
-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Success: Destroy context");
-  }
-
-  return primitiveReturnValue;
 JRT_END

 bool gpu::Ptx::execute_kernel(address kernel, PTXKernelArguments &ptxka, JavaValue &ret) {
@@ -620,6 +687,7 @@
     if (handle != NULL) {
       LOOKUP_CUDA_FUNCTION(cuInit, cuda_cu_init);
       LOOKUP_CUDA_FUNCTION(cuCtxSynchronize, cuda_cu_ctx_synchronize);
+      LOOKUP_CUDA_FUNCTION(cuCtxGetCurrent, cuda_cu_ctx_get_current);
       LOOKUP_CUDA_FUNCTION(cuCtxSetCurrent, cuda_cu_ctx_set_current);
       LOOKUP_CUDA_FUNCTION(cuDeviceGetCount, cuda_cu_device_get_count);
       LOOKUP_CUDA_FUNCTION(cuDeviceGetName, cuda_cu_device_get_name);
--- a/src/gpu/ptx/vm/gpu_ptx.hpp	Fri Jan 24 17:43:14 2014 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.hpp	Fri Jan 24 18:34:18 2014 +0100
@@ -84,16 +84,19 @@
  * Context creation flags
  */

-#define GRAAL_CU_CTX_MAP_HOST 0x08
+#define GRAAL_CU_CTX_MAP_HOST            0x08
+#define GRAAL_CU_CTX_SCHED_BLOCKING_SYNC 0x04

 class Ptx {
   friend class gpu;
+  friend class PtxCall;

  protected:
   static bool probe_linkage();
   static bool initialize_gpu();
   static unsigned int total_cores();
-  static void * generate_kernel(unsigned char *code, int code_len, const char *name);
+  static void* get_context();
+  static void* generate_kernel(unsigned char *code, int code_len, const char *name);
   static bool execute_warp(int dimX, int dimY, int dimZ, address kernel, PTXKernelArguments & ka, JavaValue &ret);
   static bool execute_kernel(address kernel, PTXKernelArguments & ka, JavaValue &ret);
 public:
@@ -106,8 +109,11 @@
 typedef int CUdevice;     /* CUDA device */

   static jlong execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ,
-                                      jlong parametersAndReturnValueBuffer,
-                                      jint parametersAndReturnValueBufferSize,
+                                      jlong buffer,
+                                      jint bufferSize,
+                                      jint objectParametersCount,
+                                      jlong objectParametersOffsets,
+                                      jlong pinnedObjects,
                                       int encodedReturnTypeSize);

 private:
@@ -115,6 +121,7 @@
   typedef int (*cuda_cu_ctx_create_func_t)(void*, unsigned int, CUdevice);
   typedef int (*cuda_cu_ctx_destroy_func_t)(void*);
   typedef int (*cuda_cu_ctx_synchronize_func_t)(void);
+  typedef int (*cuda_cu_ctx_get_current_func_t)(void*);
   typedef int (*cuda_cu_ctx_set_current_func_t)(void*);
   typedef int (*cuda_cu_device_get_count_func_t)(int*);
   typedef int (*cuda_cu_device_get_name_func_t)(char*, int, int);
@@ -152,6 +159,7 @@
   static cuda_cu_memfree_func_t                   _cuda_cu_memfree;
   static cuda_cu_memcpy_htod_func_t               _cuda_cu_memcpy_htod;
   static cuda_cu_memcpy_dtoh_func_t               _cuda_cu_memcpy_dtoh;
+  static cuda_cu_ctx_get_current_func_t           _cuda_cu_ctx_get_current;
   static cuda_cu_ctx_set_current_func_t           _cuda_cu_ctx_set_current;
   static cuda_cu_mem_host_register_func_t         _cuda_cu_mem_host_register;
   static cuda_cu_mem_host_get_device_pointer_func_t _cuda_cu_mem_host_get_device_pointer;