changeset 13624:220ed109bf77

initial code for calling PTX kernel code from Java with parameter marshaling and return value unmarshaling performed by a wrapper specified via manual graph construction
author Doug Simon <doug.simon@oracle.com>
date Mon, 13 Jan 2014 22:32:27 +0100
parents c70dddf5ce4a
children 881dd7f896de
files graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXLIRGenerator.java graal/com.oracle.graal.hotspot.ptx.test/src/com/oracle/graal/hotspot/ptx/test/PTXLaunchKernelTest.java graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXLaunchKernelGraphKit.java graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPU.java graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPUImpl.java graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/meta/HotSpotForeignCallsProviderImpl.java graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXControlFlow.java mx/projects src/gpu/ptx/vm/gpu_ptx.cpp src/gpu/ptx/vm/gpu_ptx.hpp src/gpu/ptx/vm/ptxKernelArguments.hpp src/share/vm/graal/graalCompilerToGPU.cpp
diffstat 13 files changed, 558 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXLIRGenerator.java	Mon Jan 13 22:28:57 2014 +0100
+++ b/graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXLIRGenerator.java	Mon Jan 13 22:32:27 2014 +0100
@@ -29,7 +29,7 @@
 import static com.oracle.graal.lir.ptx.PTXBitManipulationOp.IntrinsicOpcode.*;
 import static com.oracle.graal.lir.ptx.PTXCompare.*;
 
-import java.lang.annotation.*;
+import java.lang.reflect.*;
 
 import com.oracle.graal.api.code.*;
 import com.oracle.graal.api.meta.*;
@@ -154,17 +154,13 @@
         }
 
         for (ParameterNode param : graph.getNodes(ParameterNode.class)) {
+            int localIndex = param.index();
             Value paramValue = params[param.index()];
-            Annotation[] annos = graph.method().getParameterAnnotations()[param.index()];
-            Warp warpAnnotation = null;
-
-            if (annos != null) {
-                for (int a = 0; a < annos.length; a++) {
-                    if (annos[a].annotationType().equals(Warp.class)) {
-                        warpAnnotation = (Warp) annos[a];
-                    }
-                }
+            int parameterIndex = localIndex;
+            if (!Modifier.isStatic(graph.method().getModifiers())) {
+                parameterIndex--;
             }
+            Warp warpAnnotation = parameterIndex >= 0 ? MetaUtil.getParameterAnnotation(Warp.class, parameterIndex, graph.method()) : null;
             if (warpAnnotation != null) {
                 setResult(param, emitWarpParam(paramValue.getKind(), warpAnnotation));
             } else {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.hotspot.ptx.test/src/com/oracle/graal/hotspot/ptx/test/PTXLaunchKernelTest.java	Mon Jan 13 22:32:27 2014 +0100
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.graal.hotspot.ptx.test;
+
+import static com.oracle.graal.api.code.CodeUtil.*;
+import static com.oracle.graal.compiler.GraalCompiler.*;
+import static com.oracle.graal.hotspot.HotSpotGraalRuntime.*;
+
+import org.junit.*;
+
+import com.oracle.graal.api.code.*;
+import com.oracle.graal.api.code.CallingConvention.Type;
+import com.oracle.graal.api.meta.*;
+import com.oracle.graal.compiler.test.*;
+import com.oracle.graal.debug.*;
+import com.oracle.graal.debug.Debug.Scope;
+import com.oracle.graal.hotspot.*;
+import com.oracle.graal.hotspot.bridge.*;
+import com.oracle.graal.hotspot.meta.*;
+import com.oracle.graal.hotspot.ptx.*;
+import com.oracle.graal.lir.asm.*;
+import com.oracle.graal.nodes.*;
+import com.oracle.graal.phases.*;
+import com.oracle.graal.phases.tiers.*;
+import com.oracle.graal.ptx.*;
+
+/**
+ * Tests the mechanism for launching a PTX kernel method via wrapper code generated by
+ * {@link PTXLaunchKernelGraphKit}.
+ */
+public class PTXLaunchKernelTest extends GraalCompilerTest {
+
+    public PTXLaunchKernelTest() {
+        super();
+    }
+
+    /**
+     * Compiles and installs PTX kernel code for a given method.
+     */
+    static class PTXKernel extends GraalCompilerTest {
+        public PTXKernel() {
+            super(PTX.class);
+        }
+
+        static CompilerToGPU toGPU = HotSpotGraalRuntime.runtime().getCompilerToGPU();
+        static boolean validDevice = toGPU.deviceInit();
+
+        @Override
+        protected CompilationResult compile(ResolvedJavaMethod method, StructuredGraph graph) {
+            CallingConvention cc = getCallingConvention(getCodeCache(), Type.JavaCallee, graph.method(), false);
+
+            /*
+             * Use Suites.createDefaultSuites() instead of GraalCompilerTest.suites. The
+             * GraalCompilerTest.suites variable contains the Suites for the HotSpotRuntime. This
+             * code will not run on HotSpot, so it should use the plain Graal default suites,
+             * without HotSpot specific phases.
+             * 
+             * Ultimately we might want to have both the kernel and the code natively compiled for
+             * GPU fallback to CPU in cases of ECC failure on kernel invocation.
+             */
+            Suites suites = Suites.createDefaultSuites();
+            PTXHotSpotBackend ptxBackend = (PTXHotSpotBackend) getBackend();
+            ExternalCompilationResult kernelResult = compileGraph(graph, cc, method, getProviders(), ptxBackend, ptxBackend.getTarget(), null, getDefaultGraphBuilderSuite(),
+                            OptimisticOptimizations.NONE, getProfilingInfo(graph), new SpeculationLog(), suites, true, new ExternalCompilationResult(), CompilationResultBuilderFactory.Default);
+
+            Assume.assumeTrue(validDevice);
+            Assert.assertTrue(kernelResult.getTargetCode() != null);
+            try (Scope ds = Debug.scope("GeneratingKernel")) {
+                long kernel = toGPU.generateKernel(kernelResult.getTargetCode(), method.getName());
+                kernelResult.setEntryPoint(kernel);
+            } catch (Throwable e) {
+                throw Debug.handle(e);
+            }
+            return kernelResult;
+        }
+
+        @Override
+        protected InstalledCode addMethod(ResolvedJavaMethod method, CompilationResult compResult) {
+            HotSpotCodeCacheProvider codeCache = (HotSpotCodeCacheProvider) getCodeCache();
+            return codeCache.addExternalMethod(method, compResult);
+        }
+
+        /**
+         * Compiles and installs PTX kernel code for {@code method}.
+         */
+        InstalledCode getKernelCode(ResolvedJavaMethod method, StructuredGraph graph) {
+            return getCode(method, graph, true);
+        }
+    }
+
+    @Override
+    protected InstalledCode getCode(ResolvedJavaMethod method, StructuredGraph graph) {
+        InstalledCode kernelCode = new PTXKernel().getKernelCode(method, graph);
+        StructuredGraph launchKernel = new PTXLaunchKernelGraphKit(method, kernelCode.getStart(), runtime().getHostProviders()).getGraph();
+        return super.getCode(method, launchKernel);
+    }
+
+    @Test
+    public void testStaticIntKernel() {
+        test("staticIntKernel", 'a', 42);
+    }
+
+    @Test
+    public void testVirtualIntKernel() {
+        test("virtualIntKernel", 'a', 42);
+    }
+
+    public static int staticIntKernel(char p0, int p1) {
+        return p1 + p0;
+    }
+
+    public int virtualIntKernel(char p0, int p1) {
+        return p1 + p0;
+    }
+}
--- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java	Mon Jan 13 22:28:57 2014 +0100
+++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java	Mon Jan 13 22:32:27 2014 +0100
@@ -22,6 +22,11 @@
  */
 package com.oracle.graal.hotspot.ptx;
 
+import static com.oracle.graal.api.code.CallingConvention.Type.*;
+import static com.oracle.graal.api.meta.LocationIdentity.*;
+import static com.oracle.graal.hotspot.HotSpotForeignCallLinkage.RegisterEffect.*;
+import static com.oracle.graal.hotspot.HotSpotForeignCallLinkage.Transition.*;
+import static com.oracle.graal.hotspot.meta.HotSpotForeignCallsProviderImpl.*;
 import static com.oracle.graal.lir.LIRValueUtil.*;
 
 import java.util.*;
@@ -42,15 +47,25 @@
 import com.oracle.graal.lir.StandardOp.LabelOp;
 import com.oracle.graal.lir.asm.*;
 import com.oracle.graal.lir.ptx.*;
+import com.oracle.graal.lir.ptx.PTXMemOp.LoadReturnAddrOp;
 import com.oracle.graal.nodes.*;
 import com.oracle.graal.nodes.cfg.*;
-import com.oracle.graal.lir.ptx.PTXMemOp.LoadReturnAddrOp;
+import com.oracle.graal.word.*;
 
 /**
  * HotSpot PTX specific backend.
  */
 public class PTXHotSpotBackend extends HotSpotBackend {
 
+    /**
+     * Descriptor for the PTX runtime method for launching a kernel. The C++ signature is:
+     * 
+     * <pre>
+     *     jlong gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jlong parametersAndReturnValueBuffer, jint parametersAndReturnValueBufferSize, jint encodedReturnTypeSize)
+     * </pre>
+     */
+    public static final ForeignCallDescriptor LAUNCH_KERNEL = new ForeignCallDescriptor("execute_kernel_from_vm", long.class, Word.class, long.class, long.class, int.class, int.class);
+
     public PTXHotSpotBackend(HotSpotGraalRuntime runtime, HotSpotProviders providers) {
         super(runtime, providers);
     }
@@ -61,6 +76,14 @@
     }
 
     @Override
+    public void completeInitialization() {
+        HotSpotHostForeignCallsProvider hostForeignCalls = (HotSpotHostForeignCallsProvider) getRuntime().getHostProviders().getForeignCalls();
+        long launchKernel = getRuntime().getCompilerToGPU().getLaunchKernelAddress();
+        hostForeignCalls.registerForeignCall(LAUNCH_KERNEL, launchKernel, NativeCall, DESTROYS_REGISTERS, NOT_LEAF, NOT_REEXECUTABLE, ANY_LOCATION);
+        super.completeInitialization();
+    }
+
+    @Override
     public FrameMap newFrameMap() {
         return new PTXFrameMap(getCodeCache());
     }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXLaunchKernelGraphKit.java	Mon Jan 13 22:32:27 2014 +0100
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.graal.hotspot.ptx;
+
+import static com.oracle.graal.api.meta.DeoptimizationReason.*;
+import static com.oracle.graal.api.meta.LocationIdentity.*;
+import static com.oracle.graal.asm.NumUtil.*;
+import static com.oracle.graal.hotspot.ptx.PTXHotSpotBackend.*;
+import static com.oracle.graal.hotspot.replacements.HotSpotReplacementsUtil.*;
+import static java.lang.reflect.Modifier.*;
+
+import java.util.*;
+
+import com.oracle.graal.api.meta.*;
+import com.oracle.graal.debug.*;
+import com.oracle.graal.graph.*;
+import com.oracle.graal.hotspot.meta.*;
+import com.oracle.graal.hotspot.nodes.*;
+import com.oracle.graal.hotspot.stubs.*;
+import com.oracle.graal.java.*;
+import com.oracle.graal.nodes.*;
+import com.oracle.graal.nodes.HeapAccess.BarrierType;
+import com.oracle.graal.nodes.calc.*;
+import com.oracle.graal.nodes.extended.*;
+import com.oracle.graal.nodes.type.*;
+import com.oracle.graal.replacements.nodes.*;
+import com.oracle.graal.word.*;
+
+/**
+ * Utility for building a graph for launching a PTX kernel compiled for a method. This graph created
+ * is something like the following pseudo code:
+ * 
+ * <pre>
+ *     jlong kernel(p0, p1, ..., pN) {
+ *         jint kernelParamsBufSize = SIZE_OF_ALIGNED_PARAMS_WITH_PADDING(p0, p1, ..., pN);
+ *         jbyte kernelParamsBuf[kernelParamsBufSize] = {p0, PAD(p1), p1, ..., PAD(pN), pN};
+ *         jlong result = PTX_LAUNCH_KERNEL(THREAD_REGISTER, kernelParamsBuf, kernelParamsBuf);
+ *         return result;
+ *     }
+ * </pre>
+ */
+public class PTXLaunchKernelGraphKit extends GraphKit {
+
+    /**
+     * The incoming Java arguments to the kernel invocation.
+     */
+    ParameterNode[] javaParameters;
+
+    /**
+     * The size of the buffer holding the parameters and the extra word for storing the pointer to
+     * device memory for the return value. This will be the same as
+     * PTXKernelArguments::device_argument_buffer_size().
+     */
+    int kernelParametersAndReturnValueBufferSize;
+
+    /**
+     * Offsets of each Java argument in the parameters buffer.
+     */
+    int[] javaParameterOffsetsInKernelParametersBuffer;
+
+    /**
+     * Creates a graph implementing the transition from Java to the native routine that launches
+     * some compiled PTX code.
+     * 
+     * @param kernelMethod a method that has been compiled to PTX kernel code
+     * @param kernelAddress the address of the installed PTX code for {@code kernelMethod}
+     */
+    public PTXLaunchKernelGraphKit(ResolvedJavaMethod kernelMethod, long kernelAddress, HotSpotProviders providers) {
+        super(new StructuredGraph(kernelMethod), providers);
+        int wordSize = providers.getCodeCache().getTarget().wordSize;
+        Kind wordKind = providers.getCodeCache().getTarget().wordKind;
+        Signature sig = kernelMethod.getSignature();
+        boolean isStatic = isStatic(kernelMethod.getModifiers());
+        int sigCount = sig.getParameterCount(false);
+        javaParameters = new ParameterNode[(!isStatic ? 1 : 0) + sigCount];
+        javaParameterOffsetsInKernelParametersBuffer = new int[javaParameters.length];
+        int javaParametersIndex = 0;
+        Kind returnKind = sig.getReturnKind();
+
+        BitSet objects = new BitSet();
+        if (!isStatic) {
+            javaParameters[javaParametersIndex] = unique(new ParameterNode(javaParametersIndex, StampFactory.declaredNonNull(kernelMethod.getDeclaringClass())));
+            kernelParametersAndReturnValueBufferSize += wordSize;
+            javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex++] = 0;
+            objects.set(0);
+        }
+        for (int i = 0; i < sigCount; i++) {
+            Kind kind = sig.getParameterKind(i);
+            int kindByteSize = kind.getBitCount() / Byte.SIZE;
+            while ((kernelParametersAndReturnValueBufferSize % kindByteSize) != 0) {
+                kernelParametersAndReturnValueBufferSize++;
+            }
+            javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex] = kernelParametersAndReturnValueBufferSize;
+            Stamp stamp;
+            if (kind == Kind.Object) {
+                stamp = StampFactory.object();
+                int slot = kernelParametersAndReturnValueBufferSize / wordSize;
+                objects.set(slot);
+            } else {
+                stamp = StampFactory.forKind(kind);
+            }
+            ParameterNode param = unique(new ParameterNode(javaParametersIndex, stamp));
+            javaParameters[javaParametersIndex++] = param;
+            kernelParametersAndReturnValueBufferSize += kindByteSize;
+        }
+        kernelParametersAndReturnValueBufferSize = roundUp(kernelParametersAndReturnValueBufferSize, wordSize);
+
+        // Add slot for holding pointer to device memory storing return value
+        int encodedReturnTypeSize = 0;
+        if (returnKind != Kind.Void) {
+            kernelParametersAndReturnValueBufferSize += wordSize;
+            if (returnKind == Kind.Object) {
+                encodedReturnTypeSize = -wordSize;
+            } else {
+                encodedReturnTypeSize = returnKind.getBitCount() / Byte.SIZE;
+            }
+        }
+
+        ReadRegisterNode threadArg = append(new ReadRegisterNode(providers.getRegisters().getThreadRegister(), true, false));
+        ConstantNode kernelAddressArg = ConstantNode.forLong(kernelAddress, getGraph());
+        AllocaNode kernelParametersAndReturnValueBufferArg = append(new AllocaNode(kernelParametersAndReturnValueBufferSize / wordSize, objects));
+        ConstantNode kernelParametersAndReturnValueBufferSizeArg = ConstantNode.forInt(kernelParametersAndReturnValueBufferSize, getGraph());
+        ConstantNode encodedReturnTypeSizeArg = ConstantNode.forInt(encodedReturnTypeSize, getGraph());
+
+        for (javaParametersIndex = 0; javaParametersIndex < javaParameters.length; javaParametersIndex++) {
+            ParameterNode javaParameter = javaParameters[javaParametersIndex];
+            int javaParameterOffset = javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex];
+            LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, javaParameter.kind(), javaParameterOffset, getGraph());
+            append(new WriteNode(kernelParametersAndReturnValueBufferArg, javaParameter, location, BarrierType.NONE, false, false));
+        }
+        if (returnKind != Kind.Void) {
+            LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, wordKind, kernelParametersAndReturnValueBufferSize - wordSize, getGraph());
+            append(new WriteNode(kernelParametersAndReturnValueBufferArg, ConstantNode.forIntegerKind(wordKind, 0L, getGraph()), location, BarrierType.NONE, false, false));
+        }
+
+        FrameStateBuilder fsb = new FrameStateBuilder(kernelMethod, getGraph(), true);
+        FrameState fs = fsb.create(0);
+        getGraph().start().setStateAfter(fs);
+
+        ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), LAUNCH_KERNEL, threadArg, kernelAddressArg, kernelParametersAndReturnValueBufferArg,
+                        kernelParametersAndReturnValueBufferSizeArg, encodedReturnTypeSizeArg));
+        result.setDeoptimizationState(fs);
+
+        ConstantNode isObjectResultArg = ConstantNode.forBoolean(returnKind == Kind.Object, getGraph());
+        InvokeNode handlePendingException = createInvoke(getClass(), "handlePendingException", threadArg, isObjectResultArg);
+        handlePendingException.setStateAfter(fs);
+        InvokeNode getObjectResult = null;
+
+        ValueNode returnValue;
+        switch (returnKind) {
+            case Void:
+                returnValue = null;
+                break;
+            case Boolean:
+            case Byte:
+            case Short:
+            case Char:
+            case Int:
+                returnValue = unique(new ConvertNode(Kind.Long, Kind.Int, result));
+                break;
+            case Long:
+                returnValue = result;
+                break;
+            case Float:
+            case Double:
+                returnValue = unique(new ReinterpretNode(returnKind, result));
+                break;
+            case Object:
+                getObjectResult = createInvoke(getClass(), "getObjectResult", threadArg);
+                returnValue = append(getObjectResult);
+                break;
+            default:
+                throw new GraalInternalError("%s return kind not supported", returnKind);
+        }
+
+        append(new ReturnNode(returnValue));
+
+        if (Debug.isDumpEnabled()) {
+            Debug.dump(getGraph(), "Initial kernel launch graph");
+        }
+
+        rewriteWordTypes();
+        inlineInvokes();
+
+        if (Debug.isDumpEnabled()) {
+            Debug.dump(getGraph(), "Kernel launch graph before compilation");
+        }
+    }
+
+    public static void handlePendingException(Word thread, boolean isObjectResult) {
+        if (clearPendingException(thread)) {
+            if (isObjectResult) {
+                getAndClearObjectResult(thread);
+            }
+            DeoptimizeNode.deopt(DeoptimizationAction.None, RuntimeConstraint);
+        }
+    }
+
+    public static Object getObjectResult(Word thread) {
+        return getAndClearObjectResult(thread);
+    }
+}
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPU.java	Mon Jan 13 22:28:57 2014 +0100
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPU.java	Mon Jan 13 22:32:27 2014 +0100
@@ -58,4 +58,9 @@
     Object executeExternalMethodVarargs(Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException;
 
     Object executeParallelMethodVarargs(int dimX, int dimY, int dimZ, Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException;
+
+    /**
+     * Gets the address of the runtime function for launching a kernel function.
+     */
+    long getLaunchKernelAddress();
 }
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPUImpl.java	Mon Jan 13 22:28:57 2014 +0100
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPUImpl.java	Mon Jan 13 22:32:27 2014 +0100
@@ -42,4 +42,6 @@
     public native Object executeExternalMethodVarargs(Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException;
 
     public native Object executeParallelMethodVarargs(int dimX, int dimY, int dimZ, Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException;
+
+    public native long getLaunchKernelAddress();
 }
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/meta/HotSpotForeignCallsProviderImpl.java	Mon Jan 13 22:28:57 2014 +0100
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/meta/HotSpotForeignCallsProviderImpl.java	Mon Jan 13 22:32:27 2014 +0100
@@ -61,7 +61,7 @@
     /**
      * Registers the linkage for a foreign call.
      */
-    protected HotSpotForeignCallLinkage register(HotSpotForeignCallLinkage linkage) {
+    public HotSpotForeignCallLinkage register(HotSpotForeignCallLinkage linkage) {
         assert !foreignCalls.containsKey(linkage.getDescriptor()) : "already registered linkage for " + linkage.getDescriptor();
         foreignCalls.put(linkage.getDescriptor(), linkage);
         return linkage;
@@ -77,7 +77,7 @@
      * @param transition specifies if this is a {@linkplain Transition#LEAF leaf} call
      * @param killedLocations the memory locations killed by the stub call
      */
-    protected HotSpotForeignCallLinkage registerStubCall(ForeignCallDescriptor descriptor, boolean reexecutable, Transition transition, LocationIdentity... killedLocations) {
+    public HotSpotForeignCallLinkage registerStubCall(ForeignCallDescriptor descriptor, boolean reexecutable, Transition transition, LocationIdentity... killedLocations) {
         return register(HotSpotForeignCallLinkage.create(metaAccess, codeCache, this, descriptor, 0L, PRESERVES_REGISTERS, JavaCall, JavaCallee, transition, reexecutable, killedLocations));
     }
 
@@ -95,7 +95,7 @@
      *            cannot be re-executed.
      * @param killedLocations the memory locations killed by the foreign call
      */
-    protected HotSpotForeignCallLinkage registerForeignCall(ForeignCallDescriptor descriptor, long address, CallingConvention.Type outgoingCcType, RegisterEffect effect, Transition transition,
+    public HotSpotForeignCallLinkage registerForeignCall(ForeignCallDescriptor descriptor, long address, CallingConvention.Type outgoingCcType, RegisterEffect effect, Transition transition,
                     boolean reexecutable, LocationIdentity... killedLocations) {
         Class<?> resultType = descriptor.getResultType();
         assert transition != NOT_LEAF || resultType.isPrimitive() || Word.class.isAssignableFrom(resultType) : "non-leaf foreign calls must return objects in thread local storage: " + descriptor;
@@ -115,7 +115,7 @@
      *            cannot be re-executed.
      * @param killedLocations the memory locations killed by the foreign call
      */
-    protected void linkForeignCall(HotSpotProviders providers, ForeignCallDescriptor descriptor, long address, boolean prependThread, Transition transition, boolean reexecutable,
+    public void linkForeignCall(HotSpotProviders providers, ForeignCallDescriptor descriptor, long address, boolean prependThread, Transition transition, boolean reexecutable,
                     LocationIdentity... killedLocations) {
         ForeignCallStub stub = new ForeignCallStub(providers, address, descriptor, prependThread, transition, reexecutable, killedLocations);
         HotSpotForeignCallLinkage linkage = stub.getLinkage();
--- a/graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXControlFlow.java	Mon Jan 13 22:28:57 2014 +0100
+++ b/graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXControlFlow.java	Mon Jan 13 22:32:27 2014 +0100
@@ -57,7 +57,7 @@
         }
     }
 
-    public static class ReturnNoValOp extends PTXLIRInstruction {
+    public static class ReturnNoValOp extends PTXLIRInstruction implements BlockEndOp {
 
         public ReturnNoValOp() {
         }
--- a/mx/projects	Mon Jan 13 22:28:57 2014 +0100
+++ b/mx/projects	Mon Jan 13 22:32:27 2014 +0100
@@ -202,6 +202,14 @@
 project@com.oracle.graal.hotspot.ptx@javaCompliance=1.7
 project@com.oracle.graal.hotspot.ptx@workingSets=Graal,HotSpot,PTX
 
+# graal.hotspot.ptx.test
+project@com.oracle.graal.hotspot.ptx.test@subDir=graal
+project@com.oracle.graal.hotspot.ptx.test@sourceDirs=src
+project@com.oracle.graal.hotspot.ptx.test@dependencies=com.oracle.graal.hotspot.ptx,com.oracle.graal.replacements.test
+project@com.oracle.graal.hotspot.ptx.test@checkstyle=com.oracle.graal.graph
+project@com.oracle.graal.hotspot.ptx.test@javaCompliance=1.7
+project@com.oracle.graal.hotspot.ptx.test@workingSets=Graal,HotSpot,PTX
+
 # graal.hotspot.hsail
 project@com.oracle.graal.hotspot.hsail@subDir=graal
 project@com.oracle.graal.hotspot.hsail@sourceDirs=src
--- a/src/gpu/ptx/vm/gpu_ptx.cpp	Mon Jan 13 22:28:57 2014 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.cpp	Mon Jan 13 22:32:27 2014 +0100
@@ -29,6 +29,7 @@
 #include "utilities/ostream.hpp"
 #include "memory/allocation.hpp"
 #include "memory/allocation.inline.hpp"
+#include "runtime/interfaceSupport.hpp"
 #include "ptxKernelArguments.hpp"
 
 void * gpu::Ptx::_device_context;
@@ -336,6 +337,144 @@
   return cu_function;
 }
 
+JRT_ENTRY(jlong, gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jlong parametersAndReturnValueBuffer, jint parametersAndReturnValueBufferSize, int encodedReturnTypeSize))
+  tty->print_cr("*** gpu::Ptx::execute_kernel_from_vm(kernel=%p, parametersAndReturnValueBuffer=%p, parametersAndReturnValueBufferSize=%d, encodedReturnTypeSize=%d)",
+      kernel, parametersAndReturnValueBuffer, parametersAndReturnValueBufferSize, encodedReturnTypeSize);
+  tty->print("  buffer as bytes: ");
+  for (int i = 0; i < parametersAndReturnValueBufferSize; i++) {
+    tty->print(" 0x%02x", ((jbyte*) (address) parametersAndReturnValueBuffer)[i] & 0xFF);
+  }
+  tty->cr();
+  tty->print("  buffer as ints: ");
+  for (int i = 0; i < (parametersAndReturnValueBufferSize / 4); i++) {
+    tty->print(" %d", ((jint*) (address) parametersAndReturnValueBuffer)[i]);
+  }
+  tty->cr();
+  tty->print("  buffer as words: ");
+  for (unsigned i = 0; i < (parametersAndReturnValueBufferSize / sizeof(void*)); i++) {
+    tty->print(" "INTPTR_FORMAT, ((void**) (address) parametersAndReturnValueBuffer)[i]);
+  }
+  tty->cr();
+  if (kernel == 0L) {
+    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_NullPointerException(), NULL);
+    return 0L;
+  }
+
+
+  // grid dimensionality
+  unsigned int gridX = 1;
+  unsigned int gridY = 1;
+  unsigned int gridZ = 1;
+
+  // thread dimensionality
+  unsigned int blockX = 1;
+  unsigned int blockY = 1;
+  unsigned int blockZ = 1;
+
+  struct CUfunc_st* cu_function = (struct CUfunc_st*) (address) kernel;
+
+  void * config[5] = {
+    GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, (char*) (address) parametersAndReturnValueBuffer,
+    GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, &parametersAndReturnValueBufferSize,
+    GRAAL_CU_LAUNCH_PARAM_END
+  };
+
+  if (TraceGPUInteraction) {
+    tty->print_cr("[CUDA] launching kernel");
+  }
+
+  bool isObjectReturn = encodedReturnTypeSize < 0;
+  int returnTypeSize = encodedReturnTypeSize < 0 ? -encodedReturnTypeSize : encodedReturnTypeSize;
+  gpu::Ptx::CUdeviceptr device_return_value;
+  int status;
+  if (returnTypeSize != 0) {
+    status = _cuda_cu_memalloc(&device_return_value, returnTypeSize);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status);
+      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to allocate memory for return value pointer on device");
+      return 0L;
+    }
+    // Push device_return_value to kernelParams
+    gpu::Ptx::CUdeviceptr* returnValuePtr = (gpu::Ptx::CUdeviceptr*) (address) parametersAndReturnValueBuffer + parametersAndReturnValueBufferSize - sizeof(device_return_value);
+    *returnValuePtr = device_return_value;
+  }
+
+  status = _cuda_cu_launch_kernel(cu_function,
+                                      gridX, gridY, gridZ,
+                                      blockX, blockY, blockZ,
+                                      0, NULL, NULL, (void **) &config);
+
+  if (status != GRAAL_CUDA_SUCCESS) {
+    tty->print_cr("[CUDA] Failed to launch kernel");
+    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to launch kernel");
+    return 0L;
+  }
+
+  if (TraceGPUInteraction) {
+    tty->print_cr("[CUDA] Success: Kernel Launch: X: %d Y: %d Z: %d", blockX, blockY, blockZ);
+  }
+
+  status = _cuda_cu_ctx_synchronize();
+
+  if (status != GRAAL_CUDA_SUCCESS) {
+    tty->print_cr("[CUDA] Failed to synchronize launched kernel (%d)", status);
+    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to synchronize launched kernel");
+    return 0L;
+  }
+
+  if (TraceGPUInteraction) {
+    tty->print_cr("[CUDA] Success: Synchronized launch kernel");
+  }
+
+  jlong primitiveReturnValue = 0L;
+  if (isObjectReturn) {
+    oop return_val;
+    status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, device_return_value, T_OBJECT_BYTE_SIZE);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status);
+      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument");
+      return 0L;
+    }
+    thread->set_vm_result(return_val);
+  } else if (returnTypeSize > 0) {
+    jlong result;
+    status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&primitiveReturnValue, device_return_value, T_LONG_BYTE_SIZE);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status);
+      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument");
+      return 0L;
+    }
+  }
+
+  // Free device memory allocated for result
+  if (returnTypeSize != 0) {
+    status = gpu::Ptx::_cuda_cu_memfree(device_return_value);
+    if (status != GRAAL_CUDA_SUCCESS) {
+      tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status);
+      SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to free device memory of return value");
+      return 0L;
+    }
+  }
+
+  if (TraceGPUInteraction) {
+    tty->print_cr("[CUDA] Success: Freed device memory of return value");
+  }
+
+  // Destroy context
+  status = gpu::Ptx::_cuda_cu_ctx_destroy(_device_context);
+  if (status != GRAAL_CUDA_SUCCESS) {
+    tty->print_cr("[CUDA] *** Error (%d) Failed to destroy context", status);
+    SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to destroy context");
+    return 0L;
+  }
+
+  if (TraceGPUInteraction) {
+    tty->print_cr("[CUDA] Success: Destroy context");
+  }
+
+  return primitiveReturnValue;
+JRT_END
+
 bool gpu::Ptx::execute_kernel(address kernel, PTXKernelArguments &ptxka, JavaValue &ret) {
     return gpu::Ptx::execute_warp(1, 1, 1, kernel, ptxka, ret);
 }
--- a/src/gpu/ptx/vm/gpu_ptx.hpp	Mon Jan 13 22:28:57 2014 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.hpp	Mon Jan 13 22:32:27 2014 +0100
@@ -103,7 +103,9 @@
   typedef unsigned int CUdeviceptr;
 #endif
 
-typedef int CUdevice;     /**< CUDA device */
+typedef int CUdevice;     /* CUDA device */
+
+  static jlong execute_kernel_from_vm(JavaThread* thread, jlong kernel, jlong parametersAndReturnValueBuffer, jint parametersAndReturnValueBufferSize, int encodedReturnTypeSize);
 
 private:
   typedef int (*cuda_cu_init_func_t)(unsigned int);
--- a/src/gpu/ptx/vm/ptxKernelArguments.hpp	Mon Jan 13 22:28:57 2014 +0100
+++ b/src/gpu/ptx/vm/ptxKernelArguments.hpp	Mon Jan 13 22:32:27 2014 +0100
@@ -34,6 +34,7 @@
 #define T_FLOAT_BYTE_SIZE  4
 #define T_DOUBLE_BYTE_SIZE 8
 #define T_LONG_BYTE_SIZE   8
+#define T_OBJECT_BYTE_SIZE 8
 #define T_ARRAY_BYTE_SIZE  8
 
 class PTXKernelArguments : public SignatureIterator {
--- a/src/share/vm/graal/graalCompilerToGPU.cpp	Mon Jan 13 22:28:57 2014 +0100
+++ b/src/share/vm/graal/graalCompilerToGPU.cpp	Mon Jan 13 22:32:27 2014 +0100
@@ -175,6 +175,13 @@
   }
 C2V_END
 
+C2V_VMENTRY(jlong, getLaunchKernelAddress, (JNIEnv *env, jobject))
+  if (gpu::get_target_il_type() == gpu::PTX) {
+    return (jlong) gpu::Ptx::execute_kernel_from_vm;
+  }
+  return 0L;
+C2V_END
+
 C2V_VMENTRY(jboolean, deviceInit, (JNIEnv *env, jobject))
   if (gpu::is_available() == false || gpu::has_gpu_linkage() == false) {
     if (TraceGPUInteraction) {
@@ -247,6 +254,7 @@
   {CC"availableProcessors",           CC"()I",                                    FN_PTR(availableProcessors)},
   {CC"executeExternalMethodVarargs",  CC"(["OBJECT HS_INSTALLED_CODE")"OBJECT,    FN_PTR(executeExternalMethodVarargs)},
   {CC"executeParallelMethodVarargs",  CC"(III["OBJECT HS_INSTALLED_CODE")"OBJECT, FN_PTR(executeParallelMethodVarargs)},
+  {CC"getLaunchKernelAddress",        CC"()J",                                    FN_PTR(getLaunchKernelAddress)},
 };
 
 int CompilerToGPU_methods_count() {