# HG changeset patch # User Doug Simon # Date 1389648747 -3600 # Node ID 220ed109bf7774fb6bd80cde3898a697fa5890c3 # Parent c70dddf5ce4a9c01e6cc0da6fb6a09f22079d9c7 initial code for calling PTX kernel code from Java with parameter marshaling and return value unmarshaling performed by a wrapper specified via manual graph construction diff -r c70dddf5ce4a -r 220ed109bf77 graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXLIRGenerator.java --- a/graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXLIRGenerator.java Mon Jan 13 22:28:57 2014 +0100 +++ b/graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXLIRGenerator.java Mon Jan 13 22:32:27 2014 +0100 @@ -29,7 +29,7 @@ import static com.oracle.graal.lir.ptx.PTXBitManipulationOp.IntrinsicOpcode.*; import static com.oracle.graal.lir.ptx.PTXCompare.*; -import java.lang.annotation.*; +import java.lang.reflect.*; import com.oracle.graal.api.code.*; import com.oracle.graal.api.meta.*; @@ -154,17 +154,13 @@ } for (ParameterNode param : graph.getNodes(ParameterNode.class)) { + int localIndex = param.index(); Value paramValue = params[param.index()]; - Annotation[] annos = graph.method().getParameterAnnotations()[param.index()]; - Warp warpAnnotation = null; - - if (annos != null) { - for (int a = 0; a < annos.length; a++) { - if (annos[a].annotationType().equals(Warp.class)) { - warpAnnotation = (Warp) annos[a]; - } - } + int parameterIndex = localIndex; + if (!Modifier.isStatic(graph.method().getModifiers())) { + parameterIndex--; } + Warp warpAnnotation = parameterIndex >= 0 ? MetaUtil.getParameterAnnotation(Warp.class, parameterIndex, graph.method()) : null; if (warpAnnotation != null) { setResult(param, emitWarpParam(paramValue.getKind(), warpAnnotation)); } else { diff -r c70dddf5ce4a -r 220ed109bf77 graal/com.oracle.graal.hotspot.ptx.test/src/com/oracle/graal/hotspot/ptx/test/PTXLaunchKernelTest.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graal/com.oracle.graal.hotspot.ptx.test/src/com/oracle/graal/hotspot/ptx/test/PTXLaunchKernelTest.java Mon Jan 13 22:32:27 2014 +0100 @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package com.oracle.graal.hotspot.ptx.test; + +import static com.oracle.graal.api.code.CodeUtil.*; +import static com.oracle.graal.compiler.GraalCompiler.*; +import static com.oracle.graal.hotspot.HotSpotGraalRuntime.*; + +import org.junit.*; + +import com.oracle.graal.api.code.*; +import com.oracle.graal.api.code.CallingConvention.Type; +import com.oracle.graal.api.meta.*; +import com.oracle.graal.compiler.test.*; +import com.oracle.graal.debug.*; +import com.oracle.graal.debug.Debug.Scope; +import com.oracle.graal.hotspot.*; +import com.oracle.graal.hotspot.bridge.*; +import com.oracle.graal.hotspot.meta.*; +import com.oracle.graal.hotspot.ptx.*; +import com.oracle.graal.lir.asm.*; +import com.oracle.graal.nodes.*; +import com.oracle.graal.phases.*; +import com.oracle.graal.phases.tiers.*; +import com.oracle.graal.ptx.*; + +/** + * Tests the mechanism for launching a PTX kernel method via wrapper code generated by + * {@link PTXLaunchKernelGraphKit}. + */ +public class PTXLaunchKernelTest extends GraalCompilerTest { + + public PTXLaunchKernelTest() { + super(); + } + + /** + * Compiles and installs PTX kernel code for a given method. + */ + static class PTXKernel extends GraalCompilerTest { + public PTXKernel() { + super(PTX.class); + } + + static CompilerToGPU toGPU = HotSpotGraalRuntime.runtime().getCompilerToGPU(); + static boolean validDevice = toGPU.deviceInit(); + + @Override + protected CompilationResult compile(ResolvedJavaMethod method, StructuredGraph graph) { + CallingConvention cc = getCallingConvention(getCodeCache(), Type.JavaCallee, graph.method(), false); + + /* + * Use Suites.createDefaultSuites() instead of GraalCompilerTest.suites. The + * GraalCompilerTest.suites variable contains the Suites for the HotSpotRuntime. This + * code will not run on HotSpot, so it should use the plain Graal default suites, + * without HotSpot specific phases. + * + * Ultimately we might want to have both the kernel and the code natively compiled for + * GPU fallback to CPU in cases of ECC failure on kernel invocation. + */ + Suites suites = Suites.createDefaultSuites(); + PTXHotSpotBackend ptxBackend = (PTXHotSpotBackend) getBackend(); + ExternalCompilationResult kernelResult = compileGraph(graph, cc, method, getProviders(), ptxBackend, ptxBackend.getTarget(), null, getDefaultGraphBuilderSuite(), + OptimisticOptimizations.NONE, getProfilingInfo(graph), new SpeculationLog(), suites, true, new ExternalCompilationResult(), CompilationResultBuilderFactory.Default); + + Assume.assumeTrue(validDevice); + Assert.assertTrue(kernelResult.getTargetCode() != null); + try (Scope ds = Debug.scope("GeneratingKernel")) { + long kernel = toGPU.generateKernel(kernelResult.getTargetCode(), method.getName()); + kernelResult.setEntryPoint(kernel); + } catch (Throwable e) { + throw Debug.handle(e); + } + return kernelResult; + } + + @Override + protected InstalledCode addMethod(ResolvedJavaMethod method, CompilationResult compResult) { + HotSpotCodeCacheProvider codeCache = (HotSpotCodeCacheProvider) getCodeCache(); + return codeCache.addExternalMethod(method, compResult); + } + + /** + * Compiles and installs PTX kernel code for {@code method}. + */ + InstalledCode getKernelCode(ResolvedJavaMethod method, StructuredGraph graph) { + return getCode(method, graph, true); + } + } + + @Override + protected InstalledCode getCode(ResolvedJavaMethod method, StructuredGraph graph) { + InstalledCode kernelCode = new PTXKernel().getKernelCode(method, graph); + StructuredGraph launchKernel = new PTXLaunchKernelGraphKit(method, kernelCode.getStart(), runtime().getHostProviders()).getGraph(); + return super.getCode(method, launchKernel); + } + + @Test + public void testStaticIntKernel() { + test("staticIntKernel", 'a', 42); + } + + @Test + public void testVirtualIntKernel() { + test("virtualIntKernel", 'a', 42); + } + + public static int staticIntKernel(char p0, int p1) { + return p1 + p0; + } + + public int virtualIntKernel(char p0, int p1) { + return p1 + p0; + } +} diff -r c70dddf5ce4a -r 220ed109bf77 graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java --- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java Mon Jan 13 22:28:57 2014 +0100 +++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java Mon Jan 13 22:32:27 2014 +0100 @@ -22,6 +22,11 @@ */ package com.oracle.graal.hotspot.ptx; +import static com.oracle.graal.api.code.CallingConvention.Type.*; +import static com.oracle.graal.api.meta.LocationIdentity.*; +import static com.oracle.graal.hotspot.HotSpotForeignCallLinkage.RegisterEffect.*; +import static com.oracle.graal.hotspot.HotSpotForeignCallLinkage.Transition.*; +import static com.oracle.graal.hotspot.meta.HotSpotForeignCallsProviderImpl.*; import static com.oracle.graal.lir.LIRValueUtil.*; import java.util.*; @@ -42,15 +47,25 @@ import com.oracle.graal.lir.StandardOp.LabelOp; import com.oracle.graal.lir.asm.*; import com.oracle.graal.lir.ptx.*; +import com.oracle.graal.lir.ptx.PTXMemOp.LoadReturnAddrOp; import com.oracle.graal.nodes.*; import com.oracle.graal.nodes.cfg.*; -import com.oracle.graal.lir.ptx.PTXMemOp.LoadReturnAddrOp; +import com.oracle.graal.word.*; /** * HotSpot PTX specific backend. */ public class PTXHotSpotBackend extends HotSpotBackend { + /** + * Descriptor for the PTX runtime method for launching a kernel. The C++ signature is: + * + *
+     *     jlong gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jlong parametersAndReturnValueBuffer, jint parametersAndReturnValueBufferSize, jint encodedReturnTypeSize)
+     * 
+ */ + public static final ForeignCallDescriptor LAUNCH_KERNEL = new ForeignCallDescriptor("execute_kernel_from_vm", long.class, Word.class, long.class, long.class, int.class, int.class); + public PTXHotSpotBackend(HotSpotGraalRuntime runtime, HotSpotProviders providers) { super(runtime, providers); } @@ -61,6 +76,14 @@ } @Override + public void completeInitialization() { + HotSpotHostForeignCallsProvider hostForeignCalls = (HotSpotHostForeignCallsProvider) getRuntime().getHostProviders().getForeignCalls(); + long launchKernel = getRuntime().getCompilerToGPU().getLaunchKernelAddress(); + hostForeignCalls.registerForeignCall(LAUNCH_KERNEL, launchKernel, NativeCall, DESTROYS_REGISTERS, NOT_LEAF, NOT_REEXECUTABLE, ANY_LOCATION); + super.completeInitialization(); + } + + @Override public FrameMap newFrameMap() { return new PTXFrameMap(getCodeCache()); } diff -r c70dddf5ce4a -r 220ed109bf77 graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXLaunchKernelGraphKit.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXLaunchKernelGraphKit.java Mon Jan 13 22:32:27 2014 +0100 @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package com.oracle.graal.hotspot.ptx; + +import static com.oracle.graal.api.meta.DeoptimizationReason.*; +import static com.oracle.graal.api.meta.LocationIdentity.*; +import static com.oracle.graal.asm.NumUtil.*; +import static com.oracle.graal.hotspot.ptx.PTXHotSpotBackend.*; +import static com.oracle.graal.hotspot.replacements.HotSpotReplacementsUtil.*; +import static java.lang.reflect.Modifier.*; + +import java.util.*; + +import com.oracle.graal.api.meta.*; +import com.oracle.graal.debug.*; +import com.oracle.graal.graph.*; +import com.oracle.graal.hotspot.meta.*; +import com.oracle.graal.hotspot.nodes.*; +import com.oracle.graal.hotspot.stubs.*; +import com.oracle.graal.java.*; +import com.oracle.graal.nodes.*; +import com.oracle.graal.nodes.HeapAccess.BarrierType; +import com.oracle.graal.nodes.calc.*; +import com.oracle.graal.nodes.extended.*; +import com.oracle.graal.nodes.type.*; +import com.oracle.graal.replacements.nodes.*; +import com.oracle.graal.word.*; + +/** + * Utility for building a graph for launching a PTX kernel compiled for a method. This graph created + * is something like the following pseudo code: + * + *
+ *     jlong kernel(p0, p1, ..., pN) {
+ *         jint kernelParamsBufSize = SIZE_OF_ALIGNED_PARAMS_WITH_PADDING(p0, p1, ..., pN);
+ *         jbyte kernelParamsBuf[kernelParamsBufSize] = {p0, PAD(p1), p1, ..., PAD(pN), pN};
+ *         jlong result = PTX_LAUNCH_KERNEL(THREAD_REGISTER, kernelParamsBuf, kernelParamsBuf);
+ *         return result;
+ *     }
+ * 
+ */ +public class PTXLaunchKernelGraphKit extends GraphKit { + + /** + * The incoming Java arguments to the kernel invocation. + */ + ParameterNode[] javaParameters; + + /** + * The size of the buffer holding the parameters and the extra word for storing the pointer to + * device memory for the return value. This will be the same as + * PTXKernelArguments::device_argument_buffer_size(). + */ + int kernelParametersAndReturnValueBufferSize; + + /** + * Offsets of each Java argument in the parameters buffer. + */ + int[] javaParameterOffsetsInKernelParametersBuffer; + + /** + * Creates a graph implementing the transition from Java to the native routine that launches + * some compiled PTX code. + * + * @param kernelMethod a method that has been compiled to PTX kernel code + * @param kernelAddress the address of the installed PTX code for {@code kernelMethod} + */ + public PTXLaunchKernelGraphKit(ResolvedJavaMethod kernelMethod, long kernelAddress, HotSpotProviders providers) { + super(new StructuredGraph(kernelMethod), providers); + int wordSize = providers.getCodeCache().getTarget().wordSize; + Kind wordKind = providers.getCodeCache().getTarget().wordKind; + Signature sig = kernelMethod.getSignature(); + boolean isStatic = isStatic(kernelMethod.getModifiers()); + int sigCount = sig.getParameterCount(false); + javaParameters = new ParameterNode[(!isStatic ? 1 : 0) + sigCount]; + javaParameterOffsetsInKernelParametersBuffer = new int[javaParameters.length]; + int javaParametersIndex = 0; + Kind returnKind = sig.getReturnKind(); + + BitSet objects = new BitSet(); + if (!isStatic) { + javaParameters[javaParametersIndex] = unique(new ParameterNode(javaParametersIndex, StampFactory.declaredNonNull(kernelMethod.getDeclaringClass()))); + kernelParametersAndReturnValueBufferSize += wordSize; + javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex++] = 0; + objects.set(0); + } + for (int i = 0; i < sigCount; i++) { + Kind kind = sig.getParameterKind(i); + int kindByteSize = kind.getBitCount() / Byte.SIZE; + while ((kernelParametersAndReturnValueBufferSize % kindByteSize) != 0) { + kernelParametersAndReturnValueBufferSize++; + } + javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex] = kernelParametersAndReturnValueBufferSize; + Stamp stamp; + if (kind == Kind.Object) { + stamp = StampFactory.object(); + int slot = kernelParametersAndReturnValueBufferSize / wordSize; + objects.set(slot); + } else { + stamp = StampFactory.forKind(kind); + } + ParameterNode param = unique(new ParameterNode(javaParametersIndex, stamp)); + javaParameters[javaParametersIndex++] = param; + kernelParametersAndReturnValueBufferSize += kindByteSize; + } + kernelParametersAndReturnValueBufferSize = roundUp(kernelParametersAndReturnValueBufferSize, wordSize); + + // Add slot for holding pointer to device memory storing return value + int encodedReturnTypeSize = 0; + if (returnKind != Kind.Void) { + kernelParametersAndReturnValueBufferSize += wordSize; + if (returnKind == Kind.Object) { + encodedReturnTypeSize = -wordSize; + } else { + encodedReturnTypeSize = returnKind.getBitCount() / Byte.SIZE; + } + } + + ReadRegisterNode threadArg = append(new ReadRegisterNode(providers.getRegisters().getThreadRegister(), true, false)); + ConstantNode kernelAddressArg = ConstantNode.forLong(kernelAddress, getGraph()); + AllocaNode kernelParametersAndReturnValueBufferArg = append(new AllocaNode(kernelParametersAndReturnValueBufferSize / wordSize, objects)); + ConstantNode kernelParametersAndReturnValueBufferSizeArg = ConstantNode.forInt(kernelParametersAndReturnValueBufferSize, getGraph()); + ConstantNode encodedReturnTypeSizeArg = ConstantNode.forInt(encodedReturnTypeSize, getGraph()); + + for (javaParametersIndex = 0; javaParametersIndex < javaParameters.length; javaParametersIndex++) { + ParameterNode javaParameter = javaParameters[javaParametersIndex]; + int javaParameterOffset = javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex]; + LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, javaParameter.kind(), javaParameterOffset, getGraph()); + append(new WriteNode(kernelParametersAndReturnValueBufferArg, javaParameter, location, BarrierType.NONE, false, false)); + } + if (returnKind != Kind.Void) { + LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, wordKind, kernelParametersAndReturnValueBufferSize - wordSize, getGraph()); + append(new WriteNode(kernelParametersAndReturnValueBufferArg, ConstantNode.forIntegerKind(wordKind, 0L, getGraph()), location, BarrierType.NONE, false, false)); + } + + FrameStateBuilder fsb = new FrameStateBuilder(kernelMethod, getGraph(), true); + FrameState fs = fsb.create(0); + getGraph().start().setStateAfter(fs); + + ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), LAUNCH_KERNEL, threadArg, kernelAddressArg, kernelParametersAndReturnValueBufferArg, + kernelParametersAndReturnValueBufferSizeArg, encodedReturnTypeSizeArg)); + result.setDeoptimizationState(fs); + + ConstantNode isObjectResultArg = ConstantNode.forBoolean(returnKind == Kind.Object, getGraph()); + InvokeNode handlePendingException = createInvoke(getClass(), "handlePendingException", threadArg, isObjectResultArg); + handlePendingException.setStateAfter(fs); + InvokeNode getObjectResult = null; + + ValueNode returnValue; + switch (returnKind) { + case Void: + returnValue = null; + break; + case Boolean: + case Byte: + case Short: + case Char: + case Int: + returnValue = unique(new ConvertNode(Kind.Long, Kind.Int, result)); + break; + case Long: + returnValue = result; + break; + case Float: + case Double: + returnValue = unique(new ReinterpretNode(returnKind, result)); + break; + case Object: + getObjectResult = createInvoke(getClass(), "getObjectResult", threadArg); + returnValue = append(getObjectResult); + break; + default: + throw new GraalInternalError("%s return kind not supported", returnKind); + } + + append(new ReturnNode(returnValue)); + + if (Debug.isDumpEnabled()) { + Debug.dump(getGraph(), "Initial kernel launch graph"); + } + + rewriteWordTypes(); + inlineInvokes(); + + if (Debug.isDumpEnabled()) { + Debug.dump(getGraph(), "Kernel launch graph before compilation"); + } + } + + public static void handlePendingException(Word thread, boolean isObjectResult) { + if (clearPendingException(thread)) { + if (isObjectResult) { + getAndClearObjectResult(thread); + } + DeoptimizeNode.deopt(DeoptimizationAction.None, RuntimeConstraint); + } + } + + public static Object getObjectResult(Word thread) { + return getAndClearObjectResult(thread); + } +} diff -r c70dddf5ce4a -r 220ed109bf77 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPU.java --- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPU.java Mon Jan 13 22:28:57 2014 +0100 +++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPU.java Mon Jan 13 22:32:27 2014 +0100 @@ -58,4 +58,9 @@ Object executeExternalMethodVarargs(Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException; Object executeParallelMethodVarargs(int dimX, int dimY, int dimZ, Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException; + + /** + * Gets the address of the runtime function for launching a kernel function. + */ + long getLaunchKernelAddress(); } diff -r c70dddf5ce4a -r 220ed109bf77 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPUImpl.java --- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPUImpl.java Mon Jan 13 22:28:57 2014 +0100 +++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/bridge/CompilerToGPUImpl.java Mon Jan 13 22:32:27 2014 +0100 @@ -42,4 +42,6 @@ public native Object executeExternalMethodVarargs(Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException; public native Object executeParallelMethodVarargs(int dimX, int dimY, int dimZ, Object[] args, HotSpotInstalledCode hotspotInstalledCode) throws InvalidInstalledCodeException; + + public native long getLaunchKernelAddress(); } diff -r c70dddf5ce4a -r 220ed109bf77 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/meta/HotSpotForeignCallsProviderImpl.java --- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/meta/HotSpotForeignCallsProviderImpl.java Mon Jan 13 22:28:57 2014 +0100 +++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/meta/HotSpotForeignCallsProviderImpl.java Mon Jan 13 22:32:27 2014 +0100 @@ -61,7 +61,7 @@ /** * Registers the linkage for a foreign call. */ - protected HotSpotForeignCallLinkage register(HotSpotForeignCallLinkage linkage) { + public HotSpotForeignCallLinkage register(HotSpotForeignCallLinkage linkage) { assert !foreignCalls.containsKey(linkage.getDescriptor()) : "already registered linkage for " + linkage.getDescriptor(); foreignCalls.put(linkage.getDescriptor(), linkage); return linkage; @@ -77,7 +77,7 @@ * @param transition specifies if this is a {@linkplain Transition#LEAF leaf} call * @param killedLocations the memory locations killed by the stub call */ - protected HotSpotForeignCallLinkage registerStubCall(ForeignCallDescriptor descriptor, boolean reexecutable, Transition transition, LocationIdentity... killedLocations) { + public HotSpotForeignCallLinkage registerStubCall(ForeignCallDescriptor descriptor, boolean reexecutable, Transition transition, LocationIdentity... killedLocations) { return register(HotSpotForeignCallLinkage.create(metaAccess, codeCache, this, descriptor, 0L, PRESERVES_REGISTERS, JavaCall, JavaCallee, transition, reexecutable, killedLocations)); } @@ -95,7 +95,7 @@ * cannot be re-executed. * @param killedLocations the memory locations killed by the foreign call */ - protected HotSpotForeignCallLinkage registerForeignCall(ForeignCallDescriptor descriptor, long address, CallingConvention.Type outgoingCcType, RegisterEffect effect, Transition transition, + public HotSpotForeignCallLinkage registerForeignCall(ForeignCallDescriptor descriptor, long address, CallingConvention.Type outgoingCcType, RegisterEffect effect, Transition transition, boolean reexecutable, LocationIdentity... killedLocations) { Class resultType = descriptor.getResultType(); assert transition != NOT_LEAF || resultType.isPrimitive() || Word.class.isAssignableFrom(resultType) : "non-leaf foreign calls must return objects in thread local storage: " + descriptor; @@ -115,7 +115,7 @@ * cannot be re-executed. * @param killedLocations the memory locations killed by the foreign call */ - protected void linkForeignCall(HotSpotProviders providers, ForeignCallDescriptor descriptor, long address, boolean prependThread, Transition transition, boolean reexecutable, + public void linkForeignCall(HotSpotProviders providers, ForeignCallDescriptor descriptor, long address, boolean prependThread, Transition transition, boolean reexecutable, LocationIdentity... killedLocations) { ForeignCallStub stub = new ForeignCallStub(providers, address, descriptor, prependThread, transition, reexecutable, killedLocations); HotSpotForeignCallLinkage linkage = stub.getLinkage(); diff -r c70dddf5ce4a -r 220ed109bf77 graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXControlFlow.java --- a/graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXControlFlow.java Mon Jan 13 22:28:57 2014 +0100 +++ b/graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXControlFlow.java Mon Jan 13 22:32:27 2014 +0100 @@ -57,7 +57,7 @@ } } - public static class ReturnNoValOp extends PTXLIRInstruction { + public static class ReturnNoValOp extends PTXLIRInstruction implements BlockEndOp { public ReturnNoValOp() { } diff -r c70dddf5ce4a -r 220ed109bf77 mx/projects --- a/mx/projects Mon Jan 13 22:28:57 2014 +0100 +++ b/mx/projects Mon Jan 13 22:32:27 2014 +0100 @@ -202,6 +202,14 @@ project@com.oracle.graal.hotspot.ptx@javaCompliance=1.7 project@com.oracle.graal.hotspot.ptx@workingSets=Graal,HotSpot,PTX +# graal.hotspot.ptx.test +project@com.oracle.graal.hotspot.ptx.test@subDir=graal +project@com.oracle.graal.hotspot.ptx.test@sourceDirs=src +project@com.oracle.graal.hotspot.ptx.test@dependencies=com.oracle.graal.hotspot.ptx,com.oracle.graal.replacements.test +project@com.oracle.graal.hotspot.ptx.test@checkstyle=com.oracle.graal.graph +project@com.oracle.graal.hotspot.ptx.test@javaCompliance=1.7 +project@com.oracle.graal.hotspot.ptx.test@workingSets=Graal,HotSpot,PTX + # graal.hotspot.hsail project@com.oracle.graal.hotspot.hsail@subDir=graal project@com.oracle.graal.hotspot.hsail@sourceDirs=src diff -r c70dddf5ce4a -r 220ed109bf77 src/gpu/ptx/vm/gpu_ptx.cpp --- a/src/gpu/ptx/vm/gpu_ptx.cpp Mon Jan 13 22:28:57 2014 +0100 +++ b/src/gpu/ptx/vm/gpu_ptx.cpp Mon Jan 13 22:32:27 2014 +0100 @@ -29,6 +29,7 @@ #include "utilities/ostream.hpp" #include "memory/allocation.hpp" #include "memory/allocation.inline.hpp" +#include "runtime/interfaceSupport.hpp" #include "ptxKernelArguments.hpp" void * gpu::Ptx::_device_context; @@ -336,6 +337,144 @@ return cu_function; } +JRT_ENTRY(jlong, gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jlong parametersAndReturnValueBuffer, jint parametersAndReturnValueBufferSize, int encodedReturnTypeSize)) + tty->print_cr("*** gpu::Ptx::execute_kernel_from_vm(kernel=%p, parametersAndReturnValueBuffer=%p, parametersAndReturnValueBufferSize=%d, encodedReturnTypeSize=%d)", + kernel, parametersAndReturnValueBuffer, parametersAndReturnValueBufferSize, encodedReturnTypeSize); + tty->print(" buffer as bytes: "); + for (int i = 0; i < parametersAndReturnValueBufferSize; i++) { + tty->print(" 0x%02x", ((jbyte*) (address) parametersAndReturnValueBuffer)[i] & 0xFF); + } + tty->cr(); + tty->print(" buffer as ints: "); + for (int i = 0; i < (parametersAndReturnValueBufferSize / 4); i++) { + tty->print(" %d", ((jint*) (address) parametersAndReturnValueBuffer)[i]); + } + tty->cr(); + tty->print(" buffer as words: "); + for (unsigned i = 0; i < (parametersAndReturnValueBufferSize / sizeof(void*)); i++) { + tty->print(" "INTPTR_FORMAT, ((void**) (address) parametersAndReturnValueBuffer)[i]); + } + tty->cr(); + if (kernel == 0L) { + SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_NullPointerException(), NULL); + return 0L; + } + + + // grid dimensionality + unsigned int gridX = 1; + unsigned int gridY = 1; + unsigned int gridZ = 1; + + // thread dimensionality + unsigned int blockX = 1; + unsigned int blockY = 1; + unsigned int blockZ = 1; + + struct CUfunc_st* cu_function = (struct CUfunc_st*) (address) kernel; + + void * config[5] = { + GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, (char*) (address) parametersAndReturnValueBuffer, + GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, ¶metersAndReturnValueBufferSize, + GRAAL_CU_LAUNCH_PARAM_END + }; + + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] launching kernel"); + } + + bool isObjectReturn = encodedReturnTypeSize < 0; + int returnTypeSize = encodedReturnTypeSize < 0 ? -encodedReturnTypeSize : encodedReturnTypeSize; + gpu::Ptx::CUdeviceptr device_return_value; + int status; + if (returnTypeSize != 0) { + status = _cuda_cu_memalloc(&device_return_value, returnTypeSize); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to allocate memory for return value pointer on device"); + return 0L; + } + // Push device_return_value to kernelParams + gpu::Ptx::CUdeviceptr* returnValuePtr = (gpu::Ptx::CUdeviceptr*) (address) parametersAndReturnValueBuffer + parametersAndReturnValueBufferSize - sizeof(device_return_value); + *returnValuePtr = device_return_value; + } + + status = _cuda_cu_launch_kernel(cu_function, + gridX, gridY, gridZ, + blockX, blockY, blockZ, + 0, NULL, NULL, (void **) &config); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to launch kernel"); + SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to launch kernel"); + return 0L; + } + + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] Success: Kernel Launch: X: %d Y: %d Z: %d", blockX, blockY, blockZ); + } + + status = _cuda_cu_ctx_synchronize(); + + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] Failed to synchronize launched kernel (%d)", status); + SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to synchronize launched kernel"); + return 0L; + } + + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] Success: Synchronized launch kernel"); + } + + jlong primitiveReturnValue = 0L; + if (isObjectReturn) { + oop return_val; + status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, device_return_value, T_OBJECT_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status); + SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument"); + return 0L; + } + thread->set_vm_result(return_val); + } else if (returnTypeSize > 0) { + jlong result; + status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&primitiveReturnValue, device_return_value, T_LONG_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status); + SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument"); + return 0L; + } + } + + // Free device memory allocated for result + if (returnTypeSize != 0) { + status = gpu::Ptx::_cuda_cu_memfree(device_return_value); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status); + SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to free device memory of return value"); + return 0L; + } + } + + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] Success: Freed device memory of return value"); + } + + // Destroy context + status = gpu::Ptx::_cuda_cu_ctx_destroy(_device_context); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to destroy context", status); + SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to destroy context"); + return 0L; + } + + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] Success: Destroy context"); + } + + return primitiveReturnValue; +JRT_END + bool gpu::Ptx::execute_kernel(address kernel, PTXKernelArguments &ptxka, JavaValue &ret) { return gpu::Ptx::execute_warp(1, 1, 1, kernel, ptxka, ret); } diff -r c70dddf5ce4a -r 220ed109bf77 src/gpu/ptx/vm/gpu_ptx.hpp --- a/src/gpu/ptx/vm/gpu_ptx.hpp Mon Jan 13 22:28:57 2014 +0100 +++ b/src/gpu/ptx/vm/gpu_ptx.hpp Mon Jan 13 22:32:27 2014 +0100 @@ -103,7 +103,9 @@ typedef unsigned int CUdeviceptr; #endif -typedef int CUdevice; /**< CUDA device */ +typedef int CUdevice; /* CUDA device */ + + static jlong execute_kernel_from_vm(JavaThread* thread, jlong kernel, jlong parametersAndReturnValueBuffer, jint parametersAndReturnValueBufferSize, int encodedReturnTypeSize); private: typedef int (*cuda_cu_init_func_t)(unsigned int); diff -r c70dddf5ce4a -r 220ed109bf77 src/gpu/ptx/vm/ptxKernelArguments.hpp --- a/src/gpu/ptx/vm/ptxKernelArguments.hpp Mon Jan 13 22:28:57 2014 +0100 +++ b/src/gpu/ptx/vm/ptxKernelArguments.hpp Mon Jan 13 22:32:27 2014 +0100 @@ -34,6 +34,7 @@ #define T_FLOAT_BYTE_SIZE 4 #define T_DOUBLE_BYTE_SIZE 8 #define T_LONG_BYTE_SIZE 8 +#define T_OBJECT_BYTE_SIZE 8 #define T_ARRAY_BYTE_SIZE 8 class PTXKernelArguments : public SignatureIterator { diff -r c70dddf5ce4a -r 220ed109bf77 src/share/vm/graal/graalCompilerToGPU.cpp --- a/src/share/vm/graal/graalCompilerToGPU.cpp Mon Jan 13 22:28:57 2014 +0100 +++ b/src/share/vm/graal/graalCompilerToGPU.cpp Mon Jan 13 22:32:27 2014 +0100 @@ -175,6 +175,13 @@ } C2V_END +C2V_VMENTRY(jlong, getLaunchKernelAddress, (JNIEnv *env, jobject)) + if (gpu::get_target_il_type() == gpu::PTX) { + return (jlong) gpu::Ptx::execute_kernel_from_vm; + } + return 0L; +C2V_END + C2V_VMENTRY(jboolean, deviceInit, (JNIEnv *env, jobject)) if (gpu::is_available() == false || gpu::has_gpu_linkage() == false) { if (TraceGPUInteraction) { @@ -247,6 +254,7 @@ {CC"availableProcessors", CC"()I", FN_PTR(availableProcessors)}, {CC"executeExternalMethodVarargs", CC"(["OBJECT HS_INSTALLED_CODE")"OBJECT, FN_PTR(executeExternalMethodVarargs)}, {CC"executeParallelMethodVarargs", CC"(III["OBJECT HS_INSTALLED_CODE")"OBJECT, FN_PTR(executeParallelMethodVarargs)}, + {CC"getLaunchKernelAddress", CC"()J", FN_PTR(getLaunchKernelAddress)}, }; int CompilerToGPU_methods_count() {