# HG changeset patch # User Doug Simon # Date 1390584858 -3600 # Node ID eed1aafead0d938d1dbd5544c85a4128c1dce21a # Parent 57d1746e3b3d9bcb39f8eebbdfc6d0b658c852a2# Parent 9de3efd2ea8f1b3a2f4f62d5c7802b8ab59011c4 Merge. diff -r 9de3efd2ea8f -r eed1aafead0d graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ControlPTXTest.java --- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ControlPTXTest.java Fri Jan 24 17:43:14 2014 +0100 +++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ControlPTXTest.java Fri Jan 24 18:34:18 2014 +0100 @@ -26,6 +26,7 @@ public class ControlPTXTest extends PTXTest { + @Ignore("[CUDA] *** Error (status=702): Synchronize kernel") @Test public void testControl() { test("testLoop", 42); diff -r 9de3efd2ea8f -r eed1aafead0d graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/FloatPTXTest.java --- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/FloatPTXTest.java Fri Jan 24 17:43:14 2014 +0100 +++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/FloatPTXTest.java Fri Jan 24 18:34:18 2014 +0100 @@ -63,7 +63,6 @@ return 32.0 + a; } - @Ignore @Test public void testSub() { compileKernel("testSub2F"); @@ -98,7 +97,7 @@ return 32.0 - a; } - @Ignore + @Ignore("[CUDA] *** Error (209) Failed to load module data with online compiler options for method testMul2F") @Test public void testMul() { compileKernel("testMul2F"); @@ -133,7 +132,7 @@ return 32.0 * a; } - @Ignore + @Ignore("[CUDA] *** Error (209) Failed to load module data with online compiler options for method testDiv2F") @Test public void testDiv() { compileKernel("testDiv2F"); @@ -168,7 +167,6 @@ return 32.0 / a; } - @Ignore @Test public void testNeg() { compileKernel("testNeg2F"); @@ -183,12 +181,11 @@ return -a; } - @Ignore + @Ignore("need linkage to PTX remainder") @Test public void testRem() { - // need linkage to PTX remainder() - // compileKernel("testRem2F"); - // compileKernel("testRem2D"); + compileKernel("testRem2F"); + compileKernel("testRem2D"); } public static float testRem2F(float a, float b) { @@ -199,7 +196,7 @@ return a % b; } - @Ignore + @Ignore("[CUDA] *** Error (209) Failed to load module data with online compiler options for method testF2I") @Test public void testFloatConversion() { compileKernel("testF2I"); diff -r 9de3efd2ea8f -r eed1aafead0d graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ObjectPTXTest.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ObjectPTXTest.java Fri Jan 24 18:34:18 2014 +0100 @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package com.oracle.graal.compiler.ptx.test; + +import java.util.*; + +import org.junit.*; + +public class ObjectPTXTest extends PTXTest { + + static class A { + boolean z = true; + byte b = 17; + char c = 'D'; + short s = 12345; + int i = 0x1234565; + long l; + Object o; + float f; + double d; + } + + @Ignore("Object parameters not yet GC safe") + @Test + public void test0() { + for (long l : new long[]{Long.MIN_VALUE, -10, 0, 1, 2, 10, Long.MAX_VALUE}) { + A a = new A(); + a.l = l; + test("testLong", l * 2, a); + } + } + + public static long testLong(long l, A a) { + return a.l + l; + } + + @Ignore("Object parameters not yet GC safe") + @Test + public void test1() { + for (int i : new int[]{Integer.MIN_VALUE, -10, 0, 1, 2, 10, Integer.MAX_VALUE}) { + A a = new A(); + a.i = i; + test("testInt", i * 2, a); + } + } + + public static int testInt(int i, A a) { + return a.i + i; + } + + @Ignore("Object parameters not yet GC safe") + @Test + public void test2() { + A a = new A(); + a.z = true; + test("testBoolean", a); + a.z = false; + test("testBoolean", a); + } + + public static boolean testBoolean(A a) { + return a.z; + } + + @Ignore("Object parameters not yet GC safe") + @Test + public void test3() { + for (byte b : new byte[]{Byte.MIN_VALUE, -10, 0, 1, 2, 10, Byte.MAX_VALUE}) { + A a = new A(); + a.b = b; + test("testByte", b, a); + } + } + + public static int testByte(byte b, A a) { + return a.b + b; + } + + @Ignore("Object parameters not yet GC safe") + @Test + public void test4() { + for (short s : new short[]{Short.MIN_VALUE, -10, 0, 1, 2, 10, Short.MAX_VALUE}) { + A a = new A(); + a.s = s; + test("testShort", s, a); + } + } + + public static int testShort(short s, A a) { + return a.s + s; + } + + @Ignore("Object parameters not yet GC safe") + @Test + public void test5() { + for (char c : new char[]{Character.MIN_VALUE, 1, 2, 10, Character.MAX_VALUE}) { + A a = new A(); + a.c = c; + test("testChar", (char) (c - 5), a); + } + } + + public static int testChar(char c, A a) { + return a.c + c; + } + + @Ignore("Object parameters not yet GC safe") + @Test + public void test6() { + for (float f : new float[]{Float.MIN_VALUE, Float.MIN_NORMAL, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, Float.NaN, -11.45F, -0.0F, 0.0F, 2, 10, Float.MAX_VALUE}) { + A a = new A(); + a.f = f; + test("testFloat", f * 2, a); + } + } + + public static float testFloat(float f, A a) { + return a.f + f; + } + + @Ignore("Object parameters not yet GC safe") + @Test + public void test7() { + for (double d : new double[]{Double.MIN_VALUE, Double.MIN_NORMAL, Double.NaN, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY - 11.45D, -0.0D, 0.0D, 2, 10, Double.MAX_VALUE}) { + A a = new A(); + a.d = d; + test("testDouble", d * 2, a); + } + } + + public static double testDouble(double d, A a) { + return a.d + d; + } + + @Ignore("Object return values not yet supported") + @Test + public void test9() { + for (Object o : new Object[]{null, "object", new Object(), new HashMap()}) { + A a = new A(); + a.o = o; + test("testObject", a); + } + } + + public static Object testObject(A a) { + return a.o; + } +} diff -r 9de3efd2ea8f -r eed1aafead0d graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/PTXPhase.java --- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/PTXPhase.java Fri Jan 24 17:43:14 2014 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -package com.oracle.graal.compiler.ptx.test; - -import com.oracle.graal.nodes.*; -import com.oracle.graal.nodes.type.*; -import com.oracle.graal.phases.*; - -public class PTXPhase extends Phase { - - @Override - protected void run(StructuredGraph graph) { - /* - * Assume that null checks would be done on the CPU caller side prior to copying data onto - * the GPU. - */ - for (ParameterNode param : graph.getNodes(ParameterNode.class)) { - if (param.stamp() instanceof ObjectStamp) { - param.setStamp(StampFactory.declaredNonNull(((ObjectStamp) param.stamp()).type())); - } - } - } -} diff -r 9de3efd2ea8f -r eed1aafead0d graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/PTXTest.java --- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/PTXTest.java Fri Jan 24 17:43:14 2014 +0100 +++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/PTXTest.java Fri Jan 24 18:34:18 2014 +0100 @@ -69,7 +69,15 @@ Assume.assumeTrue(ptxBackend.isDeviceInitialized()); HotSpotNmethod installedPTXCode = installKernel(method, ptxCode); StructuredGraph wrapper = new PTXWrapperBuilder(method, installedPTXCode, (HotSpotProviders) getProviders()).getGraph(); - return super.getCode(method, wrapper); + + // The PTX C++ layer expects a 1:1 relationship between kernel compilation + // and kernel execution as it creates a cuContext in the former and + // destroys it in the latter. So, each kernel installed requires a unique + // wrapper. + // TODO: do cuContext management properly + boolean forceCompile = true; + + return getCode(method, wrapper, forceCompile); } protected static void compileAndPrintCode(PTXTest test) { diff -r 9de3efd2ea8f -r eed1aafead0d graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILCompilationResult.java --- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILCompilationResult.java Fri Jan 24 17:43:14 2014 +0100 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILCompilationResult.java Fri Jan 24 18:34:18 2014 +0100 @@ -43,8 +43,8 @@ import com.oracle.graal.lir.asm.*; import com.oracle.graal.nodes.*; import com.oracle.graal.nodes.java.*; -import com.oracle.graal.nodes.type.*; import com.oracle.graal.phases.*; +import com.oracle.graal.phases.common.*; import com.oracle.graal.phases.tiers.*; import com.oracle.graal.phases.util.*; import com.oracle.graal.runtime.*; @@ -139,8 +139,7 @@ Providers providers = backend.getProviders(); TargetDescription target = providers.getCodeCache().getTarget(); PhaseSuite graphBuilderSuite = backend.getSuites().getDefaultGraphBuilderSuite().copy(); - graphBuilderSuite.appendPhase(new HSAILPhase()); - new HSAILPhase().apply(graph); + graphBuilderSuite.appendPhase(new NonNullParametersPhase()); CallingConvention cc = CodeUtil.getCallingConvention(providers.getCodeCache(), Type.JavaCallee, graph.method(), false); SuitesProvider suitesProvider = backend.getSuites(); try { @@ -173,18 +172,6 @@ } } - private static class HSAILPhase extends Phase { - - @Override - protected void run(StructuredGraph graph) { - for (ParameterNode param : graph.getNodes(ParameterNode.class)) { - if (param.stamp() instanceof ObjectStamp) { - param.setStamp(StampFactory.declaredNonNull(((ObjectStamp) param.stamp()).type())); - } - } - } - } - protected HSAILCompilationResult() { } diff -r 9de3efd2ea8f -r eed1aafead0d graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java --- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java Fri Jan 24 17:43:14 2014 +0100 +++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java Fri Jan 24 18:34:18 2014 +0100 @@ -58,6 +58,7 @@ import com.oracle.graal.nodes.*; import com.oracle.graal.nodes.cfg.*; import com.oracle.graal.phases.*; +import com.oracle.graal.phases.common.*; import com.oracle.graal.phases.tiers.*; import com.oracle.graal.word.*; @@ -67,7 +68,7 @@ public class PTXHotSpotBackend extends HotSpotBackend { /** - * Descriptor for the PTX runtime method for launching a kernel. The C++ signature is: + * Descriptor for the PTX runtime method for calling a kernel. The C++ signature is: * *
      *     jlong (JavaThread* thread,
@@ -77,11 +78,14 @@
      *            jint dimZ,
      *            jlong parametersAndReturnValueBuffer,
      *            jint parametersAndReturnValueBufferSize,
+     *            jint objectParametersCount,
+     *            jlong objectParametersOffsets,
+     *            jlong pinnedObjects,
      *            jint encodedReturnTypeSize)
      * 
*/ // @formatter:off - public static final ForeignCallDescriptor LAUNCH_KERNEL = new ForeignCallDescriptor("execute_kernel_from_vm", long.class, + public static final ForeignCallDescriptor CALL_KERNEL = new ForeignCallDescriptor("execute_kernel_from_vm", long.class, Word.class, // thread long.class, // kernel int.class, // dimX @@ -89,6 +93,9 @@ int.class, // dimZ long.class, // parametersAndReturnValueBuffer int.class, // parametersAndReturnValueBufferSize + int.class, // objectParameterCount + long.class, // objectParameterOffsets + long.class, // pinnedObjects int.class); // encodedReturnTypeSize // @formatter:on @@ -114,7 +121,7 @@ CompilerToGPU compilerToGPU = getRuntime().getCompilerToGPU(); if (deviceInitialized) { long launchKernel = compilerToGPU.getLaunchKernelAddress(); - hostForeignCalls.registerForeignCall(LAUNCH_KERNEL, launchKernel, NativeCall, DESTROYS_REGISTERS, NOT_LEAF, NOT_REEXECUTABLE, ANY_LOCATION); + hostForeignCalls.registerForeignCall(CALL_KERNEL, launchKernel, NativeCall, DESTROYS_REGISTERS, NOT_LEAF, NOT_REEXECUTABLE, ANY_LOCATION); } super.completeInitialization(); } @@ -166,11 +173,13 @@ HotSpotProviders providers = getProviders(); CallingConvention cc = getCallingConvention(providers.getCodeCache(), Type.JavaCallee, method, false); PhaseSuite graphBuilderSuite = providers.getSuites().getDefaultGraphBuilderSuite(); + graphBuilderSuite.appendPhase(new NonNullParametersPhase()); Suites suites = providers.getSuites().getDefaultSuites(); ExternalCompilationResult ptxCode = compileGraph(graph, cc, method, providers, this, this.getTarget(), null, graphBuilderSuite, OptimisticOptimizations.NONE, getProfilingInfo(graph), new SpeculationLog(), suites, true, new ExternalCompilationResult(), CompilationResultBuilderFactory.Default); if (makeBinary) { try (Scope ds = Debug.scope("GeneratingKernelBinary")) { + assert ptxCode.getTargetCode() != null; long kernel = getRuntime().getCompilerToGPU().generateKernel(ptxCode.getTargetCode(), method.getName()); ptxCode.setEntryPoint(kernel); } catch (Throwable e) { diff -r 9de3efd2ea8f -r eed1aafead0d graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXWrapperBuilder.java --- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXWrapperBuilder.java Fri Jan 24 17:43:14 2014 +0100 +++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXWrapperBuilder.java Fri Jan 24 18:34:18 2014 +0100 @@ -55,21 +55,62 @@ /** * Utility for building a graph that "wraps" a compiled PTX kernel. Such a wrapper handles the - * transition from the host CPU to the GPU and back. The graph created is something like the - * following pseudo code with UPPER CASE denoting compile-time constants: + * transition from the host CPU to the GPU and back. The wrapper allocate 3 on-stack buffers: + *
    + *
  • PARAMS: a buffer for the kernel parameters and one word for the on-device address of the + * return value (if any).
  • + *
  • PINNED: a buffer into which the address of pinned objects is saved.
  • + *
  • OBJECT_OFFSETS: the offsets of the object values in PARAMS.
  • + *
+ * + * + * The PARAMS buffer is the {@code CU_LAUNCH_PARAM_BUFFER_POINTER} buffer passed in the + * {@code extra} argument to the {@code cuLaunchKernel} function. This buffer contains the + * parameters to the call. The buffer is word aligned and each parameter is aligned in the buffer + * according to its data size. The wrapper copies the incoming arguments into the buffer as is. The + * native {@link PTXHotSpotBackend#CALL_KERNEL callKernel} function will pin the memory for each + * object parameter (using {@code cuMemHostRegister}) and then replace the object pointer in PARAMS + * with an on-device pointer to the object's memory (see {@code cuMemHostGetDevicePointer}). The + * function saves pinned object pointer into PINNED so that it can unpinned once the kernel returns. + * The object pointers in PARAMS are specified by OBJECT_OFFSETS. + *

+ * As a concrete example, for a kernel whose Java method signature is: * *

- *     T kernel(p0, p1, ..., pN) {
- *         jint bufSize = SIZE_OF_ALIGNED_PARAMS_AND_RETURN_VALUE_WITH_PADDING(p0, p1, ..., pN);
- *         jbyte buf[bufSize] = {p0, PAD(p1), p1, ..., PAD(pN), pN};
- *         jlong result = PTX_LAUNCH_KERNEL(THREAD_REGISTER, KERNEL_ENTRY_POINT, dimX, dimY, dimZ, buf, bufSize, encodedReturnTypeSize);
- *         return convert(result);
+ *     static int kernel(int p1, short p2, Object p3, long p4)
+ * 
+ * + * the graph created is shown below as psuedo-code: + * + *
+ *     int kernel_wrapper(int p1, short p2, oop p3, long p4) {
+ *         address kernelAddr = kernel.start;
+ *         if (kernelAddr == 0) {
+ *             deopt(InvalidateRecompile, RuntimeConstraint);
+ *         }
+ *         byte PARAMS[32];
+ *         word PINNED[1]; // note: no refmap
+ *         int OBJECT_OFFSETS[1] = {8};
+ *         ((int*) PARAMS)[0] = p1;
+ *         ((short*) PARAMS)[2] = p2;
+ *         ((word*) PARAMS)[1] = p3;
+ *         ((long*) PARAMS)[2] = p4;
+ *         int result = CALL_KERNEL(THREAD_REGISTER, KERNEL_ENTRY_POINT, 1, 1, 1, PARAMS, 32, 1, OBJECT_OFFSETS, PINNED, 4);
+ *         if (clearPendingException(thread)) {
+ *             deopt(None, RuntimeConstraint);
+ *         }
+ *         return result;
  *     }
  * 
*

* The generated graph includes a reference to the {@link HotSpotNmethod} for the kernel. There must * be another reference to the same {@link HotSpotNmethod} object to ensure that the nmethod is not * unloaded by the next full GC. + *

+ * TODO: Only the memory for objects passed as parameters is pinned. Surely the memory for other + * objects accessed in the kernel reachable from the parameter objects needs to be pinned as well? + *

+ * TODO: Objects references within kernels are currently completely hidden from GC. */ public class PTXWrapperBuilder extends GraphKit { @@ -92,11 +133,23 @@ int[] javaParameterOffsetsInKernelParametersBuffer; /** - * Constants denoting the arguments to {@link PTXHotSpotBackend#LAUNCH_KERNEL}. + * Constants denoting the arguments to {@link PTXHotSpotBackend#CALL_KERNEL}. */ + // @formatter:off enum LaunchArg { - Thread, Kernel, DimX, DimY, DimZ, ParametersAndReturnValueBuffer, ParametersAndReturnValueBufferSize, EncodedReturnTypeSize + Thread, + Kernel, + DimX, + DimY, + DimZ, + ParametersAndReturnValueBuffer, + ParametersAndReturnValueBufferSize, + ObjectParametersCount, + ObjectParametersOffsets, + PinnedObjects, + EncodedReturnTypeSize } + // @formatter:on /** * Creates the graph implementing the CPU to GPU transition. @@ -108,6 +161,7 @@ public PTXWrapperBuilder(ResolvedJavaMethod method, HotSpotNmethod kernel, HotSpotProviders providers) { super(new StructuredGraph(method), providers); int wordSize = providers.getCodeCache().getTarget().wordSize; + int intSize = Integer.SIZE / Byte.SIZE; Kind wordKind = providers.getCodeCache().getTarget().wordKind; Signature sig = method.getSignature(); boolean isStatic = isStatic(method.getModifiers()); @@ -117,17 +171,18 @@ int javaParametersIndex = 0; Kind returnKind = sig.getReturnKind(); - BitSet objects = new BitSet(); + List objectSlots = new ArrayList<>(javaParameters.length); if (!isStatic) { - allocateParameter(Kind.Object, javaParametersIndex++, objects, wordSize); + allocateParameter(Kind.Object, javaParametersIndex++, objectSlots, wordSize); } for (int sigIndex = 0; sigIndex < sigCount; sigIndex++) { Kind kind = sig.getParameterKind(sigIndex); - allocateParameter(kind, javaParametersIndex++, objects, wordSize); + allocateParameter(kind, javaParametersIndex++, objectSlots, wordSize); } bufSize = roundUp(bufSize, wordSize); - // Add slot for holding pointer to device memory storing return value + // Add slot for the device memory pointer. The kernel writes a + // pointer in this slot that points to the return value. int encodedReturnTypeSize = 0; if (returnKind != Kind.Void) { bufSize += wordSize; @@ -140,7 +195,29 @@ InvokeNode kernelStart = createInvoke(getClass(), "getKernelStart", ConstantNode.forObject(kernel, providers.getMetaAccess(), getGraph())); - AllocaNode buf = append(new AllocaNode(bufSize / wordSize, objects)); + AllocaNode buf = append(new AllocaNode(bufSize / wordSize, new BitSet())); + ValueNode objectParametersOffsets; + ValueNode pinnedObjects; + ConstantNode nullWord = ConstantNode.forIntegerKind(wordKind, 0L, getGraph()); + if (objectSlots.isEmpty()) { + objectParametersOffsets = ConstantNode.forLong(0, getGraph()); + pinnedObjects = ConstantNode.forLong(0, getGraph()); + } else { + int intsPerWord = wordSize / intSize; + int slots = roundUp(objectSlots.size(), intsPerWord); + objectParametersOffsets = append(new AllocaNode(slots, new BitSet())); + // No refmap for pinned objects list since kernel execution is (currently) GC unsafe + pinnedObjects = append(new AllocaNode(objectSlots.size(), new BitSet())); + + // Initialize the object parameter offsets array + int index = 0; + for (int slot : objectSlots) { + int offset = slot * wordSize; + LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, Kind.Int, index * intSize, getGraph()); + append(new WriteNode(objectParametersOffsets, ConstantNode.forInt(offset, getGraph()), location, BarrierType.NONE, false, false)); + index++; + } + } Map args = new EnumMap<>(LaunchArg.class); args.put(Thread, append(new ReadRegisterNode(providers.getRegisters().getThreadRegister(), true, false))); @@ -150,6 +227,9 @@ args.put(DimZ, forInt(1, getGraph())); args.put(ParametersAndReturnValueBuffer, buf); args.put(ParametersAndReturnValueBufferSize, forInt(bufSize, getGraph())); + args.put(ObjectParametersCount, forInt(objectSlots.size(), getGraph())); + args.put(ObjectParametersOffsets, objectParametersOffsets); + args.put(PinnedObjects, pinnedObjects); args.put(EncodedReturnTypeSize, forInt(encodedReturnTypeSize, getGraph())); int sigIndex = isStatic ? 0 : -1; @@ -162,7 +242,7 @@ } if (returnKind != Kind.Void) { LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, wordKind, bufSize - wordSize, getGraph()); - append(new WriteNode(buf, ConstantNode.forIntegerKind(wordKind, 0L, getGraph()), location, BarrierType.NONE, false, false)); + append(new WriteNode(buf, nullWord, location, BarrierType.NONE, false, false)); } FrameStateBuilder fsb = new FrameStateBuilder(method, getGraph(), true); @@ -170,7 +250,7 @@ getGraph().start().setStateAfter(fs); ValueNode[] launchArgsArray = args.values().toArray(new ValueNode[args.size()]); - ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), LAUNCH_KERNEL, launchArgsArray)); + ForeignCallNode result = append(new ForeignCallNode(providers.getForeignCalls(), CALL_KERNEL, launchArgsArray)); result.setDeoptimizationState(fs); ConstantNode isObjectResultArg = ConstantNode.forBoolean(returnKind == Kind.Object, getGraph()); @@ -193,7 +273,11 @@ case Long: returnValue = result; break; - case Float: + case Float: { + ValueNode asInt = unique(new ConvertNode(Kind.Long, Kind.Int, result)); + returnValue = unique(new ReinterpretNode(Kind.Float, asInt)); + break; + } case Double: returnValue = unique(new ReinterpretNode(returnKind, result)); break; @@ -220,12 +304,12 @@ } /** - * Allocates a slot in the kernel parameters' buffer for a Java parameter. + * Computes offset and size of space in PARAMS for a Java parameter. * * @param kind the kind of the parameter * @param javaParametersIndex the index of the Java parameter */ - private void allocateParameter(Kind kind, int javaParametersIndex, BitSet objects, int wordSize) { + private void allocateParameter(Kind kind, int javaParametersIndex, List objectSlots, int wordSize) { int kindByteSize = kind == Kind.Object ? wordSize : kind.getBitCount() / Byte.SIZE; bufSize = roundUp(bufSize, kindByteSize); javaParameterOffsetsInKernelParametersBuffer[javaParametersIndex] = bufSize; @@ -233,7 +317,7 @@ if (kind == Kind.Object) { stamp = StampFactory.object(); int slot = bufSize / wordSize; - objects.set(slot); + objectSlots.add(slot); } else { stamp = StampFactory.forKind(kind); } diff -r 9de3efd2ea8f -r eed1aafead0d graal/com.oracle.graal.lir/src/com/oracle/graal/lir/FrameMap.java --- a/graal/com.oracle.graal.lir/src/com/oracle/graal/lir/FrameMap.java Fri Jan 24 17:43:14 2014 +0100 +++ b/graal/com.oracle.graal.lir/src/com/oracle/graal/lir/FrameMap.java Fri Jan 24 18:34:18 2014 +0100 @@ -316,7 +316,7 @@ spillSize += (slots * target.wordSize); if (!objects.isEmpty()) { - assert objects.length() < slots; + assert objects.length() <= slots; StackSlot result = null; for (int slotIndex = 0; slotIndex < slots; slotIndex++) { StackSlot objectSlot = null; diff -r 9de3efd2ea8f -r eed1aafead0d graal/com.oracle.graal.phases.common/src/com/oracle/graal/phases/common/NonNullParametersPhase.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graal/com.oracle.graal.phases.common/src/com/oracle/graal/phases/common/NonNullParametersPhase.java Fri Jan 24 18:34:18 2014 +0100 @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package com.oracle.graal.phases.common; + +import com.oracle.graal.nodes.*; +import com.oracle.graal.nodes.type.*; +import com.oracle.graal.phases.*; + +/** + * Modifies the stamp of all object {@linkplain ParameterNode parameters} in a graph to denote they + * are non-null. This can be used for graphs where the caller null checks all arguments. + */ +public class NonNullParametersPhase extends Phase { + + @Override + protected void run(StructuredGraph graph) { + for (ParameterNode param : graph.getNodes(ParameterNode.class)) { + if (param.stamp() instanceof ObjectStamp) { + param.setStamp(StampFactory.declaredNonNull(((ObjectStamp) param.stamp()).type())); + } + } + } +} diff -r 9de3efd2ea8f -r eed1aafead0d src/gpu/ptx/vm/gpu_ptx.cpp --- a/src/gpu/ptx/vm/gpu_ptx.cpp Fri Jan 24 17:43:14 2014 +0100 +++ b/src/gpu/ptx/vm/gpu_ptx.cpp Fri Jan 24 18:34:18 2014 +0100 @@ -39,6 +39,7 @@ gpu::Ptx::cuda_cu_ctx_create_func_t gpu::Ptx::_cuda_cu_ctx_create; gpu::Ptx::cuda_cu_ctx_destroy_func_t gpu::Ptx::_cuda_cu_ctx_destroy; gpu::Ptx::cuda_cu_ctx_synchronize_func_t gpu::Ptx::_cuda_cu_ctx_synchronize; +gpu::Ptx::cuda_cu_ctx_get_current_func_t gpu::Ptx::_cuda_cu_ctx_get_current; gpu::Ptx::cuda_cu_ctx_set_current_func_t gpu::Ptx::_cuda_cu_ctx_set_current; gpu::Ptx::cuda_cu_device_get_count_func_t gpu::Ptx::_cuda_cu_device_get_count; gpu::Ptx::cuda_cu_device_get_name_func_t gpu::Ptx::_cuda_cu_device_get_name; @@ -337,123 +338,189 @@ return cu_function; } +// A PtxCall is used to manage executing a GPU kernel. In addition to launching +// the kernel, this class releases resources allocated for the execution. +class PtxCall: StackObj { + private: + JavaThread* _thread; // the thread on which this call is made + address _buffer; // buffer containing parameters and _return_value + int _buffer_size; // size (in bytes) of _buffer + oop* _pinned; // objects that have been pinned with cuMemHostRegister + int _pinned_length; // length of _pinned + gpu::Ptx::CUdeviceptr _ret_value; // pointer to slot in GPU memory holding the return value + int _ret_type_size; // size of the return type value + bool _ret_is_object; // specifies if the return type is Object + + bool check(int status, const char *action) { + if (status != GRAAL_CUDA_SUCCESS) { + Thread* THREAD = _thread; + char* message = NEW_RESOURCE_ARRAY_IN_THREAD(THREAD, char, O_BUFLEN + 1); + jio_snprintf(message, O_BUFLEN, "[CUDA] *** Error (status=%d): %s", status, action); + if (TraceGPUInteraction || HAS_PENDING_EXCEPTION) { + tty->print_cr(message); + } + if (!HAS_PENDING_EXCEPTION) { + SharedRuntime::throw_and_post_jvmti_exception(_thread, vmSymbols::java_lang_RuntimeException(), message); + } + return false; + } + if (TraceGPUInteraction) { + tty->print_cr("[CUDA] Success: %s", action); + } + return true; + } + + public: + PtxCall(JavaThread* thread, address buffer, int buffer_size, oop* pinned, int encodedReturnTypeSize) : _thread(thread), + _buffer(buffer), _buffer_size(buffer_size), _pinned(pinned), _pinned_length(0), _ret_value(0), _ret_is_object(encodedReturnTypeSize < 0) { + _ret_type_size = _ret_is_object ? -encodedReturnTypeSize : encodedReturnTypeSize; + } + + bool is_object_return() { return _ret_is_object; } + + void alloc_return_value() { + if (_ret_type_size != 0) { + if (check(gpu::Ptx::_cuda_cu_memalloc(&_ret_value, _ret_type_size), "Allocate device memory for return value")) { + gpu::Ptx::CUdeviceptr* retValuePtr = (gpu::Ptx::CUdeviceptr*) ((_buffer + _buffer_size) - sizeof(_ret_value)); + *retValuePtr = _ret_value; + } + } + } + + void pin_objects(int count, int* objectOffsets) { + if (count == 0) { + return; + } + for (int i = 0; i < count; i++) { + int offset = objectOffsets[i]; + oop* argPtr = (oop*) (_buffer + offset); + oop obj = *argPtr; + if (obj != NULL) { + // Size (in bytes) of object + int objSize = obj->size() * HeapWordSize; + //tty->print_cr("Pinning object %d at offset %d: %p", i, offset, obj); + if (!check(gpu::Ptx::_cuda_cu_mem_host_register(obj, objSize, GRAAL_CU_MEMHOSTREGISTER_DEVICEMAP), "Pin object")) { + return; + } + + // Record original oop so that its memory can be unpinned + _pinned[_pinned_length++] = obj; + + // Replace host pointer to object with device pointer + // to object in kernel parameters buffer + if (!check(gpu::Ptx::_cuda_cu_mem_host_get_device_pointer((gpu::Ptx::CUdeviceptr*) argPtr, obj, 0), "Get device pointer for pinned object")) { + return; + } + } + } + } + + void launch(address kernel, jint dimX, jint dimY, jint dimZ) { + // grid dimensionality + unsigned int gridX = 1; + unsigned int gridY = 1; + unsigned int gridZ = 1; + void * config[] = { + GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, (char*) (address) _buffer, + GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, &_buffer_size, + GRAAL_CU_LAUNCH_PARAM_END + }; + if (check(gpu::Ptx::_cuda_cu_launch_kernel((struct CUfunc_st*) (address) kernel, + gridX, gridY, gridZ, + dimX, dimY, dimZ, + 0, NULL, NULL, (void**) &config), "Launch kernel")) { + } + } + + void synchronize() { + check(gpu::Ptx::_cuda_cu_ctx_synchronize(), "Synchronize kernel"); + } + + void unpin_objects() { + while (_pinned_length > 0) { + oop obj = _pinned[--_pinned_length]; + assert(obj != NULL, "npe"); + //tty->print_cr("Unpinning object %d: %p", _pinned_length, obj); + if (!check(gpu::Ptx::_cuda_cu_mem_host_unregister(obj), "Unpin object")) { + return; + } + } + } + + oop get_object_return_value() { + oop return_val; + check(gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, _ret_value, T_OBJECT_BYTE_SIZE), "Copy return value from device"); + return return_val; + } + + jlong get_primitive_return_value() { + jlong return_val; + check(gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, _ret_value, _ret_type_size), "Copy return value from device"); + return return_val; + } + + void free_return_value() { + if (_ret_value != 0) { + check(gpu::Ptx::_cuda_cu_memfree(_ret_value), "Free device memory"); + _ret_value = 0; + } + } + + void destroy_context() { + if (gpu::Ptx::_device_context != NULL) { + check(gpu::Ptx::_cuda_cu_ctx_destroy(gpu::Ptx::_device_context), "Destroy context"); + gpu::Ptx::_device_context = NULL; + } + } + + ~PtxCall() { + unpin_objects(); + free_return_value(); + destroy_context(); + } +}; + + JRT_ENTRY(jlong, gpu::Ptx::execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ, - jlong parametersAndReturnValueBuffer, - jint parametersAndReturnValueBufferSize, + jlong buffer, + jint bufferSize, + jint objectParametersCount, + jlong objectParametersOffsets, + jlong pinnedObjects, int encodedReturnTypeSize)) if (kernel == 0L) { SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_NullPointerException(), NULL); return 0L; } - // grid dimensionality - unsigned int gridX = 1; - unsigned int gridY = 1; - unsigned int gridZ = 1; - - struct CUfunc_st* cu_function = (struct CUfunc_st*) (address) kernel; + PtxCall call(thread, (address) buffer, bufferSize, (oop*) (address) pinnedObjects, encodedReturnTypeSize); - void * config[5] = { - GRAAL_CU_LAUNCH_PARAM_BUFFER_POINTER, (char*) (address) parametersAndReturnValueBuffer, - GRAAL_CU_LAUNCH_PARAM_BUFFER_SIZE, ¶metersAndReturnValueBufferSize, - GRAAL_CU_LAUNCH_PARAM_END - }; +#define TRY(action) do { \ + action; \ + if (HAS_PENDING_EXCEPTION) return 0L; \ +} while (0) - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] launching kernel"); - } + TRY(call.alloc_return_value()); - bool isObjectReturn = encodedReturnTypeSize < 0; - int returnTypeSize = encodedReturnTypeSize < 0 ? -encodedReturnTypeSize : encodedReturnTypeSize; - gpu::Ptx::CUdeviceptr device_return_value; - int status; - if (returnTypeSize != 0) { - status = _cuda_cu_memalloc(&device_return_value, returnTypeSize); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to allocate memory for return value pointer on device"); - return 0L; - } - // Push device_return_value to kernelParams - gpu::Ptx::CUdeviceptr* returnValuePtr = (gpu::Ptx::CUdeviceptr*) - ((address) parametersAndReturnValueBuffer + - parametersAndReturnValueBufferSize - sizeof(device_return_value)); - *returnValuePtr = device_return_value; - } + TRY(call.pin_objects(objectParametersCount, (int*) (address) objectParametersOffsets)); + + TRY(call.launch((address) kernel, dimX, dimY, dimZ)); - status = _cuda_cu_launch_kernel(cu_function, - gridX, gridY, gridZ, - dimX, dimY, dimZ, - 0, NULL, NULL, (void **) &config); + TRY(call.synchronize()); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] Failed to launch kernel"); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to launch kernel"); + if (call.is_object_return()) { + oop return_val; + TRY(return_val = call.get_object_return_value()); + thread->set_vm_result(return_val); return 0L; } - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Success: Kernel Launch: X: %d Y: %d Z: %d", dimX, dimY, dimZ); - } - - status = _cuda_cu_ctx_synchronize(); - - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] Failed to synchronize launched kernel (%d)", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_RuntimeException(), "[CUDA] Failed to synchronize launched kernel"); - return 0L; - } - - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Success: Synchronized launch kernel"); - } + jlong return_val; + TRY(return_val = call.get_primitive_return_value()); + return return_val; - jlong primitiveReturnValue = 0L; - if (isObjectReturn) { - oop return_val; - status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, device_return_value, T_OBJECT_BYTE_SIZE); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument"); - return 0L; - } - thread->set_vm_result(return_val); - } else if (returnTypeSize > 0) { - status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&primitiveReturnValue, device_return_value, returnTypeSize); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to copy value from device argument", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to copy value from device argument"); - return 0L; - } - } +#undef TRY - // Free device memory allocated for result - if (returnTypeSize != 0) { - status = gpu::Ptx::_cuda_cu_memfree(device_return_value); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to free device memory of return value", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to free device memory of return value"); - return 0L; - } - } - - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Success: Freed device memory of return value"); - } - - // Destroy context - status = gpu::Ptx::_cuda_cu_ctx_destroy(_device_context); - if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to destroy context", status); - SharedRuntime::throw_and_post_jvmti_exception(thread, vmSymbols::java_lang_Exception(), "[CUDA] Failed to destroy context"); - return 0L; - } - - if (TraceGPUInteraction) { - tty->print_cr("[CUDA] Success: Destroy context"); - } - - return primitiveReturnValue; JRT_END bool gpu::Ptx::execute_kernel(address kernel, PTXKernelArguments &ptxka, JavaValue &ret) { @@ -620,6 +687,7 @@ if (handle != NULL) { LOOKUP_CUDA_FUNCTION(cuInit, cuda_cu_init); LOOKUP_CUDA_FUNCTION(cuCtxSynchronize, cuda_cu_ctx_synchronize); + LOOKUP_CUDA_FUNCTION(cuCtxGetCurrent, cuda_cu_ctx_get_current); LOOKUP_CUDA_FUNCTION(cuCtxSetCurrent, cuda_cu_ctx_set_current); LOOKUP_CUDA_FUNCTION(cuDeviceGetCount, cuda_cu_device_get_count); LOOKUP_CUDA_FUNCTION(cuDeviceGetName, cuda_cu_device_get_name); diff -r 9de3efd2ea8f -r eed1aafead0d src/gpu/ptx/vm/gpu_ptx.hpp --- a/src/gpu/ptx/vm/gpu_ptx.hpp Fri Jan 24 17:43:14 2014 +0100 +++ b/src/gpu/ptx/vm/gpu_ptx.hpp Fri Jan 24 18:34:18 2014 +0100 @@ -84,16 +84,19 @@ * Context creation flags */ -#define GRAAL_CU_CTX_MAP_HOST 0x08 +#define GRAAL_CU_CTX_MAP_HOST 0x08 +#define GRAAL_CU_CTX_SCHED_BLOCKING_SYNC 0x04 class Ptx { friend class gpu; + friend class PtxCall; protected: static bool probe_linkage(); static bool initialize_gpu(); static unsigned int total_cores(); - static void * generate_kernel(unsigned char *code, int code_len, const char *name); + static void* get_context(); + static void* generate_kernel(unsigned char *code, int code_len, const char *name); static bool execute_warp(int dimX, int dimY, int dimZ, address kernel, PTXKernelArguments & ka, JavaValue &ret); static bool execute_kernel(address kernel, PTXKernelArguments & ka, JavaValue &ret); public: @@ -106,8 +109,11 @@ typedef int CUdevice; /* CUDA device */ static jlong execute_kernel_from_vm(JavaThread* thread, jlong kernel, jint dimX, jint dimY, jint dimZ, - jlong parametersAndReturnValueBuffer, - jint parametersAndReturnValueBufferSize, + jlong buffer, + jint bufferSize, + jint objectParametersCount, + jlong objectParametersOffsets, + jlong pinnedObjects, int encodedReturnTypeSize); private: @@ -115,6 +121,7 @@ typedef int (*cuda_cu_ctx_create_func_t)(void*, unsigned int, CUdevice); typedef int (*cuda_cu_ctx_destroy_func_t)(void*); typedef int (*cuda_cu_ctx_synchronize_func_t)(void); + typedef int (*cuda_cu_ctx_get_current_func_t)(void*); typedef int (*cuda_cu_ctx_set_current_func_t)(void*); typedef int (*cuda_cu_device_get_count_func_t)(int*); typedef int (*cuda_cu_device_get_name_func_t)(char*, int, int); @@ -152,6 +159,7 @@ static cuda_cu_memfree_func_t _cuda_cu_memfree; static cuda_cu_memcpy_htod_func_t _cuda_cu_memcpy_htod; static cuda_cu_memcpy_dtoh_func_t _cuda_cu_memcpy_dtoh; + static cuda_cu_ctx_get_current_func_t _cuda_cu_ctx_get_current; static cuda_cu_ctx_set_current_func_t _cuda_cu_ctx_set_current; static cuda_cu_mem_host_register_func_t _cuda_cu_mem_host_register; static cuda_cu_mem_host_get_device_pointer_func_t _cuda_cu_mem_host_get_device_pointer;