# HG changeset patch # User Doug Simon # Date 1402999534 -7200 # Node ID 310994c667a7d544f98ef090bd8b54adb77647e1 # Parent 3b4690ddd92e8458a168933089902996d24764a9 HSAIL: support offloading some IntStream.reduce() operations to HSA Contributed-by: Eric Caspole diff -r 3b4690ddd92e -r 310994c667a7 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceMaxTest.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceMaxTest.java Tue Jun 17 12:05:34 2014 +0200 @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package com.oracle.graal.compiler.hsail.test.lambda; + +import static com.oracle.graal.hotspot.HotSpotGraalRuntime.runtime; +import com.oracle.graal.hotspot.HotSpotVMConfig; +import static org.junit.Assert.*; +import org.junit.*; + +import java.util.*; +import java.util.stream.IntStream; + +public class ReduceMaxTest { + // The length of the input array + static int jobSize = 1027 * 1023 * 13; + static int loops = 1; + + // The source array + int bigArray[] = null; + + // result for baseline single threaded stream + int resultStream = 0; + // result for parallel CPU and offloaded streams + int resultOffload = 0; + + int evaluate(boolean doParallelStream) { + int result = 0; + for (int i = 0; i < loops; i++) { + IntStream s = Arrays.stream(bigArray); + if (doParallelStream == true) { + OptionalInt resultParallel = s.parallel().reduce(Integer::max); + result = resultParallel.getAsInt(); + } else { + result = s.reduce(Integer::max).getAsInt(); + } + } + return result; + } + + int evaluateWithIdentity(boolean doParallelStream) { + int result = 0; + for (int i = 0; i < loops; i++) { + IntStream s = Arrays.stream(bigArray); + if (doParallelStream == true) { + result = s.parallel().reduce(0, Integer::max); + } else { + result = s.reduce(0, Integer::max); + } + } + return result; + } + + @Test + public void testReduce() { + // Handmade reduce does not support +UseCompressedOops + HotSpotVMConfig config = runtime().getConfig(); + if (config.useCompressedOops == true || config.useHSAILDeoptimization == true) { + return; + } + + bigArray = new int[jobSize]; + for (int i = 0; i < jobSize; i++) { + // bigArray[i] = i + 1; + bigArray[i] = -1024 + i + 1; + } + + // Get non parallel baseline + resultStream = evaluate(false); + + // Get OptionalInt version kernel + resultOffload = evaluate(true); + assertTrue(resultStream == resultOffload); + + // Do identity version kernel + // Get non parallel baseline + resultStream = evaluateWithIdentity(false); + + resultOffload = evaluateWithIdentity(true); + assertTrue(resultStream == resultOffload); + } +} diff -r 3b4690ddd92e -r 310994c667a7 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceMinTest.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceMinTest.java Tue Jun 17 12:05:34 2014 +0200 @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package com.oracle.graal.compiler.hsail.test.lambda; + +import static com.oracle.graal.hotspot.HotSpotGraalRuntime.runtime; +import com.oracle.graal.hotspot.HotSpotVMConfig; +import static org.junit.Assert.*; +import org.junit.*; + +import java.util.*; +import java.util.stream.IntStream; + +public class ReduceMinTest { + // The length of the input array + static int jobSize = 1027 * 1023 * 13; + static int loops = 1; + + // The input array to the kernel + int bigArray[] = null; + + // result for baseline single threaded stream + int resultStream = 0; + // result for parallel CPU and offloaded streams + int resultOffload = 0; + + int evaluate(boolean doParallelStream) { + int result = 0; + for (int i = 0; i < loops; i++) { + IntStream s = Arrays.stream(bigArray); + if (doParallelStream == true) { + OptionalInt resultParallel = s.parallel().reduce(Integer::min); + result = resultParallel.getAsInt(); + } else { + result = s.reduce(Integer::min).getAsInt(); + } + } + return result; + } + + int evaluateWithIdentity(boolean doParallelStream) { + int result = 0; + for (int i = 0; i < loops; i++) { + IntStream s = Arrays.stream(bigArray); + if (doParallelStream == true) { + result = s.parallel().reduce(0, Integer::min); + } else { + result = s.reduce(0, Integer::min); + } + } + return result; + } + + @Test + public void testReduce() { + // Handmade reduce does not support +UseCompressedOops + HotSpotVMConfig config = runtime().getConfig(); + if (config.useCompressedOops == true || config.useHSAILDeoptimization == true) { + return; + } + + bigArray = new int[jobSize]; + for (int i = 0; i < jobSize; i++) { + bigArray[i] = -1024 + i + 1; + } + + // Get non parallel baseline + resultStream = evaluate(false); + + // Get OptionalInt version kernel + resultOffload = evaluate(true); + assertTrue(resultStream == resultOffload); + + // Do identity version kernel + // Get non parallel baseline + resultStream = evaluateWithIdentity(false); + + resultOffload = evaluateWithIdentity(true); + assertTrue(resultStream == resultOffload); + } +} diff -r 3b4690ddd92e -r 310994c667a7 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceSumTest.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceSumTest.java Tue Jun 17 12:05:34 2014 +0200 @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package com.oracle.graal.compiler.hsail.test.lambda; + +//import com.oracle.graal.compiler.common.GraalInternalError; +import static com.oracle.graal.hotspot.HotSpotGraalRuntime.runtime; +import com.oracle.graal.hotspot.HotSpotVMConfig; +import org.junit.*; + +import java.util.*; +import java.util.stream.IntStream; + +public class ReduceSumTest { + // The length of the input array + static int jobSize = 1027 * 1023 * 13; + static int loops = 1; + + // The array to be summed + int bigArray[] = null; + + // sum for baseline single threaded stream + int sumStream = 0; + // sum for parallel CPU and offloaded streams + int sumOffload = 0; + + int evaluate(boolean doParallelStream) { + int sum = 0; + for (int i = 0; i < loops; i++) { + IntStream s = Arrays.stream(bigArray); + if (doParallelStream == true) { + OptionalInt resultParallel = s.parallel().reduce(Integer::sum); + sum = resultParallel.getAsInt(); + } else { + OptionalInt resultStream = s.reduce(Integer::sum); + sum = resultStream.getAsInt(); + } + } + return sum; + } + + int evaluateWithIdentity(boolean doParallelStream) { + int sum = 0; + for (int i = 0; i < loops; i++) { + IntStream s = Arrays.stream(bigArray); + if (doParallelStream == true) { + sum = s.parallel().reduce(0, Integer::sum); + } else { + sum = s.reduce(0, Integer::sum); + } + } + return sum; + } + + @Test + public void testReduce() { + // Handmade reduce does not support +UseCompressedOops + HotSpotVMConfig config = runtime().getConfig(); + if (config.useCompressedOops == true || config.useHSAILDeoptimization == true) { + return; + } + + bigArray = new int[jobSize]; + for (int i = 0; i < jobSize; i++) { + bigArray[i] = -1024 + i + 1; + } + + // Get non parallel baseline + sumStream = evaluate(false); + + // Get OptionalInt version kernel + sumOffload = evaluate(true); + assert sumStream == sumOffload : "Offload sum is wrong, stream:" + sumStream + " != offload:" + sumOffload; + + // Get identity version kernel + sumOffload = evaluateWithIdentity(true); + assert sumStream == sumOffload : "Offload sum is wrong, stream:" + sumStream + " != offload:" + sumOffload; + } +} diff -r 3b4690ddd92e -r 310994c667a7 graal/com.oracle.graal.compiler.hsail/src/com/oracle/graal/compiler/hsail/CompileAndDispatch.java --- a/graal/com.oracle.graal.compiler.hsail/src/com/oracle/graal/compiler/hsail/CompileAndDispatch.java Tue Jun 17 10:09:11 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail/src/com/oracle/graal/compiler/hsail/CompileAndDispatch.java Tue Jun 17 12:05:34 2014 +0200 @@ -32,4 +32,12 @@ Object createKernel(Class consumerClass); boolean dispatchKernel(Object kernel, int jobSize, Object[] args); + + Object createKernelFromHsailString(String code, String methodName); + + String getIntegerReduceIntrinsic(String reducerName); + + Integer offloadIntReduceImpl(Object kernel, int identity, int[] streamSource); + + String getIntReduceTargetName(Class opClass); } diff -r 3b4690ddd92e -r 310994c667a7 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/ForEachToGraal.java --- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/ForEachToGraal.java Tue Jun 17 10:09:11 2014 +0200 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/ForEachToGraal.java Tue Jun 17 12:05:34 2014 +0200 @@ -26,15 +26,20 @@ import static com.oracle.graal.hotspot.HotSpotGraalRuntime.*; import java.lang.reflect.*; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.*; import com.oracle.graal.api.code.*; import com.oracle.graal.api.meta.*; +import com.oracle.graal.compiler.common.GraalInternalError; import com.oracle.graal.compiler.hsail.*; import com.oracle.graal.compiler.target.*; import com.oracle.graal.debug.*; import com.oracle.graal.debug.internal.*; import com.oracle.graal.gpu.*; import com.oracle.graal.graph.iterators.*; +import com.oracle.graal.hotspot.*; import com.oracle.graal.hotspot.meta.*; import com.oracle.graal.hsail.*; import com.oracle.graal.java.*; @@ -55,11 +60,54 @@ return (HSAILHotSpotBackend) backend; } + ConcurrentHashMap, String> resolvedConsumerTargetMethods = new ConcurrentHashMap<>(); + /** - * Gets a compiled and installed kernel for the lambda called by the {@code accept(int value)} - * method in a class implementing {@code java.util.function.IntConsumer}. - * - * @param intConsumerClass a class implementing {@code java.util.function.IntConsumer} + * Returns the name of the reduction method given a class implementing {@link IntConsumer}. + * + * @param opClass a class implementing {@link IntConsumer}. + * @return the name of the reduction method + */ + public String getIntReduceTargetName(Class opClass) { + String cachedMethodName = resolvedConsumerTargetMethods.get(Objects.requireNonNull(opClass)); + if (cachedMethodName != null) { + return cachedMethodName; + } else { + Method acceptMethod = null; + for (Method m : opClass.getMethods()) { + if (m.getName().equals("applyAsInt")) { + assert acceptMethod == null : "found more than one implementation of applyAsInt in " + opClass; + acceptMethod = m; + } + } + // Ensure a debug configuration for this thread is initialized + if (DebugScope.getConfig() == null) { + DebugEnvironment.initialize(System.out); + } + + HSAILHotSpotBackend backend = getHSAILBackend(); + Providers providers = backend.getProviders(); + StructuredGraph graph = new StructuredGraph(((HotSpotMetaAccessProvider) providers.getMetaAccess()).lookupJavaMethod(acceptMethod)); + new GraphBuilderPhase.Instance(providers.getMetaAccess(), GraphBuilderConfiguration.getDefault(), OptimisticOptimizations.ALL).apply(graph); + NodeIterable calls = graph.getNodes(MethodCallTargetNode.class); + assert calls.count() == 1; + ResolvedJavaMethod lambdaMethod = calls.first().targetMethod(); + Debug.log("target ... %s", lambdaMethod); + + String className = lambdaMethod.getDeclaringClass().getName(); + if (!className.equals("Ljava/lang/Integer;")) { + return null; + } + resolvedConsumerTargetMethods.put(opClass, lambdaMethod.getName()); + return lambdaMethod.getName().intern(); + } + } + + /** + * Gets a compiled and installed kernel for the lambda called by the + * {@link IntConsumer#accept(int)} method in a class implementing {@link IntConsumer}. + * + * @param intConsumerClass a class implementing {@link IntConsumer} * @return a {@link HotSpotNmethod} handle to the compiled and installed kernel */ private static HotSpotNmethod getCompiledLambda(Class intConsumerClass) { @@ -108,6 +156,38 @@ } @Override + public Object createKernelFromHsailString(String code, String methodName) { + ExternalCompilationResult hsailCode = new ExternalCompilationResult(); + try (Debug.Scope ds = Debug.scope("GeneratingKernelBinary")) { + + HSAILHotSpotBackend backend = getHSAILBackend(); + Providers providers = backend.getProviders(); + Method integerOffloadMethod = null; + + for (Method m : Integer.class.getMethods()) { + if (m.getName().equals(methodName)) { + integerOffloadMethod = m; + break; + } + } + if (integerOffloadMethod != null) { + ResolvedJavaMethod rm = ((HotSpotMetaAccessProvider) providers.getMetaAccess()).lookupJavaMethod(integerOffloadMethod); + + long kernel = HSAILHotSpotBackend.generateKernel(code.getBytes(), "Integer::" + methodName); + if (kernel == 0) { + throw new GraalInternalError("Failed to compile HSAIL kernel from String"); + } + hsailCode.setEntryPoint(kernel); + return backend.installKernel(rm, hsailCode); // is a HotSpotNmethod + } else { + return null; + } + } catch (Throwable e) { + throw Debug.handle(e); + } + } + + @Override public boolean dispatchKernel(Object kernel, int jobSize, Object[] args) { HotSpotNmethod code = (HotSpotNmethod) kernel; if (code != null) { @@ -125,4 +205,258 @@ return false; } } + + /** + * Running with a larger global size seems to increase the performance for sum, but it might be + * different for other reductions so it is a knob. + */ + private static final int GlobalSize = 1024 * Integer.getInteger("com.amd.sumatra.reduce.globalsize.multiple", 1); + + @Override + public Integer offloadIntReduceImpl(Object okraKernel, int identity, int[] streamSource) { + // NOTE - this reduce requires local size of 64 which is the SumatraUtils default + + // Handmade reduce does not support +UseCompressedOops + HotSpotVMConfig config = runtime().getConfig(); + if (config.useCompressedOops == true || config.useHSAILDeoptimization == true) { + throw new GraalInternalError("Reduce offload not compatible with +UseCompressedOops or +UseHSAILDeoptimization"); + } + + try { + assert streamSource.length >= GlobalSize : "Input array length=" + streamSource.length + " smaller than requested global_size=" + GlobalSize; + + int result[] = {identity}; + Object args[] = {streamSource, result, streamSource.length}; + args[0] = streamSource; + + dispatchKernel(okraKernel, GlobalSize, args); + + // kernel result is result[0]. + return result[0]; + } catch (Exception e) { + System.err.println(e); + e.printStackTrace(); + } + return null; + } + + @Override + public String getIntegerReduceIntrinsic(String reducerName) { + + // Note all of these depend on group size of 256 + + String reduceOp = "/* Invalid */ "; + String atomicResultProduction = "/* Invalid */ "; + if (reducerName.equals("sum")) { + reduceOp = "add_u32 "; + atomicResultProduction = "atomicnoret_add_global_u32 "; + } else if (reducerName.equals("max")) { + reduceOp = "max_s32 "; + atomicResultProduction = "atomicnoret_max_global_s32 "; + } else if (reducerName.equals("min")) { + reduceOp = "min_s32 "; + atomicResultProduction = "atomicnoret_min_global_s32 "; + } else { + return "/* Invalid */ "; + } + + // @formatter:off + return new String( + "version 0:95:$full:$large; // BRIG Object Format Version 0:4" + "\n" + + "" + "\n" + + "kernel &run(" + "\n" + + " align 8 kernarg_u64 %arg_p3," + "\n" + + " align 8 kernarg_u64 %arg_p4," + "\n" + + " align 4 kernarg_u32 %arg_p5)" + "\n" + + "{" + "\n" + + "" + "\n" + + " align 4 group_u32 %reduce_cllocal_scratch[256];" + "\n" + + "" + "\n" + + " workitemabsid_u32 $s2, 0;" + "\n" + + "" + "\n" + + " ld_kernarg_u32 $s1, [%arg_p5];" + "\n" + + " ld_kernarg_u64 $d0, [%arg_p4];" + "\n" + + " ld_kernarg_u64 $d1, [%arg_p3];" + "\n" + + "" + "\n" + + " add_u64 $d0, $d0, 24; // adjust over obj array headers" + "\n" + + " add_u64 $d1, $d1, 24;" + "\n" + + " cmp_ge_b1_s32 $c0, $s2, $s1; // if(gloId < length){" + "\n" + + " cbr $c0, @BB0_1;" + "\n" + + " gridsize_u32 $s0, 0; // s0 is globalsize" + "\n" + + " add_u32 $s0, $s0, $s2; // gx += globalsize" + "\n" + + " cvt_s64_s32 $d2, $s2; // s2 is global id" + "\n" + + " shl_u64 $d2, $d2, 2;" + "\n" + + " add_u64 $d2, $d1, $d2;" + "\n" + + " ld_global_u32 $s3, [$d2]; // load this element from input" + "\n" + + " brn @BB0_3;" + "\n" + + "" + "\n" + + "@BB0_1:" + "\n" + + " mov_b32 $s0, $s2;" + "\n" + "" + "\n" + + "@BB0_3:" + "\n" + + " cmp_ge_b1_s32 $c1, $s0, $s1; // while (gx < length)" + "\n" + + " cbr $c1, @BB0_6;" + "\n" + + " gridsize_u32 $s2, 0;" + "\n" + + "" + "\n" + + "@BB0_5:" + "\n" + + " cvt_s64_s32 $d2, $s0;" + "\n" + + " shl_u64 $d2, $d2, 2;" + "\n" + + " add_u64 $d2, $d1, $d2;" + "\n" + + " ld_global_u32 $s4, [$d2];" + "\n" + + reduceOp + " $s3, $s3, $s4;" + "\n" + + " add_u32 $s0, $s0, $s2;" + "\n" + + " cmp_lt_b1_s32 $c1, $s0, $s1;" + "\n" + + " cbr $c1, @BB0_5;" + "\n" + + "" + "\n" + + "@BB0_6:" + "\n" + + " workgroupid_u32 $s0, 0;" + "\n" + + " workgroupsize_u32 $s2, 0;" + "\n" + + " mul_u32 $s2, $s2, $s0;" + "\n" + + " sub_u32 $s2, $s1, $s2;" + "\n" + + " workitemid_u32 $s1, 0;" + "\n" + + " add_u32 $s4, $s1, 128;" + + "\n" + + " cmp_lt_b1_u32 $c1, $s4, $s2;" + "\n" + + " cmp_lt_b1_s32 $c2, $s1, 128;" + "\n" + + " and_b1 $c1, $c2, $c1;" + "\n" + + " cvt_s64_s32 $d1, $s1;" + "\n" + + " shl_u64 $d1, $d1, 2;" + "\n" + + " lda_group_u64 $d2, [%reduce_cllocal_scratch];" + "\n" + + " add_u64 $d1, $d2, $d1;" + "\n" + + " st_group_u32 $s3, [$d1];" + "\n" + + " barrier_fgroup;" + "\n" + + " not_b1 $c1, $c1;" + "\n" + + " cbr $c1, @BB0_8;" + "\n" + + " ld_group_u32 $s3, [$d1];" + "\n" + + " cvt_s64_s32 $d3, $s4;" + "\n" + + " shl_u64 $d3, $d3, 2;" + "\n" + + " add_u64 $d3, $d2, $d3;" + "\n" + + " ld_group_u32 $s4, [$d3];" + "\n" + + reduceOp + " $s3, $s3, $s4;" + "\n" + + " st_group_u32 $s3, [$d1];" + "\n" + + "" + "\n" + + "@BB0_8:" + "\n" + + " add_u32 $s3, $s1, 64;" + "\n" + + " cmp_lt_b1_u32 $c1, $s3, $s2;" + "\n" + + " cmp_lt_b1_s32 $c2, $s1, 64;" + "\n" + + " and_b1 $c1, $c2, $c1;" + "\n" + + " barrier_fgroup;" + "\n" + + " not_b1 $c1, $c1;" + "\n" + + " cbr $c1, @BB0_10;" + "\n" + + " ld_group_u32 $s4, [$d1];" + "\n" + + " cvt_s64_s32 $d3, $s3;" + "\n" + + " shl_u64 $d3, $d3, 2;" + "\n" + + " add_u64 $d3, $d2, $d3;" + "\n" + + " ld_group_u32 $s3, [$d3];" + "\n" + + reduceOp + " $s3, $s3, $s4;" + + "\n" + + " st_group_u32 $s3, [$d1];" + "\n" + + "" + "\n" + + "@BB0_10:" + "\n" + + " add_u32 $s3, $s1, 32;" + "\n" + + " cmp_lt_b1_u32 $c1, $s3, $s2;" + "\n" + + " cmp_lt_b1_s32 $c2, $s1, 32;" + "\n" + + " and_b1 $c1, $c2, $c1;" + "\n" + + " barrier_fgroup;" + "\n" + + " not_b1 $c1, $c1;" + "\n" + + " cbr $c1, @BB0_12;" + "\n" + + " ld_group_u32 $s4, [$d1];" + "\n" + + " cvt_s64_s32 $d3, $s3;" + "\n" + + " shl_u64 $d3, $d3, 2;" + "\n" + + " add_u64 $d3, $d2, $d3;" + "\n" + + " ld_group_u32 $s3, [$d3];" + "\n" + + reduceOp + " $s3, $s3, $s4;" + "\n" + + " st_group_u32 $s3, [$d1];" + "\n" + + "" + "\n" + + "@BB0_12:" + "\n" + + " add_u32 $s3, $s1, 16;" + "\n" + + " cmp_lt_b1_u32 $c1, $s3, $s2;" + "\n" + + " cmp_lt_b1_s32 $c2, $s1, 16;" + "\n" + + " and_b1 $c1, $c2, $c1;" + "\n" + + " barrier_fgroup;" + "\n" + + " not_b1 $c1, $c1;" + "\n" + + " cbr $c1, @BB0_14;" + "\n" + + " ld_group_u32 $s4, [$d1];" + "\n" + + " cvt_s64_s32 $d3, $s3;" + "\n" + + " shl_u64 $d3, $d3, 2;" + "\n" + + " add_u64 $d3, $d2, $d3;" + "\n" + + " ld_group_u32 $s3, [$d3];" + "\n" + + reduceOp + " $s3, $s3, $s4;" + "\n" + + " st_group_u32 $s3, [$d1];" + "\n" + + "" + "\n" + + "@BB0_14:" + "\n" + + " add_u32 $s3, $s1, 8;" + "\n" + + " cmp_lt_b1_u32 $c1, $s3, $s2;" + "\n" + + " cmp_lt_b1_s32 $c2, $s1, 8;" + "\n" + + " and_b1 $c1, $c2, $c1;" + "\n" + + " barrier_fgroup;" + "\n" + + " not_b1 $c1, $c1;" + "\n" + + " cbr $c1, @BB0_16;" + "\n" + + " ld_group_u32 $s4, [$d1];" + "\n" + + " cvt_s64_s32 $d3, $s3;" + "\n" + + " shl_u64 $d3, $d3, 2;" + "\n" + + " add_u64 $d3, $d2, $d3;" + "\n" + + " ld_group_u32 $s3, [$d3];" + "\n" + + reduceOp + " $s3, $s3, $s4;" + "\n" + + " st_group_u32 $s3, [$d1];" + "\n" + + "" + "\n" + + "@BB0_16:" + "\n" + + " add_u32 $s3, $s1, 4;" + "\n" + + " cmp_lt_b1_u32 $c1, $s3, $s2;" + "\n" + + " cmp_lt_b1_s32 $c2, $s1, 4;" + "\n" + + " and_b1 $c1, $c2, $c1;" + "\n" + + " barrier_fgroup;" + "\n" + + " not_b1 $c1, $c1;" + "\n" + + " cbr $c1, @BB0_18;" + "\n" + + " ld_group_u32 $s4, [$d1];" + "\n" + + " cvt_s64_s32 $d3, $s3;" + "\n" + + " shl_u64 $d3, $d3, 2;" + "\n" + + " add_u64 $d3, $d2, $d3;" + "\n" + + " ld_group_u32 $s3, [$d3];" + "\n" + + reduceOp + " $s3, $s3, $s4;" + "\n" + + " st_group_u32 $s3, [$d1];" + "\n" + + "" + "\n" + + "@BB0_18:" + "\n" + + " add_u32 $s3, $s1, 2;" + "\n" + + " cmp_lt_b1_u32 $c1, $s3, $s2;" + "\n" + + " cmp_lt_b1_s32 $c2, $s1, 2;" + "\n" + + " and_b1 $c1, $c2, $c1;" + "\n" + + " barrier_fgroup;" + "\n" + + " not_b1 $c1, $c1;" + "\n" + + " cbr $c1, @BB0_20;" + "\n" + + " ld_group_u32 $s4, [$d1];" + "\n" + + " cvt_s64_s32 $d3, $s3;" + "\n" + + " shl_u64 $d3, $d3, 2;" + "\n" + + " add_u64 $d3, $d2, $d3;" + "\n" + + " ld_group_u32 $s3, [$d3];" + "\n" + + reduceOp + " $s3, $s3, $s4;" + "\n" + + " st_group_u32 $s3, [$d1];" + "\n" + + "" + "\n" + + "@BB0_20:" + "\n" + + " add_u32 $s3, $s1, 1;" + "\n" + + " cmp_lt_b1_u32 $c1, $s3, $s2;" + "\n" + + " cmp_lt_b1_s32 $c2, $s1, 1;" + "\n" + + " and_b1 $c1, $c2, $c1;" + "\n" + + " barrier_fgroup;" + "\n" + + " not_b1 $c1, $c1;" + "\n" + + " cbr $c1, @BB0_22;" + "\n" + + " ld_group_u32 $s4, [$d1];" + "\n" + + " cvt_s64_s32 $d3, $s3;" + "\n" + + " shl_u64 $d3, $d3, 2;" + "\n" + + " add_u64 $d2, $d2, $d3;" + "\n" + + " ld_group_u32 $s3, [$d2];" + "\n" + + reduceOp + " $s3, $s3, $s4;" + "\n" + + " st_group_u32 $s3, [$d1];" + "\n" + + "" + "\n" + + "@BB0_22:" + "\n" + + " cmp_gt_b1_u32 $c0, $s1, 0; // s1 is local id, done if > 0" + "\n" + + " cbr $c0, @BB0_24;" + "\n" + + "" + "\n" + + " ld_group_u32 $s2, [%reduce_cllocal_scratch]; // s2 is result[get_group_id(0)];" + "\n" + + atomicResultProduction + " [$d0], $s2; // build global result from local results" + "\n" + + "" + "\n" + + "@BB0_24:" + "\n" + + " ret;" + "\n" + + "};" + "\n"); + //@formatter:on + } } diff -r 3b4690ddd92e -r 310994c667a7 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java --- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java Tue Jun 17 10:09:11 2014 +0200 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java Tue Jun 17 12:05:34 2014 +0200 @@ -250,7 +250,7 @@ /** * Generates a GPU binary from HSAIL code. */ - private static native long generateKernel(byte[] hsailCode, String name); + static native long generateKernel(byte[] hsailCode, String name); /** * Installs the {@linkplain ExternalCompilationResult#getEntryPoint() GPU binary} associated diff -r 3b4690ddd92e -r 310994c667a7 src/gpu/hsail/vm/gpu_hsail.hpp --- a/src/gpu/hsail/vm/gpu_hsail.hpp Tue Jun 17 10:09:11 2014 +0200 +++ b/src/gpu/hsail/vm/gpu_hsail.hpp Tue Jun 17 12:05:34 2014 +0200 @@ -25,6 +25,7 @@ #ifndef GPU_HSAIL_VM_GPU_HSAIL_HPP #define GPU_HSAIL_VM_GPU_HSAIL_HPP +#include "runtime/gpu.hpp" #include "utilities/exceptions.hpp" #include "graal/graalEnv.hpp" #include "gpu_hsail_Frame.hpp" diff -r 3b4690ddd92e -r 310994c667a7 src/gpu/hsail/vm/hsailArgumentsBase.cpp --- a/src/gpu/hsail/vm/hsailArgumentsBase.cpp Tue Jun 17 10:09:11 2014 +0200 +++ b/src/gpu/hsail/vm/hsailArgumentsBase.cpp Tue Jun 17 12:05:34 2014 +0200 @@ -38,6 +38,29 @@ return arg; } +void HSAILArgumentsBase::collectArgs() { + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s::collectArgs, sig:%s args length=%d", argsBuilderName(), _signature->as_C_string(), _length); + } + if (!_is_static) { + // First object in args should be 'this' + oop arg = _args->obj_at(_index++); + assert(arg->is_instance() && (!arg->is_array()), "First arg should be 'this'"); + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s, instance method, this " PTR_FORMAT ", is a %s", argsBuilderName(), (address) arg, arg->klass()->external_name()); + } + pushObject(arg); + } else { + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s, static method", argsBuilderName()); + } + } + // Iterate over the entire signature + iterate(); + + pushTrailingArgs(); +} + void HSAILArgumentsBase::do_bool() { // Get the boxed value oop arg = _args->obj_at(_index++); diff -r 3b4690ddd92e -r 310994c667a7 src/gpu/hsail/vm/hsailArgumentsBase.hpp --- a/src/gpu/hsail/vm/hsailArgumentsBase.hpp Tue Jun 17 10:09:11 2014 +0200 +++ b/src/gpu/hsail/vm/hsailArgumentsBase.hpp Tue Jun 17 12:05:34 2014 +0200 @@ -67,10 +67,6 @@ virtual void handleFinalObjParameter(void* obj) = 0; virtual void pushTrailingArgs() = 0; - void recordNullObjectParameter() { - if (_first_null_parameter_index == -1) _first_null_parameter_index = _parameter_index; - } - public: HSAILArgumentsBase(Symbol* signature, objArrayOop args, bool is_static) : SignatureIterator(signature) { this->_return_type = T_ILLEGAL; @@ -86,32 +82,29 @@ } + void recordNullObjectParameter() { + if (_first_null_parameter_index == -1) { + _first_null_parameter_index = _parameter_index; + } + } + + bool is_static() { + return _is_static; + } + + int length() { + return _length; + } + + objArrayOop args() { + return _args; + } + int getFirstNullParameterIndex() { return _first_null_parameter_index; } - void collectArgs() { - if (TraceGPUInteraction) { - tty->print_cr("[HSAIL] %s::collectArgs, sig:%s args length=%d", argsBuilderName(), _signature->as_C_string(), _length); - } - if (!_is_static) { - // First object in args should be 'this' - oop arg = _args->obj_at(_index++); - assert(arg->is_instance() && (! arg->is_array()), "First arg should be 'this'"); - if (TraceGPUInteraction) { - tty->print_cr("[HSAIL] %s, instance method, this " PTR_FORMAT ", is a %s", argsBuilderName(), (address) arg, arg->klass()->external_name()); - } - pushObject(arg); - } else { - if (TraceGPUInteraction) { - tty->print_cr("[HSAIL] %s, static method", argsBuilderName()); - } - } - // Iterate over the entire signature - iterate(); - - pushTrailingArgs(); - } + virtual void collectArgs(); void do_bool(); void do_byte(); diff -r 3b4690ddd92e -r 310994c667a7 src/gpu/hsail/vm/hsailKernelArguments.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/gpu/hsail/vm/hsailKernelArguments.cpp Tue Jun 17 12:05:34 2014 +0200 @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ +#include "hsailKernelArguments.hpp" + +void HSAILKernelArguments::collectArgs() { + int index = 0; + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s::collectArgs, args length=%d", argsBuilderName(), length()); + } + + // Manually iterate over the actual args array without looking at method signature + while (index < length()) { + oop arg = args()->obj_at(index++); + jvalue jValue; + if (arg == NULL) { + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s::collectArgs object, _index=%d, value = " PTR_FORMAT " is a %s", argsBuilderName(), index, (void*) arg, "null"); + } + recordNullObjectParameter(); + pushObject(arg); + } else { + java_lang_boxing_object::get_value(arg, &jValue); + BasicType basic_type = java_lang_boxing_object::basic_type(arg); + if (basic_type == T_ILLEGAL && (!(arg->is_array()))) { + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s::collectArgs object, _index=%d, value = " PTR_FORMAT " is a %s", argsBuilderName(), index, (void*) arg, arg == NULL ? "null" : arg->klass()->external_name()); + } + pushObject(arg); + } else if (arg->is_array()) { + if (TraceGPUInteraction) { + int array_length = ((objArrayOop) arg)->length(); + tty->print_cr("[HSAIL] %s::collectArgs array, length=%d, _index=%d, value = " PTR_FORMAT, argsBuilderName(), array_length, index, (void*) arg); + } + pushObject(arg); + } else { + switch (basic_type) { + case T_INT: + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s::collectArgs, T_INT _index=%d, value = %d", argsBuilderName(), index, jValue.i); + } + pushInt(jValue.i); + break; + case T_LONG: + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s::collectArgs, T_LONG _index=%d, value = %d", argsBuilderName(), index, jValue.j); + } + pushLong(jValue.j); + break; + case T_FLOAT: + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s::collectArgs, T_FLOAT _index=%d, value = %d", argsBuilderName(), index, jValue.f); + } + pushFloat(jValue.f); + break; + case T_DOUBLE: + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s::collectArgs, T_DOUBLE _index=%d, value = %d", argsBuilderName(), index, jValue.d); + } + pushDouble(jValue.d); + break; + case T_BYTE: + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s::collectArgs, T_BYTE _index=%d, value = %d", argsBuilderName(), index, jValue.b); + } + pushByte(jValue.b); + break; + case T_BOOLEAN: + if (TraceGPUInteraction) { + tty->print_cr("[HSAIL] %s::collectArgs, T_BOOLEAN _index=%d, value = %d", argsBuilderName(), index, jValue.z); + } + pushBool(jValue.z); + break; + } + } + } + } + + pushTrailingArgs(); +} + diff -r 3b4690ddd92e -r 310994c667a7 src/gpu/hsail/vm/hsailKernelArguments.hpp --- a/src/gpu/hsail/vm/hsailKernelArguments.hpp Tue Jun 17 10:09:11 2014 +0200 +++ b/src/gpu/hsail/vm/hsailKernelArguments.hpp Tue Jun 17 12:05:34 2014 +0200 @@ -91,16 +91,17 @@ // For kernel arguments we don't pass the final int parameter // since we use the HSAIL workitemid instruction in place of that int value virtual void handleFinalIntParameter() { - if (TraceGPUInteraction) { - tty->print_cr("[HSAIL] HSAILKernelArguments, not pushing trailing int"); - } + ShouldNotReachHere(); } - // for kernel arguments, final obj parameter should be an object + // For kernel arguments, final obj parameter should be an object // stream source array (already checked in the base class) so here we just pass it virtual void handleFinalObjParameter(void* arg) { - pushObject(arg); + ShouldNotReachHere(); } + + virtual void collectArgs(); + }; #endif // GPU_HSAIL_VM_HSAIL_KERNEL_ARGUMENTS_HPP