changeset 16119:310994c667a7

HSAIL: support offloading some IntStream.reduce() operations to HSA Contributed-by: Eric Caspole <eric.caspole@amd.com>
author Doug Simon <doug.simon@oracle.com>
date Tue, 17 Jun 2014 12:05:34 +0200
parents 3b4690ddd92e
children af8b7d059e03
files graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceMaxTest.java graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceMinTest.java graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceSumTest.java graal/com.oracle.graal.compiler.hsail/src/com/oracle/graal/compiler/hsail/CompileAndDispatch.java graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/ForEachToGraal.java graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java src/gpu/hsail/vm/gpu_hsail.hpp src/gpu/hsail/vm/hsailArgumentsBase.cpp src/gpu/hsail/vm/hsailArgumentsBase.hpp src/gpu/hsail/vm/hsailKernelArguments.cpp src/gpu/hsail/vm/hsailKernelArguments.hpp
diffstat 11 files changed, 796 insertions(+), 36 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceMaxTest.java	Tue Jun 17 12:05:34 2014 +0200
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.graal.compiler.hsail.test.lambda;
+
+import static com.oracle.graal.hotspot.HotSpotGraalRuntime.runtime;
+import com.oracle.graal.hotspot.HotSpotVMConfig;
+import static org.junit.Assert.*;
+import org.junit.*;
+
+import java.util.*;
+import java.util.stream.IntStream;
+
+public class ReduceMaxTest {
+    // The length of the input array
+    static int jobSize = 1027 * 1023 * 13;
+    static int loops = 1;
+
+    // The source array
+    int bigArray[] = null;
+
+    // result for baseline single threaded stream
+    int resultStream = 0;
+    // result for parallel CPU and offloaded streams
+    int resultOffload = 0;
+
+    int evaluate(boolean doParallelStream) {
+        int result = 0;
+        for (int i = 0; i < loops; i++) {
+            IntStream s = Arrays.stream(bigArray);
+            if (doParallelStream == true) {
+                OptionalInt resultParallel = s.parallel().reduce(Integer::max);
+                result = resultParallel.getAsInt();
+            } else {
+                result = s.reduce(Integer::max).getAsInt();
+            }
+        }
+        return result;
+    }
+
+    int evaluateWithIdentity(boolean doParallelStream) {
+        int result = 0;
+        for (int i = 0; i < loops; i++) {
+            IntStream s = Arrays.stream(bigArray);
+            if (doParallelStream == true) {
+                result = s.parallel().reduce(0, Integer::max);
+            } else {
+                result = s.reduce(0, Integer::max);
+            }
+        }
+        return result;
+    }
+
+    @Test
+    public void testReduce() {
+        // Handmade reduce does not support +UseCompressedOops
+        HotSpotVMConfig config = runtime().getConfig();
+        if (config.useCompressedOops == true || config.useHSAILDeoptimization == true) {
+            return;
+        }
+
+        bigArray = new int[jobSize];
+        for (int i = 0; i < jobSize; i++) {
+            // bigArray[i] = i + 1;
+            bigArray[i] = -1024 + i + 1;
+        }
+
+        // Get non parallel baseline
+        resultStream = evaluate(false);
+
+        // Get OptionalInt version kernel
+        resultOffload = evaluate(true);
+        assertTrue(resultStream == resultOffload);
+
+        // Do identity version kernel
+        // Get non parallel baseline
+        resultStream = evaluateWithIdentity(false);
+
+        resultOffload = evaluateWithIdentity(true);
+        assertTrue(resultStream == resultOffload);
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceMinTest.java	Tue Jun 17 12:05:34 2014 +0200
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.graal.compiler.hsail.test.lambda;
+
+import static com.oracle.graal.hotspot.HotSpotGraalRuntime.runtime;
+import com.oracle.graal.hotspot.HotSpotVMConfig;
+import static org.junit.Assert.*;
+import org.junit.*;
+
+import java.util.*;
+import java.util.stream.IntStream;
+
+public class ReduceMinTest {
+    // The length of the input array
+    static int jobSize = 1027 * 1023 * 13;
+    static int loops = 1;
+
+    // The input array to the kernel
+    int bigArray[] = null;
+
+    // result for baseline single threaded stream
+    int resultStream = 0;
+    // result for parallel CPU and offloaded streams
+    int resultOffload = 0;
+
+    int evaluate(boolean doParallelStream) {
+        int result = 0;
+        for (int i = 0; i < loops; i++) {
+            IntStream s = Arrays.stream(bigArray);
+            if (doParallelStream == true) {
+                OptionalInt resultParallel = s.parallel().reduce(Integer::min);
+                result = resultParallel.getAsInt();
+            } else {
+                result = s.reduce(Integer::min).getAsInt();
+            }
+        }
+        return result;
+    }
+
+    int evaluateWithIdentity(boolean doParallelStream) {
+        int result = 0;
+        for (int i = 0; i < loops; i++) {
+            IntStream s = Arrays.stream(bigArray);
+            if (doParallelStream == true) {
+                result = s.parallel().reduce(0, Integer::min);
+            } else {
+                result = s.reduce(0, Integer::min);
+            }
+        }
+        return result;
+    }
+
+    @Test
+    public void testReduce() {
+        // Handmade reduce does not support +UseCompressedOops
+        HotSpotVMConfig config = runtime().getConfig();
+        if (config.useCompressedOops == true || config.useHSAILDeoptimization == true) {
+            return;
+        }
+
+        bigArray = new int[jobSize];
+        for (int i = 0; i < jobSize; i++) {
+            bigArray[i] = -1024 + i + 1;
+        }
+
+        // Get non parallel baseline
+        resultStream = evaluate(false);
+
+        // Get OptionalInt version kernel
+        resultOffload = evaluate(true);
+        assertTrue(resultStream == resultOffload);
+
+        // Do identity version kernel
+        // Get non parallel baseline
+        resultStream = evaluateWithIdentity(false);
+
+        resultOffload = evaluateWithIdentity(true);
+        assertTrue(resultStream == resultOffload);
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ReduceSumTest.java	Tue Jun 17 12:05:34 2014 +0200
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.graal.compiler.hsail.test.lambda;
+
+//import com.oracle.graal.compiler.common.GraalInternalError;
+import static com.oracle.graal.hotspot.HotSpotGraalRuntime.runtime;
+import com.oracle.graal.hotspot.HotSpotVMConfig;
+import org.junit.*;
+
+import java.util.*;
+import java.util.stream.IntStream;
+
+public class ReduceSumTest {
+    // The length of the input array
+    static int jobSize = 1027 * 1023 * 13;
+    static int loops = 1;
+
+    // The array to be summed
+    int bigArray[] = null;
+
+    // sum for baseline single threaded stream
+    int sumStream = 0;
+    // sum for parallel CPU and offloaded streams
+    int sumOffload = 0;
+
+    int evaluate(boolean doParallelStream) {
+        int sum = 0;
+        for (int i = 0; i < loops; i++) {
+            IntStream s = Arrays.stream(bigArray);
+            if (doParallelStream == true) {
+                OptionalInt resultParallel = s.parallel().reduce(Integer::sum);
+                sum = resultParallel.getAsInt();
+            } else {
+                OptionalInt resultStream = s.reduce(Integer::sum);
+                sum = resultStream.getAsInt();
+            }
+        }
+        return sum;
+    }
+
+    int evaluateWithIdentity(boolean doParallelStream) {
+        int sum = 0;
+        for (int i = 0; i < loops; i++) {
+            IntStream s = Arrays.stream(bigArray);
+            if (doParallelStream == true) {
+                sum = s.parallel().reduce(0, Integer::sum);
+            } else {
+                sum = s.reduce(0, Integer::sum);
+            }
+        }
+        return sum;
+    }
+
+    @Test
+    public void testReduce() {
+        // Handmade reduce does not support +UseCompressedOops
+        HotSpotVMConfig config = runtime().getConfig();
+        if (config.useCompressedOops == true || config.useHSAILDeoptimization == true) {
+            return;
+        }
+
+        bigArray = new int[jobSize];
+        for (int i = 0; i < jobSize; i++) {
+            bigArray[i] = -1024 + i + 1;
+        }
+
+        // Get non parallel baseline
+        sumStream = evaluate(false);
+
+        // Get OptionalInt version kernel
+        sumOffload = evaluate(true);
+        assert sumStream == sumOffload : "Offload sum is wrong, stream:" + sumStream + " != offload:" + sumOffload;
+
+        // Get identity version kernel
+        sumOffload = evaluateWithIdentity(true);
+        assert sumStream == sumOffload : "Offload sum is wrong, stream:" + sumStream + " != offload:" + sumOffload;
+    }
+}
--- a/graal/com.oracle.graal.compiler.hsail/src/com/oracle/graal/compiler/hsail/CompileAndDispatch.java	Tue Jun 17 10:09:11 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail/src/com/oracle/graal/compiler/hsail/CompileAndDispatch.java	Tue Jun 17 12:05:34 2014 +0200
@@ -32,4 +32,12 @@
     Object createKernel(Class<?> consumerClass);
 
     boolean dispatchKernel(Object kernel, int jobSize, Object[] args);
+
+    Object createKernelFromHsailString(String code, String methodName);
+
+    String getIntegerReduceIntrinsic(String reducerName);
+
+    Integer offloadIntReduceImpl(Object kernel, int identity, int[] streamSource);
+
+    String getIntReduceTargetName(Class<?> opClass);
 }
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/ForEachToGraal.java	Tue Jun 17 10:09:11 2014 +0200
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/ForEachToGraal.java	Tue Jun 17 12:05:34 2014 +0200
@@ -26,15 +26,20 @@
 import static com.oracle.graal.hotspot.HotSpotGraalRuntime.*;
 
 import java.lang.reflect.*;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.function.*;
 
 import com.oracle.graal.api.code.*;
 import com.oracle.graal.api.meta.*;
+import com.oracle.graal.compiler.common.GraalInternalError;
 import com.oracle.graal.compiler.hsail.*;
 import com.oracle.graal.compiler.target.*;
 import com.oracle.graal.debug.*;
 import com.oracle.graal.debug.internal.*;
 import com.oracle.graal.gpu.*;
 import com.oracle.graal.graph.iterators.*;
+import com.oracle.graal.hotspot.*;
 import com.oracle.graal.hotspot.meta.*;
 import com.oracle.graal.hsail.*;
 import com.oracle.graal.java.*;
@@ -55,11 +60,54 @@
         return (HSAILHotSpotBackend) backend;
     }
 
+    ConcurrentHashMap<Class<?>, String> resolvedConsumerTargetMethods = new ConcurrentHashMap<>();
+
     /**
-     * Gets a compiled and installed kernel for the lambda called by the {@code accept(int value)}
-     * method in a class implementing {@code java.util.function.IntConsumer}.
-     * 
-     * @param intConsumerClass a class implementing {@code java.util.function.IntConsumer}
+     * Returns the name of the reduction method given a class implementing {@link IntConsumer}.
+     *
+     * @param opClass a class implementing {@link IntConsumer}.
+     * @return the name of the reduction method
+     */
+    public String getIntReduceTargetName(Class<?> opClass) {
+        String cachedMethodName = resolvedConsumerTargetMethods.get(Objects.requireNonNull(opClass));
+        if (cachedMethodName != null) {
+            return cachedMethodName;
+        } else {
+            Method acceptMethod = null;
+            for (Method m : opClass.getMethods()) {
+                if (m.getName().equals("applyAsInt")) {
+                    assert acceptMethod == null : "found more than one implementation of applyAsInt in " + opClass;
+                    acceptMethod = m;
+                }
+            }
+            // Ensure a debug configuration for this thread is initialized
+            if (DebugScope.getConfig() == null) {
+                DebugEnvironment.initialize(System.out);
+            }
+
+            HSAILHotSpotBackend backend = getHSAILBackend();
+            Providers providers = backend.getProviders();
+            StructuredGraph graph = new StructuredGraph(((HotSpotMetaAccessProvider) providers.getMetaAccess()).lookupJavaMethod(acceptMethod));
+            new GraphBuilderPhase.Instance(providers.getMetaAccess(), GraphBuilderConfiguration.getDefault(), OptimisticOptimizations.ALL).apply(graph);
+            NodeIterable<MethodCallTargetNode> calls = graph.getNodes(MethodCallTargetNode.class);
+            assert calls.count() == 1;
+            ResolvedJavaMethod lambdaMethod = calls.first().targetMethod();
+            Debug.log("target ... %s", lambdaMethod);
+
+            String className = lambdaMethod.getDeclaringClass().getName();
+            if (!className.equals("Ljava/lang/Integer;")) {
+                return null;
+            }
+            resolvedConsumerTargetMethods.put(opClass, lambdaMethod.getName());
+            return lambdaMethod.getName().intern();
+        }
+    }
+
+    /**
+     * Gets a compiled and installed kernel for the lambda called by the
+     * {@link IntConsumer#accept(int)} method in a class implementing {@link IntConsumer}.
+     *
+     * @param intConsumerClass a class implementing {@link IntConsumer}
      * @return a {@link HotSpotNmethod} handle to the compiled and installed kernel
      */
     private static HotSpotNmethod getCompiledLambda(Class<?> intConsumerClass) {
@@ -108,6 +156,38 @@
     }
 
     @Override
+    public Object createKernelFromHsailString(String code, String methodName) {
+        ExternalCompilationResult hsailCode = new ExternalCompilationResult();
+        try (Debug.Scope ds = Debug.scope("GeneratingKernelBinary")) {
+
+            HSAILHotSpotBackend backend = getHSAILBackend();
+            Providers providers = backend.getProviders();
+            Method integerOffloadMethod = null;
+
+            for (Method m : Integer.class.getMethods()) {
+                if (m.getName().equals(methodName)) {
+                    integerOffloadMethod = m;
+                    break;
+                }
+            }
+            if (integerOffloadMethod != null) {
+                ResolvedJavaMethod rm = ((HotSpotMetaAccessProvider) providers.getMetaAccess()).lookupJavaMethod(integerOffloadMethod);
+
+                long kernel = HSAILHotSpotBackend.generateKernel(code.getBytes(), "Integer::" + methodName);
+                if (kernel == 0) {
+                    throw new GraalInternalError("Failed to compile HSAIL kernel from String");
+                }
+                hsailCode.setEntryPoint(kernel);
+                return backend.installKernel(rm, hsailCode); // is a HotSpotNmethod
+            } else {
+                return null;
+            }
+        } catch (Throwable e) {
+            throw Debug.handle(e);
+        }
+    }
+
+    @Override
     public boolean dispatchKernel(Object kernel, int jobSize, Object[] args) {
         HotSpotNmethod code = (HotSpotNmethod) kernel;
         if (code != null) {
@@ -125,4 +205,258 @@
             return false;
         }
     }
+
+    /**
+     * Running with a larger global size seems to increase the performance for sum, but it might be
+     * different for other reductions so it is a knob.
+     */
+    private static final int GlobalSize = 1024 * Integer.getInteger("com.amd.sumatra.reduce.globalsize.multiple", 1);
+
+    @Override
+    public Integer offloadIntReduceImpl(Object okraKernel, int identity, int[] streamSource) {
+        // NOTE - this reduce requires local size of 64 which is the SumatraUtils default
+
+        // Handmade reduce does not support +UseCompressedOops
+        HotSpotVMConfig config = runtime().getConfig();
+        if (config.useCompressedOops == true || config.useHSAILDeoptimization == true) {
+            throw new GraalInternalError("Reduce offload not compatible with +UseCompressedOops or +UseHSAILDeoptimization");
+        }
+
+        try {
+            assert streamSource.length >= GlobalSize : "Input array length=" + streamSource.length + " smaller than requested global_size=" + GlobalSize;
+
+            int result[] = {identity};
+            Object args[] = {streamSource, result, streamSource.length};
+            args[0] = streamSource;
+
+            dispatchKernel(okraKernel, GlobalSize, args);
+
+            // kernel result is result[0].
+            return result[0];
+        } catch (Exception e) {
+            System.err.println(e);
+            e.printStackTrace();
+        }
+        return null;
+    }
+
+    @Override
+    public String getIntegerReduceIntrinsic(String reducerName) {
+
+        // Note all of these depend on group size of 256
+
+        String reduceOp = "/* Invalid */ ";
+        String atomicResultProduction = "/* Invalid */ ";
+        if (reducerName.equals("sum")) {
+            reduceOp = "add_u32 ";
+            atomicResultProduction = "atomicnoret_add_global_u32 ";
+        } else if (reducerName.equals("max")) {
+            reduceOp = "max_s32 ";
+            atomicResultProduction = "atomicnoret_max_global_s32 ";
+        } else if (reducerName.equals("min")) {
+            reduceOp = "min_s32 ";
+            atomicResultProduction = "atomicnoret_min_global_s32 ";
+        } else {
+            return "/* Invalid */ ";
+        }
+
+        // @formatter:off
+        return new String(
+                "version 0:95:$full:$large; // BRIG Object Format Version 0:4" + "\n"
+                + "" + "\n"
+                + "kernel &run(" + "\n"
+                + "	align 8 kernarg_u64 %arg_p3," + "\n"
+                + "	align 8 kernarg_u64 %arg_p4," + "\n"
+                + "	align 4 kernarg_u32 %arg_p5)" + "\n"
+                + "{" + "\n"
+                + "" + "\n"
+                + "	align 4 group_u32 %reduce_cllocal_scratch[256];" + "\n"
+                + "" + "\n"
+                + "	workitemabsid_u32 $s2, 0;" + "\n"
+                + "" + "\n"
+                + "	ld_kernarg_u32	$s1, [%arg_p5];" + "\n"
+                + "	ld_kernarg_u64	$d0, [%arg_p4];" + "\n"
+                + "	ld_kernarg_u64	$d1, [%arg_p3];" + "\n"
+                + "" + "\n"
+                + "	add_u64 $d0, $d0, 24;             // adjust over obj array headers" + "\n"
+                + "	add_u64 $d1, $d1, 24;" + "\n"
+                + "	cmp_ge_b1_s32	$c0, $s2, $s1; // if(gloId < length){" + "\n"
+                + "	cbr	$c0, @BB0_1;" + "\n"
+                + "	gridsize_u32	$s0, 0;        // s0 is globalsize" + "\n"
+                + " add_u32 $s0, $s0, $s2;         // gx += globalsize" + "\n"
+                + "	cvt_s64_s32	$d2, $s2;      // s2 is global id" + "\n"
+                + "	shl_u64	$d2, $d2, 2;" + "\n"
+                + "	add_u64	$d2, $d1, $d2;" + "\n"
+                + "	ld_global_u32	$s3, [$d2];    // load this element from input" + "\n"
+                + "	brn	@BB0_3;" + "\n"
+                + "" + "\n"
+                + "@BB0_1:" + "\n"
+                + "	mov_b32	$s0, $s2;" + "\n"                                  + "" + "\n"
+                + "@BB0_3:" + "\n"
+                + "	cmp_ge_b1_s32	$c1, $s0, $s1; // while (gx < length)" + "\n"
+                + "	cbr	$c1, @BB0_6;" + "\n"
+                + "	gridsize_u32	$s2, 0;" + "\n"
+                + "" + "\n"
+                + "@BB0_5:" + "\n"
+                + "	cvt_s64_s32	$d2, $s0;" + "\n"
+                + "	shl_u64	$d2, $d2, 2;" + "\n"
+                + "	add_u64	$d2, $d1, $d2;" + "\n"
+                + "	ld_global_u32	$s4, [$d2];" + "\n"
+                +       reduceOp + "  $s3, $s3, $s4;" + "\n"
+                + "	add_u32	$s0, $s0, $s2;" + "\n"
+                + "	cmp_lt_b1_s32	$c1, $s0, $s1;" + "\n"
+                + "	cbr	$c1, @BB0_5;" + "\n"
+                + "" + "\n"
+                + "@BB0_6:" + "\n"
+                + "	workgroupid_u32	$s0, 0;" + "\n"
+                + "	workgroupsize_u32	$s2, 0;" + "\n"
+                + "	mul_u32	$s2, $s2, $s0;" + "\n"
+                + "	sub_u32	$s2, $s1, $s2;" + "\n"
+                + "	workitemid_u32	$s1, 0;" + "\n"
+                + "	add_u32	$s4, $s1, 128;"
+                + "\n"
+                + "	cmp_lt_b1_u32	$c1, $s4, $s2;" + "\n"
+                + "	cmp_lt_b1_s32	$c2, $s1, 128;" + "\n"
+                + "	and_b1	$c1, $c2, $c1;" + "\n"
+                + "	cvt_s64_s32	$d1, $s1;" + "\n"
+                + "	shl_u64	$d1, $d1, 2;" + "\n"
+                + "	lda_group_u64	$d2, [%reduce_cllocal_scratch];" + "\n"
+                + "	add_u64	$d1, $d2, $d1;" + "\n"
+                + "	st_group_u32	$s3, [$d1];" + "\n"
+                + "	barrier_fgroup;" + "\n"
+                + "	not_b1	$c1, $c1;" + "\n"
+                + "	cbr	$c1, @BB0_8;" + "\n"
+                + "	ld_group_u32	$s3, [$d1];" + "\n"
+                + "	cvt_s64_s32	$d3, $s4;" + "\n"
+                + "	shl_u64	$d3, $d3, 2;" + "\n"
+                + "	add_u64	$d3, $d2, $d3;" + "\n"
+                + "	ld_group_u32	$s4, [$d3];" + "\n"
+                +       reduceOp + "  $s3, $s3, $s4;" + "\n"
+                + "	st_group_u32	$s3, [$d1];" + "\n"
+                + "" + "\n"
+                + "@BB0_8:" + "\n"
+                + "	add_u32	$s3, $s1, 64;" + "\n"
+                + "	cmp_lt_b1_u32	$c1, $s3, $s2;" + "\n"
+                + "	cmp_lt_b1_s32	$c2, $s1, 64;" + "\n"
+                + "	and_b1	$c1, $c2, $c1;" + "\n"
+                + "	barrier_fgroup;" + "\n"
+                + "	not_b1	$c1, $c1;" + "\n"
+                + "	cbr	$c1, @BB0_10;" + "\n"
+                + "	ld_group_u32	$s4, [$d1];" + "\n"
+                + "	cvt_s64_s32	$d3, $s3;" + "\n"
+                + "	shl_u64	$d3, $d3, 2;" + "\n"
+                + "	add_u64	$d3, $d2, $d3;" + "\n"
+                + "	ld_group_u32	$s3, [$d3];" + "\n"
+                +       reduceOp + "  $s3, $s3, $s4;"
+                + "\n"
+                + "	st_group_u32	$s3, [$d1];" + "\n"
+                + "" + "\n"
+                + "@BB0_10:" + "\n"
+                + "	add_u32	$s3, $s1, 32;" + "\n"
+                + "	cmp_lt_b1_u32	$c1, $s3, $s2;" + "\n"
+                + "	cmp_lt_b1_s32	$c2, $s1, 32;" + "\n"
+                + "	and_b1	$c1, $c2, $c1;" + "\n"
+                + "	barrier_fgroup;" + "\n"
+                + "	not_b1	$c1, $c1;" + "\n"
+                + "	cbr	$c1, @BB0_12;" + "\n"
+                + "	ld_group_u32	$s4, [$d1];" + "\n"
+                + "	cvt_s64_s32	$d3, $s3;" + "\n"
+                + "	shl_u64	$d3, $d3, 2;" + "\n"
+                + "	add_u64	$d3, $d2, $d3;" + "\n"
+                + "	ld_group_u32	$s3, [$d3];" + "\n"
+                +       reduceOp + "  $s3, $s3, $s4;" + "\n"
+                + "	st_group_u32	$s3, [$d1];" + "\n"
+                + "" + "\n"
+                + "@BB0_12:" + "\n"
+                + "	add_u32	$s3, $s1, 16;" + "\n"
+                + "	cmp_lt_b1_u32	$c1, $s3, $s2;" + "\n"
+                + "	cmp_lt_b1_s32	$c2, $s1, 16;" + "\n"
+                + "	and_b1	$c1, $c2, $c1;" + "\n"
+                + "	barrier_fgroup;" + "\n"
+                + "	not_b1	$c1, $c1;" + "\n"
+                + "	cbr	$c1, @BB0_14;" + "\n"
+                + "	ld_group_u32	$s4, [$d1];" + "\n"
+                + "	cvt_s64_s32	$d3, $s3;" + "\n"
+                + "	shl_u64	$d3, $d3, 2;" + "\n"
+                + "	add_u64	$d3, $d2, $d3;" + "\n"
+                + "	ld_group_u32	$s3, [$d3];" + "\n"
+                +       reduceOp + "  $s3, $s3, $s4;" + "\n"
+                + "	st_group_u32	$s3, [$d1];" + "\n"
+                + "" + "\n"
+                + "@BB0_14:" + "\n"
+                + "	add_u32	$s3, $s1, 8;" + "\n"
+                + "	cmp_lt_b1_u32	$c1, $s3, $s2;" + "\n"
+                + "	cmp_lt_b1_s32	$c2, $s1, 8;" + "\n"
+                + "	and_b1	$c1, $c2, $c1;" + "\n"
+                + "	barrier_fgroup;" + "\n"
+                + "	not_b1	$c1, $c1;" + "\n"
+                + "	cbr	$c1, @BB0_16;" + "\n"
+                + "	ld_group_u32	$s4, [$d1];" + "\n"
+                + "	cvt_s64_s32	$d3, $s3;" + "\n"
+                + "	shl_u64	$d3, $d3, 2;" + "\n"
+                + "	add_u64	$d3, $d2, $d3;" + "\n"
+                + "	ld_group_u32	$s3, [$d3];" + "\n"
+                +       reduceOp + "  $s3, $s3, $s4;" + "\n"
+                + "	st_group_u32	$s3, [$d1];" + "\n"
+                + "" + "\n"
+                + "@BB0_16:" + "\n"
+                + "	add_u32	$s3, $s1, 4;" + "\n"
+                + "	cmp_lt_b1_u32	$c1, $s3, $s2;" + "\n"
+                + "	cmp_lt_b1_s32	$c2, $s1, 4;" + "\n"
+                + "	and_b1	$c1, $c2, $c1;" + "\n"
+                + "	barrier_fgroup;" + "\n"
+                + "	not_b1	$c1, $c1;" + "\n"
+                + "	cbr	$c1, @BB0_18;" + "\n"
+                + "	ld_group_u32	$s4, [$d1];" + "\n"
+                + "	cvt_s64_s32	$d3, $s3;" + "\n"
+                + "	shl_u64	$d3, $d3, 2;" + "\n"
+                + "	add_u64	$d3, $d2, $d3;" + "\n"
+                + "	ld_group_u32	$s3, [$d3];" + "\n"
+                +       reduceOp + "  $s3, $s3, $s4;" + "\n"
+                + "	st_group_u32	$s3, [$d1];" + "\n"
+                + "" + "\n"
+                + "@BB0_18:" + "\n"
+                + "	add_u32	$s3, $s1, 2;" + "\n"
+                + "	cmp_lt_b1_u32	$c1, $s3, $s2;" + "\n"
+                + "	cmp_lt_b1_s32	$c2, $s1, 2;" + "\n"
+                + "	and_b1	$c1, $c2, $c1;" + "\n"
+                + "	barrier_fgroup;" + "\n"
+                + "	not_b1	$c1, $c1;" + "\n"
+                + "	cbr	$c1, @BB0_20;" + "\n"
+                + "	ld_group_u32	$s4, [$d1];" + "\n"
+                + "	cvt_s64_s32	$d3, $s3;" + "\n"
+                + "	shl_u64	$d3, $d3, 2;" + "\n"
+                + "	add_u64	$d3, $d2, $d3;" + "\n"
+                + "	ld_group_u32	$s3, [$d3];" + "\n"
+                +       reduceOp + "  $s3, $s3, $s4;" + "\n"
+                + "	st_group_u32	$s3, [$d1];" + "\n"
+                + "" + "\n"
+                + "@BB0_20:" + "\n"
+                + "	add_u32	$s3, $s1, 1;" + "\n"
+                + "	cmp_lt_b1_u32	$c1, $s3, $s2;" + "\n"
+                + "	cmp_lt_b1_s32	$c2, $s1, 1;" + "\n"
+                + "	and_b1	$c1, $c2, $c1;" + "\n"
+                + "	barrier_fgroup;" + "\n"
+                + "	not_b1	$c1, $c1;" + "\n"
+                + "	cbr	$c1, @BB0_22;" + "\n"
+                + "	ld_group_u32	$s4, [$d1];" + "\n"
+                + "	cvt_s64_s32	$d3, $s3;" + "\n"
+                + "	shl_u64	$d3, $d3, 2;" + "\n"
+                + "	add_u64	$d2, $d2, $d3;" + "\n"
+                + "	ld_group_u32	$s3, [$d2];" + "\n"
+                +       reduceOp + "  $s3, $s3, $s4;" + "\n"
+                + "	st_group_u32	$s3, [$d1];" + "\n"
+                + "" + "\n"
+                + "@BB0_22:" + "\n"
+                + "	cmp_gt_b1_u32	$c0, $s1, 0;  // s1 is local id, done if > 0" + "\n"
+                + "	cbr	$c0, @BB0_24;" + "\n"
+                + "" + "\n"
+                + "	ld_group_u32	$s2, [%reduce_cllocal_scratch];  // s2 is result[get_group_id(0)];" + "\n"
+                +       atomicResultProduction + " [$d0], $s2; // build global result from local results" + "\n"
+                + "" + "\n"
+                + "@BB0_24:" + "\n"
+                + "	ret;" + "\n"
+                + "};" + "\n");
+        //@formatter:on
+    }
 }
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java	Tue Jun 17 10:09:11 2014 +0200
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java	Tue Jun 17 12:05:34 2014 +0200
@@ -250,7 +250,7 @@
     /**
      * Generates a GPU binary from HSAIL code.
      */
-    private static native long generateKernel(byte[] hsailCode, String name);
+    static native long generateKernel(byte[] hsailCode, String name);
 
     /**
      * Installs the {@linkplain ExternalCompilationResult#getEntryPoint() GPU binary} associated
--- a/src/gpu/hsail/vm/gpu_hsail.hpp	Tue Jun 17 10:09:11 2014 +0200
+++ b/src/gpu/hsail/vm/gpu_hsail.hpp	Tue Jun 17 12:05:34 2014 +0200
@@ -25,6 +25,7 @@
 #ifndef GPU_HSAIL_VM_GPU_HSAIL_HPP
 #define GPU_HSAIL_VM_GPU_HSAIL_HPP
 
+#include "runtime/gpu.hpp"
 #include "utilities/exceptions.hpp"
 #include "graal/graalEnv.hpp"
 #include "gpu_hsail_Frame.hpp"
--- a/src/gpu/hsail/vm/hsailArgumentsBase.cpp	Tue Jun 17 10:09:11 2014 +0200
+++ b/src/gpu/hsail/vm/hsailArgumentsBase.cpp	Tue Jun 17 12:05:34 2014 +0200
@@ -38,6 +38,29 @@
   return arg;
 }
 
+void HSAILArgumentsBase::collectArgs() {
+  if (TraceGPUInteraction) {
+    tty->print_cr("[HSAIL] %s::collectArgs, sig:%s  args length=%d", argsBuilderName(), _signature->as_C_string(), _length);
+  }
+  if (!_is_static) {
+    // First object in args should be 'this'
+    oop arg = _args->obj_at(_index++);
+    assert(arg->is_instance() && (!arg->is_array()), "First arg should be 'this'");
+    if (TraceGPUInteraction) {
+      tty->print_cr("[HSAIL] %s, instance method, this " PTR_FORMAT ", is a %s", argsBuilderName(), (address) arg, arg->klass()->external_name());
+    }
+    pushObject(arg);
+  } else {
+    if (TraceGPUInteraction) {
+      tty->print_cr("[HSAIL] %s, static method", argsBuilderName());
+    }
+  }
+  // Iterate over the entire signature
+  iterate();
+
+  pushTrailingArgs();
+}
+
 void HSAILArgumentsBase::do_bool() {
   // Get the boxed value
   oop arg = _args->obj_at(_index++);
--- a/src/gpu/hsail/vm/hsailArgumentsBase.hpp	Tue Jun 17 10:09:11 2014 +0200
+++ b/src/gpu/hsail/vm/hsailArgumentsBase.hpp	Tue Jun 17 12:05:34 2014 +0200
@@ -67,10 +67,6 @@
     virtual void handleFinalObjParameter(void* obj) = 0;
     virtual void pushTrailingArgs() = 0;
 
-    void recordNullObjectParameter() {
-        if (_first_null_parameter_index == -1) _first_null_parameter_index = _parameter_index;
-    }
-
  public:
   HSAILArgumentsBase(Symbol* signature, objArrayOop args, bool is_static) : SignatureIterator(signature) {
     this->_return_type = T_ILLEGAL;
@@ -86,32 +82,29 @@
 
   }
 
+  void recordNullObjectParameter() {
+    if (_first_null_parameter_index == -1) {
+      _first_null_parameter_index = _parameter_index;
+    }
+  }
+
+  bool is_static() {
+    return _is_static;
+  }
+
+  int length() {
+    return _length;
+  }
+
+  objArrayOop args() {
+    return _args;
+  }
+
   int getFirstNullParameterIndex() {
     return _first_null_parameter_index;
   }
 
-  void collectArgs() {
-    if (TraceGPUInteraction) {
-      tty->print_cr("[HSAIL] %s::collectArgs, sig:%s  args length=%d", argsBuilderName(), _signature->as_C_string(), _length);
-    }    
-    if (!_is_static) {      
-      // First object in args should be 'this'
-      oop arg = _args->obj_at(_index++);
-      assert(arg->is_instance() && (! arg->is_array()), "First arg should be 'this'");
-      if (TraceGPUInteraction) {
-        tty->print_cr("[HSAIL] %s, instance method, this " PTR_FORMAT ", is a %s", argsBuilderName(), (address) arg, arg->klass()->external_name());
-      }
-      pushObject(arg);
-    } else {
-      if (TraceGPUInteraction) {
-          tty->print_cr("[HSAIL] %s, static method", argsBuilderName());
-      }
-    }
-    // Iterate over the entire signature
-    iterate();
-    
-    pushTrailingArgs();
-  }
+  virtual void collectArgs();
 
   void do_bool();
   void do_byte();
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/gpu/hsail/vm/hsailKernelArguments.cpp	Tue Jun 17 12:05:34 2014 +0200
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+#include "hsailKernelArguments.hpp"
+
+void HSAILKernelArguments::collectArgs() {
+  int index = 0;
+  if (TraceGPUInteraction) {
+    tty->print_cr("[HSAIL] %s::collectArgs, args length=%d", argsBuilderName(), length());
+  }
+
+  // Manually iterate over the actual args array without looking at method signature
+  while (index < length()) {
+    oop arg = args()->obj_at(index++);
+    jvalue jValue;
+    if (arg == NULL) {
+      if (TraceGPUInteraction) {
+        tty->print_cr("[HSAIL] %s::collectArgs object, _index=%d, value = " PTR_FORMAT " is a %s", argsBuilderName(), index, (void*) arg, "null");
+      }
+      recordNullObjectParameter();
+      pushObject(arg);
+    } else {
+      java_lang_boxing_object::get_value(arg, &jValue);
+      BasicType basic_type = java_lang_boxing_object::basic_type(arg);
+      if (basic_type == T_ILLEGAL && (!(arg->is_array()))) {
+        if (TraceGPUInteraction) {
+          tty->print_cr("[HSAIL] %s::collectArgs object, _index=%d, value = " PTR_FORMAT " is a %s", argsBuilderName(), index, (void*) arg, arg == NULL ? "null" : arg->klass()->external_name());
+        }
+        pushObject(arg);
+      } else if (arg->is_array()) {
+        if (TraceGPUInteraction) {
+          int array_length = ((objArrayOop) arg)->length();
+          tty->print_cr("[HSAIL] %s::collectArgs array, length=%d, _index=%d, value = " PTR_FORMAT, argsBuilderName(), array_length, index, (void*) arg);
+        }
+        pushObject(arg);
+      } else {
+        switch (basic_type) {
+          case T_INT:
+            if (TraceGPUInteraction) {
+              tty->print_cr("[HSAIL] %s::collectArgs, T_INT _index=%d, value = %d", argsBuilderName(), index, jValue.i);
+            }
+            pushInt(jValue.i);
+            break;
+          case T_LONG:
+            if (TraceGPUInteraction) {
+              tty->print_cr("[HSAIL] %s::collectArgs, T_LONG _index=%d, value = %d", argsBuilderName(), index, jValue.j);
+            }
+            pushLong(jValue.j);
+            break;
+          case T_FLOAT:
+            if (TraceGPUInteraction) {
+              tty->print_cr("[HSAIL] %s::collectArgs, T_FLOAT _index=%d, value = %d", argsBuilderName(), index, jValue.f);
+            }
+            pushFloat(jValue.f);
+            break;
+          case T_DOUBLE:
+            if (TraceGPUInteraction) {
+              tty->print_cr("[HSAIL] %s::collectArgs, T_DOUBLE _index=%d, value = %d", argsBuilderName(), index, jValue.d);
+            }
+            pushDouble(jValue.d);
+            break;
+          case T_BYTE:
+            if (TraceGPUInteraction) {
+              tty->print_cr("[HSAIL] %s::collectArgs, T_BYTE _index=%d, value = %d", argsBuilderName(), index, jValue.b);
+            }
+            pushByte(jValue.b);
+            break;
+          case T_BOOLEAN:
+            if (TraceGPUInteraction) {
+              tty->print_cr("[HSAIL] %s::collectArgs, T_BOOLEAN _index=%d, value = %d", argsBuilderName(), index, jValue.z);
+            }
+            pushBool(jValue.z);
+            break;
+        }
+      }
+    }
+  }
+
+  pushTrailingArgs();
+}
+
--- a/src/gpu/hsail/vm/hsailKernelArguments.hpp	Tue Jun 17 10:09:11 2014 +0200
+++ b/src/gpu/hsail/vm/hsailKernelArguments.hpp	Tue Jun 17 12:05:34 2014 +0200
@@ -91,16 +91,17 @@
     // For kernel arguments we don't pass the final int parameter
     // since we use the HSAIL workitemid instruction in place of that int value
     virtual void handleFinalIntParameter() {
-      if (TraceGPUInteraction) {
-        tty->print_cr("[HSAIL] HSAILKernelArguments, not pushing trailing int");
-      }
+      ShouldNotReachHere();
     }
 
-    // for kernel arguments, final obj parameter should be an object
+    // For kernel arguments, final obj parameter should be an object
     // stream source array (already checked in the base class) so here we just pass it
     virtual void handleFinalObjParameter(void* arg) {
-      pushObject(arg);
+      ShouldNotReachHere();
     }
+
+    virtual void collectArgs();
+
 };
 
 #endif  // GPU_HSAIL_VM_HSAIL_KERNEL_ARGUMENTS_HPP