Mercurial > hg > graal-compiler

--- a/graal/com.oracle.graal.compiler.hsail.test.infra/src/com/oracle/graal/compiler/hsail/test/infra/GraalKernelTester.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test.infra/src/com/oracle/graal/compiler/hsail/test/infra/GraalKernelTester.java	Tue Jun 10 22:36:26 2014 +0200
@@ -152,13 +152,17 @@
         return true;
     }

+    HotSpotNmethod installedCode;
+
     @Override
     protected void dispatchKernelOkra(int range, Object... args) {
         HSAILHotSpotBackend backend = getHSAILBackend();
         if (backend.isDeviceInitialized()) {
             try {
-                HotSpotNmethod code = backend.compileAndInstallKernel(testMethod);
-                backend.executeKernel(code, range, args);
+                if (installedCode == null) {
+                    installedCode = backend.compileAndInstallKernel(testMethod);
+                }
+                backend.executeKernel(installedCode, range, args);
             } catch (InvalidInstalledCodeException e) {
                 Debug.log("WARNING:Invalid installed code: " + e);
                 e.printStackTrace();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicIntGetAndAddTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicIntGetAndAddTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -45,11 +45,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicIntGetAndSetTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicIntGetAndSetTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -45,11 +45,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicLongGetAndAddTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicLongGetAndAddTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -45,11 +45,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicLongGetAndSetTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicLongGetAndSetTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -45,11 +45,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntAddAndGetGidTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntAddAndGetGidTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -47,11 +47,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntAddAndGetTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntAddAndGetTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -44,11 +44,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntDecAndGetTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntDecAndGetTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -44,11 +44,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndAddTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndAddTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -44,11 +44,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndDecTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndDecTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -44,11 +44,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndIncTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndIncTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -44,11 +44,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntIncAndGetTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntIncAndGetTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -44,11 +44,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongAddAndGetTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongAddAndGetTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -46,11 +46,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongGetAndAddTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongGetAndAddTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -44,11 +44,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongGetAndIncTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongGetAndIncTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -44,11 +44,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongIncAndGetTest.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongIncAndGetTest.java	Tue Jun 10 22:36:26 2014 +0200
@@ -44,11 +44,6 @@
     }

     @Override
-    protected boolean supportsRequiredCapabilities() {
-        return (canDeoptimize());
-    }
-
-    @Override
     public void runTest() {
         setupArrays();
--- a/graal/com.oracle.graal.gpu/src/com/oracle/graal/gpu/ExternalCompilationResult.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.gpu/src/com/oracle/graal/gpu/ExternalCompilationResult.java	Tue Jun 10 22:36:26 2014 +0200
@@ -44,6 +44,7 @@
     private StructuredGraph hostGraph;

     private int[] oopMapArray;
+    private boolean usesAllocation;

     /**
      * Set the address for the point of entry to the external compilation result.
@@ -86,4 +87,12 @@
         return oopMapArray;
     }

+    public void setUsesAllocationFlag(boolean val) {
+        usesAllocation = val;
+    }
+
+    public boolean getUsesAllocationFlag() {
+        return usesAllocation;
+    }
+
 }
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java	Tue Jun 10 22:36:26 2014 +0200
@@ -66,7 +66,6 @@
 import com.oracle.graal.lir.gen.*;
 import com.oracle.graal.lir.hsail.*;
 import com.oracle.graal.lir.hsail.HSAILControlFlow.DeoptimizingOp;
-import com.oracle.graal.lir.hsail.HSAILMove.AtomicReadAndAddOp;
 import com.oracle.graal.nodes.*;
 import com.oracle.graal.nodes.StructuredGraph.GuardsStage;
 import com.oracle.graal.nodes.extended.*;
@@ -103,8 +102,10 @@
         paramTypeMap.put("HotSpotResolvedPrimitiveType<double>", "f64");
         paramTypeMap.put("HotSpotResolvedPrimitiveType<long>", "s64");

-        // The order of the conjunction below is important: the OkraUtil
-        // call may provision the native library required by the initialize() call
+        /*
+         * The order of the conjunction below is important: the OkraUtil call may provision the
+         * native library required by the initialize() call
+         */
         deviceInitialized = OkraUtil.okraLibExists() && initialize();
     }

@@ -261,8 +262,7 @@
      */
     public final HotSpotNmethod installKernel(ResolvedJavaMethod method, ExternalCompilationResult hsailCode) {
         assert hsailCode.getEntryPoint() != 0L;
-        // code below here lifted from HotSpotCodeCacheProviders.addExternalMethod
-        // used to be return getProviders().getCodeCache().addExternalMethod(method, hsailCode);
+        // Code here based on HotSpotCodeCacheProvider.addExternalMethod().
         HotSpotResolvedJavaMethod javaMethod = (HotSpotResolvedJavaMethod) method;
         if (hsailCode.getId() == -1) {
             hsailCode.setId(javaMethod.allocateCompileId(hsailCode.getEntryBCI()));
@@ -294,6 +294,7 @@

         HSAILHotSpotNmethod code = new HSAILHotSpotNmethod(javaMethod, hsailCode.getName(), false, true);
         code.setOopMapArray(hsailCode.getOopMapArray());
+        code.setUsesAllocationFlag(hsailCode.getUsesAllocationFlag());
         HotSpotCompiledNmethod compiled = new HotSpotCompiledNmethod(getTarget(), javaMethod, compilationResult);
         CodeInstallResult result = getRuntime().getCompilerToVM().installCode(compiled, code, null);
         if (result != CodeInstallResult.OK) {
@@ -388,7 +389,9 @@
         } else {
             oopsSaveArea = null;
         }
-        return executeKernel0(kernel, jobSize, args, oopsSaveArea, donorThreadPool.get().getThreads(), HsailAllocBytesPerWorkitem.getValue(), oopMapArray);
+        // Pass donorThreadPoolArray if this kernel uses allocation, otherwise null
+        Thread[] donorThreadArray = ((HSAILHotSpotNmethod) kernel).getUsesAllocationFlag() ? donorThreadPool.get().getThreads() : null;
+        return executeKernel0(kernel, jobSize, args, oopsSaveArea, donorThreadArray, HsailAllocBytesPerWorkitem.getValue(), oopMapArray);
     }

     private static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args, Object[] oopsSave, Thread[] donorThreads, int allocBytesPerWorkitem, int[] oopMapArray)
@@ -449,6 +452,7 @@

     static class HSAILHotSpotNmethod extends HotSpotNmethod {
         private int[] oopMapArray;
+        private boolean usesAllocation;

         HSAILHotSpotNmethod(HotSpotResolvedJavaMethod method, String name, boolean isDefault, boolean isExternal) {
             super(method, name, isDefault, isExternal);
@@ -461,6 +465,14 @@
         int[] getOopMapArray() {
             return oopMapArray;
         }
+
+        public void setUsesAllocationFlag(boolean val) {
+            usesAllocation = val;
+        }
+
+        public boolean getUsesAllocationFlag() {
+            return usesAllocation;
+        }
     }

     @Override
@@ -493,19 +505,22 @@
             Debug.log("+UseHSAILSafepoints requires +UseHSAILDeoptimization");
         }

-        // see what graph nodes we have to see if we are using the thread register
-        // if not, we don't have to emit the code that sets that up
-        // maybe there is a better way to do this?
-        boolean usesThreadRegister = false;
+        /*
+         * See what graph nodes we have to see if we are using the thread register. If not, we don't
+         * have to emit the code that sets it up. Maybe there is a better way to do this?
+         */
+        boolean usesAllocation = false;
         search: for (AbstractBlock<?> b : lir.linearScanOrder()) {
             for (LIRInstruction op : lir.getLIRforBlock(b)) {
-                if (op instanceof AtomicReadAndAddOp) {
-                    usesThreadRegister = true;
+                if ((op instanceof HSAILMove.LoadOp) && ((HSAILMove.LoadOp) op).usesThreadRegister()) {
+                    usesAllocation = true;
                     assert useHSAILDeoptimization : "cannot use thread register if HSAIL deopt support is disabled";
                     break search;
                 }
             }
         }
+        // save usesAllocation flag in ExternalCompilationResult
+        ((ExternalCompilationResult) crb.compilationResult).setUsesAllocationFlag(usesAllocation);

         // Emit the prologue.
         HSAILAssembler asm = (HSAILAssembler) crb.asm;
@@ -527,8 +542,7 @@
             nonConstantParamCount++;
         }

-        // If this is an instance method, include mappings for the "this" parameter
-        // as the first parameter.
+        // If this is an instance method, include the "this" parameter
         if (!isStatic) {
             nonConstantParamCount++;
         }
@@ -564,8 +578,10 @@
         // Include the gid.
         System.arraycopy(paramtypes, 0, ccParamTypes, 0, nonConstantParamCount);

-        // Last entry is always int (its register gets used in the workitemabsid instruction)
-        // this is true even for object stream labmdas
+        /*
+         * Last entry is always int (its register gets used in the workitemabsid instruction). This
+         * is true even for object stream lambdas.
+         */
         if (sigParamCount > 0) {
             ccParamTypes[ccParamTypes.length - 1] = metaAccess.lookupJavaType(int.class);
         }
@@ -621,7 +637,6 @@
         if (useHSAILDeoptimization) {
             // Aliases for d16
             RegisterValue d16_deoptInfo = HSAIL.d16.asValue(wordKind);
-            RegisterValue d16_donorThreads = d16_deoptInfo;

             // Aliases for d17
             RegisterValue d17_donorThreadIndex = HSAIL.d17.asValue(wordKind);
@@ -645,21 +660,20 @@
             asm.emitLoadAcquire(s34_deoptOccurred, new HSAILAddressValue(Kind.Int, d16_deoptInfo, config.hsailDeoptOccurredOffset).toAddress());
             asm.emitCompare(Kind.Int, s34_deoptOccurred, Constant.forInt(0), "ne", false, false);
             asm.cbr(deoptInProgressLabel);
-            // load thread register if needed
-            if (usesThreadRegister) {
+            // load thread register if this kernel performs allocation
+            if (usesAllocation) {
+                RegisterValue threadReg = getProviders().getRegisters().getThreadRegister().asValue(wordKind);
                 assert HsailDonorThreads.getValue() > 0;
-                asm.emitLoad(wordKind, d16_donorThreads, new HSAILAddressValue(wordKind, d16_deoptInfo, config.hsailDonorThreadsOffset).toAddress());
+                asm.emitLoad(wordKind, threadReg, new HSAILAddressValue(wordKind, d16_deoptInfo, config.hsailCurTlabInfoOffset).toAddress());
                 if (HsailDonorThreads.getValue() != 1) {
                     asm.emitComment("// map workitem to a donor thread");
                     asm.emitString(String.format("rem_u32  $%s, %s, %d;", s34_donorThreadIndex.getRegister(), workItemReg, HsailDonorThreads.getValue()));
                     asm.emitConvert(d17_donorThreadIndex, s34_donorThreadIndex, wordKind, Kind.Int);
-                    asm.emit("mad", d16_donorThreads, d17_donorThreadIndex, Constant.forInt(8), d16_donorThreads);
+                    asm.emit("mad", threadReg, d17_donorThreadIndex, Constant.forInt(8), threadReg);
                 } else {
                     // workitem is already mapped to solitary donor thread
                 }
-                AllocatableValue threadRegValue = getProviders().getRegisters().getThreadRegister().asValue(wordKind);
-                asm.emitComment("// $" + getProviders().getRegisters().getThreadRegister() + " will point to a donor thread for this workitem");
-                asm.emitLoad(wordKind, threadRegValue, new HSAILAddressValue(wordKind, d16_donorThreads).toAddress());
+                asm.emitComment("// $" + getProviders().getRegisters().getThreadRegister() + " will point to holder of tlab thread info for this workitem");
             }
         }

@@ -676,8 +690,10 @@
             boolean useCompressedOops = config.useCompressedOops;
             final int arrayElementsOffset = HotSpotGraalRuntime.getArrayBaseOffset(wordKind);
             String iterationObjArgReg = HSAIL.mapRegister(cc.getArgument(nonConstantParamCount - 1));
-            // iterationObjArgReg will be the highest $d register in use (it is the last parameter)
-            // so tempReg can be the next higher $d register
+            /*
+             * iterationObjArgReg will be the highest $d register in use (it is the last parameter)
+             * so tempReg can be the next higher $d register
+             */
             String tmpReg = "$d" + (asRegister(cc.getArgument(nonConstantParamCount - 1)).encoding() + 1);
             // Convert gid to long.
             asm.emitString("cvt_u64_s32 " + tmpReg + ", " + workItemReg + "; // Convert gid to long");
@@ -740,8 +756,10 @@
         int numDRegs = 0;
         int numStackSlotBytes = 0;
         if (useHSAILDeoptimization) {
-            // get the union of registers and stack slots needed to be saved at the infopoints
-            // while doing this compute the highest register in each category
+            /*
+             * Get the union of registers and stack slots needed to be saved at the infopoints. While
+             * doing this compute the highest register in each category.
+             */
             HSAILHotSpotRegisterConfig hsailRegConfig = (HSAILHotSpotRegisterConfig) regConfig;
             Set<Register> infoUsedRegs = new TreeSet<>();
             Set<StackSlot> infoUsedStackSlots = new HashSet<>();
@@ -836,13 +854,16 @@

             asm.emitComment("// Determine next deopt save slot");
             asm.emitAtomicAdd(scratch32, deoptNextIndexAddr, Constant.forInt(1));
-            // scratch32 now holds next index to use
-            // set error condition if no room in save area
+            /*
+             * scratch32 now holds next index to use set error condition if no room in save area
+             */
             asm.emitComment("// assert room to save deopt");
             asm.emitCompare(Kind.Int, scratch32, Constant.forInt(maxDeoptIndex), "lt", false, false);
             asm.cbr("@L_StoreDeopt");
-            // if assert fails, store a guaranteed negative workitemid in top level deopt occurred
-            // flag
+            /*
+             * if assert fails, store a guaranteed negative workitemid in top level deopt occurred
+             * flag
+             */
             asm.emitWorkItemAbsId(scratch32);
             asm.emit("mad", scratch32, scratch32, Constant.forInt(-1), Constant.forInt(-1));
             asm.emitStore(scratch32, deoptInfoAddr);
@@ -880,8 +901,10 @@
             asm.emitComment("// store regCounts (" + numSRegs + " $s registers, " + numDRegs + " $d registers, " + numStackSlots + " stack slots)");
             asm.emitStore(Kind.Int, Constant.forInt(numSRegs + (numDRegs << 8) + (numStackSlots << 16)), regCountsAddr);

-            // loop thru the usedValues storing each of the registers that are used.
-            // we always store in a fixed location, even if some registers are skipped
+            /*
+             * Loop thru the usedValues storing each of the registers that are used. We always store
+             * in a fixed location, even if some registers are skipped.
+             */
             asm.emitComment("// store used regs");
             for (Register reg : infoUsedRegs) {
                 if (hsailRegConfig.isAllocatableSReg(reg)) {
@@ -961,11 +984,11 @@
         private int intsPerInfopoint;

         int[] build(List<Infopoint> infoList, int numSRegs, int numDRegs, int numStackSlots, HSAILHotSpotRegisterConfig hsailRegConfig) {
-            // we are told that infoList is always sorted
-            // each infoPoint can have a different oopMap
-
-            // since numStackSlots is the number of 8-byte stack slots used, it is an upper limit on
-            // the number of oop stack slots
+            /*
+             * We are told that infoList is always sorted. Each infoPoint can have a different
+             * oopMap. Since numStackSlots is the number of 8-byte stack slots used, it is an upper
+             * limit on the number of oop stack slots
+             */
             int bitsPerInfopoint = numDRegs + numStackSlots;
             int intsForBits = (bitsPerInfopoint + 31) / 32;
             int numInfopoints = infoList.size();
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotLIRGenerator.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotLIRGenerator.java	Tue Jun 10 22:36:26 2014 +0200
@@ -38,17 +38,8 @@
 import com.oracle.graal.lir.StandardOp.SaveRegistersOp;
 import com.oracle.graal.lir.gen.*;
 import com.oracle.graal.lir.hsail.*;
-import com.oracle.graal.lir.hsail.HSAILControlFlow.CondMoveOp;
-import com.oracle.graal.lir.hsail.HSAILControlFlow.DeoptimizeOp;
-import com.oracle.graal.lir.hsail.HSAILControlFlow.ForeignCall1ArgOp;
-import com.oracle.graal.lir.hsail.HSAILControlFlow.ForeignCall2ArgOp;
-import com.oracle.graal.lir.hsail.HSAILControlFlow.ForeignCallNoArgOp;
-import com.oracle.graal.lir.hsail.HSAILMove.CompareAndSwapOp;
-import com.oracle.graal.lir.hsail.HSAILMove.LoadOp;
-import com.oracle.graal.lir.hsail.HSAILMove.MoveFromRegOp;
-import com.oracle.graal.lir.hsail.HSAILMove.MoveToRegOp;
-import com.oracle.graal.lir.hsail.HSAILMove.StoreConstantOp;
-import com.oracle.graal.lir.hsail.HSAILMove.StoreOp;
+import com.oracle.graal.lir.hsail.HSAILControlFlow.*;
+import com.oracle.graal.lir.hsail.HSAILMove.*;
 import com.oracle.graal.phases.util.*;

 /**
@@ -126,6 +117,13 @@
         return result;
     }

+    public Variable emitLoadAcquire(PlatformKind kind, Value address, LIRFrameState state) {
+        HSAILAddressValue loadAddress = asAddressValue(address);
+        Variable result = newVariable(kind);
+        append(new LoadAcquireOp(getMemoryKind(kind), result, loadAddress, state));
+        return result;
+    }
+
     @Override
     public void emitStore(PlatformKind kind, Value address, Value inputVal, LIRFrameState state) {
         HSAILAddressValue storeAddress = asAddressValue(address);
@@ -147,6 +145,13 @@
         }
     }

+    public void emitStoreRelease(PlatformKind kind, Value address, Value inputVal, LIRFrameState state) {
+        HSAILAddressValue storeAddress = asAddressValue(address);
+        // TODO: handle Constants here
+        Variable input = load(inputVal);
+        append(new StoreReleaseOp(getMemoryKind(kind), storeAddress, input, state));
+    }
+
     public Value emitCompareAndSwap(Value address, Value expectedValue, Value newValue, Value trueValue, Value falseValue) {
         PlatformKind kind = newValue.getPlatformKind();
         assert kind == expectedValue.getPlatformKind();
@@ -314,4 +319,10 @@
         emitMove(obj, address);
         append(new HSAILMove.NullCheckOp(obj, state));
     }
+
+    public Variable emitWorkItemAbsId() {
+        Variable result = newVariable(Kind.Int);
+        append(new WorkItemAbsIdOp(result));
+        return result;
+    }
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILDirectLoadAcquireNode.java	Tue Jun 10 22:36:26 2014 +0200
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.graal.hotspot.hsail.replacements;
+
+import com.oracle.graal.api.meta.*;
+import com.oracle.graal.nodes.*;
+import com.oracle.graal.nodes.spi.*;
+import com.oracle.graal.replacements.nodes.*;
+import com.oracle.graal.hotspot.hsail.*;
+import com.oracle.graal.word.*;
+
+public class HSAILDirectLoadAcquireNode extends DirectReadNode {
+
+    public HSAILDirectLoadAcquireNode(ValueNode address, Kind readKind) {
+        super(address, readKind);
+    }
+
+    @Override
+    public void generate(NodeLIRBuilderTool gen) {
+        HSAILHotSpotLIRGenerator hsailgen = (HSAILHotSpotLIRGenerator) (gen.getLIRGeneratorTool());
+        Value result = hsailgen.emitLoadAcquire(getKind(), gen.operand(getAddress()), null);
+        gen.setResult(this, result);
+    }
+
+    @NodeIntrinsic
+    public static native long loadAcquire(long address, @ConstantNodeParameter Kind kind);
+
+    public static long loadAcquireLong(Word address) {
+        return loadAcquire(address.rawValue(), Kind.Long);
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILDirectStoreReleaseNode.java	Tue Jun 10 22:36:26 2014 +0200
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.graal.hotspot.hsail.replacements;
+
+import com.oracle.graal.api.meta.*;
+import com.oracle.graal.nodes.*;
+import com.oracle.graal.nodes.spi.*;
+import com.oracle.graal.replacements.nodes.*;
+import com.oracle.graal.hotspot.hsail.*;
+import com.oracle.graal.word.*;
+
+public class HSAILDirectStoreReleaseNode extends DirectStoreNode {
+
+    public HSAILDirectStoreReleaseNode(ValueNode address, ValueNode value, Kind kind) {
+        super(address, value, kind);
+    }
+
+    @Override
+    public void generate(NodeLIRBuilderTool gen) {
+        HSAILHotSpotLIRGenerator hsailgen = (HSAILHotSpotLIRGenerator) (gen.getLIRGeneratorTool());
+        Value v = gen.operand(getValue());
+        hsailgen.emitStoreRelease(getKind(), gen.operand(getAddress()), v, null);
+    }
+
+    @NodeIntrinsic
+    public static native void storeRelease(long address, long value, @ConstantNodeParameter Kind kind);
+
+    public static void storeReleaseLong(Word address, long value) {
+        storeRelease(address.rawValue(), value, Kind.Long);
+    }
+
+}
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILHotSpotReplacementsUtil.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILHotSpotReplacementsUtil.java	Tue Jun 10 22:36:26 2014 +0200
@@ -43,11 +43,37 @@
         hsailRegisters = registers;
     }

+    public static final LocationIdentity TLAB_INFO_LOCATION = new NamedLocationIdentity("TlabInfo");
+    public static final LocationIdentity TLABINFO_LASTGOODTOP_LOCATION = new NamedLocationIdentity("TlabInfoLastGoodTop");
+    public static final LocationIdentity TLABINFO_END_LOCATION = new NamedLocationIdentity("TlabInfoEnd");
+    public static final LocationIdentity TLABINFO_TOP_LOCATION = new NamedLocationIdentity("TlabInfoTop");
+    public static final LocationIdentity TLABINFO_START_LOCATION = new NamedLocationIdentity("TlabInfoStart");
+    public static final LocationIdentity TLABINFO_ALLOCINFO_LOCATION = new NamedLocationIdentity("TlabInfoAllocInfo");
+    public static final LocationIdentity TLABINFO_ORIGINALTOP_LOCATION = new NamedLocationIdentity("TlabInfoOriginalTop");
+    public static final LocationIdentity TLABINFO_DONORTHREAD_LOCATION = new NamedLocationIdentity("TlabInfoDonorThread");
+
+    public static final LocationIdentity ALLOCINFO_TLABINFOSPOOLNEXT_LOCATION = new NamedLocationIdentity("AllocInfoTlabInfosPoolNext");
+    public static final LocationIdentity ALLOCINFO_TLABINFOSPOOLEND_LOCATION = new NamedLocationIdentity("AllocInfoTlabInfosPoolEnd");
+    public static final LocationIdentity ALLOCINFO_TLABALIGNRESERVEBYTES_LOCATION = new NamedLocationIdentity("AllocInfoTlabAlignreservebytes");
+
     /**
-     * Gets the value of the thread register as a Word.
+     * Gets the value of the thread register as a Word. There is a level of indirection here. Thread
+     * register actually points to a holder for tlab info.
      */
-    public static Word thread() {
-        return registerAsWord(threadRegister(), true, false);
+    public static Word getTlabInfoPtr() {
+        Word threadRegAsWord = registerAsWord(threadRegister(), true, false);
+        return threadRegAsWord.readWord(0, TLAB_INFO_LOCATION);
+    }
+
+    public static Word getTlabInfoPtrLoadAcquire() {
+        Word threadRegAsWord = registerAsWord(threadRegister(), true, false);
+        return Word.unsigned(HSAILDirectLoadAcquireNode.loadAcquireLong(threadRegAsWord));
+    }
+
+    public static void writeTlabInfoPtrStoreRelease(Word val) {
+        // this only gets done in the waiting loop so we will always use Store Release
+        Word threadRegAsWord = registerAsWord(threadRegister(), true, false);
+        HSAILDirectStoreReleaseNode.storeReleaseLong(threadRegAsWord, val.rawValue());
     }

     @Fold
@@ -55,19 +81,64 @@
         return hsailRegisters.getThreadRegister();
     }

-    public static Word atomicGetAndAddTlabTop(Word thread, int size) {
-        return Word.unsigned(AtomicReadAndAddNode.getAndAddLong(null, thread.rawValue() + threadTlabTopOffset(), size, TLAB_TOP_LOCATION));
+    public static Word atomicGetAndAddTlabInfoTop(Word tlabInfo, int delta) {
+        return Word.unsigned(AtomicReadAndAddNode.getAndAddLong(null, tlabInfo.rawValue() + config().hsailTlabInfoTopOffset, delta, TLABINFO_TOP_LOCATION));
+    }
+
+    public static Word readTlabInfoEnd(Word tlabInfo) {
+        return tlabInfo.readWord(config().hsailTlabInfoEndOffset, TLABINFO_END_LOCATION);
+    }
+
+    public static Word readTlabInfoStart(Word tlabInfo) {
+        return tlabInfo.readWord(config().hsailTlabInfoStartOffset, TLABINFO_START_LOCATION);
+    }
+
+    public static void writeTlabInfoLastGoodTop(Word tlabInfo, Word val) {
+        tlabInfo.writeWord(config().hsailTlabInfoLastGoodTopOffset, val, TLABINFO_LASTGOODTOP_LOCATION);
+    }
+
+    public static void writeTlabInfoStart(Word tlabInfo, Word val) {
+        tlabInfo.writeWord(config().hsailTlabInfoStartOffset, val, TLABINFO_START_LOCATION);
+    }
+
+    public static void writeTlabInfoTop(Word tlabInfo, Word val) {
+        tlabInfo.writeWord(config().hsailTlabInfoTopOffset, val, TLABINFO_TOP_LOCATION);
+    }
+
+    public static void writeTlabInfoEnd(Word tlabInfo, Word val) {
+        tlabInfo.writeWord(config().hsailTlabInfoEndOffset, val, TLABINFO_END_LOCATION);
     }

-    public static final LocationIdentity TLAB_PFTOP_LOCATION = new NamedLocationIdentity("TlabPfTop");
+    public static Word readTlabInfoAllocInfo(Word tlabInfo) {
+        return tlabInfo.readWord(config().hsailTlabInfoAllocInfoOffset, TLABINFO_ALLOCINFO_LOCATION);
+    }

-    @Fold
-    public static int threadTlabPfTopOffset() {
-        return config().threadTlabPfTopOffset();
+    public static void writeTlabInfoAllocInfo(Word tlabInfo, Word val) {
+        tlabInfo.writeWord(config().hsailTlabInfoAllocInfoOffset, val, TLABINFO_ALLOCINFO_LOCATION);
+    }
+
+    public static void writeTlabInfoOriginalTop(Word tlabInfo, Word val) {
+        tlabInfo.writeWord(config().hsailTlabInfoOriginalTopOffset, val, TLABINFO_ORIGINALTOP_LOCATION);
     }

-    public static void writeTlabPfTop(Word thread, Word val) {
-        thread.writeWord(threadTlabPfTopOffset(), val, TLAB_PFTOP_LOCATION);
+    public static void writeTlabInfoDonorThread(Word tlabInfo, Word val) {
+        tlabInfo.writeWord(config().hsailTlabInfoDonorThreadOffset, val, TLABINFO_DONORTHREAD_LOCATION);
+    }
+
+    public static Word readTlabInfoDonorThread(Word tlabInfo) {
+        return tlabInfo.readWord(config().hsailTlabInfoDonorThreadOffset, TLABINFO_DONORTHREAD_LOCATION);
+    }
+
+    public static Word readAllocInfoTlabInfosPoolEnd(Word allocInfo) {
+        return allocInfo.readWord(config().hsailAllocInfoTlabInfosPoolEndOffset, ALLOCINFO_TLABINFOSPOOLEND_LOCATION);
+    }
+
+    public static Word readAllocInfoTlabAlignReserveBytes(Word allocInfo) {
+        return allocInfo.readWord(config().hsailAllocInfoTlabAlignReserveBytesOffset, ALLOCINFO_TLABALIGNRESERVEBYTES_LOCATION);
+    }
+
+    public static Word atomicGetAndAddAllocInfoTlabInfosPoolNext(Word allocInfo, int delta) {
+        return Word.unsigned(AtomicReadAndAddNode.getAndAddLong(null, allocInfo.rawValue() + config().hsailAllocInfoTlabInfosPoolNextOffset, delta, ALLOCINFO_TLABINFOSPOOLNEXT_LOCATION));
     }

 }
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILNewObjectSnippets.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILNewObjectSnippets.java	Tue Jun 10 22:36:26 2014 +0200
@@ -60,6 +60,9 @@
         @Option(help = "In HSAIL allocation, allow allocation from eden as fallback if TLAB is full")
         static final OptionValue<Boolean> HsailUseEdenAllocate = new OptionValue<>(false);

+        @Option(help = "In HSAIL allocation, allow GPU to allocate a new tlab if TLAB is full")
+        static final OptionValue<Boolean> HsailNewTlabAllocate = new OptionValue<>(true);
+
         @Option(help = "Estimate of number of bytes allocated by each HSAIL workitem, used to size TLABs")
         static public final OptionValue<Integer> HsailAllocBytesPerWorkitem = new OptionValue<>(64);

@@ -67,44 +70,130 @@
     }

     private static final boolean hsailUseEdenAllocate = HsailUseEdenAllocate.getValue();
+    private static final boolean hsailNewTlabAllocate = HsailNewTlabAllocate.getValue();
+
+    protected static Word fillNewTlabInfoWithTlab(Word oldTlabInfo) {
+        Word allocInfo = readTlabInfoAllocInfo(oldTlabInfo);
+        Word newTlabInfo = atomicGetAndAddAllocInfoTlabInfosPoolNext(allocInfo, config().hsailTlabInfoSize);
+        Word tlabInfosPoolEnd = readAllocInfoTlabInfosPoolEnd(allocInfo);
+        if (newTlabInfo.aboveOrEqual(tlabInfosPoolEnd)) {
+            // could not get a new tlab info, mark zero and we will later deoptimize
+            return (Word.zero());
+        }
+
+        // make new size depend on old tlab size
+        Word newTlabSize = readTlabInfoEnd(oldTlabInfo).subtract(readTlabInfoStart(oldTlabInfo));
+        // try to allocate a new tlab
+        Word tlabStart = NewInstanceStub.edenAllocate(newTlabSize, false);
+        writeTlabInfoStart(newTlabInfo, tlabStart);  // write this field even if zero
+        if (tlabStart.equal(0)) {
+            // could not get a new tlab, mark zero and we will later deoptimize
+            return (Word.zero());
+        }
+        // here we have a new tlab and a tlabInfo, we can fill it in
+        writeTlabInfoTop(newTlabInfo, tlabStart);
+        writeTlabInfoOriginalTop(newTlabInfo, tlabStart);
+        // set end so that we leave space for the tlab "alignment reserve"
+        Word alignReserveBytes = readAllocInfoTlabAlignReserveBytes(allocInfo);
+        writeTlabInfoEnd(newTlabInfo, tlabStart.add(newTlabSize.subtract(alignReserveBytes)));
+        writeTlabInfoAllocInfo(newTlabInfo, allocInfo);
+        writeTlabInfoDonorThread(newTlabInfo, readTlabInfoDonorThread(oldTlabInfo));
+        return (newTlabInfo);
+    }
+
+    protected static Word allocateFromTlabSlowPath(Word fastPathTlabInfo, int size, Word fastPathTop, Word fastPathEnd) {
+        // eventually this will be a separate call, not inlined
+
+        // we come here from the fastpath allocation
+        // here we know that the tlab has overflowed (top + size > end)
+        // find out if we are the first overflower
+        Word tlabInfo = fastPathTlabInfo;
+        Word top = fastPathTop;
+        Word end = fastPathEnd;
+
+        // start a loop where we try to get a new tlab and then try to allocate from it
+        // keep doing this until we run out of tlabs or tlabInfo structures
+        // initialize result with error return value
+        Word result = Word.zero();
+        while (result.equal(Word.zero()) && tlabInfo.notEqual(Word.zero())) {
+            boolean firstOverflower = top.belowOrEqual(end);
+            if (firstOverflower) {
+                // store the last good top before overflow into last_good_top field
+                // we will move it back into top later when back in the VM
+                writeTlabInfoLastGoodTop(tlabInfo, top);
+            }
+
+            // if all this allocate tlab from gpu logic is disabled,
+            // just immediately set tlabInfo to 0 here
+            if (!hsailNewTlabAllocate) {
+                tlabInfo = Word.zero();
+            } else {
+                // loop here waiting for the first overflower to get a new tlab
+                // note that on an hsa device we must be careful how we loop in order to ensure
+                // "forward progress". For example we must not break out of the loop.
+                Word oldTlabInfo = tlabInfo;
+                do {
+                    if (firstOverflower) {
+                        // allocate new tlabInfo and new tlab to fill it, returning 0 if any
+                        // problems
+                        // this will get all spinners out of this loop.
+                        tlabInfo = fillNewTlabInfoWithTlab(oldTlabInfo);
+                        writeTlabInfoPtrStoreRelease(tlabInfo);
+                    } else {
+                        tlabInfo = getTlabInfoPtrLoadAcquire();
+                    }
+                } while (tlabInfo.equal(oldTlabInfo));
+                // when we get out of the loop if tlabInfoPtr contains 0, it means we
+                // can't get any more tlabs and will have to deoptimize
+                // otherwise, we have a valid new tlabInfo/tlab and can try to allocate again.
+                if (tlabInfo.notEqual(0)) {
+                    top = atomicGetAndAddTlabInfoTop(tlabInfo, size);
+                    end = readTlabInfoEnd(tlabInfo);
+                    Word newTop = top.add(size);
+                    if (probability(FAST_PATH_PROBABILITY, newTop.belowOrEqual(end))) {
+                        result = top;
+                    }
+                }
+            }
+        } // while (result == 0) && (tlabInfo != 0))
+        return result;
+    }
+
+    protected static Object addressToFormattedObject(Word addr, @ConstantParameter int size, Word hub, Word prototypeMarkWord, @ConstantParameter boolean fillContents,
+                    @ConstantParameter String typeContext) {
+        Object result = formatObject(hub, size, addr, prototypeMarkWord, fillContents, true, false, true);
+        profileAllocation("instance", size, typeContext);
+        return piCast(verifyOop(result), StampFactory.forNodeIntrinsic());
+    }

     @Snippet
     public static Object allocateInstanceAtomic(@ConstantParameter int size, Word hub, Word prototypeMarkWord, @ConstantParameter boolean fillContents, @ConstantParameter String typeContext) {
-        Word thread = thread();
         boolean haveResult = false;
         if (useTLAB()) {
-            Word top = atomicGetAndAddTlabTop(thread, size);
-            Word end = readTlabEnd(thread);
-            Word newTop = top.add(size);
-            if (probability(FAST_PATH_PROBABILITY, newTop.belowOrEqual(end))) {
-                // writeTlabTop(thread, newTop) was done by the atomicGetAndAdd
-                Object result = formatObject(hub, size, top, prototypeMarkWord, fillContents, true, false, true);
-                profileAllocation("instance", size, typeContext);
-                return piCast(verifyOop(result), StampFactory.forNodeIntrinsic());
-            } else {
-                // only one overflower will be the first overflower, detectable because
-                // oldtop was still below end
-                if (top.belowOrEqual(end)) {
-                    // hack alert: store the last good top before overflow into pf_top
-                    // we will move it back into top later when back in the VM
-                    writeTlabPfTop(thread, top);
+            // inlining this manually here because it resulted in better fastpath codegen
+            Word tlabInfo = getTlabInfoPtr();
+            if (probability(FAST_PATH_PROBABILITY, tlabInfo.notEqual(0))) {
+                Word top = atomicGetAndAddTlabInfoTop(tlabInfo, size);
+                Word end = readTlabInfoEnd(tlabInfo);
+                Word newTop = top.add(size);
+                if (probability(FAST_PATH_PROBABILITY, newTop.belowOrEqual(end))) {
+                    return addressToFormattedObject(top, size, hub, prototypeMarkWord, fillContents, typeContext);
+                } else {
+                    Word addr = allocateFromTlabSlowPath(tlabInfo, size, top, end);
+                    if (addr.notEqual(0)) {
+                        return addressToFormattedObject(addr, size, hub, prototypeMarkWord, fillContents, typeContext);
+                    }
                 }
-                // useless logic but see notes on deopt path below
-                haveResult = newTop.belowOrEqual(end);
             }
         }
+
+        // we could not allocate from tlab, try allocating directly from eden
         if (hsailUseEdenAllocate) {
-            // originally:
-            // result = NewInstanceStubCall.call(hub);
-
-            // we could not allocate from tlab, try allocating directly from eden
             // false for no logging
-            Word memory = NewInstanceStub.edenAllocate(Word.unsigned(size), false);
-            if (memory.notEqual(0)) {
+            Word addr = NewInstanceStub.edenAllocate(Word.unsigned(size), false);
+            if (addr.notEqual(0)) {
                 new_eden.inc();
-                Object result = formatObject(hub, size, memory, prototypeMarkWord, fillContents, true, false, true);
-                profileAllocation("instance", size, typeContext);
-                return piCast(verifyOop(result), StampFactory.forNodeIntrinsic());
+                return addressToFormattedObject(addr, size, hub, prototypeMarkWord, fillContents, typeContext);
             }
         }
         // haveResult test here helps avoid dropping earlier stores were seen to be dropped without
@@ -126,44 +215,43 @@
         return allocateArrayAtomicImpl(hub, length, prototypeMarkWord, headerSize, log2ElementSize, fillContents, maybeUnroll, typeContext);
     }

+    protected static Object addressToFormattedArray(Word addr, int allocationSize, int length, int headerSize, Word hub, Word prototypeMarkWord, boolean fillContents, boolean maybeUnroll,
+                    @ConstantParameter String typeContext) {
+        // we are not in a stub so we can set useSnippetCounters to true
+        Object result = formatArray(hub, allocationSize, length, headerSize, addr, prototypeMarkWord, fillContents, maybeUnroll, true);
+        profileAllocation("array", allocationSize, typeContext);
+        return piArrayCast(verifyOop(result), length, StampFactory.forNodeIntrinsic());
+    }
+
     private static Object allocateArrayAtomicImpl(Word hub, int length, Word prototypeMarkWord, int headerSize, int log2ElementSize, boolean fillContents, boolean maybeUnroll, String typeContext) {
         int alignment = wordSize();
         int allocationSize = computeArrayAllocationSize(length, alignment, headerSize, log2ElementSize);
-        Word thread = thread();
         boolean haveResult = false;
         if (useTLAB()) {
-            Word top = atomicGetAndAddTlabTop(thread, allocationSize);
-            Word end = readTlabEnd(thread);
-            Word newTop = top.add(allocationSize);
-            if (probability(FAST_PATH_PROBABILITY, newTop.belowOrEqual(end))) {
-                // writeTlabTop(thread, newTop) was done by the atomicGetAndAdd
-                newarray_loopInit.inc();
-                // we are not in a stub so we can set useSnippetCounters to true
-                Object result = formatArray(hub, allocationSize, length, headerSize, top, prototypeMarkWord, fillContents, maybeUnroll, true);
-                profileAllocation("array", allocationSize, typeContext);
-                return piArrayCast(verifyOop(result), length, StampFactory.forNodeIntrinsic());
-            } else {
-                // only one overflower will be the first overflower, detectable because
-                // oldtop was still below end
-                if (top.belowOrEqual(end)) {
-                    // hack alert: store the last good top before overflow into pf_top
-                    // we will move it back into top later when back in the VM
-                    writeTlabPfTop(thread, top);
+            // inlining this manually here because it resulted in better fastpath codegen
+            Word tlabInfo = getTlabInfoPtr();
+            if (probability(FAST_PATH_PROBABILITY, tlabInfo.notEqual(0))) {
+                Word top = atomicGetAndAddTlabInfoTop(tlabInfo, allocationSize);
+                Word end = readTlabInfoEnd(tlabInfo);
+                Word newTop = top.add(allocationSize);
+                if (probability(FAST_PATH_PROBABILITY, newTop.belowOrEqual(end))) {
+                    return addressToFormattedArray(top, allocationSize, length, headerSize, hub, prototypeMarkWord, fillContents, maybeUnroll, typeContext);
+                } else {
+                    Word addr = allocateFromTlabSlowPath(tlabInfo, allocationSize, top, end);
+                    if (addr.notEqual(0)) {
+                        return addressToFormattedArray(addr, allocationSize, length, headerSize, hub, prototypeMarkWord, fillContents, maybeUnroll, typeContext);
+                    }
                 }
-                // useless logic but see notes on deopt path below
-                haveResult = newTop.belowOrEqual(end);
             }
         }
+
         // we could not allocate from tlab, try allocating directly from eden
         if (hsailUseEdenAllocate) {
             // false for no logging
-            Word memory = NewInstanceStub.edenAllocate(Word.unsigned(allocationSize), false);
-            if (memory.notEqual(0)) {
+            Word addr = NewInstanceStub.edenAllocate(Word.unsigned(allocationSize), false);
+            if (addr.notEqual(0)) {
                 newarray_eden.inc();
-                // we are not in a stub so we can set useSnippetCounters to true
-                Object result = formatArray(hub, allocationSize, length, headerSize, memory, prototypeMarkWord, fillContents, maybeUnroll, true);
-                profileAllocation("array", allocationSize, typeContext);
-                return piArrayCast(verifyOop(result), length, StampFactory.forNodeIntrinsic());
+                return addressToFormattedArray(addr, allocationSize, length, headerSize, hub, prototypeMarkWord, fillContents, maybeUnroll, typeContext);
             }
         }
         if (!haveResult) {
@@ -250,6 +338,7 @@
     private static final SnippetCounter new_eden = new SnippetCounter(countersNew, "eden", "used edenAllocate");

     private static final SnippetCounter.Group countersNewArray = SnippetCounters.getValue() ? new SnippetCounter.Group("NewArray") : null;
-    private static final SnippetCounter newarray_loopInit = new SnippetCounter(countersNewArray, "tlabLoopInit", "TLAB alloc with zeroing in a loop");
+    // private static final SnippetCounter newarray_loopInit = new SnippetCounter(countersNewArray,
+    // "tlabLoopInit", "TLAB alloc with zeroing in a loop");
     private static final SnippetCounter newarray_eden = new SnippetCounter(countersNewArray, "eden", "used edenAllocate");
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILWorkItemAbsIdNode.java	Tue Jun 10 22:36:26 2014 +0200
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.graal.hotspot.hsail.replacements;
+
+import com.oracle.graal.api.meta.*;
+import com.oracle.graal.compiler.common.type.*;
+import com.oracle.graal.nodes.*;
+import com.oracle.graal.nodes.spi.*;
+import com.oracle.graal.hotspot.hsail.*;
+
+public class HSAILWorkItemAbsIdNode extends FixedWithNextNode implements LIRLowerable {
+
+    public HSAILWorkItemAbsIdNode() {
+        super(StampFactory.forKind(Kind.Int));
+    }
+
+    @Override
+    public void generate(NodeLIRBuilderTool gen) {
+        HSAILHotSpotLIRGenerator hsailgen = (HSAILHotSpotLIRGenerator) (gen.getLIRGeneratorTool());
+        Value result = hsailgen.emitWorkItemAbsId();
+        gen.setResult(this, result);
+    }
+
+    @NodeIntrinsic
+    public static native int getWorkItemAbsId();
+
+}
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java	Tue Jun 10 22:36:26 2014 +0200
@@ -1029,9 +1029,10 @@
      */
     @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_notice_safepoints", type = "jint*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailNoticeSafepointsOffset;
     @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_deopt_occurred", type = "jint", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailDeoptOccurredOffset;
-    @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_never_ran_array", type = "jboolean *", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailNeverRanArrayOffset;
+    @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_never_ran_array", type = "jboolean*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailNeverRanArrayOffset;
     @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_deopt_next_index", type = "jint", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailDeoptNextIndexOffset;
-    @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_donor_threads", type = "JavaThread**", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailDonorThreadsOffset;
+    @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_alloc_info", type = "HSAILAllocationInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailAllocInfoOffset;
+    @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_cur_tlab_info", type = "HSAILTlabInfo**", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailCurTlabInfoOffset;

     @HotSpotVMField(name = "Hsail::HSAILKernelDeoptimization::_workitemid", type = "jint", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailDeoptimizationWorkItem;
     @HotSpotVMField(name = "Hsail::HSAILKernelDeoptimization::_actionAndReason", type = "jint", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailDeoptimizationReason;
@@ -1043,6 +1044,20 @@
     @HotSpotVMType(name = "Hsail::HSAILKernelDeoptimization", get = HotSpotVMType.Type.SIZE) @Stable public int hsailKernelDeoptimizationHeaderSize;
     @HotSpotVMType(name = "Hsail::HSAILDeoptimizationInfo", get = HotSpotVMType.Type.SIZE) @Stable public int hsailDeoptimizationInfoHeaderSize;

+    @HotSpotVMField(name = "HSAILAllocationInfo::_tlab_infos_pool_start", type = "HSAILTlabInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailAllocInfoTlabInfosPoolStartOffset;
+    @HotSpotVMField(name = "HSAILAllocationInfo::_tlab_infos_pool_next", type = "HSAILTlabInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailAllocInfoTlabInfosPoolNextOffset;
+    @HotSpotVMField(name = "HSAILAllocationInfo::_tlab_infos_pool_end", type = "HSAILTlabInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailAllocInfoTlabInfosPoolEndOffset;
+    @HotSpotVMField(name = "HSAILAllocationInfo::_tlab_align_reserve_bytes", type = "size_t", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailAllocInfoTlabAlignReserveBytesOffset;
+
+    @HotSpotVMField(name = "HSAILTlabInfo::_start", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoStartOffset;
+    @HotSpotVMField(name = "HSAILTlabInfo::_top", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoTopOffset;
+    @HotSpotVMField(name = "HSAILTlabInfo::_end", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoEndOffset;
+    @HotSpotVMField(name = "HSAILTlabInfo::_last_good_top", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoLastGoodTopOffset;
+    @HotSpotVMField(name = "HSAILTlabInfo::_original_top", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoOriginalTopOffset;
+    @HotSpotVMField(name = "HSAILTlabInfo::_donor_thread", type = "JavaThread*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoDonorThreadOffset;
+    @HotSpotVMField(name = "HSAILTlabInfo::_alloc_info", type = "HSAILAllocationInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoAllocInfoOffset;
+    @HotSpotVMType(name = "HSAILTlabInfo", get = HotSpotVMType.Type.SIZE) @Stable public int hsailTlabInfoSize;
+
     /**
      * Mark word right shift to get identity hash code.
      */
--- a/graal/com.oracle.graal.lir.hsail/src/com/oracle/graal/lir/hsail/HSAILMove.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.lir.hsail/src/com/oracle/graal/lir/hsail/HSAILMove.java	Tue Jun 10 22:36:26 2014 +0200
@@ -33,6 +33,7 @@
 import com.oracle.graal.lir.*;
 import com.oracle.graal.lir.StandardOp.MoveOp;
 import com.oracle.graal.lir.asm.*;
+import com.oracle.graal.hsail.*;

 /**
  * Implementation of move instructions.
@@ -167,6 +168,25 @@
             HSAILAddress addr = address.toAddress();
             masm.emitLoad(kind, result, addr);
         }
+
+        public boolean usesThreadRegister() {
+            return (address.toAddress().getBase() == HSAIL.threadRegister);
+        }
+    }
+
+    /**
+     * A LoadOp that uses the HSAIL ld_acq instruction
+     */
+    public static class LoadAcquireOp extends LoadOp {
+        public LoadAcquireOp(Kind kind, AllocatableValue result, HSAILAddressValue address, LIRFrameState state) {
+            super(kind, result, address, state);
+        }
+
+        @Override
+        public void emitMemAccess(HSAILAssembler masm) {
+            HSAILAddress addr = address.toAddress();
+            masm.emitLoadAcquire(result, addr);
+        }
     }

     public static class StoreOp extends MemOp {
@@ -186,6 +206,22 @@
         }
     }

+    /**
+     * A StoreOp that uses the HSAIL st_rel instruction
+     */
+    public static class StoreReleaseOp extends StoreOp {
+        public StoreReleaseOp(Kind kind, HSAILAddressValue address, AllocatableValue input, LIRFrameState state) {
+            super(kind, address, input, state);
+        }
+
+        @Override
+        public void emitMemAccess(HSAILAssembler masm) {
+            assert isRegister(input);
+            HSAILAddress addr = address.toAddress();
+            masm.emitStoreRelease(input, addr);
+        }
+    }
+
     public static class StoreConstantOp extends MemOp {

         protected final Constant input;
@@ -465,4 +501,18 @@
         }
     }

+    public static class WorkItemAbsIdOp extends HSAILLIRInstruction {
+
+        @Def({REG}) protected AllocatableValue result;
+
+        public WorkItemAbsIdOp(AllocatableValue result) {
+            this.result = result;
+        }
+
+        @Override
+        public void emitCode(CompilationResultBuilder crb, HSAILAssembler masm) {
+            masm.emitWorkItemAbsId(result);
+        }
+    }
+
 }
--- a/graal/com.oracle.graal.replacements/src/com/oracle/graal/replacements/nodes/DirectReadNode.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.replacements/src/com/oracle/graal/replacements/nodes/DirectReadNode.java	Tue Jun 10 22:36:26 2014 +0200
@@ -45,6 +45,10 @@
         this.readKind = readKind;
     }

+    protected ValueNode getAddress() {
+        return address;
+    }
+
     @Override
     public void generate(NodeLIRBuilderTool gen) {
         gen.setResult(this, gen.getLIRGeneratorTool().emitLoad(readKind, gen.operand(address), null));
--- a/graal/com.oracle.graal.replacements/src/com/oracle/graal/replacements/nodes/DirectStoreNode.java	Tue Jun 10 19:08:33 2014 +0200
+++ b/graal/com.oracle.graal.replacements/src/com/oracle/graal/replacements/nodes/DirectStoreNode.java	Tue Jun 10 22:36:26 2014 +0200
@@ -53,6 +53,14 @@
         gen.getLIRGeneratorTool().emitStore(kind, gen.operand(address), v, null);
     }

+    protected ValueNode getAddress() {
+        return address;
+    }
+
+    protected ValueNode getValue() {
+        return value;
+    }
+
     /*
      * The kind of the store is provided explicitly in these intrinsics because it is not always
      * possible to determine the kind from the given value during compilation (because stack kinds
--- a/src/gpu/hsail/vm/gpu_hsail.cpp	Tue Jun 10 19:08:33 2014 +0200
+++ b/src/gpu/hsail/vm/gpu_hsail.cpp	Tue Jun 10 22:36:26 2014 +0200
@@ -69,8 +69,8 @@
   {CC"executeKernel0",   CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I[I)Z",  FN_PTR(Hsail::execute_kernel_void_1d)},
 };

-void * Hsail::_device_context = NULL;
-jint   Hsail::_notice_safepoints = false;
+void* Hsail::_device_context = NULL;
+jint  Hsail::_notice_safepoints = false;

 Hsail::okra_create_context_func_t  Hsail::_okra_create_context;
 Hsail::okra_create_kernel_func_t   Hsail::_okra_create_kernel;
@@ -85,43 +85,6 @@
 Hsail::okra_clearargs_func_t       Hsail::_okra_clearargs;
 Hsail::okra_register_heap_func_t   Hsail::_okra_register_heap;

-struct Stats {
-  int _dispatches;
-  int _deopts;
-  int _overflows;
-  bool _changeSeen;
-
-public:
-  Stats() {
-    _dispatches = _deopts = _overflows = 0;
-    _changeSeen = false;
-  }
-
-  void incDeopts() {
-    _deopts++;
-    _changeSeen = true;
-  }
-  void incOverflows() {
-    _overflows++;
-    _changeSeen = true;
-  }
-
-  void finishDispatch() {
-    _dispatches++;
-    if (_changeSeen) {
-      // print();
-      _changeSeen = false;
-    }
-  }
-
-  void print() {
-    tty->print_cr("Disp=%d, Deopts=%d, Ovflows=%d", _dispatches, _deopts, _overflows);
-  }
-
-};
-
-static Stats kernelStats;
-
 //static jint in_kernel = 0;

 void Hsail::notice_safepoints() {
@@ -165,7 +128,7 @@
 return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, oop_map_array, CHECK_0);
 GPU_END

-static void showRanges(jboolean *a, int len) {
+static void showRanges(jboolean* a, int len) {
   // show ranges
   bool lookFor = true;
   for (int i = 0; i < len; i++) {
@@ -182,38 +145,6 @@
   }
 }

-// fill and retire old tlab and get a new one
-// if we can't get one, no problem someone will eventually do a gc
-void Hsail::getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, size_t tlabMinHsail) {
-  tlab->clear_before_allocation();    // fill and retire old tlab (will also check for null)
-
-  // get a size for a new tlab that is at least tlabMinHsail.
-  size_t new_tlab_size = tlab->compute_size(tlabMinHsail);
-  if (new_tlab_size == 0) return;
-
-  HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size);
-  if (tlab_start == NULL) return;
-
-  // ..and clear it if required
-  if (ZeroTLAB) {
-    Copy::zero_to_words(tlab_start, new_tlab_size);
-  }
-  // and init the tlab pointers
-  tlab->fill(tlab_start, tlab_start, new_tlab_size);
-}
-
-static void printTlabInfo (ThreadLocalAllocBuffer* tlab) {
-  HeapWord *start = tlab->start();
-  HeapWord *top = tlab->top();
-  HeapWord *end = tlab->end();
-  // sizes are in bytes
-  size_t tlabFree = tlab->free() * HeapWordSize;
-  size_t tlabUsed = tlab->used() * HeapWordSize;
-  size_t tlabSize = tlabFree + tlabUsed;
-  double freePct = 100.0 * (double) tlabFree/(double) tlabSize;
-  tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct);
-}
-
 class OopSaver : public StackObj {
 private:
   objArrayOop _oopsSaveArray;
@@ -260,21 +191,21 @@
     _oopMapArray = (typeArrayOop) JNIHandles::resolve(_oop_map_array);
   }

-  void * getOopForBit(HSAILFrame * hsailFrame, int bit) {
+  void* getOopForBit(HSAILFrame* hsailFrame, int bit) {
     assert(isOop(hsailFrame, bit), "");
-    void *oop;
+    void* oop;
     if (bit < hsailFrame->num_d_regs()) {
       // d register
       oop = (void*) hsailFrame->get_d_reg(bit);
     } else {
       // stack slot
       int stackOffset = (bit - hsailFrame->num_d_regs()) * 8;  // 8 bytes per stack slot
-      oop = (void *) hsailFrame->get_stackslot64(stackOffset);
+      oop = (void*) hsailFrame->get_stackslot64(stackOffset);
     }
     return oop;
   }

-  void putOopForBit(HSAILFrame * hsailFrame, int bit, void *oop) {
+  void putOopForBit(HSAILFrame* hsailFrame, int bit, void* oop) {
     assert(isOop(hsailFrame, bit), "");
     if (bit < hsailFrame->num_d_regs()) {
       // d register
@@ -286,7 +217,7 @@
     }
   }

-  void saveOopsFromFrame(HSAILFrame * hsailFrame, int deoptSlot){
+  void saveOopsFromFrame(HSAILFrame* hsailFrame, int deoptSlot){
     // as used, no need to resolve arrays on each call
     int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();

@@ -300,7 +231,7 @@
     }
   }

-  void restoreOopsToFrame(HSAILFrame * hsailFrame, int deoptSlot, int workitem){
+  void restoreOopsToFrame(HSAILFrame* hsailFrame, int deoptSlot, int workitem){
     // need to re-resolve on each restore
     resolveArrays();
     int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
@@ -310,13 +241,13 @@
       if (isOop(hsailFrame, bit)) {
         // the dregister or stack slot at this bit is an oop, retrieve it from array and put back in frame
         int saveArrayIndex = deoptSlot * oopsPerDeopt + bit;
-        void * newValue = (void *) _oopsSaveArray->obj_at(saveArrayIndex);
-        void * oldValue = getOopForBit(hsailFrame, bit);
+        void* newValue = (void*) _oopsSaveArray->obj_at(saveArrayIndex);
+        void* oldValue = getOopForBit(hsailFrame, bit);
         assert((oldValue != 0 ? newValue != 0 : newValue == 0), "bad dregValue retrieved");
         if (newValue != oldValue) {
           if (TraceGPUInteraction) {
             int numDRegs = hsailFrame->num_d_regs();
-            const char *name = (bit < numDRegs ? "$d" : "stk");
+            const char* name = (bit < numDRegs ? "$d" : "stk");
             int num = (bit < numDRegs ? bit : bit - numDRegs);
             tty->print_cr("oop moved for %s%d, workitem %d, slot %d, old=%p, new=%p",
                           name, num, workitem, deoptSlot, oldValue, newValue);
@@ -327,7 +258,7 @@
     }
   }

-  bool isOop(HSAILFrame * hsailFrame, int bit){
+  bool isOop(HSAILFrame* hsailFrame, int bit){
     // re-resolve on each access
     resolveArrays();
     if (bit > hsailFrame->num_d_regs() + hsailFrame->num_stack_slots()) {
@@ -347,47 +278,15 @@

 };

-jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save,
+jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm, jobject oops_save,
                                                 jobject donor_threads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) {
   ResourceMark rm(THREAD);
   objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args);

-  // TODO: avoid donor thread logic if kernel does not allocate
-  objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads);
-  int numDonorThreads = donorThreadObjects->length();
-  guarantee(numDonorThreads > 0, "need at least one donor thread");
-  JavaThread** donorThreads = NEW_RESOURCE_ARRAY(JavaThread*, numDonorThreads);
-  for (int i = 0; i < numDonorThreads; i++) {
-    donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i));
-  }
-
-
-  // compute tlabMinHsail based on number of workitems, number of donor
-  // threads, allocBytesPerWorkitem rounded up
-  size_t tlabMinHsail = (allocBytesPerWorkitem * dimX + (numDonorThreads - 1)) / numDonorThreads;
-  if (TraceGPUInteraction) {
-    tty->print_cr("computed tlabMinHsail = %d", tlabMinHsail);
-  }
-
-  for (int i = 0; i < numDonorThreads; i++) {
-    JavaThread* donorThread = donorThreads[i];
-    ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
-    if (TraceGPUInteraction) {
-      tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab);
-      printTlabInfo(tlab);
-    }
-
-    // note: this used vs. free limit checking should be based on some
-    // heuristic where we see how much this kernel tends to allocate
-    if ((tlab->end() == NULL) || (tlab->free() * HeapWordSize < tlabMinHsail)) {
-      getNewTlabForDonorThread(tlab, tlabMinHsail);
-      if (TraceGPUInteraction) {
-        tty->print("donorThread %d, refilled tlab, -> ", i);
-        printTlabInfo(tlab);
-      }
-    }
-  }
-
+  // We avoid HSAILAllocationInfo logic if kernel does not allocate
+  // in which case the donor_thread array passed in will be null
+  HSAILAllocationInfo* allocInfo = (donor_threads == NULL ? NULL : new HSAILAllocationInfo(donor_threads, dimX, allocBytesPerWorkitem));
+
   // Reset the kernel arguments
   _okra_clearargs(kernel);

@@ -400,7 +299,11 @@
     int numStackSlots = (saveAreaCounts >> 16);
     int bytesPerSaveArea = numSRegs * 4 + (numDRegs + numStackSlots) * 8;

-    e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea, dimX, donorThreads);
+    e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea, dimX, allocInfo);
+    // copy cur_tlab_infos
+    if (allocInfo != NULL) {
+      e->setCurTlabInfos(allocInfo->getCurTlabInfos());
+    }
   }

   // This object sets up the kernel arguments
@@ -409,8 +312,8 @@
     tty->print_cr("[HSAIL] range=%d", dimX);
   }

-  // if any object passed was null, throw an exception here
-  // doing this means the kernel code can avoid null checks on the object parameters.
+  // If any object passed was null, throw an exception here. Doing this
+  // means the kernel code can avoid null checks on the object parameters.
   if (hka.getFirstNullParameterIndex() >= 0) {
     char buf[64];
     sprintf(buf, "Null Kernel Parameter seen, Parameter Index: %d", hka.getFirstNullParameterIndex());
@@ -431,23 +334,9 @@
     //in_kernel = 0;
   }

-  // fix up any tlab tops that overflowed
-  bool anyOverflows = false;
-  for (int i = 0; i < numDonorThreads; i++) {
-    JavaThread * donorThread = donorThreads[i];
-    ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
-    if (tlab->top() > tlab->end()) {
-      anyOverflows = true;
-      long overflowAmount = (long) tlab->top() - (long) tlab->pf_top();
-      // tlab->set_top is private this ugly hack gets around that
-      *(long *)((char *)tlab + in_bytes(tlab->top_offset())) = (long) tlab->pf_top();
-      if (TraceGPUInteraction) {
-        tty->print_cr("donorThread %d at %p overflowed by %ld bytes, setting last good top to %p", i, donorThread, overflowAmount, tlab->top());
-      }
-    }
-  }
-  if (anyOverflows) {
-    kernelStats.incOverflows();
+  // avoid HSAILAllocationInfo logic if kernel does not allocate
+  if (allocInfo != NULL) {
+    allocInfo->postKernelCleanup();
   }

   if (UseHSAILDeoptimization) {
@@ -465,13 +354,11 @@
         guarantee(deoptcode == 1, msg);
       }
     } else {
-      kernelStats.incDeopts();
-
       {
         TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction);
         if (TraceGPUInteraction) {
           tty->print_cr("deopt happened.");
-          HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(0);
+          HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(0);
           tty->print_cr("first deopter was workitem %d", pdeopt->workitem());
         }

@@ -485,7 +372,7 @@
         // since slots are allocated from the beginning, we know how far to look
         assert(e->num_deopts() < e->num_slots(), "deopt save state overflow");
         for (int k = 0; k < e->num_deopts(); k++) {
-          HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k);
+          HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(k);
           assert (pdeopt->workitem() >= 0, "bad workitem in deopt");
           // this is a workitem that deopted
           oopSaver.saveOopsFromFrame(pdeopt->first_frame(), k);
@@ -494,15 +381,15 @@
         // Handle any deopting workitems.
         int count_deoptimized = 0;
         for (int k = 0; k < e->num_deopts(); k++) {
-          HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k);
+          HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(k);

           jint workitem = pdeopt->workitem();
           if (workitem != -1) {
             int deoptId = pdeopt->pc_offset();
-            HSAILFrame *hsailFrame = pdeopt->first_frame();
+            HSAILFrame* hsailFrame = pdeopt->first_frame();

-            // update the hsailFrame from the oopsSaveArray
-            // will re-resolve the handles each time
+            // Update the hsailFrame from the oopsSaveArray
+            // will re-resolve the handles each time.
             oopSaver.restoreOopsToFrame(hsailFrame, k, workitem);

             JavaValue result(T_VOID);
@@ -511,7 +398,7 @@
             javaArgs.push_int(deoptId);
             javaArgs.push_long((jlong) hsailFrame);

-            // override the deoptimization action with Action_none until we decide
+            // Override the deoptimization action with Action_none until we decide
             // how to handle the other actions.
             int myActionReason = Deoptimization::make_trap_request(Deoptimization::trap_request_reason(pdeopt->reason()), Deoptimization::Action_none);
             javaArgs.push_int(myActionReason);
@@ -551,7 +438,7 @@
       // turn off verbose trace stuff for javacall arg setup
       bool savedTraceGPUInteraction = TraceGPUInteraction;
       TraceGPUInteraction = false;
-      jboolean *never_ran_array = e->never_ran_array();
+      jboolean* never_ran_array = e->never_ran_array();
       if (handleNeverRansHere) {
         for (int k = 0; k < dimX; k++) {
           if (never_ran_array[k]) {
@@ -562,9 +449,10 @@
             JavaCallArguments javaArgs;
             // re-resolve the args_handle here
             objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args);
-            // This object sets up the javaCall arguments
-            // the way argsArray is set up, this should work for instance methods as well
-            // (the receiver will be the first oop pushed)
+
+            // This object sets up the javaCall arguments. The way
+            // argsArray is set up, this should work for instance
+            // methods as well (the receiver will be the first oop pushed)
             HSAILJavaCallArguments hjca(&javaArgs, k, mh->signature(), resolvedArgsArray, mh->is_static());
             if (mh->is_static()) {
               JavaCalls::call_static(&result, methKlass, mh->name(), mh->signature(), &javaArgs, THREAD);
@@ -583,19 +471,19 @@
     }

     delete e;
+    delete allocInfo;
   }
-  kernelStats.finishDispatch();
   return success;
 }

-GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv *env, jclass, jbyteArray code_handle, jstring name_handle))
+GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv* env, jclass, jbyteArray code_handle, jstring name_handle))
   guarantee(_okra_create_kernel != NULL, "[HSAIL] Okra not linked");
   ResourceMark rm;
   jsize name_len = env->GetStringLength(name_handle);
   jsize code_len = env->GetArrayLength(code_handle);

   char* name = NEW_RESOURCE_ARRAY(char, name_len + 1);
-  unsigned char *code = NEW_RESOURCE_ARRAY(unsigned char, code_len + 1);
+  unsigned char* code = NEW_RESOURCE_ARRAY(unsigned char, code_len + 1);

   code[code_len] = 0;
   name[name_len] = 0;
@@ -631,7 +519,7 @@
         return false; \
   } \

-GPU_ENTRY(jboolean, Hsail::initialize, (JNIEnv *env, jclass))
+GPU_ENTRY(jboolean, Hsail::initialize, (JNIEnv* env, jclass))
   if (okra_library_name == NULL) {
     if (TraceGPUInteraction) {
       tty->print_cr("Unsupported HSAIL platform");
@@ -641,14 +529,14 @@

   // here we know we have a valid okra_library_name to try to load
   char ebuf[O_BUFLEN];
-  char *okra_lib_name_from_env_var = getenv("_OKRA_SIM_LIB_PATH_");
+  char* okra_lib_name_from_env_var = getenv("_OKRA_SIM_LIB_PATH_");
   if (okra_lib_name_from_env_var != NULL) {
     okra_library_name = okra_lib_name_from_env_var;
   }
   if (TraceGPUInteraction) {
     tty->print_cr("[HSAIL] library is %s", okra_library_name);
   }
-  void *okra_lib_handle = NULL;
+  void* okra_lib_handle = NULL;
 #if defined(LINUX)
   // Check first if the Okra library is already loaded.
   // TODO: Figure out how to do this on other OSes.
@@ -668,8 +556,8 @@

   guarantee(_okra_create_context == NULL, "cannot repeat GPU initialization");

-  // at this point we know  okra_lib_handle is valid whether we loaded
-  // here or earlier.  In either case, we can lookup the functions
+  // At this point we know  okra_lib_handle is valid whether we loaded
+  // here or earlier.  In either case, we can lookup the functions.
   LOOKUP_OKRA_FUNCTION(okra_create_context, okra_create_context);
   LOOKUP_OKRA_FUNCTION(okra_create_kernel, okra_create_kernel);
   LOOKUP_OKRA_FUNCTION(okra_push_object, okra_push_object);
--- a/src/gpu/hsail/vm/gpu_hsail.hpp	Tue Jun 10 19:08:33 2014 +0200
+++ b/src/gpu/hsail/vm/gpu_hsail.hpp	Tue Jun 10 22:36:26 2014 +0200
@@ -22,12 +22,47 @@
  *
  */

-#ifndef GPU_HSAIL_HPP
-#define GPU_HSAIL_HPP
+#ifndef GPU_HSAIL_VM_GPU_HSAIL_HPP
+#define GPU_HSAIL_VM_GPU_HSAIL_HPP

 #include "utilities/exceptions.hpp"
 #include "graal/graalEnv.hpp"
 #include "gpu_hsail_Frame.hpp"
+#include "gpu_hsail_Tlab.hpp"
+
+struct HSAILKernelStats {
+  int _dispatches;
+  int _deopts;
+  int _overflows;
+  bool _changeSeen;
+
+public:
+  HSAILKernelStats() {
+    _dispatches = _deopts = _overflows = 0;
+    _changeSeen = false;
+  }
+
+  void incDeopts() {
+    _deopts++;
+    _changeSeen = true;
+  }
+  void incOverflows() {
+    _overflows++;
+    _changeSeen = true;
+  }
+
+  void finishDispatch() {
+    _dispatches++;
+    if (_changeSeen) {
+      // print();
+      _changeSeen = false;
+    }
+  }
+
+  void print() {
+    tty->print_cr("Disp=%d, Deopts=%d, Ovflows=%d", _dispatches, _deopts, _overflows);
+  }
+};

 class Hsail : public Gpu {

@@ -46,9 +81,9 @@
     inline jint workitem() { return _workitemid; }
     inline jint reason() { return _actionAndReason; }
     inline jint pc_offset() { return first_frame()->pc_offset(); }
-    inline HSAILFrame *first_frame() {
+    inline HSAILFrame* first_frame() {
       // starts after the "header" fields
-      return (HSAILFrame *) (((jbyte *) this) + sizeof(*this));
+      return (HSAILFrame*) (((jbyte*) this) + sizeof(*this));
     }
   };

@@ -56,38 +91,41 @@
 // TODO: query the device to get this number
 #define MAX_DEOPT_SLOTS    (8 * 40 * 64)

+
   class HSAILDeoptimizationInfo : public CHeapObj<mtInternal> {
     friend class VMStructs;
    private:
     jint* _notice_safepoints;
     jint _deopt_occurred;
     jint _deopt_next_index;
-    JavaThread** _donor_threads;
     jint _num_slots;
     jint _deopt_span;
+    HSAILTlabInfo** _cur_tlab_info;   // copy of what was in the HSAILAllocationInfo, to avoid an extra indirection
+    HSAILAllocationInfo* _alloc_info;
     char _ignore;
     // keep a pointer last so save area following it is word aligned
-    jboolean * _never_ran_array;
+    jboolean* _never_ran_array;

    public:
+    // static HSAILKernelStats kernelStats;
     HSAILKernelDeoptimization _deopt_save_states[1];  // number and size of these can vary per kernel

     static inline size_t hdr_size() {
       return sizeof(HSAILDeoptimizationInfo);
     }

-    inline jbyte * save_area_start() {
+    inline jbyte* save_area_start() {
       return (jbyte*) (this) + hdr_size();
     }

-    inline HSAILDeoptimizationInfo(int numSlots, int bytesPerSaveArea, int dimX, JavaThread** donorThreads) {
+    inline HSAILDeoptimizationInfo(int numSlots, int bytesPerSaveArea, int dimX, HSAILAllocationInfo* allocInfo) {
       _notice_safepoints = &Hsail::_notice_safepoints;
       _deopt_occurred = 0;
       _deopt_next_index = 0;
       _num_slots = numSlots;
       _never_ran_array = NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal);
       memset(_never_ran_array, 0, dimX * sizeof(jboolean));
-      _donor_threads = donorThreads;
+      _alloc_info = allocInfo;
       _deopt_span = sizeof(HSAILKernelDeoptimization) + sizeof(HSAILFrame) + bytesPerSaveArea;
       if (TraceGPUInteraction) {
         tty->print_cr("HSAILDeoptimizationInfo allocated, %d slots of size %d, total size = 0x%lx bytes", _num_slots, _deopt_span, (_num_slots * _deopt_span + sizeof(HSAILDeoptimizationInfo)));
@@ -102,21 +140,25 @@
       return _deopt_occurred;
     }
     inline jint num_deopts() { return _deopt_next_index; }
-    inline jboolean *never_ran_array() { return _never_ran_array; }
+    inline jboolean* never_ran_array() { return _never_ran_array; }
     inline jint num_slots() {return _num_slots;}

-    inline HSAILKernelDeoptimization * get_deopt_save_state(int slot) {
+    inline HSAILKernelDeoptimization* get_deopt_save_state(int slot) {
       // use _deopt_span to index into _deopt_states
-      return (HSAILKernelDeoptimization *) (save_area_start() + _deopt_span * slot);
+      return (HSAILKernelDeoptimization*) (save_area_start() + _deopt_span * slot);
     }

-    void * operator new (size_t hdrSize, int numSlots, int bytesPerSaveArea) {
+    void setCurTlabInfos(HSAILTlabInfo** ptlabInfos) {
+      _cur_tlab_info = ptlabInfos;
+    }
+
+    void* operator new (size_t hdrSize, int numSlots, int bytesPerSaveArea) {
       assert(hdrSize <= hdr_size(), "");
       size_t totalSizeBytes = hdr_size()  + numSlots * (sizeof(HSAILKernelDeoptimization) + sizeof(HSAILFrame) + bytesPerSaveArea);
       return NEW_C_HEAP_ARRAY(char, totalSizeBytes, mtInternal);
     }

-    void operator delete (void *ptr) {
+    void operator delete (void* ptr) {
       FREE_C_HEAP_ARRAY(char, ptr, mtInternal);
     }
   };
@@ -126,21 +168,16 @@
   static JNINativeMethod HSAIL_methods[];

   // static native boolean initialize();
-  JNIEXPORT static jboolean initialize(JNIEnv *env, jclass);
+  JNIEXPORT static jboolean initialize(JNIEnv* env, jclass);

   // static native long generateKernel(byte[] targetCode, String name);
-  JNIEXPORT static jlong generate_kernel(JNIEnv *env, jclass, jbyteArray code_handle, jstring name_handle);
+  JNIEXPORT static jlong generate_kernel(JNIEnv* env, jclass, jbyteArray code_handle, jstring name_handle);

   // static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args);
-  JNIEXPORT static jboolean execute_kernel_void_1d(JNIEnv *env, jclass, jobject hotspotInstalledCode, jint dimX, jobject args, jobject oopsSave,
+  JNIEXPORT static jboolean execute_kernel_void_1d(JNIEnv* env, jclass, jobject hotspotInstalledCode, jint dimX, jobject args, jobject oopsSave,
                                                    jobject donorThreads, int allocBytesPerWorkitem, jobject oop_map_array);

-  // static native void getThreadPointers(Object[] donorThreads, long[] threadPointersOut);
-  JNIEXPORT static void get_thread_pointers(JNIEnv *env, jclass, jobject donor_threads_handle, jobject thread_ptrs_handle);
-
-  static void getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, size_t tlabMinHsail);
-
-  static jboolean execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oopsSave,
+  static jboolean execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm, jobject oopsSave,
                                                   jobject donorThreads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS);

   static void register_heap();
@@ -165,7 +202,7 @@

 private:
   typedef void* (*okra_create_context_func_t)();
-  typedef void* (*okra_create_kernel_func_t)(void*, unsigned char *, const char *);
+  typedef void* (*okra_create_kernel_func_t)(void*, unsigned char*, const char*);
   typedef bool (*okra_push_object_func_t)(void*, void*);
   typedef bool (*okra_push_boolean_func_t)(void*, jboolean);
   typedef bool (*okra_push_byte_func_t)(void*, jbyte);
@@ -197,4 +234,4 @@
   // true if safepoints are activated
   static jint _notice_safepoints;
 };
-#endif // GPU_HSAIL_HPP
+#endif // GPU_HSAIL_VM_GPU_HSAIL_HPP
--- a/src/gpu/hsail/vm/gpu_hsail_Frame.hpp	Tue Jun 10 19:08:33 2014 +0200
+++ b/src/gpu/hsail/vm/gpu_hsail_Frame.hpp	Tue Jun 10 22:36:26 2014 +0200
@@ -22,8 +22,8 @@
  *
  */

-#ifndef GPU_HSAIL_FRAME_HPP
-#define GPU_HSAIL_FRAME_HPP
+#ifndef GPU_HSAIL_VM_GPU_HSAIL_FRAME_HPP
+#define GPU_HSAIL_VM_GPU_HSAIL_FRAME_HPP

 #include "graal/graalEnv.hpp"
 #include "code/debugInfo.hpp"
@@ -43,31 +43,31 @@
   jint num_s_regs() {return _num_s_regs; }
   jint num_d_regs() {return _num_d_regs; }
   jint num_stack_slots() {return _num_stack_slots; }
-  jbyte * data_start() {return (jbyte *) this  + sizeof(*this); }
+  jbyte* data_start() {return (jbyte*) this  + sizeof(*this); }
   jlong get_d_reg(int idx) {
     int ofst = num_s_regs() * 4 + idx * 8;
-    return(*(jlong *) (data_start() + ofst));
+    return(*(jlong*) (data_start() + ofst));
   }
   jint get_s_reg(int idx) {
     int ofst = idx * 4;
-    return(*(jint *) (data_start() + ofst));
+    return(*(jint*) (data_start() + ofst));
   }
   void put_d_reg(int idx, jlong val) {
     int ofst = num_s_regs() * 4 + idx * 8;
-    (*(jlong *) (data_start() + ofst)) = val;
+    (*(jlong*) (data_start() + ofst)) = val;
   }
   jint get_stackslot32(int stackOffset) {
     int ofst = num_s_regs() * 4 + num_d_regs() * 8 + stackOffset;
-    return(*(jint *) (data_start() + ofst));
+    return(*(jint*) (data_start() + ofst));
   }
   jlong get_stackslot64(int stackOffset) {
     int ofst = num_s_regs() * 4 + num_d_regs() * 8 + stackOffset;
-    return(*(jlong *) (data_start() + ofst));
+    return(*(jlong*) (data_start() + ofst));
   }
   void put_stackslot64(int stackOffset, jlong val) {
     int ofst = num_s_regs() * 4 + num_d_regs() * 8 + stackOffset;
-    (*(jlong *) (data_start() + ofst)) = val;
+    (*(jlong*) (data_start() + ofst)) = val;
   }
 };

-#endif // GPU_HSAIL_FRAME_HPP
+#endif // GPU_HSAIL_VM_GPU_HSAIL_FRAME_HPP
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/gpu/hsail/vm/gpu_hsail_Tlab.hpp	Tue Jun 10 22:36:26 2014 +0200
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP
+#define GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP
+
+#include "graal/graalEnv.hpp"
+#include "code/debugInfo.hpp"
+#include "code/location.hpp"
+#include "gpu_hsail.hpp"
+
+class HSAILAllocationInfo;
+
+class HSAILTlabInfo VALUE_OBJ_CLASS_SPEC {
+  friend class VMStructs;
+public:
+  // uses only the necessary fields from a full TLAB
+  HeapWord* _start;
+  HeapWord* _top;
+  HeapWord* _end;
+  HeapWord* _last_good_top;
+  HeapWord* _original_top;
+  JavaThread* _donor_thread;         // donor thread associated with this tlabInfo
+  HSAILAllocationInfo* _alloc_info;   // same as what is in HSAILDeoptimizationInfo
+
+  // Accessors
+  HeapWord* start() { return _start; }
+  HeapWord* top() { return _top; }
+  HeapWord* end() { return _end; }
+  HeapWord* last_good_top() { return _last_good_top; }
+  HeapWord* original_top() { return _original_top; }
+  void initialize(HeapWord* start, HeapWord* top, HeapWord* end, JavaThread* donorThread, HSAILAllocationInfo* allocInfo) {
+    _start = start;
+    _top = _original_top = top;
+    _end = end;
+    _donor_thread = donorThread;
+    _alloc_info = allocInfo;
+  }
+};
+
+
+class HSAILAllocationInfo : public CHeapObj<mtInternal> {
+  friend class VMStructs;
+private:
+  JavaThread** donorThreads;
+  jint _num_donor_threads;
+  size_t _tlab_align_reserve_bytes;    // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes()
+  HSAILTlabInfo** _cur_tlab_infos;    // array of current tlab info pointers, one per donor_thread
+  HSAILTlabInfo* _tlab_infos_pool_start;    // pool for new tlab_infos
+  HSAILTlabInfo* _tlab_infos_pool_next;     // where next will be allocated from
+  HSAILTlabInfo* _tlab_infos_pool_end;      // where next will be allocated from
+
+public:
+  HSAILAllocationInfo(jobject donor_threads_jobj, int dimX, int allocBytesPerWorkitem) {
+    // fill in the donorThreads array
+    objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads_jobj);
+    _num_donor_threads = donorThreadObjects->length();
+    guarantee(_num_donor_threads > 0, "need at least one donor thread");
+    donorThreads = NEW_C_HEAP_ARRAY(JavaThread*, _num_donor_threads, mtInternal);
+    for (int i = 0; i < _num_donor_threads; i++) {
+      donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i));
+    }
+
+    // Compute max_tlab_infos based on amount of free heap space
+    size_t max_tlab_infos;
+    {
+      JavaThread* donorThread = donorThreads[0];
+      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+      size_t new_tlab_size = tlab->compute_size(0);
+      size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(donorThread);
+      if (new_tlab_size != 0) {
+        max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_donor_threads));
+      } else {
+        max_tlab_infos = 8 * _num_donor_threads;   // an arbitrary multiple
+      }
+      if (TraceGPUInteraction) {
+        tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos);
+      }
+    }
+
+    _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_donor_threads, mtInternal);
+    _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal);
+    _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_donor_threads];
+    _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos];
+    _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes();
+
+    // we will fill the first N tlabInfos from the donor threads
+    for (int i = 0; i < _num_donor_threads; i++) {
+      JavaThread* donorThread = donorThreads[i];
+      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+      if (TraceGPUInteraction) {
+        tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab);
+        printTlabInfoFromThread(tlab);
+      }
+
+      // Here we try to get a new tlab if current one is null. Note:
+      // eventually we may want to test if the size is too small based
+      // on some heuristic where we see how much this kernel tends to
+      // allocate, but for now we can just let it overflow and let the
+      // GPU allocate new tlabs. Actually, if we can't prime a tlab
+      // here, it might make sense to do a gc now rather than to start
+      // the kernel and have it deoptimize.  How to do that?
+      if (tlab->end() == NULL) {
+        bool success = getNewTlabForDonorThread(tlab, i);
+        if (TraceGPUInteraction) {
+          if (success) {
+            tty->print("donorThread %d, refilled tlab, -> ", i);
+            printTlabInfoFromThread(tlab);
+          } else {
+            tty->print("donorThread %d, could not refill tlab, left as ", i);
+            printTlabInfoFromThread(tlab);
+          }
+        }
+      }
+
+      // extract the necessary tlab fields into a TlabInfo record
+      HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i];
+      _cur_tlab_infos[i] = pTlabInfo;
+      pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), donorThread, this);
+    }
+  }
+
+  ~HSAILAllocationInfo() {
+    FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal);
+    FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal);
+    FREE_C_HEAP_ARRAY(JavaThread*, donorThreads, mtInternal);
+  }
+
+  void postKernelCleanup() {
+    // go thru all the tlabInfos, fix up any tlab tops that overflowed
+    // complete the tlabs if they overflowed
+    // update the donor threads tlabs when appropriate
+    bool anyOverflows = false;
+    size_t bytesAllocated = 0;
+    // if there was an overflow in allocating tlabInfos, correct it here
+    if (_tlab_infos_pool_next > _tlab_infos_pool_end) {
+      if (TraceGPUInteraction) {
+        int overflowAmount = _tlab_infos_pool_next - _tlab_infos_pool_end;
+        tty->print_cr("tlabInfo allocation overflowed by %d units", overflowAmount);
+      }
+      _tlab_infos_pool_next = _tlab_infos_pool_end;
+    }
+    for (HSAILTlabInfo* tlabInfo = _tlab_infos_pool_start; tlabInfo < _tlab_infos_pool_next; tlabInfo++) {
+      if (TraceGPUInteraction) {
+        tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo,
+                      tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top());
+      }
+      JavaThread* donorThread = tlabInfo->_donor_thread;
+      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+      bool overflowed = false;
+      // if a tlabInfo has NULL fields, i.e. we could not prime it on entry,
+      // or we could not get a tlab from the gpu, so ignore tlabInfo here
+      if (tlabInfo->start() != NULL) {
+        if (tlabInfo->top() > tlabInfo->end()) {
+          anyOverflows = true;
+          overflowed = true;
+          if (TraceGPUInteraction) {
+            long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top();
+            tty->print_cr("tlabInfo %p (donorThread = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, donorThread, overflowAmount, tlabInfo->last_good_top());
+          }
+          tlabInfo->_top = tlabInfo->last_good_top();
+        }
+
+        // fill the donor thread tlab with the tlabInfo information
+        // we do this even if it will get overwritten by a later tlabinfo
+        // because it helps with tlab statistics for that donor thread
+        tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve());
+
+        // if there was an overflow, make it parsable with retire = true
+        if (overflowed) {
+          tlab->make_parsable(true);
+        }
+
+        size_t delta = (long)(tlabInfo->top()) - (long)(tlabInfo->original_top());
+        if (TraceGPUInteraction) {
+          tty->print_cr("%ld bytes were allocated by tlabInfo %p (start %p, top %p, end %p", delta, tlabInfo,
+                        tlabInfo->start(), tlabInfo->top(), tlabInfo->end());
+        }
+        bytesAllocated += delta;
+      }
+    }
+    if (TraceGPUInteraction) {
+      tty->print_cr("%ld total bytes were allocated in this kernel", bytesAllocated);
+    }
+    if (anyOverflows) {
+      // Hsail::kernelStats.incOverflows();
+    }
+  }
+
+  HSAILTlabInfo** getCurTlabInfos() {
+    return _cur_tlab_infos;
+  }
+
+private:
+  // fill and retire old tlab and get a new one
+  // if we can't get one, no problem someone will eventually do a gc
+  bool getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, int idx) {
+
+    tlab->clear_before_allocation();    // fill and retire old tlab (will also check for null)
+
+    // get a size for a new tlab that is based on the desired_size
+    size_t new_tlab_size = tlab->compute_size(0);
+    if (new_tlab_size == 0) return false;
+
+    HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size);
+    if (tlab_start == NULL) return false;
+
+    // ..and clear it if required
+    if (ZeroTLAB) {
+      Copy::zero_to_words(tlab_start, new_tlab_size);
+    }
+    // and init the tlab pointers
+    tlab->fill(tlab_start, tlab_start, new_tlab_size);
+    return true;
+  }
+
+  void printTlabInfoFromThread (ThreadLocalAllocBuffer* tlab) {
+    HeapWord* start = tlab->start();
+    HeapWord* top = tlab->top();
+    HeapWord* end = tlab->end();
+    // sizes are in bytes
+    size_t tlabFree = tlab->free() * HeapWordSize;
+    size_t tlabUsed = tlab->used() * HeapWordSize;
+    size_t tlabSize = tlabFree + tlabUsed;
+    double freePct = 100.0 * (double) tlabFree/(double) tlabSize;
+    tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct);
+  }
+
+};
+
+#endif // GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP
--- a/src/gpu/hsail/vm/hsailArgumentsBase.hpp	Tue Jun 10 19:08:33 2014 +0200
+++ b/src/gpu/hsail/vm/hsailArgumentsBase.hpp	Tue Jun 10 22:36:26 2014 +0200
@@ -22,19 +22,16 @@
  *
  */

-#ifndef BASE_ARGUMENTS_HSAIL_HPP
-#define BASE_ARGUMENTS_HSAIL_HPP
+#ifndef GPU_HSAIL_VM_HSAIL_ARGUMENTS_BASE_HPP
+#define GPU_HSAIL_VM_HSAIL_ARGUMENTS_BASE_HPP

 #include "runtime/signature.hpp"


-/***
- * Base class which iterates thru a signature and pulls from a
- * objArrayOop of boxed values.  Used as base for HSAILKernelArguments
- * and HSAILJavaCallArguments The derived classes specify how to push
- * args onto their data structure
- ***/
-
+// Base class which iterates thru a signature and pulls from a
+// objArrayOop of boxed values.  Used as base for HSAILKernelArguments
+// and HSAILJavaCallArguments The derived classes specify how to push
+// args onto their data structure
 class HSAILArgumentsBase : public SignatureIterator {

 public:
@@ -49,7 +46,7 @@
   // number of parameters in the signature
   int _parameter_count;

-  Symbol * _signature;
+  Symbol* _signature;
   bool _is_static;

   // records first null parameter seen
@@ -58,8 +55,8 @@
   // Get next java argument
   oop next_arg(BasicType expectedType);

-    virtual char *argsBuilderName() = 0;
-    virtual void pushObject(void * obj) = 0;
+    virtual char* argsBuilderName() = 0;
+    virtual void pushObject(void* obj) = 0;
     virtual void pushBool(jboolean z) = 0;
     virtual void pushByte(jbyte b) = 0;
     virtual void pushDouble(jdouble d) = 0;
@@ -67,7 +64,7 @@
     virtual void pushInt(jint i) = 0;
     virtual void pushLong(jlong j) = 0;
     virtual void handleFinalIntParameter() = 0;
-    virtual void handleFinalObjParameter(void *obj) = 0;
+    virtual void handleFinalObjParameter(void* obj) = 0;
     virtual void pushTrailingArgs() = 0;

     void recordNullObjectParameter() {
@@ -143,4 +140,4 @@

 };

-#endif  // BASE_ARGUMENTS_HSAIL_HPP
+#endif  // GPU_HSAIL_VM_HSAIL_ARGUMENTS_BASE_HPP
--- a/src/gpu/hsail/vm/hsailJavaCallArguments.hpp	Tue Jun 10 19:08:33 2014 +0200
+++ b/src/gpu/hsail/vm/hsailJavaCallArguments.hpp	Tue Jun 10 22:36:26 2014 +0200
@@ -22,8 +22,8 @@
  *
  */

-#ifndef JAVACALL_ARGUMENTS_HSAIL_HPP
-#define JAVACALL_ARGUMENTS_HSAIL_HPP
+#ifndef GPU_HSAIL_VM_HSAIL_JAVACALL_ARGUMENTS_HPP
+#define GPU_HSAIL_VM_HSAIL_JAVACALL_ARGUMENTS_HPP

 #include "hsailArgumentsBase.hpp"
 #include "runtime/javaCalls.hpp"
@@ -33,17 +33,17 @@
 public:

 private:
-  // JavaCall Args to push into
-  JavaCallArguments *_javaArgs;
+  // JavaCall args to push into
+  JavaCallArguments* _javaArgs;
   int _workitemid;
  public:
-    HSAILJavaCallArguments(JavaCallArguments *javaArgs, int workitemid, Symbol* signature, objArrayOop args, bool is_static) : HSAILArgumentsBase(signature, args, is_static) {
+    HSAILJavaCallArguments(JavaCallArguments* javaArgs, int workitemid, Symbol* signature, objArrayOop args, bool is_static) : HSAILArgumentsBase(signature, args, is_static) {
         _javaArgs = javaArgs;
         _workitemid = workitemid;
         collectArgs();
     }
-    virtual char *argsBuilderName() {return (char *)"HSAILJavaCallArguments";}
-    virtual void pushObject(void *obj) { _javaArgs->push_oop((oop) obj);  }
+    virtual char* argsBuilderName() {return (char*)"HSAILJavaCallArguments";}
+    virtual void pushObject(void* obj) { _javaArgs->push_oop((oop) obj);  }
     virtual void pushBool(jboolean z) { pushInt(z); }
     virtual void pushByte(jbyte b) { pushInt(b); }
     virtual void pushDouble(jdouble d) { _javaArgs->push_double(d); }
@@ -64,7 +64,7 @@
     // stream source array (already checked in the base class) so for
     // a javacall we need to extract the correct obj from it based on
     // the workitemid
-    virtual void handleFinalObjParameter(void *arg) {
+    virtual void handleFinalObjParameter(void* arg) {
       objArrayOop objArrayArg = (objArrayOop) arg;
       oop extractedObj = objArrayArg->obj_at(_workitemid);
       if (TraceGPUInteraction) {
@@ -77,5 +77,5 @@

 };

-#endif  // JAVACALL_ARGUMENTS_HSAIL_HPP
+#endif  // GPU_HSAIL_VM_HSAIL_JAVACALL_ARGUMENTS_HPP
--- a/src/gpu/hsail/vm/hsailKernelArguments.hpp	Tue Jun 10 19:08:33 2014 +0200
+++ b/src/gpu/hsail/vm/hsailKernelArguments.hpp	Tue Jun 10 22:36:26 2014 +0200
@@ -22,8 +22,8 @@
  *
  */

-#ifndef KERNEL_ARGUMENTS_HSAIL_HPP
-#define KERNEL_ARGUMENTS_HSAIL_HPP
+#ifndef GPU_HSAIL_VM_HSAIL_KERNEL_ARGUMENTS_HPP
+#define GPU_HSAIL_VM_HSAIL_KERNEL_ARGUMENTS_HPP

 #include "gpu_hsail.hpp"
 #include "runtime/signature.hpp"
@@ -37,7 +37,7 @@
 private:
   // Kernel to push into
   address _kernel;
-  void * _exceptionHolder;
+  void* _exceptionHolder;

  public:
     HSAILKernelArguments(address kernel, Symbol* signature, objArrayOop args, bool is_static, void* exceptionHolder) : HSAILArgumentsBase(signature, args, is_static) {
@@ -45,8 +45,8 @@
         _exceptionHolder = exceptionHolder;
         collectArgs();
     }
-    virtual char *argsBuilderName() {return (char *)"HSAILKernelArguments";}
-    virtual void pushObject(void *obj) {
+    virtual char* argsBuilderName() {return (char*)"HSAILKernelArguments";}
+    virtual void pushObject(void* obj) {
         bool pushed = Hsail::_okra_push_object(_kernel, obj);
         assert(pushed == true, "arg push failed");
     }
@@ -98,9 +98,9 @@

     // for kernel arguments, final obj parameter should be an object
     // stream source array (already checked in the base class) so here we just pass it
-    virtual void handleFinalObjParameter(void *arg) {
+    virtual void handleFinalObjParameter(void* arg) {
       pushObject(arg);
     }
 };

-#endif  // KERNEL_ARGUMENTS_HSAIL_HPP
+#endif  // GPU_HSAIL_VM_HSAIL_KERNEL_ARGUMENTS_HPP
--- a/src/gpu/hsail/vm/vmStructs_hsail.hpp	Tue Jun 10 19:08:33 2014 +0200
+++ b/src/gpu/hsail/vm/vmStructs_hsail.hpp	Tue Jun 10 22:36:26 2014 +0200
@@ -41,16 +41,32 @@
   nonstatic_field(Hsail::HSAILKernelDeoptimization, _workitemid,                                jint)                                 \
   nonstatic_field(Hsail::HSAILKernelDeoptimization, _actionAndReason,                           jint)                                 \
                                                                                                                                       \
-  nonstatic_field(Hsail::HSAILDeoptimizationInfo, _notice_safepoints,                      jint*)                                     \
+  nonstatic_field(Hsail::HSAILDeoptimizationInfo, _notice_safepoints,                      jint*) \
   nonstatic_field(Hsail::HSAILDeoptimizationInfo, _deopt_occurred,                         jint)                                      \
   nonstatic_field(Hsail::HSAILDeoptimizationInfo, _deopt_next_index,                       jint)                                      \
-  nonstatic_field(Hsail::HSAILDeoptimizationInfo, _donor_threads,                          JavaThread**)                              \
-  nonstatic_field(Hsail::HSAILDeoptimizationInfo, _never_ran_array,                        jboolean *)                                \
+  nonstatic_field(Hsail::HSAILDeoptimizationInfo, _cur_tlab_info,                          HSAILTlabInfo**)                           \
+  nonstatic_field(Hsail::HSAILDeoptimizationInfo, _alloc_info,                             HSAILAllocationInfo*)                      \
+  nonstatic_field(Hsail::HSAILDeoptimizationInfo, _never_ran_array,                        jboolean*)                                 \
+                                                                                                                                      \
+  nonstatic_field(HSAILAllocationInfo, _tlab_infos_pool_start,                             HSAILTlabInfo*)                            \
+  nonstatic_field(HSAILAllocationInfo, _tlab_infos_pool_next,                              HSAILTlabInfo*)                            \
+  nonstatic_field(HSAILAllocationInfo, _tlab_infos_pool_end,                               HSAILTlabInfo*)                            \
+  nonstatic_field(HSAILAllocationInfo, _tlab_align_reserve_bytes,                          size_t)                                    \
+                                                                                                                                      \
+  nonstatic_field(HSAILTlabInfo, _start,                                                   HeapWord*)                                 \
+  nonstatic_field(HSAILTlabInfo, _top,                                                     HeapWord*)                                 \
+  nonstatic_field(HSAILTlabInfo, _end,                                                     HeapWord*)                                 \
+  nonstatic_field(HSAILTlabInfo, _last_good_top,                                           HeapWord*)                                 \
+  nonstatic_field(HSAILTlabInfo, _original_top,                                            HeapWord*)                                 \
+  nonstatic_field(HSAILTlabInfo, _donor_thread,                                            JavaThread*)                               \
+  nonstatic_field(HSAILTlabInfo, _alloc_info,                                              HSAILAllocationInfo*)                      \

-#define VM_TYPES_GPU_HSAIL(declare_type, declare_toplevel_type)                 \
+#define VM_TYPES_GPU_HSAIL(declare_type, declare_toplevel_type)      \
   declare_toplevel_type(HSAILFrame)                                  \
   declare_toplevel_type(HSAILFrame*)                                 \
   declare_toplevel_type(Hsail::HSAILKernelDeoptimization)            \
+  declare_toplevel_type(HSAILAllocationInfo)                         \
+  declare_toplevel_type(HSAILTlabInfo)                               \
   declare_toplevel_type(Hsail::HSAILDeoptimizationInfo)

 #endif // GPU_HSAIL_VM_VMSTRUCTS_HSAIL_HPP
--- a/src/share/vm/gc_interface/collectedHeap.hpp	Tue Jun 10 19:08:33 2014 +0200
+++ b/src/share/vm/gc_interface/collectedHeap.hpp	Tue Jun 10 22:36:26 2014 +0200
@@ -84,7 +84,7 @@
 class CollectedHeap : public CHeapObj<mtInternal> {
   friend class VMStructs;
   friend class IsGCActiveMark; // Block structured external access to _is_gc_active
-  friend class Hsail;  // access to allocate_new_tlab
+  friend class HSAILAllocationInfo;  // access to allocate_new_tlab

 #ifdef ASSERT
   static int       _fire_out_of_memory_count;