changeset 16795:a29e6e7b7a86

Replace hsail donor threads with hsail tlabs
author Tom Rodriguez <tom.rodriguez@oracle.com>
date Tue, 12 Aug 2014 16:30:17 -0700
parents 74c02c90a3f9
children 0d47af538a92
files graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/DonorThreadPool.java graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILHotSpotReplacementsUtil.java graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILNewObjectSnippets.java graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java src/gpu/hsail/vm/gpu_hsail.cpp src/gpu/hsail/vm/gpu_hsail.hpp src/gpu/hsail/vm/gpu_hsail_Tlab.hpp src/gpu/hsail/vm/vmStructs_hsail.hpp src/share/vm/gc_interface/collectedHeap.cpp src/share/vm/memory/threadLocalAllocBuffer.cpp src/share/vm/memory/threadLocalAllocBuffer.hpp src/share/vm/runtime/thread.cpp src/share/vm/runtime/thread.hpp
diffstat 14 files changed, 167 insertions(+), 181 deletions(-) [+]
line wrap: on
line diff
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/DonorThreadPool.java	Tue Aug 12 16:12:49 2014 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-package com.oracle.graal.hotspot.hsail;
-
-import static com.oracle.graal.hotspot.hsail.HSAILHotSpotBackend.Options.*;
-
-import java.util.concurrent.*;
-
-import com.oracle.graal.hotspot.hsail.HSAILHotSpotBackend.Options;
-
-/**
- * Thread pool for HSAIL allocation support.
- */
-public class DonorThreadPool {
-
-    private final Thread[] threads;
-
-    void waitAt(CyclicBarrier barrier) {
-        try {
-            barrier.await();
-        } catch (Exception e) {
-            e.printStackTrace();
-        }
-    }
-
-    /**
-     * Creates a pool of threads whose size is given by {@link Options#HsailDonorThreads}.
-     */
-    DonorThreadPool() {
-        int size = HsailDonorThreads.getValue();
-        this.threads = new Thread[size];
-        CyclicBarrier barrier = new CyclicBarrier(size + 1);
-
-        // fill in threads
-        for (int i = 0; i < size; i++) {
-            threads[i] = new Thread(new Runnable() {
-                @Override
-                public void run() {
-                    while (true) {
-                        waitAt(barrier);
-                    }
-                }
-            }, "HsailDonorThread-" + i);
-            threads[i].setDaemon(true);
-            threads[i].start();
-        }
-        // creating thread waits at barrier to make sure others have started
-        waitAt(barrier);
-    }
-
-    public Thread[] getThreads() {
-        return threads;
-    }
-}
\ No newline at end of file
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java	Tue Aug 12 16:12:49 2014 -0700
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java	Tue Aug 12 16:30:17 2014 -0700
@@ -85,8 +85,8 @@
     public static class Options {
 
         // @formatter:off
-        @Option(help = "Number of donor threads for HSAIL kernel dispatch")
-        static public final OptionValue<Integer> HsailDonorThreads = new OptionValue<>(4);
+        @Option(help = "Number of TLABs used for HSAIL kernels which allocate")
+        static public final OptionValue<Integer> HsailKernelTlabs = new OptionValue<>(4);
         // @formatter:on
     }
 
@@ -369,25 +369,18 @@
         return result;
     }
 
-    private static final ThreadLocal<DonorThreadPool> donorThreadPool = new ThreadLocal<DonorThreadPool>() {
-        @Override
-        protected DonorThreadPool initialValue() {
-            return new DonorThreadPool();
-        }
-    };
-
     public boolean executeKernel(HotSpotInstalledCode kernel, int jobSize, Object[] args) throws InvalidInstalledCodeException {
         if (!deviceInitialized) {
             throw new GraalInternalError("Cannot execute GPU kernel if device is not initialized");
         }
         int[] oopMapArray = ((HSAILHotSpotNmethod) kernel).getOopMapArray();
 
-        // Pass donorThreadPoolArray if this kernel uses allocation, otherwise null
-        Thread[] donorThreadArray = ((HSAILHotSpotNmethod) kernel).getUsesAllocationFlag() ? donorThreadPool.get().getThreads() : null;
-        return executeKernel0(kernel, jobSize, args, donorThreadArray, HsailAllocBytesPerWorkitem.getValue(), oopMapArray);
+        // Pass HsailKernelTlabs number if this kernel uses allocation, otherwise 0
+        int numTlabs = ((HSAILHotSpotNmethod) kernel).getUsesAllocationFlag() ? HsailKernelTlabs.getValue() : 0;
+        return executeKernel0(kernel, jobSize, args, numTlabs, HsailAllocBytesPerWorkitem.getValue(), oopMapArray);
     }
 
-    private static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args, Thread[] donorThreads, int allocBytesPerWorkitem, int[] oopMapArray)
+    private static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args, int numTlabs, int allocBytesPerWorkitem, int[] oopMapArray)
                     throws InvalidInstalledCodeException;
 
     /**
@@ -633,12 +626,12 @@
             RegisterValue d16_deoptInfo = HSAIL.d16.asValue(wordLIRKind);
 
             // Aliases for d17
-            RegisterValue d17_donorThreadIndex = HSAIL.d17.asValue(wordLIRKind);
-            RegisterValue d17_safepointFlagAddrIndex = d17_donorThreadIndex;
+            RegisterValue d17_tlabIndex = HSAIL.d17.asValue(wordLIRKind);
+            RegisterValue d17_safepointFlagAddrIndex = d17_tlabIndex;
 
             // Aliases for s34
             RegisterValue s34_deoptOccurred = HSAIL.s34.asValue(LIRKind.value(Kind.Int));
-            RegisterValue s34_donorThreadIndex = s34_deoptOccurred;
+            RegisterValue s34_tlabIndex = s34_deoptOccurred;
 
             asm.emitLoadKernelArg(d16_deoptInfo, asm.getDeoptInfoName(), "u64");
             asm.emitComment("// Check if a deopt or safepoint has occurred and abort if true before doing any work");
@@ -657,15 +650,15 @@
             // load thread register if this kernel performs allocation
             if (usesAllocation) {
                 RegisterValue threadReg = getProviders().getRegisters().getThreadRegister().asValue(wordLIRKind);
-                assert HsailDonorThreads.getValue() > 0;
+                assert HsailKernelTlabs.getValue() > 0;
                 asm.emitLoad(wordKind, threadReg, new HSAILAddressValue(wordLIRKind, d16_deoptInfo, config.hsailCurTlabInfoOffset).toAddress());
-                if (HsailDonorThreads.getValue() != 1) {
-                    asm.emitComment("// map workitem to a donor thread");
-                    asm.emitString(String.format("rem_u32  $%s, %s, %d;", s34_donorThreadIndex.getRegister(), workItemReg, HsailDonorThreads.getValue()));
-                    asm.emitConvert(d17_donorThreadIndex, s34_donorThreadIndex, wordKind, Kind.Int);
-                    asm.emit("mad", threadReg, d17_donorThreadIndex, Constant.forInt(8), threadReg);
+                if (HsailKernelTlabs.getValue() != 1) {
+                    asm.emitComment("// map workitem to a tlab");
+                    asm.emitString(String.format("rem_u32  $%s, %s, %d;", s34_tlabIndex.getRegister(), workItemReg, HsailKernelTlabs.getValue()));
+                    asm.emitConvert(d17_tlabIndex, s34_tlabIndex, wordKind, Kind.Int);
+                    asm.emit("mad", threadReg, d17_tlabIndex, Constant.forInt(8), threadReg);
                 } else {
-                    // workitem is already mapped to solitary donor thread
+                    // workitem is already mapped to solitary tlab
                 }
                 asm.emitComment("// $" + getProviders().getRegisters().getThreadRegister() + " will point to holder of tlab thread info for this workitem");
             }
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILHotSpotReplacementsUtil.java	Tue Aug 12 16:12:49 2014 -0700
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILHotSpotReplacementsUtil.java	Tue Aug 12 16:30:17 2014 -0700
@@ -50,7 +50,7 @@
     public static final LocationIdentity TLABINFO_START_LOCATION = new NamedLocationIdentity("TlabInfoStart");
     public static final LocationIdentity TLABINFO_ALLOCINFO_LOCATION = new NamedLocationIdentity("TlabInfoAllocInfo");
     public static final LocationIdentity TLABINFO_ORIGINALTOP_LOCATION = new NamedLocationIdentity("TlabInfoOriginalTop");
-    public static final LocationIdentity TLABINFO_DONORTHREAD_LOCATION = new NamedLocationIdentity("TlabInfoDonorThread");
+    public static final LocationIdentity TLABINFO_TLAB_LOCATION = new NamedLocationIdentity("TlabInfoTlab");
 
     public static final LocationIdentity ALLOCINFO_TLABINFOSPOOLNEXT_LOCATION = new NamedLocationIdentity("AllocInfoTlabInfosPoolNext");
     public static final LocationIdentity ALLOCINFO_TLABINFOSPOOLEND_LOCATION = new NamedLocationIdentity("AllocInfoTlabInfosPoolEnd");
@@ -121,12 +121,12 @@
         tlabInfo.writeWord(config().hsailTlabInfoOriginalTopOffset, val, TLABINFO_ORIGINALTOP_LOCATION);
     }
 
-    public static void writeTlabInfoDonorThread(Word tlabInfo, Word val) {
-        tlabInfo.writeWord(config().hsailTlabInfoDonorThreadOffset, val, TLABINFO_DONORTHREAD_LOCATION);
+    public static void writeTlabInfoTlab(Word tlabInfo, Word val) {
+        tlabInfo.writeWord(config().hsailTlabInfoTlabOffset, val, TLABINFO_TLAB_LOCATION);
     }
 
-    public static Word readTlabInfoDonorThread(Word tlabInfo) {
-        return tlabInfo.readWord(config().hsailTlabInfoDonorThreadOffset, TLABINFO_DONORTHREAD_LOCATION);
+    public static Word readTlabInfoTlab(Word tlabInfo) {
+        return tlabInfo.readWord(config().hsailTlabInfoTlabOffset, TLABINFO_TLAB_LOCATION);
     }
 
     public static Word readAllocInfoTlabInfosPoolEnd(Word allocInfo) {
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILNewObjectSnippets.java	Tue Aug 12 16:12:49 2014 -0700
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILNewObjectSnippets.java	Tue Aug 12 16:30:17 2014 -0700
@@ -97,7 +97,7 @@
         Word alignReserveBytes = readAllocInfoTlabAlignReserveBytes(allocInfo);
         writeTlabInfoEnd(newTlabInfo, tlabStart.add(newTlabSize.subtract(alignReserveBytes)));
         writeTlabInfoAllocInfo(newTlabInfo, allocInfo);
-        writeTlabInfoDonorThread(newTlabInfo, readTlabInfoDonorThread(oldTlabInfo));
+        writeTlabInfoTlab(newTlabInfo, readTlabInfoTlab(oldTlabInfo));
         return (newTlabInfo);
     }
 
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java	Tue Aug 12 16:12:49 2014 -0700
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java	Tue Aug 12 16:30:17 2014 -0700
@@ -1052,8 +1052,8 @@
     @HotSpotVMField(name = "HSAILTlabInfo::_end", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoEndOffset;
     @HotSpotVMField(name = "HSAILTlabInfo::_last_good_top", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoLastGoodTopOffset;
     @HotSpotVMField(name = "HSAILTlabInfo::_original_top", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoOriginalTopOffset;
-    @HotSpotVMField(name = "HSAILTlabInfo::_donor_thread", type = "JavaThread*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoDonorThreadOffset;
     @HotSpotVMField(name = "HSAILTlabInfo::_alloc_info", type = "HSAILAllocationInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoAllocInfoOffset;
+    @HotSpotVMField(name = "HSAILTlabInfo::_tlab", type = "ThreadLocalAllocBuffer*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoTlabOffset;
     @HotSpotVMType(name = "HSAILTlabInfo", get = HotSpotVMType.Type.SIZE) @Stable public int hsailTlabInfoSize;
 
     /**
--- a/src/gpu/hsail/vm/gpu_hsail.cpp	Tue Aug 12 16:12:49 2014 -0700
+++ b/src/gpu/hsail/vm/gpu_hsail.cpp	Tue Aug 12 16:30:17 2014 -0700
@@ -66,7 +66,7 @@
 JNINativeMethod Hsail::HSAIL_methods[] = {
   {CC"initialize",       CC"()Z",                               FN_PTR(Hsail::initialize)},
   {CC"generateKernel",   CC"([B" STRING ")J",                   FN_PTR(Hsail::generate_kernel)},
-  {CC"executeKernel0",   CC"("HS_INSTALLED_CODE"I["OBJECT"["JLTHREAD"I[I)Z",  FN_PTR(Hsail::execute_kernel_void_1d)},
+  {CC"executeKernel0",   CC"("HS_INSTALLED_CODE"I["OBJECT"II[I)Z",  FN_PTR(Hsail::execute_kernel_void_1d)},
 };
 
 void* Hsail::_device_context = NULL;
@@ -100,7 +100,7 @@
 }
 
 GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args,
-                                                      jobject donor_threads, jint allocBytesPerWorkitem, jobject oop_map_array))
+                                                      jint num_tlabs, jint allocBytesPerWorkitem, jobject oop_map_array))
 
   ResourceMark rm;
   jlong nmethodValue = InstalledCode::address(kernel_handle);
@@ -116,7 +116,7 @@
     SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL);
   }
 
-return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, donor_threads, allocBytesPerWorkitem, oop_map_array, CHECK_0);
+return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, num_tlabs, allocBytesPerWorkitem, oop_map_array, CHECK_0);
 GPU_END
 
 static void showRanges(jboolean* a, int len) {
@@ -137,14 +137,14 @@
 }
 
 jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm,
-                                                jobject donor_threads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) {
+                                                jint num_tlabs, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) {
   ResourceMark rm(THREAD);
   objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args);
   assert(THREAD->is_Java_thread(), "must be a JavaThread");
 
   // We avoid HSAILAllocationInfo logic if kernel does not allocate
-  // in which case the donor_thread array passed in will be null
-  HSAILAllocationInfo* allocInfo = (donor_threads == NULL ? NULL : new HSAILAllocationInfo(donor_threads, dimX, allocBytesPerWorkitem));
+  // in which case the num_tlabs passed in will be 0
+  HSAILAllocationInfo* allocInfo = (num_tlabs == 0 ?  NULL : new HSAILAllocationInfo(num_tlabs, dimX, allocBytesPerWorkitem));
   
   // Reset the kernel arguments
   _okra_clear_args(kernel);
--- a/src/gpu/hsail/vm/gpu_hsail.hpp	Tue Aug 12 16:12:49 2014 -0700
+++ b/src/gpu/hsail/vm/gpu_hsail.hpp	Tue Aug 12 16:30:17 2014 -0700
@@ -185,10 +185,10 @@
 
   // static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args);
   JNIEXPORT static jboolean execute_kernel_void_1d(JNIEnv* env, jclass, jobject hotspotInstalledCode, jint dimX, jobject args,
-                                                   jobject donorThreads, int allocBytesPerWorkitem, jobject oop_map_array);
+                                                   jint num_tlabs, int allocBytesPerWorkitem, jobject oop_map_array);
 
   static jboolean execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm,
-                                                  jobject donorThreads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS);
+                                                  jint num_tlabs, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS);
 
   static GraalEnv::CodeInstallResult install_code(Handle& compiled_code, CodeBlob*& cb, Handle installed_code, Handle triggered_deoptimizations);
 
--- a/src/gpu/hsail/vm/gpu_hsail_Tlab.hpp	Tue Aug 12 16:12:49 2014 -0700
+++ b/src/gpu/hsail/vm/gpu_hsail_Tlab.hpp	Tue Aug 12 16:30:17 2014 -0700
@@ -41,7 +41,7 @@
   HeapWord* _end;
   HeapWord* _last_good_top;
   HeapWord* _original_top;
-  JavaThread* _donor_thread;         // donor thread associated with this tlabInfo
+  ThreadLocalAllocBuffer* _tlab;      // tlab associated with this tlabInfo
   HSAILAllocationInfo* _alloc_info;   // same as what is in HSAILDeoptimizationInfo
 
   // Accessors
@@ -50,11 +50,12 @@
   HeapWord* end() { return _end; }
   HeapWord* last_good_top() { return _last_good_top; }
   HeapWord* original_top() { return _original_top; }
-  void initialize(HeapWord* start, HeapWord* top, HeapWord* end, JavaThread* donorThread, HSAILAllocationInfo* allocInfo) {
+  ThreadLocalAllocBuffer* tlab() { return _tlab; }
+  void initialize(HeapWord* start, HeapWord* top, HeapWord* end, ThreadLocalAllocBuffer* tlab, HSAILAllocationInfo* allocInfo) {
     _start = start;
     _top = _original_top = top;
     _end = end;
-    _donor_thread = donorThread;
+    _tlab = tlab;
     _alloc_info = allocInfo;
   }
 };
@@ -63,54 +64,56 @@
 class HSAILAllocationInfo : public CHeapObj<mtInternal> {
   friend class VMStructs;
 private:
-  JavaThread** donorThreads;
-  jint _num_donor_threads;
-  size_t _tlab_align_reserve_bytes;    // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes()
-  HSAILTlabInfo** _cur_tlab_infos;    // array of current tlab info pointers, one per donor_thread
+  jint   _num_tlabs;
+  size_t _tlab_align_reserve_bytes;         // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes()
+  HSAILTlabInfo** _cur_tlab_infos;          // array of current tlab info pointers, one per num_tlabs
   HSAILTlabInfo* _tlab_infos_pool_start;    // pool for new tlab_infos
   HSAILTlabInfo* _tlab_infos_pool_next;     // where next will be allocated from
   HSAILTlabInfo* _tlab_infos_pool_end;      // where next will be allocated from
 
 public:
-  HSAILAllocationInfo(jobject donor_threads_jobj, int dimX, int allocBytesPerWorkitem) {
-    // fill in the donorThreads array
-    objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads_jobj);
-    _num_donor_threads = donorThreadObjects->length();
-    guarantee(_num_donor_threads > 0, "need at least one donor thread");
-    donorThreads = NEW_C_HEAP_ARRAY(JavaThread*, _num_donor_threads, mtInternal);
-    for (int i = 0; i < _num_donor_threads; i++) {
-      donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i));
+  HSAILAllocationInfo(jint num_tlabs, int dimX, int allocBytesPerWorkitem) {
+    _num_tlabs = num_tlabs;
+    // if this thread doesn't have gpu_hsail_tlabs allocated yet, do so now
+    JavaThread* thread = JavaThread::current();
+    if (thread->get_gpu_hsail_tlabs_count() == 0) {
+      thread->initialize_gpu_hsail_tlabs(num_tlabs);
+      if (TraceGPUInteraction) {
+        for (int i = 0; i < num_tlabs; i++) {
+          ThreadLocalAllocBuffer* tlab = thread->get_gpu_hsail_tlab_at(i);
+          tty->print("initialized gpu_hsail_tlab %d at %p -> ", i, tlab);
+          printTlabInfoFromThread(tlab);
+        }
+      }
     }
-    
+
     // Compute max_tlab_infos based on amount of free heap space
     size_t max_tlab_infos;
     {
-      JavaThread* donorThread = donorThreads[0];
-      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+      ThreadLocalAllocBuffer* tlab = &thread->tlab();
       size_t new_tlab_size = tlab->compute_size(0);
-      size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(donorThread);
+      size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(thread);
       if (new_tlab_size != 0) {
-        max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_donor_threads));
+        max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_tlabs));
       } else {
-        max_tlab_infos = 8 * _num_donor_threads;   // an arbitrary multiple
+        max_tlab_infos = 8 * _num_tlabs;   // an arbitrary multiple
       }
       if (TraceGPUInteraction) {
         tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos);
       }
     }
 
-    _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_donor_threads, mtInternal);
+    _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_tlabs, mtInternal);
     _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal);
-    _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_donor_threads];
+    _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_tlabs];
     _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos];
     _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes();
       
-    // we will fill the first N tlabInfos from the donor threads
-    for (int i = 0; i < _num_donor_threads; i++) {
-      JavaThread* donorThread = donorThreads[i];
-      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+    // we will fill the first N tlabInfos from the gpu_hsail_tlabs
+    for (int i = 0; i < _num_tlabs; i++) {
+      ThreadLocalAllocBuffer* tlab = thread->get_gpu_hsail_tlab_at(i);
       if (TraceGPUInteraction) {
-        tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab);
+        tty->print("gpu_hsail_tlab %d at %p -> ", i, tlab);
         printTlabInfoFromThread(tlab);
       }
       
@@ -122,13 +125,13 @@
       // here, it might make sense to do a gc now rather than to start
       // the kernel and have it deoptimize.  How to do that?
       if (tlab->end() == NULL) {
-        bool success = getNewTlabForDonorThread(tlab, i);
+        bool success = getNewGpuHsailTlab(tlab);
         if (TraceGPUInteraction) {
           if (success) {
-            tty->print("donorThread %d, refilled tlab, -> ", i);
+            tty->print("gpu_hsail_tlab %d, refilled tlab, -> ", i);
             printTlabInfoFromThread(tlab);
           } else {
-            tty->print("donorThread %d, could not refill tlab, left as ", i);
+            tty->print("gpu_hsail_tlab %d, could not refill tlab, left as ", i);
             printTlabInfoFromThread(tlab);
           }
         }
@@ -137,26 +140,19 @@
       // extract the necessary tlab fields into a TlabInfo record
       HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i];
       _cur_tlab_infos[i] = pTlabInfo;
-      pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), donorThread, this);
-
-      // reset the real tlab fields to zero so we are sure the thread doesn't use it
-      tlab->set_start(NULL);
-      tlab->set_top(NULL);
-      tlab->set_pf_top(NULL);
-      tlab->set_end(NULL);
+      pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), tlab, this);
     }
   }
 
   ~HSAILAllocationInfo() {
     FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal);
     FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal);
-    FREE_C_HEAP_ARRAY(JavaThread*, donorThreads, mtInternal);
   }
 
   void postKernelCleanup() {
     // go thru all the tlabInfos, fix up any tlab tops that overflowed
     // complete the tlabs if they overflowed
-    // update the donor threads tlabs when appropriate
+    // update the gpu_hsail_tlabs when appropriate
     bool anyOverflows = false;
     size_t bytesAllocated = 0;
     // if there was an overflow in allocating tlabInfos, correct it here
@@ -172,8 +168,7 @@
         tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo, 
                       tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top());
       }
-      JavaThread* donorThread = tlabInfo->_donor_thread;
-      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+      ThreadLocalAllocBuffer* tlab = tlabInfo->tlab();
       bool overflowed = false;
       // if a tlabInfo has NULL fields, i.e. we could not prime it on entry,
       // or we could not get a tlab from the gpu, so ignore tlabInfo here
@@ -183,24 +178,14 @@
           overflowed = true;
           if (TraceGPUInteraction) {
             long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top(); 
-            tty->print_cr("tlabInfo %p (donorThread = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, donorThread, overflowAmount, tlabInfo->last_good_top());
+            tty->print_cr("tlabInfo %p (tlab = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, tlab, overflowAmount, tlabInfo->last_good_top());
           }
           tlabInfo->_top = tlabInfo->last_good_top();
         }
 
-        // if the donor thread allocated anything while we were running
-        // we will retire its tlab before overwriting with our new one
-        if (tlab->top() != NULL) {
-          if (TraceGPUInteraction) {
-            tty->print("Donor Thread allocated new tlab");
-            printTlabInfoFromThread(tlab);
-          }
-          tlab->make_parsable(true);
-        }
-
-        // fill the donor thread tlab with the tlabInfo information
+        // fill the gpu_hsail_tlab with the tlabInfo information
         // we do this even if it will get overwritten by a later tlabinfo
-        // because it helps with tlab statistics for that donor thread
+        // because it helps with tlab statistics for that tlab
         tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve());
 
         // if there was an overflow, make it parsable with retire = true
@@ -231,7 +216,7 @@
 private:
   // fill and retire old tlab and get a new one
   // if we can't get one, no problem someone will eventually do a gc
-  bool getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, int idx) {
+  bool getNewGpuHsailTlab(ThreadLocalAllocBuffer* tlab) {
 
     tlab->clear_before_allocation();    // fill and retire old tlab (will also check for null)
     
--- a/src/gpu/hsail/vm/vmStructs_hsail.hpp	Tue Aug 12 16:12:49 2014 -0700
+++ b/src/gpu/hsail/vm/vmStructs_hsail.hpp	Tue Aug 12 16:30:17 2014 -0700
@@ -58,8 +58,8 @@
   nonstatic_field(HSAILTlabInfo, _end,                                                     HeapWord*)                                 \
   nonstatic_field(HSAILTlabInfo, _last_good_top,                                           HeapWord*)                                 \
   nonstatic_field(HSAILTlabInfo, _original_top,                                            HeapWord*)                                 \
-  nonstatic_field(HSAILTlabInfo, _donor_thread,                                            JavaThread*)                               \
   nonstatic_field(HSAILTlabInfo, _alloc_info,                                              HSAILAllocationInfo*)                      \
+  nonstatic_field(HSAILTlabInfo, _tlab,                                                    ThreadLocalAllocBuffer*)                   \
 
 #define VM_TYPES_GPU_HSAIL(declare_type, declare_toplevel_type)      \
   declare_toplevel_type(HSAILFrame)                                  \
--- a/src/share/vm/gc_interface/collectedHeap.cpp	Tue Aug 12 16:12:49 2014 -0700
+++ b/src/share/vm/gc_interface/collectedHeap.cpp	Tue Aug 12 16:30:17 2014 -0700
@@ -503,7 +503,12 @@
          "Attempt to fill tlabs before main thread has been added"
          " to threads list is doomed to failure!");
   for (JavaThread *thread = Threads::first(); thread; thread = thread->next()) {
-     if (use_tlab) thread->tlab().make_parsable(retire_tlabs);
+     if (use_tlab) {
+       thread->tlab().make_parsable(retire_tlabs);
+#ifdef GRAAL
+       thread->gpu_hsail_tlabs_make_parsable(retire_tlabs);
+#endif
+     }     
 #if defined(COMPILER2) || defined(GRAAL)
      // The deferred store barriers must all have been flushed to the
      // card-table (or other remembered set structure) before GC starts
--- a/src/share/vm/memory/threadLocalAllocBuffer.cpp	Tue Aug 12 16:12:49 2014 -0700
+++ b/src/share/vm/memory/threadLocalAllocBuffer.cpp	Tue Aug 12 16:30:17 2014 -0700
@@ -48,6 +48,13 @@
   for(JavaThread *thread = Threads::first(); thread; thread = thread->next()) {
     thread->tlab().accumulate_statistics();
     thread->tlab().initialize_statistics();
+#ifdef GRAAL
+    for (jint i = 0; i < thread->get_gpu_hsail_tlabs_count(); i++) {
+      thread->get_gpu_hsail_tlab_at(i)->accumulate_statistics();
+      thread->get_gpu_hsail_tlab_at(i)->initialize_statistics();
+    }
+#endif
+
   }
 
   // Publish new stats if some allocation occurred.
@@ -129,6 +136,11 @@
 void ThreadLocalAllocBuffer::resize_all_tlabs() {
   for(JavaThread *thread = Threads::first(); thread; thread = thread->next()) {
     thread->tlab().resize();
+#ifdef GRAAL
+    for (jint i = 0; i < thread->get_gpu_hsail_tlabs_count(); i++) {
+      thread->get_gpu_hsail_tlab_at(i)->resize();
+    }
+#endif
   }
 }
 
@@ -188,11 +200,12 @@
   invariants();
 }
 
-void ThreadLocalAllocBuffer::initialize() {
+void ThreadLocalAllocBuffer::initialize(Thread* owning_thread) {
   initialize(NULL,                    // start
              NULL,                    // top
              NULL);                   // end
 
+  _owning_thread = owning_thread;
   set_desired_size(initial_desired_size());
 
   // Following check is needed because at startup the main (primordial)
@@ -221,7 +234,7 @@
   // During jvm startup, the main (primordial) thread is initialized
   // before the heap is initialized.  So reinitialize it now.
   guarantee(Thread::current()->is_Java_thread(), "tlab initialization thread not Java thread");
-  Thread::current()->tlab().initialize();
+  Thread::current()->tlab().initialize(Thread::current());
 
   if (PrintTLAB && Verbose) {
     gclog_or_tty->print("TLAB min: " SIZE_FORMAT " initial: " SIZE_FORMAT " max: " SIZE_FORMAT "\n",
@@ -303,9 +316,7 @@
 }
 
 Thread* ThreadLocalAllocBuffer::myThread() {
-  return (Thread*)(((char *)this) +
-                   in_bytes(start_offset()) -
-                   in_bytes(Thread::tlab_start_offset()));
+  return _owning_thread;
 }
 
 
--- a/src/share/vm/memory/threadLocalAllocBuffer.hpp	Tue Aug 12 16:12:49 2014 -0700
+++ b/src/share/vm/memory/threadLocalAllocBuffer.hpp	Tue Aug 12 16:30:17 2014 -0700
@@ -56,6 +56,7 @@
   unsigned  _slow_refill_waste;
   unsigned  _gc_waste;
   unsigned  _slow_allocations;
+  Thread*   _owning_thread;
 
   AdaptiveWeightedAverage _allocation_fraction;  // fraction of eden allocated in tlabs
 
@@ -156,7 +157,7 @@
   static void resize_all_tlabs();
 
   void fill(HeapWord* start, HeapWord* top, size_t new_size);
-  void initialize();
+  void initialize(Thread* owning_thread);
 
   static size_t refill_waste_limit_increment()   { return TLABWasteIncrement; }
 
--- a/src/share/vm/runtime/thread.cpp	Tue Aug 12 16:12:49 2014 -0700
+++ b/src/share/vm/runtime/thread.cpp	Tue Aug 12 16:30:17 2014 -0700
@@ -1474,7 +1474,9 @@
 #ifdef GRAAL
   set_gpu_exception_bci(0);
   set_gpu_exception_method(NULL);  
-  set_gpu_hsail_deopt_info(NULL);  
+  set_gpu_hsail_deopt_info(NULL);
+  _gpu_hsail_tlabs_count = 0;
+  _gpu_hsail_tlabs = NULL;
 #endif
   set_thread_state(_thread_new);
 #if INCLUDE_NMT
@@ -1694,6 +1696,8 @@
     }
     FREE_C_HEAP_ARRAY(jlong, _graal_counters, mtInternal);
   }
+
+  delete_gpu_hsail_tlabs();
 #endif // GRAAL
 }
 
@@ -1968,7 +1972,7 @@
   remove_stack_guard_pages();
 
   if (UseTLAB) {
-    tlab().make_parsable(true);  // retire TLAB
+    tlabs_make_parsable(true);   // retire TLABs, if any
   }
 
   if (JvmtiEnv::environments_might_exist()) {
@@ -2047,7 +2051,7 @@
   remove_stack_guard_pages();
 
   if (UseTLAB) {
-    tlab().make_parsable(true);  // retire TLAB, if any
+    tlabs_make_parsable(true);   // retire TLABs, if any
   }
 
 #if INCLUDE_ALL_GCS
@@ -4792,3 +4796,54 @@
   VMThread* thread = VMThread::vm_thread();
   if (thread != NULL) thread->verify();
 }
+
+void JavaThread::tlabs_make_parsable(bool retire) {
+  // do the primary tlab for this thread
+  tlab().make_parsable(retire);
+#ifdef GRAAL
+  // do the gpu_hsail tlabs if any
+  gpu_hsail_tlabs_make_parsable(retire);
+#endif
+}
+
+
+#ifdef GRAAL
+void JavaThread::initialize_gpu_hsail_tlabs(jint count) {
+  if (!UseTLAB) return;
+  // create tlabs
+  _gpu_hsail_tlabs = NEW_C_HEAP_ARRAY(ThreadLocalAllocBuffer*, count, mtInternal);
+  // initialize
+  for (jint i = 0; i < count; i++) {
+    _gpu_hsail_tlabs[i] = new ThreadLocalAllocBuffer();
+    _gpu_hsail_tlabs[i]->initialize(Thread::current());
+  }
+  _gpu_hsail_tlabs_count = count;
+}
+
+ThreadLocalAllocBuffer* JavaThread::get_gpu_hsail_tlab_at(jint idx) {
+  assert(idx >= 0 && idx < get_gpu_hsail_tlabs_count(), "illegal gpu tlab index");
+  return _gpu_hsail_tlabs[idx];
+}
+
+void JavaThread::gpu_hsail_tlabs_make_parsable(bool retire) {
+  for (jint i = 0; i < get_gpu_hsail_tlabs_count(); i++) {
+    get_gpu_hsail_tlab_at(i)->make_parsable(retire);
+  }
+}
+
+void JavaThread::delete_gpu_hsail_tlabs() {
+  if (!UseTLAB) return;
+  if (_gpu_hsail_tlabs_count == 0) return;
+
+  gpu_hsail_tlabs_make_parsable(true);
+  for (jint i = 0; i < get_gpu_hsail_tlabs_count(); i++) {
+    delete get_gpu_hsail_tlab_at(i);
+  }
+  FREE_C_HEAP_ARRAY(ThreadLocalAllocBuffer*, _gpu_hsail_tlabs, mtInternal);
+  _gpu_hsail_tlabs = NULL;
+  _gpu_hsail_tlabs_count = 0;
+}
+
+
+#endif
+
--- a/src/share/vm/runtime/thread.hpp	Tue Aug 12 16:12:49 2014 -0700
+++ b/src/share/vm/runtime/thread.hpp	Tue Aug 12 16:30:17 2014 -0700
@@ -436,7 +436,7 @@
   ThreadLocalAllocBuffer& tlab()                 { return _tlab; }
   void initialize_tlab() {
     if (UseTLAB) {
-      tlab().initialize();
+      tlab().initialize(this);
     }
   }
 
@@ -950,6 +950,8 @@
   Method* _gpu_exception_method;
   // Record the hsailDeoptimization info so gc oops_do processing can find it
   void*   _gpu_hsail_deopt_info;
+  jint    _gpu_hsail_tlabs_count;
+  ThreadLocalAllocBuffer** _gpu_hsail_tlabs;
 #endif
 
  public:
@@ -960,9 +962,17 @@
   Method* get_gpu_exception_method()             { return _gpu_exception_method; }
   void set_gpu_hsail_deopt_info(void * deoptInfo) { _gpu_hsail_deopt_info = deoptInfo; }
   void* get_gpu_hsail_deopt_info()               { return _gpu_hsail_deopt_info; }
+  jint  get_gpu_hsail_tlabs_count()              { return _gpu_hsail_tlabs_count; }
+
+  void  initialize_gpu_hsail_tlabs(jint count);
+  ThreadLocalAllocBuffer* get_gpu_hsail_tlab_at(jint idx);
+  void gpu_hsail_tlabs_make_parsable(bool retire);
+  void  delete_gpu_hsail_tlabs();
 #endif
-  
+
  private:  
+  void tlabs_make_parsable(bool retire);
+
   // support for JNI critical regions
   jint    _jni_active_critical;                  // count of entries into JNI critical region