# HG changeset patch # User Tom Rodriguez # Date 1407886217 25200 # Node ID a29e6e7b7a861550301a5af3c710c889e45ee450 # Parent 74c02c90a3f94cff5ab4d305740974253fc5604e Replace hsail donor threads with hsail tlabs diff -r 74c02c90a3f9 -r a29e6e7b7a86 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/DonorThreadPool.java --- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/DonorThreadPool.java Tue Aug 12 16:12:49 2014 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -package com.oracle.graal.hotspot.hsail; - -import static com.oracle.graal.hotspot.hsail.HSAILHotSpotBackend.Options.*; - -import java.util.concurrent.*; - -import com.oracle.graal.hotspot.hsail.HSAILHotSpotBackend.Options; - -/** - * Thread pool for HSAIL allocation support. - */ -public class DonorThreadPool { - - private final Thread[] threads; - - void waitAt(CyclicBarrier barrier) { - try { - barrier.await(); - } catch (Exception e) { - e.printStackTrace(); - } - } - - /** - * Creates a pool of threads whose size is given by {@link Options#HsailDonorThreads}. - */ - DonorThreadPool() { - int size = HsailDonorThreads.getValue(); - this.threads = new Thread[size]; - CyclicBarrier barrier = new CyclicBarrier(size + 1); - - // fill in threads - for (int i = 0; i < size; i++) { - threads[i] = new Thread(new Runnable() { - @Override - public void run() { - while (true) { - waitAt(barrier); - } - } - }, "HsailDonorThread-" + i); - threads[i].setDaemon(true); - threads[i].start(); - } - // creating thread waits at barrier to make sure others have started - waitAt(barrier); - } - - public Thread[] getThreads() { - return threads; - } -} \ No newline at end of file diff -r 74c02c90a3f9 -r a29e6e7b7a86 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java --- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java Tue Aug 12 16:12:49 2014 -0700 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java Tue Aug 12 16:30:17 2014 -0700 @@ -85,8 +85,8 @@ public static class Options { // @formatter:off - @Option(help = "Number of donor threads for HSAIL kernel dispatch") - static public final OptionValue HsailDonorThreads = new OptionValue<>(4); + @Option(help = "Number of TLABs used for HSAIL kernels which allocate") + static public final OptionValue HsailKernelTlabs = new OptionValue<>(4); // @formatter:on } @@ -369,25 +369,18 @@ return result; } - private static final ThreadLocal donorThreadPool = new ThreadLocal() { - @Override - protected DonorThreadPool initialValue() { - return new DonorThreadPool(); - } - }; - public boolean executeKernel(HotSpotInstalledCode kernel, int jobSize, Object[] args) throws InvalidInstalledCodeException { if (!deviceInitialized) { throw new GraalInternalError("Cannot execute GPU kernel if device is not initialized"); } int[] oopMapArray = ((HSAILHotSpotNmethod) kernel).getOopMapArray(); - // Pass donorThreadPoolArray if this kernel uses allocation, otherwise null - Thread[] donorThreadArray = ((HSAILHotSpotNmethod) kernel).getUsesAllocationFlag() ? donorThreadPool.get().getThreads() : null; - return executeKernel0(kernel, jobSize, args, donorThreadArray, HsailAllocBytesPerWorkitem.getValue(), oopMapArray); + // Pass HsailKernelTlabs number if this kernel uses allocation, otherwise 0 + int numTlabs = ((HSAILHotSpotNmethod) kernel).getUsesAllocationFlag() ? HsailKernelTlabs.getValue() : 0; + return executeKernel0(kernel, jobSize, args, numTlabs, HsailAllocBytesPerWorkitem.getValue(), oopMapArray); } - private static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args, Thread[] donorThreads, int allocBytesPerWorkitem, int[] oopMapArray) + private static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args, int numTlabs, int allocBytesPerWorkitem, int[] oopMapArray) throws InvalidInstalledCodeException; /** @@ -633,12 +626,12 @@ RegisterValue d16_deoptInfo = HSAIL.d16.asValue(wordLIRKind); // Aliases for d17 - RegisterValue d17_donorThreadIndex = HSAIL.d17.asValue(wordLIRKind); - RegisterValue d17_safepointFlagAddrIndex = d17_donorThreadIndex; + RegisterValue d17_tlabIndex = HSAIL.d17.asValue(wordLIRKind); + RegisterValue d17_safepointFlagAddrIndex = d17_tlabIndex; // Aliases for s34 RegisterValue s34_deoptOccurred = HSAIL.s34.asValue(LIRKind.value(Kind.Int)); - RegisterValue s34_donorThreadIndex = s34_deoptOccurred; + RegisterValue s34_tlabIndex = s34_deoptOccurred; asm.emitLoadKernelArg(d16_deoptInfo, asm.getDeoptInfoName(), "u64"); asm.emitComment("// Check if a deopt or safepoint has occurred and abort if true before doing any work"); @@ -657,15 +650,15 @@ // load thread register if this kernel performs allocation if (usesAllocation) { RegisterValue threadReg = getProviders().getRegisters().getThreadRegister().asValue(wordLIRKind); - assert HsailDonorThreads.getValue() > 0; + assert HsailKernelTlabs.getValue() > 0; asm.emitLoad(wordKind, threadReg, new HSAILAddressValue(wordLIRKind, d16_deoptInfo, config.hsailCurTlabInfoOffset).toAddress()); - if (HsailDonorThreads.getValue() != 1) { - asm.emitComment("// map workitem to a donor thread"); - asm.emitString(String.format("rem_u32 $%s, %s, %d;", s34_donorThreadIndex.getRegister(), workItemReg, HsailDonorThreads.getValue())); - asm.emitConvert(d17_donorThreadIndex, s34_donorThreadIndex, wordKind, Kind.Int); - asm.emit("mad", threadReg, d17_donorThreadIndex, Constant.forInt(8), threadReg); + if (HsailKernelTlabs.getValue() != 1) { + asm.emitComment("// map workitem to a tlab"); + asm.emitString(String.format("rem_u32 $%s, %s, %d;", s34_tlabIndex.getRegister(), workItemReg, HsailKernelTlabs.getValue())); + asm.emitConvert(d17_tlabIndex, s34_tlabIndex, wordKind, Kind.Int); + asm.emit("mad", threadReg, d17_tlabIndex, Constant.forInt(8), threadReg); } else { - // workitem is already mapped to solitary donor thread + // workitem is already mapped to solitary tlab } asm.emitComment("// $" + getProviders().getRegisters().getThreadRegister() + " will point to holder of tlab thread info for this workitem"); } diff -r 74c02c90a3f9 -r a29e6e7b7a86 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILHotSpotReplacementsUtil.java --- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILHotSpotReplacementsUtil.java Tue Aug 12 16:12:49 2014 -0700 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILHotSpotReplacementsUtil.java Tue Aug 12 16:30:17 2014 -0700 @@ -50,7 +50,7 @@ public static final LocationIdentity TLABINFO_START_LOCATION = new NamedLocationIdentity("TlabInfoStart"); public static final LocationIdentity TLABINFO_ALLOCINFO_LOCATION = new NamedLocationIdentity("TlabInfoAllocInfo"); public static final LocationIdentity TLABINFO_ORIGINALTOP_LOCATION = new NamedLocationIdentity("TlabInfoOriginalTop"); - public static final LocationIdentity TLABINFO_DONORTHREAD_LOCATION = new NamedLocationIdentity("TlabInfoDonorThread"); + public static final LocationIdentity TLABINFO_TLAB_LOCATION = new NamedLocationIdentity("TlabInfoTlab"); public static final LocationIdentity ALLOCINFO_TLABINFOSPOOLNEXT_LOCATION = new NamedLocationIdentity("AllocInfoTlabInfosPoolNext"); public static final LocationIdentity ALLOCINFO_TLABINFOSPOOLEND_LOCATION = new NamedLocationIdentity("AllocInfoTlabInfosPoolEnd"); @@ -121,12 +121,12 @@ tlabInfo.writeWord(config().hsailTlabInfoOriginalTopOffset, val, TLABINFO_ORIGINALTOP_LOCATION); } - public static void writeTlabInfoDonorThread(Word tlabInfo, Word val) { - tlabInfo.writeWord(config().hsailTlabInfoDonorThreadOffset, val, TLABINFO_DONORTHREAD_LOCATION); + public static void writeTlabInfoTlab(Word tlabInfo, Word val) { + tlabInfo.writeWord(config().hsailTlabInfoTlabOffset, val, TLABINFO_TLAB_LOCATION); } - public static Word readTlabInfoDonorThread(Word tlabInfo) { - return tlabInfo.readWord(config().hsailTlabInfoDonorThreadOffset, TLABINFO_DONORTHREAD_LOCATION); + public static Word readTlabInfoTlab(Word tlabInfo) { + return tlabInfo.readWord(config().hsailTlabInfoTlabOffset, TLABINFO_TLAB_LOCATION); } public static Word readAllocInfoTlabInfosPoolEnd(Word allocInfo) { diff -r 74c02c90a3f9 -r a29e6e7b7a86 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILNewObjectSnippets.java --- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILNewObjectSnippets.java Tue Aug 12 16:12:49 2014 -0700 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILNewObjectSnippets.java Tue Aug 12 16:30:17 2014 -0700 @@ -97,7 +97,7 @@ Word alignReserveBytes = readAllocInfoTlabAlignReserveBytes(allocInfo); writeTlabInfoEnd(newTlabInfo, tlabStart.add(newTlabSize.subtract(alignReserveBytes))); writeTlabInfoAllocInfo(newTlabInfo, allocInfo); - writeTlabInfoDonorThread(newTlabInfo, readTlabInfoDonorThread(oldTlabInfo)); + writeTlabInfoTlab(newTlabInfo, readTlabInfoTlab(oldTlabInfo)); return (newTlabInfo); } diff -r 74c02c90a3f9 -r a29e6e7b7a86 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java --- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java Tue Aug 12 16:12:49 2014 -0700 +++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java Tue Aug 12 16:30:17 2014 -0700 @@ -1052,8 +1052,8 @@ @HotSpotVMField(name = "HSAILTlabInfo::_end", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoEndOffset; @HotSpotVMField(name = "HSAILTlabInfo::_last_good_top", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoLastGoodTopOffset; @HotSpotVMField(name = "HSAILTlabInfo::_original_top", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoOriginalTopOffset; - @HotSpotVMField(name = "HSAILTlabInfo::_donor_thread", type = "JavaThread*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoDonorThreadOffset; @HotSpotVMField(name = "HSAILTlabInfo::_alloc_info", type = "HSAILAllocationInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoAllocInfoOffset; + @HotSpotVMField(name = "HSAILTlabInfo::_tlab", type = "ThreadLocalAllocBuffer*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoTlabOffset; @HotSpotVMType(name = "HSAILTlabInfo", get = HotSpotVMType.Type.SIZE) @Stable public int hsailTlabInfoSize; /** diff -r 74c02c90a3f9 -r a29e6e7b7a86 src/gpu/hsail/vm/gpu_hsail.cpp --- a/src/gpu/hsail/vm/gpu_hsail.cpp Tue Aug 12 16:12:49 2014 -0700 +++ b/src/gpu/hsail/vm/gpu_hsail.cpp Tue Aug 12 16:30:17 2014 -0700 @@ -66,7 +66,7 @@ JNINativeMethod Hsail::HSAIL_methods[] = { {CC"initialize", CC"()Z", FN_PTR(Hsail::initialize)}, {CC"generateKernel", CC"([B" STRING ")J", FN_PTR(Hsail::generate_kernel)}, - {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["JLTHREAD"I[I)Z", FN_PTR(Hsail::execute_kernel_void_1d)}, + {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"II[I)Z", FN_PTR(Hsail::execute_kernel_void_1d)}, }; void* Hsail::_device_context = NULL; @@ -100,7 +100,7 @@ } GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args, - jobject donor_threads, jint allocBytesPerWorkitem, jobject oop_map_array)) + jint num_tlabs, jint allocBytesPerWorkitem, jobject oop_map_array)) ResourceMark rm; jlong nmethodValue = InstalledCode::address(kernel_handle); @@ -116,7 +116,7 @@ SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); } -return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, donor_threads, allocBytesPerWorkitem, oop_map_array, CHECK_0); +return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, num_tlabs, allocBytesPerWorkitem, oop_map_array, CHECK_0); GPU_END static void showRanges(jboolean* a, int len) { @@ -137,14 +137,14 @@ } jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm, - jobject donor_threads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) { + jint num_tlabs, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) { ResourceMark rm(THREAD); objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args); assert(THREAD->is_Java_thread(), "must be a JavaThread"); // We avoid HSAILAllocationInfo logic if kernel does not allocate - // in which case the donor_thread array passed in will be null - HSAILAllocationInfo* allocInfo = (donor_threads == NULL ? NULL : new HSAILAllocationInfo(donor_threads, dimX, allocBytesPerWorkitem)); + // in which case the num_tlabs passed in will be 0 + HSAILAllocationInfo* allocInfo = (num_tlabs == 0 ? NULL : new HSAILAllocationInfo(num_tlabs, dimX, allocBytesPerWorkitem)); // Reset the kernel arguments _okra_clear_args(kernel); diff -r 74c02c90a3f9 -r a29e6e7b7a86 src/gpu/hsail/vm/gpu_hsail.hpp --- a/src/gpu/hsail/vm/gpu_hsail.hpp Tue Aug 12 16:12:49 2014 -0700 +++ b/src/gpu/hsail/vm/gpu_hsail.hpp Tue Aug 12 16:30:17 2014 -0700 @@ -185,10 +185,10 @@ // static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args); JNIEXPORT static jboolean execute_kernel_void_1d(JNIEnv* env, jclass, jobject hotspotInstalledCode, jint dimX, jobject args, - jobject donorThreads, int allocBytesPerWorkitem, jobject oop_map_array); + jint num_tlabs, int allocBytesPerWorkitem, jobject oop_map_array); static jboolean execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm, - jobject donorThreads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS); + jint num_tlabs, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS); static GraalEnv::CodeInstallResult install_code(Handle& compiled_code, CodeBlob*& cb, Handle installed_code, Handle triggered_deoptimizations); diff -r 74c02c90a3f9 -r a29e6e7b7a86 src/gpu/hsail/vm/gpu_hsail_Tlab.hpp --- a/src/gpu/hsail/vm/gpu_hsail_Tlab.hpp Tue Aug 12 16:12:49 2014 -0700 +++ b/src/gpu/hsail/vm/gpu_hsail_Tlab.hpp Tue Aug 12 16:30:17 2014 -0700 @@ -41,7 +41,7 @@ HeapWord* _end; HeapWord* _last_good_top; HeapWord* _original_top; - JavaThread* _donor_thread; // donor thread associated with this tlabInfo + ThreadLocalAllocBuffer* _tlab; // tlab associated with this tlabInfo HSAILAllocationInfo* _alloc_info; // same as what is in HSAILDeoptimizationInfo // Accessors @@ -50,11 +50,12 @@ HeapWord* end() { return _end; } HeapWord* last_good_top() { return _last_good_top; } HeapWord* original_top() { return _original_top; } - void initialize(HeapWord* start, HeapWord* top, HeapWord* end, JavaThread* donorThread, HSAILAllocationInfo* allocInfo) { + ThreadLocalAllocBuffer* tlab() { return _tlab; } + void initialize(HeapWord* start, HeapWord* top, HeapWord* end, ThreadLocalAllocBuffer* tlab, HSAILAllocationInfo* allocInfo) { _start = start; _top = _original_top = top; _end = end; - _donor_thread = donorThread; + _tlab = tlab; _alloc_info = allocInfo; } }; @@ -63,54 +64,56 @@ class HSAILAllocationInfo : public CHeapObj { friend class VMStructs; private: - JavaThread** donorThreads; - jint _num_donor_threads; - size_t _tlab_align_reserve_bytes; // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes() - HSAILTlabInfo** _cur_tlab_infos; // array of current tlab info pointers, one per donor_thread + jint _num_tlabs; + size_t _tlab_align_reserve_bytes; // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes() + HSAILTlabInfo** _cur_tlab_infos; // array of current tlab info pointers, one per num_tlabs HSAILTlabInfo* _tlab_infos_pool_start; // pool for new tlab_infos HSAILTlabInfo* _tlab_infos_pool_next; // where next will be allocated from HSAILTlabInfo* _tlab_infos_pool_end; // where next will be allocated from public: - HSAILAllocationInfo(jobject donor_threads_jobj, int dimX, int allocBytesPerWorkitem) { - // fill in the donorThreads array - objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads_jobj); - _num_donor_threads = donorThreadObjects->length(); - guarantee(_num_donor_threads > 0, "need at least one donor thread"); - donorThreads = NEW_C_HEAP_ARRAY(JavaThread*, _num_donor_threads, mtInternal); - for (int i = 0; i < _num_donor_threads; i++) { - donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i)); + HSAILAllocationInfo(jint num_tlabs, int dimX, int allocBytesPerWorkitem) { + _num_tlabs = num_tlabs; + // if this thread doesn't have gpu_hsail_tlabs allocated yet, do so now + JavaThread* thread = JavaThread::current(); + if (thread->get_gpu_hsail_tlabs_count() == 0) { + thread->initialize_gpu_hsail_tlabs(num_tlabs); + if (TraceGPUInteraction) { + for (int i = 0; i < num_tlabs; i++) { + ThreadLocalAllocBuffer* tlab = thread->get_gpu_hsail_tlab_at(i); + tty->print("initialized gpu_hsail_tlab %d at %p -> ", i, tlab); + printTlabInfoFromThread(tlab); + } + } } - + // Compute max_tlab_infos based on amount of free heap space size_t max_tlab_infos; { - JavaThread* donorThread = donorThreads[0]; - ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); + ThreadLocalAllocBuffer* tlab = &thread->tlab(); size_t new_tlab_size = tlab->compute_size(0); - size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(donorThread); + size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(thread); if (new_tlab_size != 0) { - max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_donor_threads)); + max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_tlabs)); } else { - max_tlab_infos = 8 * _num_donor_threads; // an arbitrary multiple + max_tlab_infos = 8 * _num_tlabs; // an arbitrary multiple } if (TraceGPUInteraction) { tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos); } } - _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_donor_threads, mtInternal); + _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_tlabs, mtInternal); _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal); - _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_donor_threads]; + _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_tlabs]; _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos]; _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes(); - // we will fill the first N tlabInfos from the donor threads - for (int i = 0; i < _num_donor_threads; i++) { - JavaThread* donorThread = donorThreads[i]; - ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); + // we will fill the first N tlabInfos from the gpu_hsail_tlabs + for (int i = 0; i < _num_tlabs; i++) { + ThreadLocalAllocBuffer* tlab = thread->get_gpu_hsail_tlab_at(i); if (TraceGPUInteraction) { - tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab); + tty->print("gpu_hsail_tlab %d at %p -> ", i, tlab); printTlabInfoFromThread(tlab); } @@ -122,13 +125,13 @@ // here, it might make sense to do a gc now rather than to start // the kernel and have it deoptimize. How to do that? if (tlab->end() == NULL) { - bool success = getNewTlabForDonorThread(tlab, i); + bool success = getNewGpuHsailTlab(tlab); if (TraceGPUInteraction) { if (success) { - tty->print("donorThread %d, refilled tlab, -> ", i); + tty->print("gpu_hsail_tlab %d, refilled tlab, -> ", i); printTlabInfoFromThread(tlab); } else { - tty->print("donorThread %d, could not refill tlab, left as ", i); + tty->print("gpu_hsail_tlab %d, could not refill tlab, left as ", i); printTlabInfoFromThread(tlab); } } @@ -137,26 +140,19 @@ // extract the necessary tlab fields into a TlabInfo record HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i]; _cur_tlab_infos[i] = pTlabInfo; - pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), donorThread, this); - - // reset the real tlab fields to zero so we are sure the thread doesn't use it - tlab->set_start(NULL); - tlab->set_top(NULL); - tlab->set_pf_top(NULL); - tlab->set_end(NULL); + pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), tlab, this); } } ~HSAILAllocationInfo() { FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal); FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal); - FREE_C_HEAP_ARRAY(JavaThread*, donorThreads, mtInternal); } void postKernelCleanup() { // go thru all the tlabInfos, fix up any tlab tops that overflowed // complete the tlabs if they overflowed - // update the donor threads tlabs when appropriate + // update the gpu_hsail_tlabs when appropriate bool anyOverflows = false; size_t bytesAllocated = 0; // if there was an overflow in allocating tlabInfos, correct it here @@ -172,8 +168,7 @@ tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo, tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top()); } - JavaThread* donorThread = tlabInfo->_donor_thread; - ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); + ThreadLocalAllocBuffer* tlab = tlabInfo->tlab(); bool overflowed = false; // if a tlabInfo has NULL fields, i.e. we could not prime it on entry, // or we could not get a tlab from the gpu, so ignore tlabInfo here @@ -183,24 +178,14 @@ overflowed = true; if (TraceGPUInteraction) { long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top(); - tty->print_cr("tlabInfo %p (donorThread = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, donorThread, overflowAmount, tlabInfo->last_good_top()); + tty->print_cr("tlabInfo %p (tlab = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, tlab, overflowAmount, tlabInfo->last_good_top()); } tlabInfo->_top = tlabInfo->last_good_top(); } - // if the donor thread allocated anything while we were running - // we will retire its tlab before overwriting with our new one - if (tlab->top() != NULL) { - if (TraceGPUInteraction) { - tty->print("Donor Thread allocated new tlab"); - printTlabInfoFromThread(tlab); - } - tlab->make_parsable(true); - } - - // fill the donor thread tlab with the tlabInfo information + // fill the gpu_hsail_tlab with the tlabInfo information // we do this even if it will get overwritten by a later tlabinfo - // because it helps with tlab statistics for that donor thread + // because it helps with tlab statistics for that tlab tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve()); // if there was an overflow, make it parsable with retire = true @@ -231,7 +216,7 @@ private: // fill and retire old tlab and get a new one // if we can't get one, no problem someone will eventually do a gc - bool getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, int idx) { + bool getNewGpuHsailTlab(ThreadLocalAllocBuffer* tlab) { tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null) diff -r 74c02c90a3f9 -r a29e6e7b7a86 src/gpu/hsail/vm/vmStructs_hsail.hpp --- a/src/gpu/hsail/vm/vmStructs_hsail.hpp Tue Aug 12 16:12:49 2014 -0700 +++ b/src/gpu/hsail/vm/vmStructs_hsail.hpp Tue Aug 12 16:30:17 2014 -0700 @@ -58,8 +58,8 @@ nonstatic_field(HSAILTlabInfo, _end, HeapWord*) \ nonstatic_field(HSAILTlabInfo, _last_good_top, HeapWord*) \ nonstatic_field(HSAILTlabInfo, _original_top, HeapWord*) \ - nonstatic_field(HSAILTlabInfo, _donor_thread, JavaThread*) \ nonstatic_field(HSAILTlabInfo, _alloc_info, HSAILAllocationInfo*) \ + nonstatic_field(HSAILTlabInfo, _tlab, ThreadLocalAllocBuffer*) \ #define VM_TYPES_GPU_HSAIL(declare_type, declare_toplevel_type) \ declare_toplevel_type(HSAILFrame) \ diff -r 74c02c90a3f9 -r a29e6e7b7a86 src/share/vm/gc_interface/collectedHeap.cpp --- a/src/share/vm/gc_interface/collectedHeap.cpp Tue Aug 12 16:12:49 2014 -0700 +++ b/src/share/vm/gc_interface/collectedHeap.cpp Tue Aug 12 16:30:17 2014 -0700 @@ -503,7 +503,12 @@ "Attempt to fill tlabs before main thread has been added" " to threads list is doomed to failure!"); for (JavaThread *thread = Threads::first(); thread; thread = thread->next()) { - if (use_tlab) thread->tlab().make_parsable(retire_tlabs); + if (use_tlab) { + thread->tlab().make_parsable(retire_tlabs); +#ifdef GRAAL + thread->gpu_hsail_tlabs_make_parsable(retire_tlabs); +#endif + } #if defined(COMPILER2) || defined(GRAAL) // The deferred store barriers must all have been flushed to the // card-table (or other remembered set structure) before GC starts diff -r 74c02c90a3f9 -r a29e6e7b7a86 src/share/vm/memory/threadLocalAllocBuffer.cpp --- a/src/share/vm/memory/threadLocalAllocBuffer.cpp Tue Aug 12 16:12:49 2014 -0700 +++ b/src/share/vm/memory/threadLocalAllocBuffer.cpp Tue Aug 12 16:30:17 2014 -0700 @@ -48,6 +48,13 @@ for(JavaThread *thread = Threads::first(); thread; thread = thread->next()) { thread->tlab().accumulate_statistics(); thread->tlab().initialize_statistics(); +#ifdef GRAAL + for (jint i = 0; i < thread->get_gpu_hsail_tlabs_count(); i++) { + thread->get_gpu_hsail_tlab_at(i)->accumulate_statistics(); + thread->get_gpu_hsail_tlab_at(i)->initialize_statistics(); + } +#endif + } // Publish new stats if some allocation occurred. @@ -129,6 +136,11 @@ void ThreadLocalAllocBuffer::resize_all_tlabs() { for(JavaThread *thread = Threads::first(); thread; thread = thread->next()) { thread->tlab().resize(); +#ifdef GRAAL + for (jint i = 0; i < thread->get_gpu_hsail_tlabs_count(); i++) { + thread->get_gpu_hsail_tlab_at(i)->resize(); + } +#endif } } @@ -188,11 +200,12 @@ invariants(); } -void ThreadLocalAllocBuffer::initialize() { +void ThreadLocalAllocBuffer::initialize(Thread* owning_thread) { initialize(NULL, // start NULL, // top NULL); // end + _owning_thread = owning_thread; set_desired_size(initial_desired_size()); // Following check is needed because at startup the main (primordial) @@ -221,7 +234,7 @@ // During jvm startup, the main (primordial) thread is initialized // before the heap is initialized. So reinitialize it now. guarantee(Thread::current()->is_Java_thread(), "tlab initialization thread not Java thread"); - Thread::current()->tlab().initialize(); + Thread::current()->tlab().initialize(Thread::current()); if (PrintTLAB && Verbose) { gclog_or_tty->print("TLAB min: " SIZE_FORMAT " initial: " SIZE_FORMAT " max: " SIZE_FORMAT "\n", @@ -303,9 +316,7 @@ } Thread* ThreadLocalAllocBuffer::myThread() { - return (Thread*)(((char *)this) + - in_bytes(start_offset()) - - in_bytes(Thread::tlab_start_offset())); + return _owning_thread; } diff -r 74c02c90a3f9 -r a29e6e7b7a86 src/share/vm/memory/threadLocalAllocBuffer.hpp --- a/src/share/vm/memory/threadLocalAllocBuffer.hpp Tue Aug 12 16:12:49 2014 -0700 +++ b/src/share/vm/memory/threadLocalAllocBuffer.hpp Tue Aug 12 16:30:17 2014 -0700 @@ -56,6 +56,7 @@ unsigned _slow_refill_waste; unsigned _gc_waste; unsigned _slow_allocations; + Thread* _owning_thread; AdaptiveWeightedAverage _allocation_fraction; // fraction of eden allocated in tlabs @@ -156,7 +157,7 @@ static void resize_all_tlabs(); void fill(HeapWord* start, HeapWord* top, size_t new_size); - void initialize(); + void initialize(Thread* owning_thread); static size_t refill_waste_limit_increment() { return TLABWasteIncrement; } diff -r 74c02c90a3f9 -r a29e6e7b7a86 src/share/vm/runtime/thread.cpp --- a/src/share/vm/runtime/thread.cpp Tue Aug 12 16:12:49 2014 -0700 +++ b/src/share/vm/runtime/thread.cpp Tue Aug 12 16:30:17 2014 -0700 @@ -1474,7 +1474,9 @@ #ifdef GRAAL set_gpu_exception_bci(0); set_gpu_exception_method(NULL); - set_gpu_hsail_deopt_info(NULL); + set_gpu_hsail_deopt_info(NULL); + _gpu_hsail_tlabs_count = 0; + _gpu_hsail_tlabs = NULL; #endif set_thread_state(_thread_new); #if INCLUDE_NMT @@ -1694,6 +1696,8 @@ } FREE_C_HEAP_ARRAY(jlong, _graal_counters, mtInternal); } + + delete_gpu_hsail_tlabs(); #endif // GRAAL } @@ -1968,7 +1972,7 @@ remove_stack_guard_pages(); if (UseTLAB) { - tlab().make_parsable(true); // retire TLAB + tlabs_make_parsable(true); // retire TLABs, if any } if (JvmtiEnv::environments_might_exist()) { @@ -2047,7 +2051,7 @@ remove_stack_guard_pages(); if (UseTLAB) { - tlab().make_parsable(true); // retire TLAB, if any + tlabs_make_parsable(true); // retire TLABs, if any } #if INCLUDE_ALL_GCS @@ -4792,3 +4796,54 @@ VMThread* thread = VMThread::vm_thread(); if (thread != NULL) thread->verify(); } + +void JavaThread::tlabs_make_parsable(bool retire) { + // do the primary tlab for this thread + tlab().make_parsable(retire); +#ifdef GRAAL + // do the gpu_hsail tlabs if any + gpu_hsail_tlabs_make_parsable(retire); +#endif +} + + +#ifdef GRAAL +void JavaThread::initialize_gpu_hsail_tlabs(jint count) { + if (!UseTLAB) return; + // create tlabs + _gpu_hsail_tlabs = NEW_C_HEAP_ARRAY(ThreadLocalAllocBuffer*, count, mtInternal); + // initialize + for (jint i = 0; i < count; i++) { + _gpu_hsail_tlabs[i] = new ThreadLocalAllocBuffer(); + _gpu_hsail_tlabs[i]->initialize(Thread::current()); + } + _gpu_hsail_tlabs_count = count; +} + +ThreadLocalAllocBuffer* JavaThread::get_gpu_hsail_tlab_at(jint idx) { + assert(idx >= 0 && idx < get_gpu_hsail_tlabs_count(), "illegal gpu tlab index"); + return _gpu_hsail_tlabs[idx]; +} + +void JavaThread::gpu_hsail_tlabs_make_parsable(bool retire) { + for (jint i = 0; i < get_gpu_hsail_tlabs_count(); i++) { + get_gpu_hsail_tlab_at(i)->make_parsable(retire); + } +} + +void JavaThread::delete_gpu_hsail_tlabs() { + if (!UseTLAB) return; + if (_gpu_hsail_tlabs_count == 0) return; + + gpu_hsail_tlabs_make_parsable(true); + for (jint i = 0; i < get_gpu_hsail_tlabs_count(); i++) { + delete get_gpu_hsail_tlab_at(i); + } + FREE_C_HEAP_ARRAY(ThreadLocalAllocBuffer*, _gpu_hsail_tlabs, mtInternal); + _gpu_hsail_tlabs = NULL; + _gpu_hsail_tlabs_count = 0; +} + + +#endif + diff -r 74c02c90a3f9 -r a29e6e7b7a86 src/share/vm/runtime/thread.hpp --- a/src/share/vm/runtime/thread.hpp Tue Aug 12 16:12:49 2014 -0700 +++ b/src/share/vm/runtime/thread.hpp Tue Aug 12 16:30:17 2014 -0700 @@ -436,7 +436,7 @@ ThreadLocalAllocBuffer& tlab() { return _tlab; } void initialize_tlab() { if (UseTLAB) { - tlab().initialize(); + tlab().initialize(this); } } @@ -950,6 +950,8 @@ Method* _gpu_exception_method; // Record the hsailDeoptimization info so gc oops_do processing can find it void* _gpu_hsail_deopt_info; + jint _gpu_hsail_tlabs_count; + ThreadLocalAllocBuffer** _gpu_hsail_tlabs; #endif public: @@ -960,9 +962,17 @@ Method* get_gpu_exception_method() { return _gpu_exception_method; } void set_gpu_hsail_deopt_info(void * deoptInfo) { _gpu_hsail_deopt_info = deoptInfo; } void* get_gpu_hsail_deopt_info() { return _gpu_hsail_deopt_info; } + jint get_gpu_hsail_tlabs_count() { return _gpu_hsail_tlabs_count; } + + void initialize_gpu_hsail_tlabs(jint count); + ThreadLocalAllocBuffer* get_gpu_hsail_tlab_at(jint idx); + void gpu_hsail_tlabs_make_parsable(bool retire); + void delete_gpu_hsail_tlabs(); #endif - + private: + void tlabs_make_parsable(bool retire); + // support for JNI critical regions jint _jni_active_critical; // count of entries into JNI critical region