Mercurial > hg > graal-jvmci-8
diff src/gpu/hsail/vm/gpu_hsail.cpp @ 15066:2cae21d9f122
HSAIL: initial support for object allocation in HSAIL kernels
Contributed-by: Tom Deneau <tom.deneau@amd.com>
author | Doug Simon <doug.simon@oracle.com> |
---|---|
date | Fri, 11 Apr 2014 17:12:08 +0200 |
parents | a6c144380ce7 |
children | 0e689f20706e |
line wrap: on
line diff
--- a/src/gpu/hsail/vm/gpu_hsail.cpp Fri Apr 11 16:42:39 2014 +0200 +++ b/src/gpu/hsail/vm/gpu_hsail.cpp Fri Apr 11 17:12:08 2014 +0200 @@ -59,16 +59,15 @@ #define OBJECT "Ljava/lang/Object;" #define STRING "Ljava/lang/String;" +#define JLTHREAD "Ljava/lang/Thread;" #define HS_INSTALLED_CODE "Lcom/oracle/graal/hotspot/meta/HotSpotInstalledCode;" #define HS_COMPILED_NMETHOD "Lcom/oracle/graal/hotspot/HotSpotCompiledNmethod;" #define HS_NMETHOD "Lcom/oracle/graal/hotspot/meta/HotSpotNmethod;" -// public native void executeKernel(HotSpotNmethod kernel, int jobSize, int i, int j, Object[] args) throws InvalidInstalledCodeException; - JNINativeMethod Hsail::HSAIL_methods[] = { {CC"initialize", CC"()Z", FN_PTR(Hsail::initialize)}, {CC"generateKernel", CC"([B" STRING ")J", FN_PTR(Hsail::generate_kernel)}, - {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT")Z", FN_PTR(Hsail::execute_kernel_void_1d)}, + {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I)Z", FN_PTR(Hsail::execute_kernel_void_1d)}, }; void * Hsail::_device_context = NULL; @@ -86,6 +85,43 @@ Hsail::okra_clearargs_func_t Hsail::_okra_clearargs; Hsail::okra_register_heap_func_t Hsail::_okra_register_heap; +struct Stats { + int _dispatches; + int _deopts; + int _overflows; + bool _changeSeen; + +public: + Stats() { + _dispatches = _deopts = _overflows = 0; + _changeSeen = false; + } + + void incDeopts() { + _deopts++; + _changeSeen = true; + } + void incOverflows() { + _overflows++; + _changeSeen = true; + } + + void finishDispatch() { + _dispatches++; + if (_changeSeen) { + // print(); + _changeSeen = false; + } + } + + void print() { + tty->print_cr("Disp=%d, Deopts=%d, Ovflows=%d", _dispatches, _deopts, _overflows); + } + +}; + +static Stats kernelStats; + void Hsail::register_heap() { // After the okra functions are set up and the heap is initialized, register the java heap with HSA @@ -97,7 +133,8 @@ _okra_register_heap(Universe::heap()->base(), Universe::heap()->capacity()); } -GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args_handle, jobject oops_save_handle)) +GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args, jobject oops_save, + jobject donor_threads, jint allocBytesPerWorkitem)) ResourceMark rm; jlong nmethodValue = HotSpotInstalledCode::codeBlob(kernel_handle); @@ -113,7 +150,7 @@ SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); } - return execute_kernel_void_1d_internal((address) kernel, dimX, args_handle, mh, nm, oops_save_handle, CHECK_0); + return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, CHECK_0); GPU_END static void showRanges(jboolean *a, int len) { @@ -133,10 +170,80 @@ } } -jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args_handle, methodHandle& mh, nmethod *nm, jobject oops_save_handle, TRAPS) { +// fill and retire old tlab and get a new one +// if we can't get one, no problem someone will eventually do a gc +void Hsail::getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, size_t tlabMinHsail) { + tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null) + + // get a size for a new tlab that is at least tlabMinHsail. + size_t new_tlab_size = tlab->compute_size(tlabMinHsail); + if (new_tlab_size == 0) return; + + HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size); + if (tlab_start == NULL) return; + + // ..and clear it if required + if (ZeroTLAB) { + Copy::zero_to_words(tlab_start, new_tlab_size); + } + // and init the tlab pointers + tlab->fill(tlab_start, tlab_start, new_tlab_size); +} + +static void printTlabInfo (ThreadLocalAllocBuffer* tlab) { + HeapWord *start = tlab->start(); + HeapWord *top = tlab->top(); + HeapWord *end = tlab->end(); + // sizes are in bytes + size_t tlabFree = tlab->free() * HeapWordSize; + size_t tlabUsed = tlab->used() * HeapWordSize; + size_t tlabSize = tlabFree + tlabUsed; + double freePct = 100.0 * (double) tlabFree/(double) tlabSize; + tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct); +} + + +jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save, + jobject donor_threads, int allocBytesPerWorkitem, TRAPS) { ResourceMark rm(THREAD); - objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args_handle); + objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args); + + // TODO: avoid donor thread logic if kernel does not allocate + objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads); + int numDonorThreads = donorThreadObjects->length(); + guarantee(numDonorThreads > 0, "need at least one donor thread"); + JavaThread** donorThreads = NEW_RESOURCE_ARRAY(JavaThread*, numDonorThreads); + for (int i = 0; i < numDonorThreads; i++) { + donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i)); + } + + + // compute tlabMinHsail based on number of workitems, number of donor + // threads, allocBytesPerWorkitem rounded up + size_t tlabMinHsail = (allocBytesPerWorkitem * dimX + (numDonorThreads - 1)) / numDonorThreads; + if (TraceGPUInteraction) { + tty->print_cr("computed tlabMinHsail = %d", tlabMinHsail); + } + + for (int i = 0; i < numDonorThreads; i++) { + JavaThread* donorThread = donorThreads[i]; + ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); + if (TraceGPUInteraction) { + tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab); + printTlabInfo(tlab); + } + + // note: this used vs. free limit checking should be based on some + // heuristic where we see how much this kernel tends to allocate + if ((tlab->end() == NULL) || (tlab->free() * HeapWordSize < tlabMinHsail)) { + getNewTlabForDonorThread(tlab, tlabMinHsail); + if (TraceGPUInteraction) { + tty->print("donorThread %d, refilled tlab, -> ", i); + printTlabInfo(tlab); + } + } + } // Reset the kernel arguments _okra_clearargs(kernel); @@ -146,6 +253,7 @@ e = new (ResourceObj::C_HEAP, mtInternal) HSAILDeoptimizationInfo(); e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal)); memset(e->never_ran_array(), 0, dimX * sizeof(jboolean)); + e->set_donor_threads(donorThreads); } // This object sets up the kernel arguments @@ -169,6 +277,25 @@ success = _okra_execute_with_range(kernel, dimX); } + // fix up any tlab tops that overflowed + bool anyOverflows = false; + for (int i = 0; i < numDonorThreads; i++) { + JavaThread * donorThread = donorThreads[i]; + ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); + if (tlab->top() > tlab->end()) { + anyOverflows = true; + long overflowAmount = (long) tlab->top() - (long) tlab->pf_top(); + // tlab->set_top is private this ugly hack gets around that + *(long *)((char *)tlab + in_bytes(tlab->top_offset())) = (long) tlab->pf_top(); + if (TraceGPUInteraction) { + tty->print_cr("donorThread %d at %p overflowed by %ld bytes, setting last good top to %p", i, donorThread, overflowAmount, tlab->top()); + } + } + } + if (anyOverflows) { + kernelStats.incOverflows(); + } + if (UseHSAILDeoptimization) { // check if any workitem requested a deopt // currently we only support at most one such workitem @@ -180,10 +307,9 @@ sprintf(msg, "deopt error detected, slot for workitem %d was not empty", -1 * (deoptcode + 1)); guarantee(deoptcode == 1, msg); } - + kernelStats.incDeopts(); { TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction); - if (TraceGPUInteraction) { tty->print_cr("deopt happened."); HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[0]; @@ -194,7 +320,7 @@ // the hsail frames in oops_save so they get adjusted by any // GC. Need to do this before leaving thread_in_vm mode. // resolve handle only needed once here (not exiting vm mode) - objArrayOop oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save_handle); + objArrayOop oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save); // since slots are allocated from the beginning, we know how far to look assert(e->num_deopts() < MAX_DEOPT_SAVE_STATES_SIZE, "deopt save state overflow"); @@ -228,7 +354,7 @@ // update the hsailFrame from the oopsSaveArray // re-resolve the handle - oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save_handle); + oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save); int dregOopMap = hsailFrame->dreg_oops_map(); for (int bit = 0; bit < 16; bit++) { @@ -296,7 +422,7 @@ JavaValue result(T_VOID); JavaCallArguments javaArgs; // re-resolve the args_handle here - objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args_handle); + objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args); // This object sets up the javaCall arguments // the way argsArray is set up, this should work for instance methods as well // (the receiver will be the first oop pushed) @@ -317,10 +443,11 @@ } // end of never-ran handling } } - + FREE_C_HEAP_ARRAY(jboolean, e->never_ran_array(), mtInternal); delete e; } + kernelStats.finishDispatch(); return success; }