Mercurial > hg > graal-jvmci-8
diff src/gpu/hsail/vm/gpu_hsail.cpp @ 16076:06eedda53e14
HSAIL: add support to allocate new TLAB from GPU
Contributed-by: Tom Deneau <tom.deneau@amd.com>
author | Doug Simon <doug.simon@oracle.com> |
---|---|
date | Tue, 10 Jun 2014 22:36:26 +0200 |
parents | 66a9286203a2 |
children | e9998e2be7f5 |
line wrap: on
line diff
--- a/src/gpu/hsail/vm/gpu_hsail.cpp Tue Jun 10 19:08:33 2014 +0200 +++ b/src/gpu/hsail/vm/gpu_hsail.cpp Tue Jun 10 22:36:26 2014 +0200 @@ -69,8 +69,8 @@ {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I[I)Z", FN_PTR(Hsail::execute_kernel_void_1d)}, }; -void * Hsail::_device_context = NULL; -jint Hsail::_notice_safepoints = false; +void* Hsail::_device_context = NULL; +jint Hsail::_notice_safepoints = false; Hsail::okra_create_context_func_t Hsail::_okra_create_context; Hsail::okra_create_kernel_func_t Hsail::_okra_create_kernel; @@ -85,43 +85,6 @@ Hsail::okra_clearargs_func_t Hsail::_okra_clearargs; Hsail::okra_register_heap_func_t Hsail::_okra_register_heap; -struct Stats { - int _dispatches; - int _deopts; - int _overflows; - bool _changeSeen; - -public: - Stats() { - _dispatches = _deopts = _overflows = 0; - _changeSeen = false; - } - - void incDeopts() { - _deopts++; - _changeSeen = true; - } - void incOverflows() { - _overflows++; - _changeSeen = true; - } - - void finishDispatch() { - _dispatches++; - if (_changeSeen) { - // print(); - _changeSeen = false; - } - } - - void print() { - tty->print_cr("Disp=%d, Deopts=%d, Ovflows=%d", _dispatches, _deopts, _overflows); - } - -}; - -static Stats kernelStats; - //static jint in_kernel = 0; void Hsail::notice_safepoints() { @@ -165,7 +128,7 @@ return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, oop_map_array, CHECK_0); GPU_END -static void showRanges(jboolean *a, int len) { +static void showRanges(jboolean* a, int len) { // show ranges bool lookFor = true; for (int i = 0; i < len; i++) { @@ -182,38 +145,6 @@ } } -// fill and retire old tlab and get a new one -// if we can't get one, no problem someone will eventually do a gc -void Hsail::getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, size_t tlabMinHsail) { - tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null) - - // get a size for a new tlab that is at least tlabMinHsail. - size_t new_tlab_size = tlab->compute_size(tlabMinHsail); - if (new_tlab_size == 0) return; - - HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size); - if (tlab_start == NULL) return; - - // ..and clear it if required - if (ZeroTLAB) { - Copy::zero_to_words(tlab_start, new_tlab_size); - } - // and init the tlab pointers - tlab->fill(tlab_start, tlab_start, new_tlab_size); -} - -static void printTlabInfo (ThreadLocalAllocBuffer* tlab) { - HeapWord *start = tlab->start(); - HeapWord *top = tlab->top(); - HeapWord *end = tlab->end(); - // sizes are in bytes - size_t tlabFree = tlab->free() * HeapWordSize; - size_t tlabUsed = tlab->used() * HeapWordSize; - size_t tlabSize = tlabFree + tlabUsed; - double freePct = 100.0 * (double) tlabFree/(double) tlabSize; - tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct); -} - class OopSaver : public StackObj { private: objArrayOop _oopsSaveArray; @@ -260,21 +191,21 @@ _oopMapArray = (typeArrayOop) JNIHandles::resolve(_oop_map_array); } - void * getOopForBit(HSAILFrame * hsailFrame, int bit) { + void* getOopForBit(HSAILFrame* hsailFrame, int bit) { assert(isOop(hsailFrame, bit), ""); - void *oop; + void* oop; if (bit < hsailFrame->num_d_regs()) { // d register oop = (void*) hsailFrame->get_d_reg(bit); } else { // stack slot int stackOffset = (bit - hsailFrame->num_d_regs()) * 8; // 8 bytes per stack slot - oop = (void *) hsailFrame->get_stackslot64(stackOffset); + oop = (void*) hsailFrame->get_stackslot64(stackOffset); } return oop; } - void putOopForBit(HSAILFrame * hsailFrame, int bit, void *oop) { + void putOopForBit(HSAILFrame* hsailFrame, int bit, void* oop) { assert(isOop(hsailFrame, bit), ""); if (bit < hsailFrame->num_d_regs()) { // d register @@ -286,7 +217,7 @@ } } - void saveOopsFromFrame(HSAILFrame * hsailFrame, int deoptSlot){ + void saveOopsFromFrame(HSAILFrame* hsailFrame, int deoptSlot){ // as used, no need to resolve arrays on each call int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots(); @@ -300,7 +231,7 @@ } } - void restoreOopsToFrame(HSAILFrame * hsailFrame, int deoptSlot, int workitem){ + void restoreOopsToFrame(HSAILFrame* hsailFrame, int deoptSlot, int workitem){ // need to re-resolve on each restore resolveArrays(); int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots(); @@ -310,13 +241,13 @@ if (isOop(hsailFrame, bit)) { // the dregister or stack slot at this bit is an oop, retrieve it from array and put back in frame int saveArrayIndex = deoptSlot * oopsPerDeopt + bit; - void * newValue = (void *) _oopsSaveArray->obj_at(saveArrayIndex); - void * oldValue = getOopForBit(hsailFrame, bit); + void* newValue = (void*) _oopsSaveArray->obj_at(saveArrayIndex); + void* oldValue = getOopForBit(hsailFrame, bit); assert((oldValue != 0 ? newValue != 0 : newValue == 0), "bad dregValue retrieved"); if (newValue != oldValue) { if (TraceGPUInteraction) { int numDRegs = hsailFrame->num_d_regs(); - const char *name = (bit < numDRegs ? "$d" : "stk"); + const char* name = (bit < numDRegs ? "$d" : "stk"); int num = (bit < numDRegs ? bit : bit - numDRegs); tty->print_cr("oop moved for %s%d, workitem %d, slot %d, old=%p, new=%p", name, num, workitem, deoptSlot, oldValue, newValue); @@ -327,7 +258,7 @@ } } - bool isOop(HSAILFrame * hsailFrame, int bit){ + bool isOop(HSAILFrame* hsailFrame, int bit){ // re-resolve on each access resolveArrays(); if (bit > hsailFrame->num_d_regs() + hsailFrame->num_stack_slots()) { @@ -347,47 +278,15 @@ }; -jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save, +jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm, jobject oops_save, jobject donor_threads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) { ResourceMark rm(THREAD); objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args); - // TODO: avoid donor thread logic if kernel does not allocate - objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads); - int numDonorThreads = donorThreadObjects->length(); - guarantee(numDonorThreads > 0, "need at least one donor thread"); - JavaThread** donorThreads = NEW_RESOURCE_ARRAY(JavaThread*, numDonorThreads); - for (int i = 0; i < numDonorThreads; i++) { - donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i)); - } - - - // compute tlabMinHsail based on number of workitems, number of donor - // threads, allocBytesPerWorkitem rounded up - size_t tlabMinHsail = (allocBytesPerWorkitem * dimX + (numDonorThreads - 1)) / numDonorThreads; - if (TraceGPUInteraction) { - tty->print_cr("computed tlabMinHsail = %d", tlabMinHsail); - } - - for (int i = 0; i < numDonorThreads; i++) { - JavaThread* donorThread = donorThreads[i]; - ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); - if (TraceGPUInteraction) { - tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab); - printTlabInfo(tlab); - } - - // note: this used vs. free limit checking should be based on some - // heuristic where we see how much this kernel tends to allocate - if ((tlab->end() == NULL) || (tlab->free() * HeapWordSize < tlabMinHsail)) { - getNewTlabForDonorThread(tlab, tlabMinHsail); - if (TraceGPUInteraction) { - tty->print("donorThread %d, refilled tlab, -> ", i); - printTlabInfo(tlab); - } - } - } - + // We avoid HSAILAllocationInfo logic if kernel does not allocate + // in which case the donor_thread array passed in will be null + HSAILAllocationInfo* allocInfo = (donor_threads == NULL ? NULL : new HSAILAllocationInfo(donor_threads, dimX, allocBytesPerWorkitem)); + // Reset the kernel arguments _okra_clearargs(kernel); @@ -400,7 +299,11 @@ int numStackSlots = (saveAreaCounts >> 16); int bytesPerSaveArea = numSRegs * 4 + (numDRegs + numStackSlots) * 8; - e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea, dimX, donorThreads); + e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea, dimX, allocInfo); + // copy cur_tlab_infos + if (allocInfo != NULL) { + e->setCurTlabInfos(allocInfo->getCurTlabInfos()); + } } // This object sets up the kernel arguments @@ -409,8 +312,8 @@ tty->print_cr("[HSAIL] range=%d", dimX); } - // if any object passed was null, throw an exception here - // doing this means the kernel code can avoid null checks on the object parameters. + // If any object passed was null, throw an exception here. Doing this + // means the kernel code can avoid null checks on the object parameters. if (hka.getFirstNullParameterIndex() >= 0) { char buf[64]; sprintf(buf, "Null Kernel Parameter seen, Parameter Index: %d", hka.getFirstNullParameterIndex()); @@ -431,23 +334,9 @@ //in_kernel = 0; } - // fix up any tlab tops that overflowed - bool anyOverflows = false; - for (int i = 0; i < numDonorThreads; i++) { - JavaThread * donorThread = donorThreads[i]; - ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); - if (tlab->top() > tlab->end()) { - anyOverflows = true; - long overflowAmount = (long) tlab->top() - (long) tlab->pf_top(); - // tlab->set_top is private this ugly hack gets around that - *(long *)((char *)tlab + in_bytes(tlab->top_offset())) = (long) tlab->pf_top(); - if (TraceGPUInteraction) { - tty->print_cr("donorThread %d at %p overflowed by %ld bytes, setting last good top to %p", i, donorThread, overflowAmount, tlab->top()); - } - } - } - if (anyOverflows) { - kernelStats.incOverflows(); + // avoid HSAILAllocationInfo logic if kernel does not allocate + if (allocInfo != NULL) { + allocInfo->postKernelCleanup(); } if (UseHSAILDeoptimization) { @@ -465,13 +354,11 @@ guarantee(deoptcode == 1, msg); } } else { - kernelStats.incDeopts(); - { TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction); if (TraceGPUInteraction) { tty->print_cr("deopt happened."); - HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(0); + HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(0); tty->print_cr("first deopter was workitem %d", pdeopt->workitem()); } @@ -485,7 +372,7 @@ // since slots are allocated from the beginning, we know how far to look assert(e->num_deopts() < e->num_slots(), "deopt save state overflow"); for (int k = 0; k < e->num_deopts(); k++) { - HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k); + HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(k); assert (pdeopt->workitem() >= 0, "bad workitem in deopt"); // this is a workitem that deopted oopSaver.saveOopsFromFrame(pdeopt->first_frame(), k); @@ -494,15 +381,15 @@ // Handle any deopting workitems. int count_deoptimized = 0; for (int k = 0; k < e->num_deopts(); k++) { - HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k); + HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(k); jint workitem = pdeopt->workitem(); if (workitem != -1) { int deoptId = pdeopt->pc_offset(); - HSAILFrame *hsailFrame = pdeopt->first_frame(); + HSAILFrame* hsailFrame = pdeopt->first_frame(); - // update the hsailFrame from the oopsSaveArray - // will re-resolve the handles each time + // Update the hsailFrame from the oopsSaveArray + // will re-resolve the handles each time. oopSaver.restoreOopsToFrame(hsailFrame, k, workitem); JavaValue result(T_VOID); @@ -511,7 +398,7 @@ javaArgs.push_int(deoptId); javaArgs.push_long((jlong) hsailFrame); - // override the deoptimization action with Action_none until we decide + // Override the deoptimization action with Action_none until we decide // how to handle the other actions. int myActionReason = Deoptimization::make_trap_request(Deoptimization::trap_request_reason(pdeopt->reason()), Deoptimization::Action_none); javaArgs.push_int(myActionReason); @@ -551,7 +438,7 @@ // turn off verbose trace stuff for javacall arg setup bool savedTraceGPUInteraction = TraceGPUInteraction; TraceGPUInteraction = false; - jboolean *never_ran_array = e->never_ran_array(); + jboolean* never_ran_array = e->never_ran_array(); if (handleNeverRansHere) { for (int k = 0; k < dimX; k++) { if (never_ran_array[k]) { @@ -562,9 +449,10 @@ JavaCallArguments javaArgs; // re-resolve the args_handle here objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args); - // This object sets up the javaCall arguments - // the way argsArray is set up, this should work for instance methods as well - // (the receiver will be the first oop pushed) + + // This object sets up the javaCall arguments. The way + // argsArray is set up, this should work for instance + // methods as well (the receiver will be the first oop pushed) HSAILJavaCallArguments hjca(&javaArgs, k, mh->signature(), resolvedArgsArray, mh->is_static()); if (mh->is_static()) { JavaCalls::call_static(&result, methKlass, mh->name(), mh->signature(), &javaArgs, THREAD); @@ -583,19 +471,19 @@ } delete e; + delete allocInfo; } - kernelStats.finishDispatch(); return success; } -GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv *env, jclass, jbyteArray code_handle, jstring name_handle)) +GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv* env, jclass, jbyteArray code_handle, jstring name_handle)) guarantee(_okra_create_kernel != NULL, "[HSAIL] Okra not linked"); ResourceMark rm; jsize name_len = env->GetStringLength(name_handle); jsize code_len = env->GetArrayLength(code_handle); char* name = NEW_RESOURCE_ARRAY(char, name_len + 1); - unsigned char *code = NEW_RESOURCE_ARRAY(unsigned char, code_len + 1); + unsigned char* code = NEW_RESOURCE_ARRAY(unsigned char, code_len + 1); code[code_len] = 0; name[name_len] = 0; @@ -631,7 +519,7 @@ return false; \ } \ -GPU_ENTRY(jboolean, Hsail::initialize, (JNIEnv *env, jclass)) +GPU_ENTRY(jboolean, Hsail::initialize, (JNIEnv* env, jclass)) if (okra_library_name == NULL) { if (TraceGPUInteraction) { tty->print_cr("Unsupported HSAIL platform"); @@ -641,14 +529,14 @@ // here we know we have a valid okra_library_name to try to load char ebuf[O_BUFLEN]; - char *okra_lib_name_from_env_var = getenv("_OKRA_SIM_LIB_PATH_"); + char* okra_lib_name_from_env_var = getenv("_OKRA_SIM_LIB_PATH_"); if (okra_lib_name_from_env_var != NULL) { okra_library_name = okra_lib_name_from_env_var; } if (TraceGPUInteraction) { tty->print_cr("[HSAIL] library is %s", okra_library_name); } - void *okra_lib_handle = NULL; + void* okra_lib_handle = NULL; #if defined(LINUX) // Check first if the Okra library is already loaded. // TODO: Figure out how to do this on other OSes. @@ -668,8 +556,8 @@ guarantee(_okra_create_context == NULL, "cannot repeat GPU initialization"); - // at this point we know okra_lib_handle is valid whether we loaded - // here or earlier. In either case, we can lookup the functions + // At this point we know okra_lib_handle is valid whether we loaded + // here or earlier. In either case, we can lookup the functions. LOOKUP_OKRA_FUNCTION(okra_create_context, okra_create_context); LOOKUP_OKRA_FUNCTION(okra_create_kernel, okra_create_kernel); LOOKUP_OKRA_FUNCTION(okra_push_object, okra_push_object);