Mercurial > hg > graal-compiler
comparison src/gpu/hsail/vm/gpu_hsail.cpp @ 15482:a250a512434d
HSAIL: support for object values in stack slots at deoptimization points
Contributed-by: Tom Deneau <tom.deneau@amd.com>
author | Doug Simon <doug.simon@oracle.com> |
---|---|
date | Fri, 02 May 2014 21:58:28 +0200 |
parents | 66e3af78ea96 |
children | d370d87e528f |
comparison
equal
deleted
inserted
replaced
15481:09d721bcffe2 | 15482:a250a512434d |
---|---|
65 #define HS_NMETHOD "Lcom/oracle/graal/hotspot/meta/HotSpotNmethod;" | 65 #define HS_NMETHOD "Lcom/oracle/graal/hotspot/meta/HotSpotNmethod;" |
66 | 66 |
67 JNINativeMethod Hsail::HSAIL_methods[] = { | 67 JNINativeMethod Hsail::HSAIL_methods[] = { |
68 {CC"initialize", CC"()Z", FN_PTR(Hsail::initialize)}, | 68 {CC"initialize", CC"()Z", FN_PTR(Hsail::initialize)}, |
69 {CC"generateKernel", CC"([B" STRING ")J", FN_PTR(Hsail::generate_kernel)}, | 69 {CC"generateKernel", CC"([B" STRING ")J", FN_PTR(Hsail::generate_kernel)}, |
70 {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I)Z", FN_PTR(Hsail::execute_kernel_void_1d)}, | 70 {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I[I)Z", FN_PTR(Hsail::execute_kernel_void_1d)}, |
71 }; | 71 }; |
72 | 72 |
73 void * Hsail::_device_context = NULL; | 73 void * Hsail::_device_context = NULL; |
74 jint Hsail::_notice_safepoints = false; | 74 jint Hsail::_notice_safepoints = false; |
75 | 75 |
145 } | 145 } |
146 _okra_register_heap(Universe::heap()->base(), Universe::heap()->capacity()); | 146 _okra_register_heap(Universe::heap()->base(), Universe::heap()->capacity()); |
147 } | 147 } |
148 | 148 |
149 GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args, jobject oops_save, | 149 GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args, jobject oops_save, |
150 jobject donor_threads, jint allocBytesPerWorkitem)) | 150 jobject donor_threads, jint allocBytesPerWorkitem, jobject oop_map_array)) |
151 | 151 |
152 ResourceMark rm; | 152 ResourceMark rm; |
153 jlong nmethodValue = InstalledCode::address(kernel_handle); | 153 jlong nmethodValue = InstalledCode::address(kernel_handle); |
154 if (nmethodValue == 0) { | 154 if (nmethodValue == 0) { |
155 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); | 155 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); |
161 void* kernel = (void*) HotSpotInstalledCode::codeStart(kernel_handle); | 161 void* kernel = (void*) HotSpotInstalledCode::codeStart(kernel_handle); |
162 if (kernel == NULL) { | 162 if (kernel == NULL) { |
163 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); | 163 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); |
164 } | 164 } |
165 | 165 |
166 return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, CHECK_0); | 166 return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, oop_map_array, CHECK_0); |
167 GPU_END | 167 GPU_END |
168 | 168 |
169 static void showRanges(jboolean *a, int len) { | 169 static void showRanges(jboolean *a, int len) { |
170 // show ranges | 170 // show ranges |
171 bool lookFor = true; | 171 bool lookFor = true; |
213 size_t tlabSize = tlabFree + tlabUsed; | 213 size_t tlabSize = tlabFree + tlabUsed; |
214 double freePct = 100.0 * (double) tlabFree/(double) tlabSize; | 214 double freePct = 100.0 * (double) tlabFree/(double) tlabSize; |
215 tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct); | 215 tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct); |
216 } | 216 } |
217 | 217 |
218 | 218 class OopSaver : public StackObj { |
219 jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save, | 219 private: |
220 jobject donor_threads, int allocBytesPerWorkitem, TRAPS) { | 220 objArrayOop _oopsSaveArray; |
221 typeArrayOop _oopMapArray; | |
222 jobject _oops_save; | |
223 jobject _oop_map_array; | |
224 int _last_pcoffset; | |
225 int _last_idx; | |
226 int _saveAreaCounts; | |
227 | |
228 enum { | |
229 SAVEAREACOUNTS_OFST=0, | |
230 SPAN_OFST=1, | |
231 HEADERSIZE=2 | |
232 }; | |
233 int mapPcOffsetToIndex(int pcOffset) { | |
234 if (pcOffset == _last_pcoffset) { | |
235 return _last_idx; | |
236 } | |
237 int span = _oopMapArray->int_at(SPAN_OFST); | |
238 for (int idx = HEADERSIZE; idx < _oopMapArray->length(); idx += span) { | |
239 int ofst = _oopMapArray->int_at(idx); | |
240 if (ofst == pcOffset) { | |
241 _last_pcoffset = pcOffset; | |
242 _last_idx = idx + 1; | |
243 return _last_idx; | |
244 } | |
245 } | |
246 } | |
247 | |
248 public: | |
249 OopSaver(jobject oops_save, jobject oop_map_array) { | |
250 _oops_save = oops_save; | |
251 _oop_map_array = oop_map_array; | |
252 _last_pcoffset = -1; | |
253 _saveAreaCounts = getSaveAreaCounts(oop_map_array); | |
254 resolveArrays(); | |
255 } | |
256 | |
257 void resolveArrays() { | |
258 _oopsSaveArray = (objArrayOop) JNIHandles::resolve(_oops_save); | |
259 _oopMapArray = (typeArrayOop) JNIHandles::resolve(_oop_map_array); | |
260 } | |
261 | |
262 void * getOopForBit(HSAILFrame * hsailFrame, int bit) { | |
263 assert(isOop(hsailFrame, bit), ""); | |
264 void *oop; | |
265 if (bit < hsailFrame->num_d_regs()) { | |
266 // d register | |
267 oop = (void*) hsailFrame->get_d_reg(bit); | |
268 } else { | |
269 // stack slot | |
270 int stackOffset = (bit - hsailFrame->num_d_regs()) * 8; // 8 bytes per stack slot | |
271 oop = (void *) hsailFrame->get_stackslot64(stackOffset); | |
272 } | |
273 return oop; | |
274 } | |
275 | |
276 void putOopForBit(HSAILFrame * hsailFrame, int bit, void *oop) { | |
277 assert(isOop(hsailFrame, bit), ""); | |
278 if (bit < hsailFrame->num_d_regs()) { | |
279 // d register | |
280 hsailFrame->put_d_reg(bit, (jlong) oop); | |
281 } else { | |
282 // stack slot | |
283 int stackOffset = (bit - hsailFrame->num_d_regs()) * 8; // 8 bytes per stack slot | |
284 hsailFrame->put_stackslot64(stackOffset, (jlong) oop); | |
285 } | |
286 } | |
287 | |
288 void saveOopsFromFrame(HSAILFrame * hsailFrame, int deoptSlot){ | |
289 // as used, no need to resolve arrays on each call | |
290 int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots(); | |
291 | |
292 // handle the dregister and stackSlot based oops | |
293 for (int bit = 0; bit < oopsPerDeopt; bit++) { | |
294 if (isOop(hsailFrame, bit)) { | |
295 void* saved_oop = getOopForBit(hsailFrame, bit); | |
296 int saveArrayIndex = deoptSlot * oopsPerDeopt + bit; | |
297 _oopsSaveArray->obj_at_put(saveArrayIndex, (oop) saved_oop); | |
298 } | |
299 } | |
300 } | |
301 | |
302 void restoreOopsToFrame(HSAILFrame * hsailFrame, int deoptSlot, int workitem){ | |
303 // need to re-resolve on each restore | |
304 resolveArrays(); | |
305 int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots(); | |
306 | |
307 // handle the dregister and stackSlot based oops | |
308 for (int bit = 0; bit < oopsPerDeopt; bit++) { | |
309 if (isOop(hsailFrame, bit)) { | |
310 // the dregister or stack slot at this bit is an oop, retrieve it from array and put back in frame | |
311 int saveArrayIndex = deoptSlot * oopsPerDeopt + bit; | |
312 void * newValue = (void *) _oopsSaveArray->obj_at(saveArrayIndex); | |
313 void * oldValue = getOopForBit(hsailFrame, bit); | |
314 assert((oldValue != 0 ? newValue != 0 : newValue == 0), "bad dregValue retrieved"); | |
315 if (newValue != oldValue) { | |
316 if (TraceGPUInteraction) { | |
317 int numDRegs = hsailFrame->num_d_regs(); | |
318 const char *name = (bit < numDRegs ? "$d" : "stk"); | |
319 int num = (bit < numDRegs ? bit : bit - numDRegs); | |
320 tty->print_cr("oop moved for %s%d, workitem %d, slot %d, old=%p, new=%p", | |
321 name, num, workitem, deoptSlot, oldValue, newValue); | |
322 } | |
323 putOopForBit(hsailFrame, bit, newValue); | |
324 } | |
325 } | |
326 } | |
327 } | |
328 | |
329 bool isOop(HSAILFrame * hsailFrame, int bit){ | |
330 // re-resolve on each access | |
331 resolveArrays(); | |
332 if (bit > hsailFrame->num_d_regs() + hsailFrame->num_stack_slots()) { | |
333 return false; | |
334 } | |
335 int pcOffset = hsailFrame->pc_offset(); | |
336 int bits_int_idx = mapPcOffsetToIndex(pcOffset) + (bit / 32); | |
337 int bitpos = bit % 32; | |
338 int bits = _oopMapArray->int_at(bits_int_idx); | |
339 return ((bits & (1 << bitpos)) != 0); | |
340 } | |
341 | |
342 static int getSaveAreaCounts(jobject oopMapArrayObject) { | |
343 typeArrayOop oopMapArray = (typeArrayOop) JNIHandles::resolve(oopMapArrayObject); | |
344 return oopMapArray->int_at(SAVEAREACOUNTS_OFST); | |
345 } | |
346 | |
347 }; | |
348 | |
349 jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save, | |
350 jobject donor_threads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) { | |
221 ResourceMark rm(THREAD); | 351 ResourceMark rm(THREAD); |
222 objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args); | 352 objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args); |
223 | 353 |
224 // TODO: avoid donor thread logic if kernel does not allocate | 354 // TODO: avoid donor thread logic if kernel does not allocate |
225 objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads); | 355 objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads); |
258 } | 388 } |
259 | 389 |
260 // Reset the kernel arguments | 390 // Reset the kernel arguments |
261 _okra_clearargs(kernel); | 391 _okra_clearargs(kernel); |
262 | 392 |
393 // get how many bytes per deopt save area are required | |
394 int saveAreaCounts = OopSaver::getSaveAreaCounts(oop_map_array); | |
395 int numSRegs = saveAreaCounts & 0xff; | |
396 int numDRegs = (saveAreaCounts >> 8) & 0xff; | |
397 int numStackSlots = (saveAreaCounts >> 16); | |
398 int bytesPerSaveArea = numSRegs * 4 + (numDRegs + numStackSlots) * 8; | |
399 | |
263 HSAILDeoptimizationInfo* e; | 400 HSAILDeoptimizationInfo* e; |
264 if (UseHSAILDeoptimization) { | 401 if (UseHSAILDeoptimization) { |
265 e = new (ResourceObj::C_HEAP, mtInternal) HSAILDeoptimizationInfo(); | 402 e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea); |
266 e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal)); | 403 e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal)); |
267 memset(e->never_ran_array(), 0, dimX * sizeof(jboolean)); | 404 memset(e->never_ran_array(), 0, dimX * sizeof(jboolean)); |
268 e->set_donor_threads(donorThreads); | 405 e->set_donor_threads(donorThreads); |
269 } | 406 } |
270 | 407 |
316 } | 453 } |
317 | 454 |
318 if (UseHSAILDeoptimization) { | 455 if (UseHSAILDeoptimization) { |
319 kernelStats.incDeopts(); | 456 kernelStats.incDeopts(); |
320 // check if any workitem requested a deopt | 457 // check if any workitem requested a deopt |
321 // currently we only support at most one such workitem | |
322 int deoptcode = e->deopt_occurred(); | 458 int deoptcode = e->deopt_occurred(); |
323 if (deoptcode != 1) { | 459 if (deoptcode != 1) { |
324 if (deoptcode == 0) { | 460 if (deoptcode == 0) { |
325 if (TraceGPUInteraction && _notice_safepoints != 0) { | 461 if (TraceGPUInteraction && _notice_safepoints != 0) { |
326 tty->print_cr("[HSAIL] observed safepoint during kernel"); | 462 tty->print_cr("[HSAIL] observed safepoint during kernel"); |
335 | 471 |
336 { | 472 { |
337 TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction); | 473 TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction); |
338 if (TraceGPUInteraction) { | 474 if (TraceGPUInteraction) { |
339 tty->print_cr("deopt happened."); | 475 tty->print_cr("deopt happened."); |
340 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[0]; | 476 HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(0); |
341 tty->print_cr("first deopter was workitem %d", pdeopt->workitem()); | 477 tty->print_cr("first deopter was workitem %d", pdeopt->workitem()); |
342 } | 478 } |
343 | 479 |
344 // Before handling any deopting workitems, save the pointers from | 480 // Before handling any deopting workitems, save the pointers from |
345 // the hsail frames in oops_save so they get adjusted by any | 481 // the hsail frames in oops_save so they get adjusted by any |
346 // GC. Need to do this before leaving thread_in_vm mode. | 482 // GC. Need to do this before leaving thread_in_vm mode. |
483 OopSaver oopSaver(oops_save, oop_map_array); | |
347 // resolve handle only needed once here (not exiting vm mode) | 484 // resolve handle only needed once here (not exiting vm mode) |
348 objArrayOop oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save); | 485 oopSaver.resolveArrays(); |
349 | 486 |
350 // since slots are allocated from the beginning, we know how far to look | 487 // since slots are allocated from the beginning, we know how far to look |
351 assert(e->num_deopts() < MAX_DEOPT_SAVE_STATES_SIZE, "deopt save state overflow"); | 488 assert(e->num_deopts() < e->num_slots(), "deopt save state overflow"); |
352 for (int k = 0; k < e->num_deopts(); k++) { | 489 for (int k = 0; k < e->num_deopts(); k++) { |
353 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k]; | 490 HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k); |
354 jint workitem = pdeopt->workitem(); | 491 assert (pdeopt->workitem() >= 0, "bad workitem in deopt"); |
355 if (workitem != -1) { | 492 // this is a workitem that deopted |
356 // this is a workitem that deopted | 493 oopSaver.saveOopsFromFrame(pdeopt->first_frame(), k); |
357 HSAILFrame *hsailFrame = pdeopt->first_frame(); | |
358 int dregOopMap = hsailFrame->dreg_oops_map(); | |
359 for (int bit = 0; bit < 16; bit++) { | |
360 if ((dregOopMap & (1 << bit)) != 0) { | |
361 // the dregister at this bit is an oop, save it in the array | |
362 int index = k * 16 + bit; | |
363 void* saved_oop = (void*) hsailFrame->get_d_reg(bit); | |
364 oopsSaveArray->obj_at_put(index, (oop) saved_oop); | |
365 } | |
366 } | |
367 } | |
368 } | 494 } |
369 | 495 |
370 // Handle any deopting workitems. | 496 // Handle any deopting workitems. |
371 int count_deoptimized = 0; | 497 int count_deoptimized = 0; |
372 for (int k = 0; k < e->num_deopts(); k++) { | 498 for (int k = 0; k < e->num_deopts(); k++) { |
373 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k]; | 499 HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k); |
374 | 500 |
375 jint workitem = pdeopt->workitem(); | 501 jint workitem = pdeopt->workitem(); |
376 if (workitem != -1) { | 502 if (workitem != -1) { |
377 int deoptId = pdeopt->pc_offset(); | 503 int deoptId = pdeopt->pc_offset(); |
378 HSAILFrame *hsailFrame = pdeopt->first_frame(); | 504 HSAILFrame *hsailFrame = pdeopt->first_frame(); |
379 | 505 |
380 // update the hsailFrame from the oopsSaveArray | 506 // update the hsailFrame from the oopsSaveArray |
381 // re-resolve the handle | 507 // will re-resolve the handles each time |
382 oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save); | 508 oopSaver.restoreOopsToFrame(hsailFrame, k, workitem); |
383 | |
384 int dregOopMap = hsailFrame->dreg_oops_map(); | |
385 for (int bit = 0; bit < 16; bit++) { | |
386 if ((dregOopMap & (1 << bit)) != 0) { | |
387 // the dregister at this bit is an oop, retrieve it from array and put back in frame | |
388 int index = k * 16 + bit; | |
389 void * dregValue = (void *) oopsSaveArray->obj_at(index); | |
390 void * oldDregValue = (void *) hsailFrame->get_d_reg(bit); | |
391 assert((oldDregValue != 0 ? dregValue != 0 : dregValue == 0), "bad dregValue retrieved"); | |
392 if (TraceGPUInteraction) { | |
393 if (dregValue != oldDregValue) { | |
394 tty->print_cr("oop moved for $d%d, workitem %d, slot %d, old=%p, new=%p", bit, workitem, k, oldDregValue, dregValue); | |
395 } | |
396 } | |
397 hsailFrame->put_d_reg(bit, (jlong) dregValue); | |
398 } | |
399 } | |
400 | 509 |
401 JavaValue result(T_VOID); | 510 JavaValue result(T_VOID); |
402 JavaCallArguments javaArgs; | 511 JavaCallArguments javaArgs; |
403 javaArgs.set_alternative_target(nm); | 512 javaArgs.set_alternative_target(nm); |
404 javaArgs.push_int(deoptId); | 513 javaArgs.push_int(deoptId); |
408 // how to handle the other actions. | 517 // how to handle the other actions. |
409 int myActionReason = Deoptimization::make_trap_request(Deoptimization::trap_request_reason(pdeopt->reason()), Deoptimization::Action_none); | 518 int myActionReason = Deoptimization::make_trap_request(Deoptimization::trap_request_reason(pdeopt->reason()), Deoptimization::Action_none); |
410 javaArgs.push_int(myActionReason); | 519 javaArgs.push_int(myActionReason); |
411 javaArgs.push_oop((oop) NULL); | 520 javaArgs.push_oop((oop) NULL); |
412 if (TraceGPUInteraction) { | 521 if (TraceGPUInteraction) { |
413 int dregOopMap = hsailFrame->dreg_oops_map(); | 522 tty->print_cr("[HSAIL] Deoptimizing to host for workitem=%d (slot=%d) with deoptId=%d, frame=" INTPTR_FORMAT ", actionAndReason=%d", workitem, k, deoptId, hsailFrame, myActionReason); |
414 tty->print_cr("[HSAIL] Deoptimizing to host for workitem=%d (slot=%d) with deoptId=%d, frame=" INTPTR_FORMAT ", actionAndReason=%d, dregOopMap=%04x", workitem, k, deoptId, hsailFrame, myActionReason, dregOopMap); | 523 // show the $d registers or stack slots containing references |
415 // show the registers containing references | 524 int maxOopBits = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots(); |
416 for (int bit = 0; bit < 16; bit++) { | 525 for (int bit = 0; bit < maxOopBits; bit++) { |
417 if ((dregOopMap & (1 << bit)) != 0) { | 526 if (oopSaver.isOop(hsailFrame, bit)) { |
418 tty->print_cr(" oop $d%d = %p", bit, hsailFrame->get_d_reg(bit)); | 527 if (bit < hsailFrame->num_d_regs()) { |
528 // show $d reg oop | |
529 tty->print_cr(" oop $d%d = %p", bit, oopSaver.getOopForBit(hsailFrame, bit)); | |
530 } else { | |
531 // show stack slot oop | |
532 int stackOffset = (bit - hsailFrame->num_d_regs()) * 8; // 8 bytes per stack slot | |
533 tty->print_cr(" oop stk:%d = %p", stackOffset, oopSaver.getOopForBit(hsailFrame, bit)); | |
534 } | |
419 } | 535 } |
420 } | 536 } |
421 } | 537 } |
422 JavaCalls::call(&result, mh, &javaArgs, THREAD); | 538 JavaCalls::call(&result, mh, &javaArgs, THREAD); |
423 count_deoptimized++; | 539 count_deoptimized++; |
459 } | 575 } |
460 count_never_ran++; | 576 count_never_ran++; |
461 } | 577 } |
462 } | 578 } |
463 TraceGPUInteraction = savedTraceGPUInteraction; | 579 TraceGPUInteraction = savedTraceGPUInteraction; |
464 if (TraceGPUInteraction) { | 580 if (TraceGPUInteraction && (count_never_ran > 0)) { |
465 tty->print_cr("%d workitems never ran, have been run via JavaCall", count_never_ran); | 581 tty->print_cr("%d workitems never ran, have been run via JavaCall", count_never_ran); |
466 showRanges(never_ran_array, dimX); | 582 showRanges(never_ran_array, dimX); |
467 } | 583 } |
468 } // end of never-ran handling | 584 } // end of never-ran handling |
469 } | 585 } |