comparison src/gpu/hsail/vm/gpu_hsail.cpp @ 15482:a250a512434d

HSAIL: support for object values in stack slots at deoptimization points Contributed-by: Tom Deneau <tom.deneau@amd.com>
author Doug Simon <doug.simon@oracle.com>
date Fri, 02 May 2014 21:58:28 +0200
parents 66e3af78ea96
children d370d87e528f
comparison
equal deleted inserted replaced
15481:09d721bcffe2 15482:a250a512434d
65 #define HS_NMETHOD "Lcom/oracle/graal/hotspot/meta/HotSpotNmethod;" 65 #define HS_NMETHOD "Lcom/oracle/graal/hotspot/meta/HotSpotNmethod;"
66 66
67 JNINativeMethod Hsail::HSAIL_methods[] = { 67 JNINativeMethod Hsail::HSAIL_methods[] = {
68 {CC"initialize", CC"()Z", FN_PTR(Hsail::initialize)}, 68 {CC"initialize", CC"()Z", FN_PTR(Hsail::initialize)},
69 {CC"generateKernel", CC"([B" STRING ")J", FN_PTR(Hsail::generate_kernel)}, 69 {CC"generateKernel", CC"([B" STRING ")J", FN_PTR(Hsail::generate_kernel)},
70 {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I)Z", FN_PTR(Hsail::execute_kernel_void_1d)}, 70 {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I[I)Z", FN_PTR(Hsail::execute_kernel_void_1d)},
71 }; 71 };
72 72
73 void * Hsail::_device_context = NULL; 73 void * Hsail::_device_context = NULL;
74 jint Hsail::_notice_safepoints = false; 74 jint Hsail::_notice_safepoints = false;
75 75
145 } 145 }
146 _okra_register_heap(Universe::heap()->base(), Universe::heap()->capacity()); 146 _okra_register_heap(Universe::heap()->base(), Universe::heap()->capacity());
147 } 147 }
148 148
149 GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args, jobject oops_save, 149 GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args, jobject oops_save,
150 jobject donor_threads, jint allocBytesPerWorkitem)) 150 jobject donor_threads, jint allocBytesPerWorkitem, jobject oop_map_array))
151 151
152 ResourceMark rm; 152 ResourceMark rm;
153 jlong nmethodValue = InstalledCode::address(kernel_handle); 153 jlong nmethodValue = InstalledCode::address(kernel_handle);
154 if (nmethodValue == 0) { 154 if (nmethodValue == 0) {
155 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); 155 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL);
161 void* kernel = (void*) HotSpotInstalledCode::codeStart(kernel_handle); 161 void* kernel = (void*) HotSpotInstalledCode::codeStart(kernel_handle);
162 if (kernel == NULL) { 162 if (kernel == NULL) {
163 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); 163 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL);
164 } 164 }
165 165
166 return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, CHECK_0); 166 return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, oop_map_array, CHECK_0);
167 GPU_END 167 GPU_END
168 168
169 static void showRanges(jboolean *a, int len) { 169 static void showRanges(jboolean *a, int len) {
170 // show ranges 170 // show ranges
171 bool lookFor = true; 171 bool lookFor = true;
213 size_t tlabSize = tlabFree + tlabUsed; 213 size_t tlabSize = tlabFree + tlabUsed;
214 double freePct = 100.0 * (double) tlabFree/(double) tlabSize; 214 double freePct = 100.0 * (double) tlabFree/(double) tlabSize;
215 tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct); 215 tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct);
216 } 216 }
217 217
218 218 class OopSaver : public StackObj {
219 jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save, 219 private:
220 jobject donor_threads, int allocBytesPerWorkitem, TRAPS) { 220 objArrayOop _oopsSaveArray;
221 typeArrayOop _oopMapArray;
222 jobject _oops_save;
223 jobject _oop_map_array;
224 int _last_pcoffset;
225 int _last_idx;
226 int _saveAreaCounts;
227
228 enum {
229 SAVEAREACOUNTS_OFST=0,
230 SPAN_OFST=1,
231 HEADERSIZE=2
232 };
233 int mapPcOffsetToIndex(int pcOffset) {
234 if (pcOffset == _last_pcoffset) {
235 return _last_idx;
236 }
237 int span = _oopMapArray->int_at(SPAN_OFST);
238 for (int idx = HEADERSIZE; idx < _oopMapArray->length(); idx += span) {
239 int ofst = _oopMapArray->int_at(idx);
240 if (ofst == pcOffset) {
241 _last_pcoffset = pcOffset;
242 _last_idx = idx + 1;
243 return _last_idx;
244 }
245 }
246 }
247
248 public:
249 OopSaver(jobject oops_save, jobject oop_map_array) {
250 _oops_save = oops_save;
251 _oop_map_array = oop_map_array;
252 _last_pcoffset = -1;
253 _saveAreaCounts = getSaveAreaCounts(oop_map_array);
254 resolveArrays();
255 }
256
257 void resolveArrays() {
258 _oopsSaveArray = (objArrayOop) JNIHandles::resolve(_oops_save);
259 _oopMapArray = (typeArrayOop) JNIHandles::resolve(_oop_map_array);
260 }
261
262 void * getOopForBit(HSAILFrame * hsailFrame, int bit) {
263 assert(isOop(hsailFrame, bit), "");
264 void *oop;
265 if (bit < hsailFrame->num_d_regs()) {
266 // d register
267 oop = (void*) hsailFrame->get_d_reg(bit);
268 } else {
269 // stack slot
270 int stackOffset = (bit - hsailFrame->num_d_regs()) * 8; // 8 bytes per stack slot
271 oop = (void *) hsailFrame->get_stackslot64(stackOffset);
272 }
273 return oop;
274 }
275
276 void putOopForBit(HSAILFrame * hsailFrame, int bit, void *oop) {
277 assert(isOop(hsailFrame, bit), "");
278 if (bit < hsailFrame->num_d_regs()) {
279 // d register
280 hsailFrame->put_d_reg(bit, (jlong) oop);
281 } else {
282 // stack slot
283 int stackOffset = (bit - hsailFrame->num_d_regs()) * 8; // 8 bytes per stack slot
284 hsailFrame->put_stackslot64(stackOffset, (jlong) oop);
285 }
286 }
287
288 void saveOopsFromFrame(HSAILFrame * hsailFrame, int deoptSlot){
289 // as used, no need to resolve arrays on each call
290 int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
291
292 // handle the dregister and stackSlot based oops
293 for (int bit = 0; bit < oopsPerDeopt; bit++) {
294 if (isOop(hsailFrame, bit)) {
295 void* saved_oop = getOopForBit(hsailFrame, bit);
296 int saveArrayIndex = deoptSlot * oopsPerDeopt + bit;
297 _oopsSaveArray->obj_at_put(saveArrayIndex, (oop) saved_oop);
298 }
299 }
300 }
301
302 void restoreOopsToFrame(HSAILFrame * hsailFrame, int deoptSlot, int workitem){
303 // need to re-resolve on each restore
304 resolveArrays();
305 int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
306
307 // handle the dregister and stackSlot based oops
308 for (int bit = 0; bit < oopsPerDeopt; bit++) {
309 if (isOop(hsailFrame, bit)) {
310 // the dregister or stack slot at this bit is an oop, retrieve it from array and put back in frame
311 int saveArrayIndex = deoptSlot * oopsPerDeopt + bit;
312 void * newValue = (void *) _oopsSaveArray->obj_at(saveArrayIndex);
313 void * oldValue = getOopForBit(hsailFrame, bit);
314 assert((oldValue != 0 ? newValue != 0 : newValue == 0), "bad dregValue retrieved");
315 if (newValue != oldValue) {
316 if (TraceGPUInteraction) {
317 int numDRegs = hsailFrame->num_d_regs();
318 const char *name = (bit < numDRegs ? "$d" : "stk");
319 int num = (bit < numDRegs ? bit : bit - numDRegs);
320 tty->print_cr("oop moved for %s%d, workitem %d, slot %d, old=%p, new=%p",
321 name, num, workitem, deoptSlot, oldValue, newValue);
322 }
323 putOopForBit(hsailFrame, bit, newValue);
324 }
325 }
326 }
327 }
328
329 bool isOop(HSAILFrame * hsailFrame, int bit){
330 // re-resolve on each access
331 resolveArrays();
332 if (bit > hsailFrame->num_d_regs() + hsailFrame->num_stack_slots()) {
333 return false;
334 }
335 int pcOffset = hsailFrame->pc_offset();
336 int bits_int_idx = mapPcOffsetToIndex(pcOffset) + (bit / 32);
337 int bitpos = bit % 32;
338 int bits = _oopMapArray->int_at(bits_int_idx);
339 return ((bits & (1 << bitpos)) != 0);
340 }
341
342 static int getSaveAreaCounts(jobject oopMapArrayObject) {
343 typeArrayOop oopMapArray = (typeArrayOop) JNIHandles::resolve(oopMapArrayObject);
344 return oopMapArray->int_at(SAVEAREACOUNTS_OFST);
345 }
346
347 };
348
349 jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save,
350 jobject donor_threads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) {
221 ResourceMark rm(THREAD); 351 ResourceMark rm(THREAD);
222 objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args); 352 objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args);
223 353
224 // TODO: avoid donor thread logic if kernel does not allocate 354 // TODO: avoid donor thread logic if kernel does not allocate
225 objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads); 355 objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads);
258 } 388 }
259 389
260 // Reset the kernel arguments 390 // Reset the kernel arguments
261 _okra_clearargs(kernel); 391 _okra_clearargs(kernel);
262 392
393 // get how many bytes per deopt save area are required
394 int saveAreaCounts = OopSaver::getSaveAreaCounts(oop_map_array);
395 int numSRegs = saveAreaCounts & 0xff;
396 int numDRegs = (saveAreaCounts >> 8) & 0xff;
397 int numStackSlots = (saveAreaCounts >> 16);
398 int bytesPerSaveArea = numSRegs * 4 + (numDRegs + numStackSlots) * 8;
399
263 HSAILDeoptimizationInfo* e; 400 HSAILDeoptimizationInfo* e;
264 if (UseHSAILDeoptimization) { 401 if (UseHSAILDeoptimization) {
265 e = new (ResourceObj::C_HEAP, mtInternal) HSAILDeoptimizationInfo(); 402 e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea);
266 e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal)); 403 e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal));
267 memset(e->never_ran_array(), 0, dimX * sizeof(jboolean)); 404 memset(e->never_ran_array(), 0, dimX * sizeof(jboolean));
268 e->set_donor_threads(donorThreads); 405 e->set_donor_threads(donorThreads);
269 } 406 }
270 407
316 } 453 }
317 454
318 if (UseHSAILDeoptimization) { 455 if (UseHSAILDeoptimization) {
319 kernelStats.incDeopts(); 456 kernelStats.incDeopts();
320 // check if any workitem requested a deopt 457 // check if any workitem requested a deopt
321 // currently we only support at most one such workitem
322 int deoptcode = e->deopt_occurred(); 458 int deoptcode = e->deopt_occurred();
323 if (deoptcode != 1) { 459 if (deoptcode != 1) {
324 if (deoptcode == 0) { 460 if (deoptcode == 0) {
325 if (TraceGPUInteraction && _notice_safepoints != 0) { 461 if (TraceGPUInteraction && _notice_safepoints != 0) {
326 tty->print_cr("[HSAIL] observed safepoint during kernel"); 462 tty->print_cr("[HSAIL] observed safepoint during kernel");
335 471
336 { 472 {
337 TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction); 473 TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction);
338 if (TraceGPUInteraction) { 474 if (TraceGPUInteraction) {
339 tty->print_cr("deopt happened."); 475 tty->print_cr("deopt happened.");
340 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[0]; 476 HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(0);
341 tty->print_cr("first deopter was workitem %d", pdeopt->workitem()); 477 tty->print_cr("first deopter was workitem %d", pdeopt->workitem());
342 } 478 }
343 479
344 // Before handling any deopting workitems, save the pointers from 480 // Before handling any deopting workitems, save the pointers from
345 // the hsail frames in oops_save so they get adjusted by any 481 // the hsail frames in oops_save so they get adjusted by any
346 // GC. Need to do this before leaving thread_in_vm mode. 482 // GC. Need to do this before leaving thread_in_vm mode.
483 OopSaver oopSaver(oops_save, oop_map_array);
347 // resolve handle only needed once here (not exiting vm mode) 484 // resolve handle only needed once here (not exiting vm mode)
348 objArrayOop oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save); 485 oopSaver.resolveArrays();
349 486
350 // since slots are allocated from the beginning, we know how far to look 487 // since slots are allocated from the beginning, we know how far to look
351 assert(e->num_deopts() < MAX_DEOPT_SAVE_STATES_SIZE, "deopt save state overflow"); 488 assert(e->num_deopts() < e->num_slots(), "deopt save state overflow");
352 for (int k = 0; k < e->num_deopts(); k++) { 489 for (int k = 0; k < e->num_deopts(); k++) {
353 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k]; 490 HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k);
354 jint workitem = pdeopt->workitem(); 491 assert (pdeopt->workitem() >= 0, "bad workitem in deopt");
355 if (workitem != -1) { 492 // this is a workitem that deopted
356 // this is a workitem that deopted 493 oopSaver.saveOopsFromFrame(pdeopt->first_frame(), k);
357 HSAILFrame *hsailFrame = pdeopt->first_frame();
358 int dregOopMap = hsailFrame->dreg_oops_map();
359 for (int bit = 0; bit < 16; bit++) {
360 if ((dregOopMap & (1 << bit)) != 0) {
361 // the dregister at this bit is an oop, save it in the array
362 int index = k * 16 + bit;
363 void* saved_oop = (void*) hsailFrame->get_d_reg(bit);
364 oopsSaveArray->obj_at_put(index, (oop) saved_oop);
365 }
366 }
367 }
368 } 494 }
369 495
370 // Handle any deopting workitems. 496 // Handle any deopting workitems.
371 int count_deoptimized = 0; 497 int count_deoptimized = 0;
372 for (int k = 0; k < e->num_deopts(); k++) { 498 for (int k = 0; k < e->num_deopts(); k++) {
373 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k]; 499 HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k);
374 500
375 jint workitem = pdeopt->workitem(); 501 jint workitem = pdeopt->workitem();
376 if (workitem != -1) { 502 if (workitem != -1) {
377 int deoptId = pdeopt->pc_offset(); 503 int deoptId = pdeopt->pc_offset();
378 HSAILFrame *hsailFrame = pdeopt->first_frame(); 504 HSAILFrame *hsailFrame = pdeopt->first_frame();
379 505
380 // update the hsailFrame from the oopsSaveArray 506 // update the hsailFrame from the oopsSaveArray
381 // re-resolve the handle 507 // will re-resolve the handles each time
382 oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save); 508 oopSaver.restoreOopsToFrame(hsailFrame, k, workitem);
383
384 int dregOopMap = hsailFrame->dreg_oops_map();
385 for (int bit = 0; bit < 16; bit++) {
386 if ((dregOopMap & (1 << bit)) != 0) {
387 // the dregister at this bit is an oop, retrieve it from array and put back in frame
388 int index = k * 16 + bit;
389 void * dregValue = (void *) oopsSaveArray->obj_at(index);
390 void * oldDregValue = (void *) hsailFrame->get_d_reg(bit);
391 assert((oldDregValue != 0 ? dregValue != 0 : dregValue == 0), "bad dregValue retrieved");
392 if (TraceGPUInteraction) {
393 if (dregValue != oldDregValue) {
394 tty->print_cr("oop moved for $d%d, workitem %d, slot %d, old=%p, new=%p", bit, workitem, k, oldDregValue, dregValue);
395 }
396 }
397 hsailFrame->put_d_reg(bit, (jlong) dregValue);
398 }
399 }
400 509
401 JavaValue result(T_VOID); 510 JavaValue result(T_VOID);
402 JavaCallArguments javaArgs; 511 JavaCallArguments javaArgs;
403 javaArgs.set_alternative_target(nm); 512 javaArgs.set_alternative_target(nm);
404 javaArgs.push_int(deoptId); 513 javaArgs.push_int(deoptId);
408 // how to handle the other actions. 517 // how to handle the other actions.
409 int myActionReason = Deoptimization::make_trap_request(Deoptimization::trap_request_reason(pdeopt->reason()), Deoptimization::Action_none); 518 int myActionReason = Deoptimization::make_trap_request(Deoptimization::trap_request_reason(pdeopt->reason()), Deoptimization::Action_none);
410 javaArgs.push_int(myActionReason); 519 javaArgs.push_int(myActionReason);
411 javaArgs.push_oop((oop) NULL); 520 javaArgs.push_oop((oop) NULL);
412 if (TraceGPUInteraction) { 521 if (TraceGPUInteraction) {
413 int dregOopMap = hsailFrame->dreg_oops_map(); 522 tty->print_cr("[HSAIL] Deoptimizing to host for workitem=%d (slot=%d) with deoptId=%d, frame=" INTPTR_FORMAT ", actionAndReason=%d", workitem, k, deoptId, hsailFrame, myActionReason);
414 tty->print_cr("[HSAIL] Deoptimizing to host for workitem=%d (slot=%d) with deoptId=%d, frame=" INTPTR_FORMAT ", actionAndReason=%d, dregOopMap=%04x", workitem, k, deoptId, hsailFrame, myActionReason, dregOopMap); 523 // show the $d registers or stack slots containing references
415 // show the registers containing references 524 int maxOopBits = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
416 for (int bit = 0; bit < 16; bit++) { 525 for (int bit = 0; bit < maxOopBits; bit++) {
417 if ((dregOopMap & (1 << bit)) != 0) { 526 if (oopSaver.isOop(hsailFrame, bit)) {
418 tty->print_cr(" oop $d%d = %p", bit, hsailFrame->get_d_reg(bit)); 527 if (bit < hsailFrame->num_d_regs()) {
528 // show $d reg oop
529 tty->print_cr(" oop $d%d = %p", bit, oopSaver.getOopForBit(hsailFrame, bit));
530 } else {
531 // show stack slot oop
532 int stackOffset = (bit - hsailFrame->num_d_regs()) * 8; // 8 bytes per stack slot
533 tty->print_cr(" oop stk:%d = %p", stackOffset, oopSaver.getOopForBit(hsailFrame, bit));
534 }
419 } 535 }
420 } 536 }
421 } 537 }
422 JavaCalls::call(&result, mh, &javaArgs, THREAD); 538 JavaCalls::call(&result, mh, &javaArgs, THREAD);
423 count_deoptimized++; 539 count_deoptimized++;
459 } 575 }
460 count_never_ran++; 576 count_never_ran++;
461 } 577 }
462 } 578 }
463 TraceGPUInteraction = savedTraceGPUInteraction; 579 TraceGPUInteraction = savedTraceGPUInteraction;
464 if (TraceGPUInteraction) { 580 if (TraceGPUInteraction && (count_never_ran > 0)) {
465 tty->print_cr("%d workitems never ran, have been run via JavaCall", count_never_ran); 581 tty->print_cr("%d workitems never ran, have been run via JavaCall", count_never_ran);
466 showRanges(never_ran_array, dimX); 582 showRanges(never_ran_array, dimX);
467 } 583 }
468 } // end of never-ran handling 584 } // end of never-ran handling
469 } 585 }