Mercurial > hg > truffle
comparison src/cpu/x86/vm/x86_64.ad @ 8873:e961c11b85fe
8011102: Clear AVX registers after return from JNI call
Summary: Execute vzeroupper instruction after JNI call and on exits in jit compiled code which use 256bit vectors.
Reviewed-by: roland
author | kvn |
---|---|
date | Wed, 03 Apr 2013 11:12:57 -0700 |
parents | b30b3c2a0cf2 |
children | 8be1318fbe77 a6e09d6dd8e5 |
comparison
equal
deleted
inserted
replaced
8872:53028d751155 | 8873:e961c11b85fe |
---|---|
397 #define __ _masm. | 397 #define __ _masm. |
398 | 398 |
399 static int preserve_SP_size() { | 399 static int preserve_SP_size() { |
400 return 3; // rex.w, op, rm(reg/reg) | 400 return 3; // rex.w, op, rm(reg/reg) |
401 } | 401 } |
402 static int clear_avx_size() { | |
403 return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper | |
404 } | |
402 | 405 |
403 // !!!!! Special hack to get all types of calls to specify the byte offset | 406 // !!!!! Special hack to get all types of calls to specify the byte offset |
404 // from the start of the call to the point where the return address | 407 // from the start of the call to the point where the return address |
405 // will point. | 408 // will point. |
406 int MachCallStaticJavaNode::ret_addr_offset() | 409 int MachCallStaticJavaNode::ret_addr_offset() |
407 { | 410 { |
408 int offset = 5; // 5 bytes from start of call to where return address points | 411 int offset = 5; // 5 bytes from start of call to where return address points |
412 offset += clear_avx_size(); | |
409 if (_method_handle_invoke) | 413 if (_method_handle_invoke) |
410 offset += preserve_SP_size(); | 414 offset += preserve_SP_size(); |
411 return offset; | 415 return offset; |
412 } | 416 } |
413 | 417 |
414 int MachCallDynamicJavaNode::ret_addr_offset() | 418 int MachCallDynamicJavaNode::ret_addr_offset() |
415 { | 419 { |
416 return 15; // 15 bytes from start of call to where return address points | 420 int offset = 15; // 15 bytes from start of call to where return address points |
421 offset += clear_avx_size(); | |
422 return offset; | |
417 } | 423 } |
418 | 424 |
419 // In os_cpu .ad file | 425 int MachCallRuntimeNode::ret_addr_offset() { |
420 // int MachCallRuntimeNode::ret_addr_offset() | 426 int offset = 13; // movq r10,#addr; callq (r10) |
427 offset += clear_avx_size(); | |
428 return offset; | |
429 } | |
421 | 430 |
422 // Indicate if the safepoint node needs the polling page as an input, | 431 // Indicate if the safepoint node needs the polling page as an input, |
423 // it does if the polling page is more than disp32 away. | 432 // it does if the polling page is more than disp32 away. |
424 bool SafePointNode::needs_polling_address_input() | 433 bool SafePointNode::needs_polling_address_input() |
425 { | 434 { |
432 | 441 |
433 // The address of the call instruction needs to be 4-byte aligned to | 442 // The address of the call instruction needs to be 4-byte aligned to |
434 // ensure that it does not span a cache line so that it can be patched. | 443 // ensure that it does not span a cache line so that it can be patched. |
435 int CallStaticJavaDirectNode::compute_padding(int current_offset) const | 444 int CallStaticJavaDirectNode::compute_padding(int current_offset) const |
436 { | 445 { |
446 current_offset += clear_avx_size(); // skip vzeroupper | |
437 current_offset += 1; // skip call opcode byte | 447 current_offset += 1; // skip call opcode byte |
438 return round_to(current_offset, alignment_required()) - current_offset; | 448 return round_to(current_offset, alignment_required()) - current_offset; |
439 } | 449 } |
440 | 450 |
441 // The address of the call instruction needs to be 4-byte aligned to | 451 // The address of the call instruction needs to be 4-byte aligned to |
442 // ensure that it does not span a cache line so that it can be patched. | 452 // ensure that it does not span a cache line so that it can be patched. |
443 int CallStaticJavaHandleNode::compute_padding(int current_offset) const | 453 int CallStaticJavaHandleNode::compute_padding(int current_offset) const |
444 { | 454 { |
445 current_offset += preserve_SP_size(); // skip mov rbp, rsp | 455 current_offset += preserve_SP_size(); // skip mov rbp, rsp |
456 current_offset += clear_avx_size(); // skip vzeroupper | |
446 current_offset += 1; // skip call opcode byte | 457 current_offset += 1; // skip call opcode byte |
447 return round_to(current_offset, alignment_required()) - current_offset; | 458 return round_to(current_offset, alignment_required()) - current_offset; |
448 } | 459 } |
449 | 460 |
450 // The address of the call instruction needs to be 4-byte aligned to | 461 // The address of the call instruction needs to be 4-byte aligned to |
451 // ensure that it does not span a cache line so that it can be patched. | 462 // ensure that it does not span a cache line so that it can be patched. |
452 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const | 463 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const |
453 { | 464 { |
465 current_offset += clear_avx_size(); // skip vzeroupper | |
454 current_offset += 11; // skip movq instruction + call opcode byte | 466 current_offset += 11; // skip movq instruction + call opcode byte |
455 return round_to(current_offset, alignment_required()) - current_offset; | 467 return round_to(current_offset, alignment_required()) - current_offset; |
456 } | 468 } |
457 | 469 |
458 // EMIT_RM() | 470 // EMIT_RM() |
762 //============================================================================= | 774 //============================================================================= |
763 #ifndef PRODUCT | 775 #ifndef PRODUCT |
764 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const | 776 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const |
765 { | 777 { |
766 Compile* C = ra_->C; | 778 Compile* C = ra_->C; |
779 if (C->max_vector_size() > 16) { | |
780 st->print("vzeroupper"); | |
781 st->cr(); st->print("\t"); | |
782 } | |
783 | |
767 int framesize = C->frame_slots() << LogBytesPerInt; | 784 int framesize = C->frame_slots() << LogBytesPerInt; |
768 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); | 785 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); |
769 // Remove word for return adr already pushed | 786 // Remove word for return adr already pushed |
770 // and RBP | 787 // and RBP |
771 framesize -= 2*wordSize; | 788 framesize -= 2*wordSize; |
791 #endif | 808 #endif |
792 | 809 |
793 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const | 810 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const |
794 { | 811 { |
795 Compile* C = ra_->C; | 812 Compile* C = ra_->C; |
813 if (C->max_vector_size() > 16) { | |
814 // Clear upper bits of YMM registers when current compiled code uses | |
815 // wide vectors to avoid AVX <-> SSE transition penalty during call. | |
816 MacroAssembler _masm(&cbuf); | |
817 __ vzeroupper(); | |
818 } | |
819 | |
796 int framesize = C->frame_slots() << LogBytesPerInt; | 820 int framesize = C->frame_slots() << LogBytesPerInt; |
797 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); | 821 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); |
798 // Remove word for return adr already pushed | 822 // Remove word for return adr already pushed |
799 // and RBP | 823 // and RBP |
800 framesize -= 2*wordSize; | 824 framesize -= 2*wordSize; |
2006 __ xorptr(Rrdi, Rrdi); | 2030 __ xorptr(Rrdi, Rrdi); |
2007 } | 2031 } |
2008 __ bind(miss); | 2032 __ bind(miss); |
2009 %} | 2033 %} |
2010 | 2034 |
2035 enc_class clear_avx %{ | |
2036 debug_only(int off0 = cbuf.insts_size()); | |
2037 if (ra_->C->max_vector_size() > 16) { | |
2038 // Clear upper bits of YMM registers when current compiled code uses | |
2039 // wide vectors to avoid AVX <-> SSE transition penalty during call. | |
2040 MacroAssembler _masm(&cbuf); | |
2041 __ vzeroupper(); | |
2042 } | |
2043 debug_only(int off1 = cbuf.insts_size()); | |
2044 assert(off1 - off0 == clear_avx_size(), "correct size prediction"); | |
2045 %} | |
2046 | |
2047 enc_class Java_To_Runtime(method meth) %{ | |
2048 // No relocation needed | |
2049 MacroAssembler _masm(&cbuf); | |
2050 __ mov64(r10, (int64_t) $meth$$method); | |
2051 __ call(r10); | |
2052 %} | |
2053 | |
2011 enc_class Java_To_Interpreter(method meth) | 2054 enc_class Java_To_Interpreter(method meth) |
2012 %{ | 2055 %{ |
2013 // CALL Java_To_Interpreter | 2056 // CALL Java_To_Interpreter |
2014 // This is the instruction starting address for relocation info. | 2057 // This is the instruction starting address for relocation info. |
2015 cbuf.set_insts_mark(); | 2058 cbuf.set_insts_mark(); |
11364 effect(USE meth); | 11407 effect(USE meth); |
11365 | 11408 |
11366 ins_cost(300); | 11409 ins_cost(300); |
11367 format %{ "call,static " %} | 11410 format %{ "call,static " %} |
11368 opcode(0xE8); /* E8 cd */ | 11411 opcode(0xE8); /* E8 cd */ |
11369 ins_encode(Java_Static_Call(meth), call_epilog); | 11412 ins_encode(clear_avx, Java_Static_Call(meth), call_epilog); |
11370 ins_pipe(pipe_slow); | 11413 ins_pipe(pipe_slow); |
11371 ins_alignment(4); | 11414 ins_alignment(4); |
11372 %} | 11415 %} |
11373 | 11416 |
11374 // Call Java Static Instruction (method handle version) | 11417 // Call Java Static Instruction (method handle version) |
11382 // We use it here for a similar purpose, in {preserve,restore}_SP. | 11425 // We use it here for a similar purpose, in {preserve,restore}_SP. |
11383 | 11426 |
11384 ins_cost(300); | 11427 ins_cost(300); |
11385 format %{ "call,static/MethodHandle " %} | 11428 format %{ "call,static/MethodHandle " %} |
11386 opcode(0xE8); /* E8 cd */ | 11429 opcode(0xE8); /* E8 cd */ |
11387 ins_encode(preserve_SP, | 11430 ins_encode(clear_avx, preserve_SP, |
11388 Java_Static_Call(meth), | 11431 Java_Static_Call(meth), |
11389 restore_SP, | 11432 restore_SP, |
11390 call_epilog); | 11433 call_epilog); |
11391 ins_pipe(pipe_slow); | 11434 ins_pipe(pipe_slow); |
11392 ins_alignment(4); | 11435 ins_alignment(4); |
11401 effect(USE meth); | 11444 effect(USE meth); |
11402 | 11445 |
11403 ins_cost(300); | 11446 ins_cost(300); |
11404 format %{ "movq rax, #Universe::non_oop_word()\n\t" | 11447 format %{ "movq rax, #Universe::non_oop_word()\n\t" |
11405 "call,dynamic " %} | 11448 "call,dynamic " %} |
11406 ins_encode(Java_Dynamic_Call(meth), call_epilog); | 11449 ins_encode(clear_avx, Java_Dynamic_Call(meth), call_epilog); |
11407 ins_pipe(pipe_slow); | 11450 ins_pipe(pipe_slow); |
11408 ins_alignment(4); | 11451 ins_alignment(4); |
11409 %} | 11452 %} |
11410 | 11453 |
11411 // Call Runtime Instruction | 11454 // Call Runtime Instruction |
11414 match(CallRuntime); | 11457 match(CallRuntime); |
11415 effect(USE meth); | 11458 effect(USE meth); |
11416 | 11459 |
11417 ins_cost(300); | 11460 ins_cost(300); |
11418 format %{ "call,runtime " %} | 11461 format %{ "call,runtime " %} |
11419 opcode(0xE8); /* E8 cd */ | 11462 ins_encode(clear_avx, Java_To_Runtime(meth)); |
11420 ins_encode(Java_To_Runtime(meth)); | |
11421 ins_pipe(pipe_slow); | 11463 ins_pipe(pipe_slow); |
11422 %} | 11464 %} |
11423 | 11465 |
11424 // Call runtime without safepoint | 11466 // Call runtime without safepoint |
11425 instruct CallLeafDirect(method meth) | 11467 instruct CallLeafDirect(method meth) |
11427 match(CallLeaf); | 11469 match(CallLeaf); |
11428 effect(USE meth); | 11470 effect(USE meth); |
11429 | 11471 |
11430 ins_cost(300); | 11472 ins_cost(300); |
11431 format %{ "call_leaf,runtime " %} | 11473 format %{ "call_leaf,runtime " %} |
11432 opcode(0xE8); /* E8 cd */ | 11474 ins_encode(clear_avx, Java_To_Runtime(meth)); |
11433 ins_encode(Java_To_Runtime(meth)); | |
11434 ins_pipe(pipe_slow); | 11475 ins_pipe(pipe_slow); |
11435 %} | 11476 %} |
11436 | 11477 |
11437 // Call runtime without safepoint | 11478 // Call runtime without safepoint |
11438 instruct CallLeafNoFPDirect(method meth) | 11479 instruct CallLeafNoFPDirect(method meth) |
11440 match(CallLeafNoFP); | 11481 match(CallLeafNoFP); |
11441 effect(USE meth); | 11482 effect(USE meth); |
11442 | 11483 |
11443 ins_cost(300); | 11484 ins_cost(300); |
11444 format %{ "call_leaf_nofp,runtime " %} | 11485 format %{ "call_leaf_nofp,runtime " %} |
11445 opcode(0xE8); /* E8 cd */ | |
11446 ins_encode(Java_To_Runtime(meth)); | 11486 ins_encode(Java_To_Runtime(meth)); |
11447 ins_pipe(pipe_slow); | 11487 ins_pipe(pipe_slow); |
11448 %} | 11488 %} |
11449 | 11489 |
11450 // Return Instruction | 11490 // Return Instruction |