Mercurial > hg > truffle
comparison src/cpu/x86/vm/x86_32.ad @ 8873:e961c11b85fe
8011102: Clear AVX registers after return from JNI call
Summary: Execute vzeroupper instruction after JNI call and on exits in jit compiled code which use 256bit vectors.
Reviewed-by: roland
author | kvn |
---|---|
date | Wed, 03 Apr 2013 11:12:57 -0700 |
parents | b30b3c2a0cf2 |
children | 886d1fd67dc3 a6e09d6dd8e5 |
comparison
equal
deleted
inserted
replaced
8872:53028d751155 | 8873:e961c11b85fe |
---|---|
226 static jlong *double_signmask_pool = double_quadword(&fp_signmask_pool[2*2], CONST64(0x7FFFFFFFFFFFFFFF), CONST64(0x7FFFFFFFFFFFFFFF)); | 226 static jlong *double_signmask_pool = double_quadword(&fp_signmask_pool[2*2], CONST64(0x7FFFFFFFFFFFFFFF), CONST64(0x7FFFFFFFFFFFFFFF)); |
227 static jlong *float_signflip_pool = double_quadword(&fp_signmask_pool[3*2], CONST64(0x8000000080000000), CONST64(0x8000000080000000)); | 227 static jlong *float_signflip_pool = double_quadword(&fp_signmask_pool[3*2], CONST64(0x8000000080000000), CONST64(0x8000000080000000)); |
228 static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000)); | 228 static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000)); |
229 | 229 |
230 // Offset hacking within calls. | 230 // Offset hacking within calls. |
231 static int pre_call_FPU_size() { | 231 static int pre_call_resets_size() { |
232 if (Compile::current()->in_24_bit_fp_mode()) | 232 int size = 0; |
233 return 6; // fldcw | 233 Compile* C = Compile::current(); |
234 return 0; | 234 if (C->in_24_bit_fp_mode()) { |
235 size += 6; // fldcw | |
236 } | |
237 if (C->max_vector_size() > 16) { | |
238 size += 3; // vzeroupper | |
239 } | |
240 return size; | |
235 } | 241 } |
236 | 242 |
237 static int preserve_SP_size() { | 243 static int preserve_SP_size() { |
238 return 2; // op, rm(reg/reg) | 244 return 2; // op, rm(reg/reg) |
239 } | 245 } |
240 | 246 |
241 // !!!!! Special hack to get all type of calls to specify the byte offset | 247 // !!!!! Special hack to get all type of calls to specify the byte offset |
242 // from the start of the call to the point where the return address | 248 // from the start of the call to the point where the return address |
243 // will point. | 249 // will point. |
244 int MachCallStaticJavaNode::ret_addr_offset() { | 250 int MachCallStaticJavaNode::ret_addr_offset() { |
245 int offset = 5 + pre_call_FPU_size(); // 5 bytes from start of call to where return address points | 251 int offset = 5 + pre_call_resets_size(); // 5 bytes from start of call to where return address points |
246 if (_method_handle_invoke) | 252 if (_method_handle_invoke) |
247 offset += preserve_SP_size(); | 253 offset += preserve_SP_size(); |
248 return offset; | 254 return offset; |
249 } | 255 } |
250 | 256 |
251 int MachCallDynamicJavaNode::ret_addr_offset() { | 257 int MachCallDynamicJavaNode::ret_addr_offset() { |
252 return 10 + pre_call_FPU_size(); // 10 bytes from start of call to where return address points | 258 return 10 + pre_call_resets_size(); // 10 bytes from start of call to where return address points |
253 } | 259 } |
254 | 260 |
255 static int sizeof_FFree_Float_Stack_All = -1; | 261 static int sizeof_FFree_Float_Stack_All = -1; |
256 | 262 |
257 int MachCallRuntimeNode::ret_addr_offset() { | 263 int MachCallRuntimeNode::ret_addr_offset() { |
258 assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already"); | 264 assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already"); |
259 return sizeof_FFree_Float_Stack_All + 5 + pre_call_FPU_size(); | 265 return sizeof_FFree_Float_Stack_All + 5 + pre_call_resets_size(); |
260 } | 266 } |
261 | 267 |
262 // Indicate if the safepoint node needs the polling page as an input. | 268 // Indicate if the safepoint node needs the polling page as an input. |
263 // Since x86 does have absolute addressing, it doesn't. | 269 // Since x86 does have absolute addressing, it doesn't. |
264 bool SafePointNode::needs_polling_address_input() { | 270 bool SafePointNode::needs_polling_address_input() { |
270 // | 276 // |
271 | 277 |
272 // The address of the call instruction needs to be 4-byte aligned to | 278 // The address of the call instruction needs to be 4-byte aligned to |
273 // ensure that it does not span a cache line so that it can be patched. | 279 // ensure that it does not span a cache line so that it can be patched. |
274 int CallStaticJavaDirectNode::compute_padding(int current_offset) const { | 280 int CallStaticJavaDirectNode::compute_padding(int current_offset) const { |
275 current_offset += pre_call_FPU_size(); // skip fldcw, if any | 281 current_offset += pre_call_resets_size(); // skip fldcw, if any |
276 current_offset += 1; // skip call opcode byte | 282 current_offset += 1; // skip call opcode byte |
277 return round_to(current_offset, alignment_required()) - current_offset; | 283 return round_to(current_offset, alignment_required()) - current_offset; |
278 } | 284 } |
279 | 285 |
280 // The address of the call instruction needs to be 4-byte aligned to | 286 // The address of the call instruction needs to be 4-byte aligned to |
281 // ensure that it does not span a cache line so that it can be patched. | 287 // ensure that it does not span a cache line so that it can be patched. |
282 int CallStaticJavaHandleNode::compute_padding(int current_offset) const { | 288 int CallStaticJavaHandleNode::compute_padding(int current_offset) const { |
283 current_offset += pre_call_FPU_size(); // skip fldcw, if any | 289 current_offset += pre_call_resets_size(); // skip fldcw, if any |
284 current_offset += preserve_SP_size(); // skip mov rbp, rsp | 290 current_offset += preserve_SP_size(); // skip mov rbp, rsp |
285 current_offset += 1; // skip call opcode byte | 291 current_offset += 1; // skip call opcode byte |
286 return round_to(current_offset, alignment_required()) - current_offset; | 292 return round_to(current_offset, alignment_required()) - current_offset; |
287 } | 293 } |
288 | 294 |
289 // The address of the call instruction needs to be 4-byte aligned to | 295 // The address of the call instruction needs to be 4-byte aligned to |
290 // ensure that it does not span a cache line so that it can be patched. | 296 // ensure that it does not span a cache line so that it can be patched. |
291 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const { | 297 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const { |
292 current_offset += pre_call_FPU_size(); // skip fldcw, if any | 298 current_offset += pre_call_resets_size(); // skip fldcw, if any |
293 current_offset += 5; // skip MOV instruction | 299 current_offset += 5; // skip MOV instruction |
294 current_offset += 1; // skip call opcode byte | 300 current_offset += 1; // skip call opcode byte |
295 return round_to(current_offset, alignment_required()) - current_offset; | 301 return round_to(current_offset, alignment_required()) - current_offset; |
296 } | 302 } |
297 | 303 |
581 int framesize = C->frame_slots() << LogBytesPerInt; | 587 int framesize = C->frame_slots() << LogBytesPerInt; |
582 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); | 588 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); |
583 // Remove two words for return addr and rbp, | 589 // Remove two words for return addr and rbp, |
584 framesize -= 2*wordSize; | 590 framesize -= 2*wordSize; |
585 | 591 |
586 if( C->in_24_bit_fp_mode() ) { | 592 if (C->max_vector_size() > 16) { |
593 st->print("VZEROUPPER"); | |
594 st->cr(); st->print("\t"); | |
595 } | |
596 if (C->in_24_bit_fp_mode()) { | |
587 st->print("FLDCW standard control word"); | 597 st->print("FLDCW standard control word"); |
588 st->cr(); st->print("\t"); | 598 st->cr(); st->print("\t"); |
589 } | 599 } |
590 if( framesize ) { | 600 if (framesize) { |
591 st->print("ADD ESP,%d\t# Destroy frame",framesize); | 601 st->print("ADD ESP,%d\t# Destroy frame",framesize); |
592 st->cr(); st->print("\t"); | 602 st->cr(); st->print("\t"); |
593 } | 603 } |
594 st->print_cr("POPL EBP"); st->print("\t"); | 604 st->print_cr("POPL EBP"); st->print("\t"); |
595 if( do_polling() && C->is_method_compilation() ) { | 605 if (do_polling() && C->is_method_compilation()) { |
596 st->print("TEST PollPage,EAX\t! Poll Safepoint"); | 606 st->print("TEST PollPage,EAX\t! Poll Safepoint"); |
597 st->cr(); st->print("\t"); | 607 st->cr(); st->print("\t"); |
598 } | 608 } |
599 } | 609 } |
600 #endif | 610 #endif |
601 | 611 |
602 void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { | 612 void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { |
603 Compile *C = ra_->C; | 613 Compile *C = ra_->C; |
604 | 614 |
615 if (C->max_vector_size() > 16) { | |
616 // Clear upper bits of YMM registers when current compiled code uses | |
617 // wide vectors to avoid AVX <-> SSE transition penalty during call. | |
618 MacroAssembler masm(&cbuf); | |
619 masm.vzeroupper(); | |
620 } | |
605 // If method set FPU control word, restore to standard control word | 621 // If method set FPU control word, restore to standard control word |
606 if( C->in_24_bit_fp_mode() ) { | 622 if (C->in_24_bit_fp_mode()) { |
607 MacroAssembler masm(&cbuf); | 623 MacroAssembler masm(&cbuf); |
608 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); | 624 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); |
609 } | 625 } |
610 | 626 |
611 int framesize = C->frame_slots() << LogBytesPerInt; | 627 int framesize = C->frame_slots() << LogBytesPerInt; |
613 // Remove two words for return addr and rbp, | 629 // Remove two words for return addr and rbp, |
614 framesize -= 2*wordSize; | 630 framesize -= 2*wordSize; |
615 | 631 |
616 // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here | 632 // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here |
617 | 633 |
618 if( framesize >= 128 ) { | 634 if (framesize >= 128) { |
619 emit_opcode(cbuf, 0x81); // add SP, #framesize | 635 emit_opcode(cbuf, 0x81); // add SP, #framesize |
620 emit_rm(cbuf, 0x3, 0x00, ESP_enc); | 636 emit_rm(cbuf, 0x3, 0x00, ESP_enc); |
621 emit_d32(cbuf, framesize); | 637 emit_d32(cbuf, framesize); |
622 } | 638 } else if (framesize) { |
623 else if( framesize ) { | |
624 emit_opcode(cbuf, 0x83); // add SP, #framesize | 639 emit_opcode(cbuf, 0x83); // add SP, #framesize |
625 emit_rm(cbuf, 0x3, 0x00, ESP_enc); | 640 emit_rm(cbuf, 0x3, 0x00, ESP_enc); |
626 emit_d8(cbuf, framesize); | 641 emit_d8(cbuf, framesize); |
627 } | 642 } |
628 | 643 |
629 emit_opcode(cbuf, 0x58 | EBP_enc); | 644 emit_opcode(cbuf, 0x58 | EBP_enc); |
630 | 645 |
631 if( do_polling() && C->is_method_compilation() ) { | 646 if (do_polling() && C->is_method_compilation()) { |
632 cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0); | 647 cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0); |
633 emit_opcode(cbuf,0x85); | 648 emit_opcode(cbuf,0x85); |
634 emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX | 649 emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX |
635 emit_d32(cbuf, (intptr_t)os::get_polling_page()); | 650 emit_d32(cbuf, (intptr_t)os::get_polling_page()); |
636 } | 651 } |
638 | 653 |
639 uint MachEpilogNode::size(PhaseRegAlloc *ra_) const { | 654 uint MachEpilogNode::size(PhaseRegAlloc *ra_) const { |
640 Compile *C = ra_->C; | 655 Compile *C = ra_->C; |
641 // If method set FPU control word, restore to standard control word | 656 // If method set FPU control word, restore to standard control word |
642 int size = C->in_24_bit_fp_mode() ? 6 : 0; | 657 int size = C->in_24_bit_fp_mode() ? 6 : 0; |
643 if( do_polling() && C->is_method_compilation() ) size += 6; | 658 if (C->max_vector_size() > 16) size += 3; // vzeroupper |
659 if (do_polling() && C->is_method_compilation()) size += 6; | |
644 | 660 |
645 int framesize = C->frame_slots() << LogBytesPerInt; | 661 int framesize = C->frame_slots() << LogBytesPerInt; |
646 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); | 662 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); |
647 // Remove two words for return addr and rbp, | 663 // Remove two words for return addr and rbp, |
648 framesize -= 2*wordSize; | 664 framesize -= 2*wordSize; |
649 | 665 |
650 size++; // popl rbp, | 666 size++; // popl rbp, |
651 | 667 |
652 if( framesize >= 128 ) { | 668 if (framesize >= 128) { |
653 size += 6; | 669 size += 6; |
654 } else { | 670 } else { |
655 size += framesize ? 3 : 0; | 671 size += framesize ? 3 : 0; |
656 } | 672 } |
657 return size; | 673 return size; |
1851 } | 1867 } |
1852 } | 1868 } |
1853 %} | 1869 %} |
1854 | 1870 |
1855 | 1871 |
1856 enc_class pre_call_FPU %{ | 1872 enc_class pre_call_resets %{ |
1857 // If method sets FPU control word restore it here | 1873 // If method sets FPU control word restore it here |
1858 debug_only(int off0 = cbuf.insts_size()); | 1874 debug_only(int off0 = cbuf.insts_size()); |
1859 if( Compile::current()->in_24_bit_fp_mode() ) { | 1875 if (ra_->C->in_24_bit_fp_mode()) { |
1860 MacroAssembler masm(&cbuf); | 1876 MacroAssembler _masm(&cbuf); |
1861 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); | 1877 __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); |
1878 } | |
1879 if (ra_->C->max_vector_size() > 16) { | |
1880 // Clear upper bits of YMM registers when current compiled code uses | |
1881 // wide vectors to avoid AVX <-> SSE transition penalty during call. | |
1882 MacroAssembler _masm(&cbuf); | |
1883 __ vzeroupper(); | |
1862 } | 1884 } |
1863 debug_only(int off1 = cbuf.insts_size()); | 1885 debug_only(int off1 = cbuf.insts_size()); |
1864 assert(off1 - off0 == pre_call_FPU_size(), "correct size prediction"); | 1886 assert(off1 - off0 == pre_call_resets_size(), "correct size prediction"); |
1865 %} | 1887 %} |
1866 | 1888 |
1867 enc_class post_call_FPU %{ | 1889 enc_class post_call_FPU %{ |
1868 // If method sets FPU control word do it here also | 1890 // If method sets FPU control word do it here also |
1869 if( Compile::current()->in_24_bit_fp_mode() ) { | 1891 if (Compile::current()->in_24_bit_fp_mode()) { |
1870 MacroAssembler masm(&cbuf); | 1892 MacroAssembler masm(&cbuf); |
1871 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24())); | 1893 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24())); |
1872 } | 1894 } |
1873 %} | 1895 %} |
1874 | 1896 |
1875 enc_class Java_Static_Call (method meth) %{ // JAVA STATIC CALL | 1897 enc_class Java_Static_Call (method meth) %{ // JAVA STATIC CALL |
1876 // CALL to fixup routine. Fixup routine uses ScopeDesc info to determine | 1898 // CALL to fixup routine. Fixup routine uses ScopeDesc info to determine |
1877 // who we intended to call. | 1899 // who we intended to call. |
1878 cbuf.set_insts_mark(); | 1900 cbuf.set_insts_mark(); |
1879 $$$emit8$primary; | 1901 $$$emit8$primary; |
1880 if ( !_method ) { | 1902 if (!_method) { |
1881 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), | 1903 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), |
1882 runtime_call_Relocation::spec(), RELOC_IMM32 ); | 1904 runtime_call_Relocation::spec(), RELOC_IMM32 ); |
1883 } else if(_optimized_virtual) { | 1905 } else if (_optimized_virtual) { |
1884 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), | 1906 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), |
1885 opt_virtual_call_Relocation::spec(), RELOC_IMM32 ); | 1907 opt_virtual_call_Relocation::spec(), RELOC_IMM32 ); |
1886 } else { | 1908 } else { |
1887 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), | 1909 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), |
1888 static_call_Relocation::spec(), RELOC_IMM32 ); | 1910 static_call_Relocation::spec(), RELOC_IMM32 ); |
1889 } | 1911 } |
1890 if( _method ) { // Emit stub for static call | 1912 if (_method) { // Emit stub for static call |
1891 emit_java_to_interp(cbuf); | 1913 emit_java_to_interp(cbuf); |
1892 } | 1914 } |
1893 %} | 1915 %} |
1894 | 1916 |
1895 enc_class Java_Dynamic_Call (method meth) %{ // JAVA DYNAMIC CALL | 1917 enc_class Java_Dynamic_Call (method meth) %{ // JAVA DYNAMIC CALL |
12826 effect(USE meth); | 12848 effect(USE meth); |
12827 | 12849 |
12828 ins_cost(300); | 12850 ins_cost(300); |
12829 format %{ "CALL,static " %} | 12851 format %{ "CALL,static " %} |
12830 opcode(0xE8); /* E8 cd */ | 12852 opcode(0xE8); /* E8 cd */ |
12831 ins_encode( pre_call_FPU, | 12853 ins_encode( pre_call_resets, |
12832 Java_Static_Call( meth ), | 12854 Java_Static_Call( meth ), |
12833 call_epilog, | 12855 call_epilog, |
12834 post_call_FPU ); | 12856 post_call_FPU ); |
12835 ins_pipe( pipe_slow ); | 12857 ins_pipe( pipe_slow ); |
12836 ins_alignment(4); | 12858 ins_alignment(4); |
12847 // We use it here for a similar purpose, in {preserve,restore}_SP. | 12869 // We use it here for a similar purpose, in {preserve,restore}_SP. |
12848 | 12870 |
12849 ins_cost(300); | 12871 ins_cost(300); |
12850 format %{ "CALL,static/MethodHandle " %} | 12872 format %{ "CALL,static/MethodHandle " %} |
12851 opcode(0xE8); /* E8 cd */ | 12873 opcode(0xE8); /* E8 cd */ |
12852 ins_encode( pre_call_FPU, | 12874 ins_encode( pre_call_resets, |
12853 preserve_SP, | 12875 preserve_SP, |
12854 Java_Static_Call( meth ), | 12876 Java_Static_Call( meth ), |
12855 restore_SP, | 12877 restore_SP, |
12856 call_epilog, | 12878 call_epilog, |
12857 post_call_FPU ); | 12879 post_call_FPU ); |
12868 | 12890 |
12869 ins_cost(300); | 12891 ins_cost(300); |
12870 format %{ "MOV EAX,(oop)-1\n\t" | 12892 format %{ "MOV EAX,(oop)-1\n\t" |
12871 "CALL,dynamic" %} | 12893 "CALL,dynamic" %} |
12872 opcode(0xE8); /* E8 cd */ | 12894 opcode(0xE8); /* E8 cd */ |
12873 ins_encode( pre_call_FPU, | 12895 ins_encode( pre_call_resets, |
12874 Java_Dynamic_Call( meth ), | 12896 Java_Dynamic_Call( meth ), |
12875 call_epilog, | 12897 call_epilog, |
12876 post_call_FPU ); | 12898 post_call_FPU ); |
12877 ins_pipe( pipe_slow ); | 12899 ins_pipe( pipe_slow ); |
12878 ins_alignment(4); | 12900 ins_alignment(4); |
12885 | 12907 |
12886 ins_cost(300); | 12908 ins_cost(300); |
12887 format %{ "CALL,runtime " %} | 12909 format %{ "CALL,runtime " %} |
12888 opcode(0xE8); /* E8 cd */ | 12910 opcode(0xE8); /* E8 cd */ |
12889 // Use FFREEs to clear entries in float stack | 12911 // Use FFREEs to clear entries in float stack |
12890 ins_encode( pre_call_FPU, | 12912 ins_encode( pre_call_resets, |
12891 FFree_Float_Stack_All, | 12913 FFree_Float_Stack_All, |
12892 Java_To_Runtime( meth ), | 12914 Java_To_Runtime( meth ), |
12893 post_call_FPU ); | 12915 post_call_FPU ); |
12894 ins_pipe( pipe_slow ); | 12916 ins_pipe( pipe_slow ); |
12895 %} | 12917 %} |
12900 effect(USE meth); | 12922 effect(USE meth); |
12901 | 12923 |
12902 ins_cost(300); | 12924 ins_cost(300); |
12903 format %{ "CALL_LEAF,runtime " %} | 12925 format %{ "CALL_LEAF,runtime " %} |
12904 opcode(0xE8); /* E8 cd */ | 12926 opcode(0xE8); /* E8 cd */ |
12905 ins_encode( pre_call_FPU, | 12927 ins_encode( pre_call_resets, |
12906 FFree_Float_Stack_All, | 12928 FFree_Float_Stack_All, |
12907 Java_To_Runtime( meth ), | 12929 Java_To_Runtime( meth ), |
12908 Verify_FPU_For_Leaf, post_call_FPU ); | 12930 Verify_FPU_For_Leaf, post_call_FPU ); |
12909 ins_pipe( pipe_slow ); | 12931 ins_pipe( pipe_slow ); |
12910 %} | 12932 %} |