comparison src/cpu/x86/vm/x86_32.ad @ 8873:e961c11b85fe

8011102: Clear AVX registers after return from JNI call Summary: Execute vzeroupper instruction after JNI call and on exits in jit compiled code which use 256bit vectors. Reviewed-by: roland
author kvn
date Wed, 03 Apr 2013 11:12:57 -0700
parents b30b3c2a0cf2
children 886d1fd67dc3 a6e09d6dd8e5
comparison
equal deleted inserted replaced
8872:53028d751155 8873:e961c11b85fe
226 static jlong *double_signmask_pool = double_quadword(&fp_signmask_pool[2*2], CONST64(0x7FFFFFFFFFFFFFFF), CONST64(0x7FFFFFFFFFFFFFFF)); 226 static jlong *double_signmask_pool = double_quadword(&fp_signmask_pool[2*2], CONST64(0x7FFFFFFFFFFFFFFF), CONST64(0x7FFFFFFFFFFFFFFF));
227 static jlong *float_signflip_pool = double_quadword(&fp_signmask_pool[3*2], CONST64(0x8000000080000000), CONST64(0x8000000080000000)); 227 static jlong *float_signflip_pool = double_quadword(&fp_signmask_pool[3*2], CONST64(0x8000000080000000), CONST64(0x8000000080000000));
228 static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000)); 228 static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000));
229 229
230 // Offset hacking within calls. 230 // Offset hacking within calls.
231 static int pre_call_FPU_size() { 231 static int pre_call_resets_size() {
232 if (Compile::current()->in_24_bit_fp_mode()) 232 int size = 0;
233 return 6; // fldcw 233 Compile* C = Compile::current();
234 return 0; 234 if (C->in_24_bit_fp_mode()) {
235 size += 6; // fldcw
236 }
237 if (C->max_vector_size() > 16) {
238 size += 3; // vzeroupper
239 }
240 return size;
235 } 241 }
236 242
237 static int preserve_SP_size() { 243 static int preserve_SP_size() {
238 return 2; // op, rm(reg/reg) 244 return 2; // op, rm(reg/reg)
239 } 245 }
240 246
241 // !!!!! Special hack to get all type of calls to specify the byte offset 247 // !!!!! Special hack to get all type of calls to specify the byte offset
242 // from the start of the call to the point where the return address 248 // from the start of the call to the point where the return address
243 // will point. 249 // will point.
244 int MachCallStaticJavaNode::ret_addr_offset() { 250 int MachCallStaticJavaNode::ret_addr_offset() {
245 int offset = 5 + pre_call_FPU_size(); // 5 bytes from start of call to where return address points 251 int offset = 5 + pre_call_resets_size(); // 5 bytes from start of call to where return address points
246 if (_method_handle_invoke) 252 if (_method_handle_invoke)
247 offset += preserve_SP_size(); 253 offset += preserve_SP_size();
248 return offset; 254 return offset;
249 } 255 }
250 256
251 int MachCallDynamicJavaNode::ret_addr_offset() { 257 int MachCallDynamicJavaNode::ret_addr_offset() {
252 return 10 + pre_call_FPU_size(); // 10 bytes from start of call to where return address points 258 return 10 + pre_call_resets_size(); // 10 bytes from start of call to where return address points
253 } 259 }
254 260
255 static int sizeof_FFree_Float_Stack_All = -1; 261 static int sizeof_FFree_Float_Stack_All = -1;
256 262
257 int MachCallRuntimeNode::ret_addr_offset() { 263 int MachCallRuntimeNode::ret_addr_offset() {
258 assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already"); 264 assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already");
259 return sizeof_FFree_Float_Stack_All + 5 + pre_call_FPU_size(); 265 return sizeof_FFree_Float_Stack_All + 5 + pre_call_resets_size();
260 } 266 }
261 267
262 // Indicate if the safepoint node needs the polling page as an input. 268 // Indicate if the safepoint node needs the polling page as an input.
263 // Since x86 does have absolute addressing, it doesn't. 269 // Since x86 does have absolute addressing, it doesn't.
264 bool SafePointNode::needs_polling_address_input() { 270 bool SafePointNode::needs_polling_address_input() {
270 // 276 //
271 277
272 // The address of the call instruction needs to be 4-byte aligned to 278 // The address of the call instruction needs to be 4-byte aligned to
273 // ensure that it does not span a cache line so that it can be patched. 279 // ensure that it does not span a cache line so that it can be patched.
274 int CallStaticJavaDirectNode::compute_padding(int current_offset) const { 280 int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
275 current_offset += pre_call_FPU_size(); // skip fldcw, if any 281 current_offset += pre_call_resets_size(); // skip fldcw, if any
276 current_offset += 1; // skip call opcode byte 282 current_offset += 1; // skip call opcode byte
277 return round_to(current_offset, alignment_required()) - current_offset; 283 return round_to(current_offset, alignment_required()) - current_offset;
278 } 284 }
279 285
280 // The address of the call instruction needs to be 4-byte aligned to 286 // The address of the call instruction needs to be 4-byte aligned to
281 // ensure that it does not span a cache line so that it can be patched. 287 // ensure that it does not span a cache line so that it can be patched.
282 int CallStaticJavaHandleNode::compute_padding(int current_offset) const { 288 int CallStaticJavaHandleNode::compute_padding(int current_offset) const {
283 current_offset += pre_call_FPU_size(); // skip fldcw, if any 289 current_offset += pre_call_resets_size(); // skip fldcw, if any
284 current_offset += preserve_SP_size(); // skip mov rbp, rsp 290 current_offset += preserve_SP_size(); // skip mov rbp, rsp
285 current_offset += 1; // skip call opcode byte 291 current_offset += 1; // skip call opcode byte
286 return round_to(current_offset, alignment_required()) - current_offset; 292 return round_to(current_offset, alignment_required()) - current_offset;
287 } 293 }
288 294
289 // The address of the call instruction needs to be 4-byte aligned to 295 // The address of the call instruction needs to be 4-byte aligned to
290 // ensure that it does not span a cache line so that it can be patched. 296 // ensure that it does not span a cache line so that it can be patched.
291 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const { 297 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const {
292 current_offset += pre_call_FPU_size(); // skip fldcw, if any 298 current_offset += pre_call_resets_size(); // skip fldcw, if any
293 current_offset += 5; // skip MOV instruction 299 current_offset += 5; // skip MOV instruction
294 current_offset += 1; // skip call opcode byte 300 current_offset += 1; // skip call opcode byte
295 return round_to(current_offset, alignment_required()) - current_offset; 301 return round_to(current_offset, alignment_required()) - current_offset;
296 } 302 }
297 303
581 int framesize = C->frame_slots() << LogBytesPerInt; 587 int framesize = C->frame_slots() << LogBytesPerInt;
582 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 588 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
583 // Remove two words for return addr and rbp, 589 // Remove two words for return addr and rbp,
584 framesize -= 2*wordSize; 590 framesize -= 2*wordSize;
585 591
586 if( C->in_24_bit_fp_mode() ) { 592 if (C->max_vector_size() > 16) {
593 st->print("VZEROUPPER");
594 st->cr(); st->print("\t");
595 }
596 if (C->in_24_bit_fp_mode()) {
587 st->print("FLDCW standard control word"); 597 st->print("FLDCW standard control word");
588 st->cr(); st->print("\t"); 598 st->cr(); st->print("\t");
589 } 599 }
590 if( framesize ) { 600 if (framesize) {
591 st->print("ADD ESP,%d\t# Destroy frame",framesize); 601 st->print("ADD ESP,%d\t# Destroy frame",framesize);
592 st->cr(); st->print("\t"); 602 st->cr(); st->print("\t");
593 } 603 }
594 st->print_cr("POPL EBP"); st->print("\t"); 604 st->print_cr("POPL EBP"); st->print("\t");
595 if( do_polling() && C->is_method_compilation() ) { 605 if (do_polling() && C->is_method_compilation()) {
596 st->print("TEST PollPage,EAX\t! Poll Safepoint"); 606 st->print("TEST PollPage,EAX\t! Poll Safepoint");
597 st->cr(); st->print("\t"); 607 st->cr(); st->print("\t");
598 } 608 }
599 } 609 }
600 #endif 610 #endif
601 611
602 void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { 612 void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
603 Compile *C = ra_->C; 613 Compile *C = ra_->C;
604 614
615 if (C->max_vector_size() > 16) {
616 // Clear upper bits of YMM registers when current compiled code uses
617 // wide vectors to avoid AVX <-> SSE transition penalty during call.
618 MacroAssembler masm(&cbuf);
619 masm.vzeroupper();
620 }
605 // If method set FPU control word, restore to standard control word 621 // If method set FPU control word, restore to standard control word
606 if( C->in_24_bit_fp_mode() ) { 622 if (C->in_24_bit_fp_mode()) {
607 MacroAssembler masm(&cbuf); 623 MacroAssembler masm(&cbuf);
608 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); 624 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
609 } 625 }
610 626
611 int framesize = C->frame_slots() << LogBytesPerInt; 627 int framesize = C->frame_slots() << LogBytesPerInt;
613 // Remove two words for return addr and rbp, 629 // Remove two words for return addr and rbp,
614 framesize -= 2*wordSize; 630 framesize -= 2*wordSize;
615 631
616 // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here 632 // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
617 633
618 if( framesize >= 128 ) { 634 if (framesize >= 128) {
619 emit_opcode(cbuf, 0x81); // add SP, #framesize 635 emit_opcode(cbuf, 0x81); // add SP, #framesize
620 emit_rm(cbuf, 0x3, 0x00, ESP_enc); 636 emit_rm(cbuf, 0x3, 0x00, ESP_enc);
621 emit_d32(cbuf, framesize); 637 emit_d32(cbuf, framesize);
622 } 638 } else if (framesize) {
623 else if( framesize ) {
624 emit_opcode(cbuf, 0x83); // add SP, #framesize 639 emit_opcode(cbuf, 0x83); // add SP, #framesize
625 emit_rm(cbuf, 0x3, 0x00, ESP_enc); 640 emit_rm(cbuf, 0x3, 0x00, ESP_enc);
626 emit_d8(cbuf, framesize); 641 emit_d8(cbuf, framesize);
627 } 642 }
628 643
629 emit_opcode(cbuf, 0x58 | EBP_enc); 644 emit_opcode(cbuf, 0x58 | EBP_enc);
630 645
631 if( do_polling() && C->is_method_compilation() ) { 646 if (do_polling() && C->is_method_compilation()) {
632 cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0); 647 cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0);
633 emit_opcode(cbuf,0x85); 648 emit_opcode(cbuf,0x85);
634 emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX 649 emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX
635 emit_d32(cbuf, (intptr_t)os::get_polling_page()); 650 emit_d32(cbuf, (intptr_t)os::get_polling_page());
636 } 651 }
638 653
639 uint MachEpilogNode::size(PhaseRegAlloc *ra_) const { 654 uint MachEpilogNode::size(PhaseRegAlloc *ra_) const {
640 Compile *C = ra_->C; 655 Compile *C = ra_->C;
641 // If method set FPU control word, restore to standard control word 656 // If method set FPU control word, restore to standard control word
642 int size = C->in_24_bit_fp_mode() ? 6 : 0; 657 int size = C->in_24_bit_fp_mode() ? 6 : 0;
643 if( do_polling() && C->is_method_compilation() ) size += 6; 658 if (C->max_vector_size() > 16) size += 3; // vzeroupper
659 if (do_polling() && C->is_method_compilation()) size += 6;
644 660
645 int framesize = C->frame_slots() << LogBytesPerInt; 661 int framesize = C->frame_slots() << LogBytesPerInt;
646 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 662 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
647 // Remove two words for return addr and rbp, 663 // Remove two words for return addr and rbp,
648 framesize -= 2*wordSize; 664 framesize -= 2*wordSize;
649 665
650 size++; // popl rbp, 666 size++; // popl rbp,
651 667
652 if( framesize >= 128 ) { 668 if (framesize >= 128) {
653 size += 6; 669 size += 6;
654 } else { 670 } else {
655 size += framesize ? 3 : 0; 671 size += framesize ? 3 : 0;
656 } 672 }
657 return size; 673 return size;
1851 } 1867 }
1852 } 1868 }
1853 %} 1869 %}
1854 1870
1855 1871
1856 enc_class pre_call_FPU %{ 1872 enc_class pre_call_resets %{
1857 // If method sets FPU control word restore it here 1873 // If method sets FPU control word restore it here
1858 debug_only(int off0 = cbuf.insts_size()); 1874 debug_only(int off0 = cbuf.insts_size());
1859 if( Compile::current()->in_24_bit_fp_mode() ) { 1875 if (ra_->C->in_24_bit_fp_mode()) {
1860 MacroAssembler masm(&cbuf); 1876 MacroAssembler _masm(&cbuf);
1861 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); 1877 __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
1878 }
1879 if (ra_->C->max_vector_size() > 16) {
1880 // Clear upper bits of YMM registers when current compiled code uses
1881 // wide vectors to avoid AVX <-> SSE transition penalty during call.
1882 MacroAssembler _masm(&cbuf);
1883 __ vzeroupper();
1862 } 1884 }
1863 debug_only(int off1 = cbuf.insts_size()); 1885 debug_only(int off1 = cbuf.insts_size());
1864 assert(off1 - off0 == pre_call_FPU_size(), "correct size prediction"); 1886 assert(off1 - off0 == pre_call_resets_size(), "correct size prediction");
1865 %} 1887 %}
1866 1888
1867 enc_class post_call_FPU %{ 1889 enc_class post_call_FPU %{
1868 // If method sets FPU control word do it here also 1890 // If method sets FPU control word do it here also
1869 if( Compile::current()->in_24_bit_fp_mode() ) { 1891 if (Compile::current()->in_24_bit_fp_mode()) {
1870 MacroAssembler masm(&cbuf); 1892 MacroAssembler masm(&cbuf);
1871 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24())); 1893 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
1872 } 1894 }
1873 %} 1895 %}
1874 1896
1875 enc_class Java_Static_Call (method meth) %{ // JAVA STATIC CALL 1897 enc_class Java_Static_Call (method meth) %{ // JAVA STATIC CALL
1876 // CALL to fixup routine. Fixup routine uses ScopeDesc info to determine 1898 // CALL to fixup routine. Fixup routine uses ScopeDesc info to determine
1877 // who we intended to call. 1899 // who we intended to call.
1878 cbuf.set_insts_mark(); 1900 cbuf.set_insts_mark();
1879 $$$emit8$primary; 1901 $$$emit8$primary;
1880 if ( !_method ) { 1902 if (!_method) {
1881 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), 1903 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
1882 runtime_call_Relocation::spec(), RELOC_IMM32 ); 1904 runtime_call_Relocation::spec(), RELOC_IMM32 );
1883 } else if(_optimized_virtual) { 1905 } else if (_optimized_virtual) {
1884 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), 1906 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
1885 opt_virtual_call_Relocation::spec(), RELOC_IMM32 ); 1907 opt_virtual_call_Relocation::spec(), RELOC_IMM32 );
1886 } else { 1908 } else {
1887 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), 1909 emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
1888 static_call_Relocation::spec(), RELOC_IMM32 ); 1910 static_call_Relocation::spec(), RELOC_IMM32 );
1889 } 1911 }
1890 if( _method ) { // Emit stub for static call 1912 if (_method) { // Emit stub for static call
1891 emit_java_to_interp(cbuf); 1913 emit_java_to_interp(cbuf);
1892 } 1914 }
1893 %} 1915 %}
1894 1916
1895 enc_class Java_Dynamic_Call (method meth) %{ // JAVA DYNAMIC CALL 1917 enc_class Java_Dynamic_Call (method meth) %{ // JAVA DYNAMIC CALL
12826 effect(USE meth); 12848 effect(USE meth);
12827 12849
12828 ins_cost(300); 12850 ins_cost(300);
12829 format %{ "CALL,static " %} 12851 format %{ "CALL,static " %}
12830 opcode(0xE8); /* E8 cd */ 12852 opcode(0xE8); /* E8 cd */
12831 ins_encode( pre_call_FPU, 12853 ins_encode( pre_call_resets,
12832 Java_Static_Call( meth ), 12854 Java_Static_Call( meth ),
12833 call_epilog, 12855 call_epilog,
12834 post_call_FPU ); 12856 post_call_FPU );
12835 ins_pipe( pipe_slow ); 12857 ins_pipe( pipe_slow );
12836 ins_alignment(4); 12858 ins_alignment(4);
12847 // We use it here for a similar purpose, in {preserve,restore}_SP. 12869 // We use it here for a similar purpose, in {preserve,restore}_SP.
12848 12870
12849 ins_cost(300); 12871 ins_cost(300);
12850 format %{ "CALL,static/MethodHandle " %} 12872 format %{ "CALL,static/MethodHandle " %}
12851 opcode(0xE8); /* E8 cd */ 12873 opcode(0xE8); /* E8 cd */
12852 ins_encode( pre_call_FPU, 12874 ins_encode( pre_call_resets,
12853 preserve_SP, 12875 preserve_SP,
12854 Java_Static_Call( meth ), 12876 Java_Static_Call( meth ),
12855 restore_SP, 12877 restore_SP,
12856 call_epilog, 12878 call_epilog,
12857 post_call_FPU ); 12879 post_call_FPU );
12868 12890
12869 ins_cost(300); 12891 ins_cost(300);
12870 format %{ "MOV EAX,(oop)-1\n\t" 12892 format %{ "MOV EAX,(oop)-1\n\t"
12871 "CALL,dynamic" %} 12893 "CALL,dynamic" %}
12872 opcode(0xE8); /* E8 cd */ 12894 opcode(0xE8); /* E8 cd */
12873 ins_encode( pre_call_FPU, 12895 ins_encode( pre_call_resets,
12874 Java_Dynamic_Call( meth ), 12896 Java_Dynamic_Call( meth ),
12875 call_epilog, 12897 call_epilog,
12876 post_call_FPU ); 12898 post_call_FPU );
12877 ins_pipe( pipe_slow ); 12899 ins_pipe( pipe_slow );
12878 ins_alignment(4); 12900 ins_alignment(4);
12885 12907
12886 ins_cost(300); 12908 ins_cost(300);
12887 format %{ "CALL,runtime " %} 12909 format %{ "CALL,runtime " %}
12888 opcode(0xE8); /* E8 cd */ 12910 opcode(0xE8); /* E8 cd */
12889 // Use FFREEs to clear entries in float stack 12911 // Use FFREEs to clear entries in float stack
12890 ins_encode( pre_call_FPU, 12912 ins_encode( pre_call_resets,
12891 FFree_Float_Stack_All, 12913 FFree_Float_Stack_All,
12892 Java_To_Runtime( meth ), 12914 Java_To_Runtime( meth ),
12893 post_call_FPU ); 12915 post_call_FPU );
12894 ins_pipe( pipe_slow ); 12916 ins_pipe( pipe_slow );
12895 %} 12917 %}
12900 effect(USE meth); 12922 effect(USE meth);
12901 12923
12902 ins_cost(300); 12924 ins_cost(300);
12903 format %{ "CALL_LEAF,runtime " %} 12925 format %{ "CALL_LEAF,runtime " %}
12904 opcode(0xE8); /* E8 cd */ 12926 opcode(0xE8); /* E8 cd */
12905 ins_encode( pre_call_FPU, 12927 ins_encode( pre_call_resets,
12906 FFree_Float_Stack_All, 12928 FFree_Float_Stack_All,
12907 Java_To_Runtime( meth ), 12929 Java_To_Runtime( meth ),
12908 Verify_FPU_For_Leaf, post_call_FPU ); 12930 Verify_FPU_For_Leaf, post_call_FPU );
12909 ins_pipe( pipe_slow ); 12931 ins_pipe( pipe_slow );
12910 %} 12932 %}