comparison src/cpu/x86/vm/x86_64.ad @ 8873:e961c11b85fe

8011102: Clear AVX registers after return from JNI call Summary: Execute vzeroupper instruction after JNI call and on exits in jit compiled code which use 256bit vectors. Reviewed-by: roland
author kvn
date Wed, 03 Apr 2013 11:12:57 -0700
parents b30b3c2a0cf2
children 8be1318fbe77 a6e09d6dd8e5
comparison
equal deleted inserted replaced
8872:53028d751155 8873:e961c11b85fe
397 #define __ _masm. 397 #define __ _masm.
398 398
399 static int preserve_SP_size() { 399 static int preserve_SP_size() {
400 return 3; // rex.w, op, rm(reg/reg) 400 return 3; // rex.w, op, rm(reg/reg)
401 } 401 }
402 static int clear_avx_size() {
403 return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper
404 }
402 405
403 // !!!!! Special hack to get all types of calls to specify the byte offset 406 // !!!!! Special hack to get all types of calls to specify the byte offset
404 // from the start of the call to the point where the return address 407 // from the start of the call to the point where the return address
405 // will point. 408 // will point.
406 int MachCallStaticJavaNode::ret_addr_offset() 409 int MachCallStaticJavaNode::ret_addr_offset()
407 { 410 {
408 int offset = 5; // 5 bytes from start of call to where return address points 411 int offset = 5; // 5 bytes from start of call to where return address points
412 offset += clear_avx_size();
409 if (_method_handle_invoke) 413 if (_method_handle_invoke)
410 offset += preserve_SP_size(); 414 offset += preserve_SP_size();
411 return offset; 415 return offset;
412 } 416 }
413 417
414 int MachCallDynamicJavaNode::ret_addr_offset() 418 int MachCallDynamicJavaNode::ret_addr_offset()
415 { 419 {
416 return 15; // 15 bytes from start of call to where return address points 420 int offset = 15; // 15 bytes from start of call to where return address points
421 offset += clear_avx_size();
422 return offset;
417 } 423 }
418 424
419 // In os_cpu .ad file 425 int MachCallRuntimeNode::ret_addr_offset() {
420 // int MachCallRuntimeNode::ret_addr_offset() 426 int offset = 13; // movq r10,#addr; callq (r10)
427 offset += clear_avx_size();
428 return offset;
429 }
421 430
422 // Indicate if the safepoint node needs the polling page as an input, 431 // Indicate if the safepoint node needs the polling page as an input,
423 // it does if the polling page is more than disp32 away. 432 // it does if the polling page is more than disp32 away.
424 bool SafePointNode::needs_polling_address_input() 433 bool SafePointNode::needs_polling_address_input()
425 { 434 {
432 441
433 // The address of the call instruction needs to be 4-byte aligned to 442 // The address of the call instruction needs to be 4-byte aligned to
434 // ensure that it does not span a cache line so that it can be patched. 443 // ensure that it does not span a cache line so that it can be patched.
435 int CallStaticJavaDirectNode::compute_padding(int current_offset) const 444 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
436 { 445 {
446 current_offset += clear_avx_size(); // skip vzeroupper
437 current_offset += 1; // skip call opcode byte 447 current_offset += 1; // skip call opcode byte
438 return round_to(current_offset, alignment_required()) - current_offset; 448 return round_to(current_offset, alignment_required()) - current_offset;
439 } 449 }
440 450
441 // The address of the call instruction needs to be 4-byte aligned to 451 // The address of the call instruction needs to be 4-byte aligned to
442 // ensure that it does not span a cache line so that it can be patched. 452 // ensure that it does not span a cache line so that it can be patched.
443 int CallStaticJavaHandleNode::compute_padding(int current_offset) const 453 int CallStaticJavaHandleNode::compute_padding(int current_offset) const
444 { 454 {
445 current_offset += preserve_SP_size(); // skip mov rbp, rsp 455 current_offset += preserve_SP_size(); // skip mov rbp, rsp
456 current_offset += clear_avx_size(); // skip vzeroupper
446 current_offset += 1; // skip call opcode byte 457 current_offset += 1; // skip call opcode byte
447 return round_to(current_offset, alignment_required()) - current_offset; 458 return round_to(current_offset, alignment_required()) - current_offset;
448 } 459 }
449 460
450 // The address of the call instruction needs to be 4-byte aligned to 461 // The address of the call instruction needs to be 4-byte aligned to
451 // ensure that it does not span a cache line so that it can be patched. 462 // ensure that it does not span a cache line so that it can be patched.
452 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const 463 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
453 { 464 {
465 current_offset += clear_avx_size(); // skip vzeroupper
454 current_offset += 11; // skip movq instruction + call opcode byte 466 current_offset += 11; // skip movq instruction + call opcode byte
455 return round_to(current_offset, alignment_required()) - current_offset; 467 return round_to(current_offset, alignment_required()) - current_offset;
456 } 468 }
457 469
458 // EMIT_RM() 470 // EMIT_RM()
762 //============================================================================= 774 //=============================================================================
763 #ifndef PRODUCT 775 #ifndef PRODUCT
764 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const 776 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
765 { 777 {
766 Compile* C = ra_->C; 778 Compile* C = ra_->C;
779 if (C->max_vector_size() > 16) {
780 st->print("vzeroupper");
781 st->cr(); st->print("\t");
782 }
783
767 int framesize = C->frame_slots() << LogBytesPerInt; 784 int framesize = C->frame_slots() << LogBytesPerInt;
768 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 785 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
769 // Remove word for return adr already pushed 786 // Remove word for return adr already pushed
770 // and RBP 787 // and RBP
771 framesize -= 2*wordSize; 788 framesize -= 2*wordSize;
791 #endif 808 #endif
792 809
793 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const 810 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
794 { 811 {
795 Compile* C = ra_->C; 812 Compile* C = ra_->C;
813 if (C->max_vector_size() > 16) {
814 // Clear upper bits of YMM registers when current compiled code uses
815 // wide vectors to avoid AVX <-> SSE transition penalty during call.
816 MacroAssembler _masm(&cbuf);
817 __ vzeroupper();
818 }
819
796 int framesize = C->frame_slots() << LogBytesPerInt; 820 int framesize = C->frame_slots() << LogBytesPerInt;
797 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 821 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
798 // Remove word for return adr already pushed 822 // Remove word for return adr already pushed
799 // and RBP 823 // and RBP
800 framesize -= 2*wordSize; 824 framesize -= 2*wordSize;
2006 __ xorptr(Rrdi, Rrdi); 2030 __ xorptr(Rrdi, Rrdi);
2007 } 2031 }
2008 __ bind(miss); 2032 __ bind(miss);
2009 %} 2033 %}
2010 2034
2035 enc_class clear_avx %{
2036 debug_only(int off0 = cbuf.insts_size());
2037 if (ra_->C->max_vector_size() > 16) {
2038 // Clear upper bits of YMM registers when current compiled code uses
2039 // wide vectors to avoid AVX <-> SSE transition penalty during call.
2040 MacroAssembler _masm(&cbuf);
2041 __ vzeroupper();
2042 }
2043 debug_only(int off1 = cbuf.insts_size());
2044 assert(off1 - off0 == clear_avx_size(), "correct size prediction");
2045 %}
2046
2047 enc_class Java_To_Runtime(method meth) %{
2048 // No relocation needed
2049 MacroAssembler _masm(&cbuf);
2050 __ mov64(r10, (int64_t) $meth$$method);
2051 __ call(r10);
2052 %}
2053
2011 enc_class Java_To_Interpreter(method meth) 2054 enc_class Java_To_Interpreter(method meth)
2012 %{ 2055 %{
2013 // CALL Java_To_Interpreter 2056 // CALL Java_To_Interpreter
2014 // This is the instruction starting address for relocation info. 2057 // This is the instruction starting address for relocation info.
2015 cbuf.set_insts_mark(); 2058 cbuf.set_insts_mark();
11364 effect(USE meth); 11407 effect(USE meth);
11365 11408
11366 ins_cost(300); 11409 ins_cost(300);
11367 format %{ "call,static " %} 11410 format %{ "call,static " %}
11368 opcode(0xE8); /* E8 cd */ 11411 opcode(0xE8); /* E8 cd */
11369 ins_encode(Java_Static_Call(meth), call_epilog); 11412 ins_encode(clear_avx, Java_Static_Call(meth), call_epilog);
11370 ins_pipe(pipe_slow); 11413 ins_pipe(pipe_slow);
11371 ins_alignment(4); 11414 ins_alignment(4);
11372 %} 11415 %}
11373 11416
11374 // Call Java Static Instruction (method handle version) 11417 // Call Java Static Instruction (method handle version)
11382 // We use it here for a similar purpose, in {preserve,restore}_SP. 11425 // We use it here for a similar purpose, in {preserve,restore}_SP.
11383 11426
11384 ins_cost(300); 11427 ins_cost(300);
11385 format %{ "call,static/MethodHandle " %} 11428 format %{ "call,static/MethodHandle " %}
11386 opcode(0xE8); /* E8 cd */ 11429 opcode(0xE8); /* E8 cd */
11387 ins_encode(preserve_SP, 11430 ins_encode(clear_avx, preserve_SP,
11388 Java_Static_Call(meth), 11431 Java_Static_Call(meth),
11389 restore_SP, 11432 restore_SP,
11390 call_epilog); 11433 call_epilog);
11391 ins_pipe(pipe_slow); 11434 ins_pipe(pipe_slow);
11392 ins_alignment(4); 11435 ins_alignment(4);
11401 effect(USE meth); 11444 effect(USE meth);
11402 11445
11403 ins_cost(300); 11446 ins_cost(300);
11404 format %{ "movq rax, #Universe::non_oop_word()\n\t" 11447 format %{ "movq rax, #Universe::non_oop_word()\n\t"
11405 "call,dynamic " %} 11448 "call,dynamic " %}
11406 ins_encode(Java_Dynamic_Call(meth), call_epilog); 11449 ins_encode(clear_avx, Java_Dynamic_Call(meth), call_epilog);
11407 ins_pipe(pipe_slow); 11450 ins_pipe(pipe_slow);
11408 ins_alignment(4); 11451 ins_alignment(4);
11409 %} 11452 %}
11410 11453
11411 // Call Runtime Instruction 11454 // Call Runtime Instruction
11414 match(CallRuntime); 11457 match(CallRuntime);
11415 effect(USE meth); 11458 effect(USE meth);
11416 11459
11417 ins_cost(300); 11460 ins_cost(300);
11418 format %{ "call,runtime " %} 11461 format %{ "call,runtime " %}
11419 opcode(0xE8); /* E8 cd */ 11462 ins_encode(clear_avx, Java_To_Runtime(meth));
11420 ins_encode(Java_To_Runtime(meth));
11421 ins_pipe(pipe_slow); 11463 ins_pipe(pipe_slow);
11422 %} 11464 %}
11423 11465
11424 // Call runtime without safepoint 11466 // Call runtime without safepoint
11425 instruct CallLeafDirect(method meth) 11467 instruct CallLeafDirect(method meth)
11427 match(CallLeaf); 11469 match(CallLeaf);
11428 effect(USE meth); 11470 effect(USE meth);
11429 11471
11430 ins_cost(300); 11472 ins_cost(300);
11431 format %{ "call_leaf,runtime " %} 11473 format %{ "call_leaf,runtime " %}
11432 opcode(0xE8); /* E8 cd */ 11474 ins_encode(clear_avx, Java_To_Runtime(meth));
11433 ins_encode(Java_To_Runtime(meth));
11434 ins_pipe(pipe_slow); 11475 ins_pipe(pipe_slow);
11435 %} 11476 %}
11436 11477
11437 // Call runtime without safepoint 11478 // Call runtime without safepoint
11438 instruct CallLeafNoFPDirect(method meth) 11479 instruct CallLeafNoFPDirect(method meth)
11440 match(CallLeafNoFP); 11481 match(CallLeafNoFP);
11441 effect(USE meth); 11482 effect(USE meth);
11442 11483
11443 ins_cost(300); 11484 ins_cost(300);
11444 format %{ "call_leaf_nofp,runtime " %} 11485 format %{ "call_leaf_nofp,runtime " %}
11445 opcode(0xE8); /* E8 cd */
11446 ins_encode(Java_To_Runtime(meth)); 11486 ins_encode(Java_To_Runtime(meth));
11447 ins_pipe(pipe_slow); 11487 ins_pipe(pipe_slow);
11448 %} 11488 %}
11449 11489
11450 // Return Instruction 11490 // Return Instruction