# HG changeset patch # User kvn # Date 1365012777 25200 # Node ID e961c11b85fe2b977b4b6a7def649dbf6ace4241 # Parent 53028d7511557e0295ac650dd7c4a4f12db3d027 8011102: Clear AVX registers after return from JNI call Summary: Execute vzeroupper instruction after JNI call and on exits in jit compiled code which use 256bit vectors. Reviewed-by: roland diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/cppInterpreter_x86.cpp --- a/src/cpu/x86/vm/cppInterpreter_x86.cpp Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/cppInterpreter_x86.cpp Wed Apr 03 11:12:57 2013 -0700 @@ -1299,25 +1299,8 @@ __ push(rdx); #endif // _LP64 - // Either restore the MXCSR register after returning from the JNI Call - // or verify that it wasn't changed. - if (VM_Version::supports_sse()) { - if (RestoreMXCSROnJNICalls) { - __ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std())); - } - else if (CheckJNICalls ) { - __ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); - } - } - -#ifndef _LP64 - // Either restore the x87 floating pointer control word after returning - // from the JNI call or verify that it wasn't changed. - if (CheckJNICalls) { - __ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); - } -#endif // _LP64 - + // Verify or restore cpu control state after JNI call + __ restore_cpu_control_state_after_jni(); // change thread state __ movl(Address(thread, JavaThread::thread_state_offset()), _thread_in_native_trans); diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/macroAssembler_x86.cpp --- a/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Wed Apr 03 11:12:57 2013 -0700 @@ -4765,6 +4765,31 @@ pop_CPU_state(); } +void MacroAssembler::restore_cpu_control_state_after_jni() { + // Either restore the MXCSR register after returning from the JNI Call + // or verify that it wasn't changed (with -Xcheck:jni flag). + if (VM_Version::supports_sse()) { + if (RestoreMXCSROnJNICalls) { + ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std())); + } else if (CheckJNICalls) { + call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); + } + } + if (VM_Version::supports_avx()) { + // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty. + vzeroupper(); + } + +#ifndef _LP64 + // Either restore the x87 floating pointer control word after returning + // from the JNI call or verify that it wasn't changed. + if (CheckJNICalls) { + call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); + } +#endif // _LP64 +} + + void MacroAssembler::load_klass(Register dst, Register src) { #ifdef _LP64 if (UseCompressedKlassPointers) { @@ -5759,6 +5784,8 @@ addptr(result, stride2); subl(cnt2, stride2); jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); + // clean upper bits of YMM registers + vzeroupper(); // compare wide vectors tail bind(COMPARE_WIDE_TAIL); @@ -5772,6 +5799,8 @@ // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. bind(VECTOR_NOT_EQUAL); + // clean upper bits of YMM registers + vzeroupper(); lea(str1, Address(str1, result, scale)); lea(str2, Address(str2, result, scale)); jmp(COMPARE_16_CHARS); @@ -6028,6 +6057,10 @@ // That's it bind(DONE); + if (UseAVX >= 2) { + // clean upper bits of YMM registers + vzeroupper(); + } } void MacroAssembler::generate_fill(BasicType t, bool aligned, @@ -6157,6 +6190,10 @@ vmovdqu(Address(to, 0), xtmp); addptr(to, 32); subl(count, 8 << shift); + + BIND(L_check_fill_8_bytes); + // clean upper bits of YMM registers + vzeroupper(); } else { // Fill 32-byte chunks pshufd(xtmp, xtmp, 0); @@ -6180,8 +6217,9 @@ addptr(to, 32); subl(count, 8 << shift); jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); + + BIND(L_check_fill_8_bytes); } - BIND(L_check_fill_8_bytes); addl(count, 8 << shift); jccb(Assembler::zero, L_exit); jmpb(L_fill_8_bytes); @@ -6316,6 +6354,10 @@ jccb(Assembler::lessEqual, L_copy_16_chars); bind(L_copy_16_chars_exit); + if (UseAVX >= 2) { + // clean upper bits of YMM registers + vzeroupper(); + } subptr(len, 8); jccb(Assembler::greater, L_copy_8_chars_exit); diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/macroAssembler_x86.hpp --- a/src/cpu/x86/vm/macroAssembler_x86.hpp Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/macroAssembler_x86.hpp Wed Apr 03 11:12:57 2013 -0700 @@ -582,6 +582,9 @@ // only if +VerifyFPU void verify_FPU(int stack_depth, const char* s = "illegal FPU state"); + // Verify or restore cpu control state after JNI call + void restore_cpu_control_state_after_jni(); + // prints msg, dumps registers and stops execution void stop(const char* msg); diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/sharedRuntime_x86_32.cpp --- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp Wed Apr 03 11:12:57 2013 -0700 @@ -2065,6 +2065,9 @@ __ call(RuntimeAddress(native_func)); + // Verify or restore cpu control state after JNI call + __ restore_cpu_control_state_after_jni(); + // WARNING - on Windows Java Natives use pascal calling convention and pop the // arguments off of the stack. We could just re-adjust the stack pointer here // and continue to do SP relative addressing but we instead switch to FP diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/sharedRuntime_x86_64.cpp --- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Wed Apr 03 11:12:57 2013 -0700 @@ -2315,16 +2315,8 @@ __ call(RuntimeAddress(native_func)); - // Either restore the MXCSR register after returning from the JNI Call - // or verify that it wasn't changed. - if (RestoreMXCSROnJNICalls) { - __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std())); - - } - else if (CheckJNICalls ) { - __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry()))); - } - + // Verify or restore cpu control state after JNI call + __ restore_cpu_control_state_after_jni(); // Unpack native results. switch (ret_type) { diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/stubGenerator_x86_32.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Apr 03 11:12:57 2013 -0700 @@ -835,6 +835,11 @@ __ BIND(L_copy_64_bytes); __ subl(qword_count, 8); __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop); + + if (UseUnalignedLoadStores && (UseAVX >= 2)) { + // clean upper bits of YMM registers + __ vzeroupper(); + } __ addl(qword_count, 8); __ jccb(Assembler::zero, L_exit); // diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/stubGenerator_x86_64.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Apr 03 11:12:57 2013 -0700 @@ -1331,6 +1331,10 @@ } __ addptr(qword_count, 4); __ BIND(L_end); + if (UseAVX >= 2) { + // clean upper bits of YMM registers + __ vzeroupper(); + } } else { // Copy 32-bytes per iteration __ BIND(L_loop); @@ -1404,6 +1408,10 @@ } __ subptr(qword_count, 4); __ BIND(L_end); + if (UseAVX >= 2) { + // clean upper bits of YMM registers + __ vzeroupper(); + } } else { // Copy 32-bytes per iteration __ BIND(L_loop); diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/templateInterpreter_x86_32.cpp --- a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp Wed Apr 03 11:12:57 2013 -0700 @@ -1080,22 +1080,8 @@ // result potentially in rdx:rax or ST0 - // Either restore the MXCSR register after returning from the JNI Call - // or verify that it wasn't changed. - if (VM_Version::supports_sse()) { - if (RestoreMXCSROnJNICalls) { - __ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std())); - } - else if (CheckJNICalls ) { - __ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); - } - } - - // Either restore the x87 floating pointer control word after returning - // from the JNI call or verify that it wasn't changed. - if (CheckJNICalls) { - __ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); - } + // Verify or restore cpu control state after JNI call + __ restore_cpu_control_state_after_jni(); // save potential result in ST(0) & rdx:rax // (if result handler is the T_FLOAT or T_DOUBLE handler, result must be in ST0 - diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/templateInterpreter_x86_64.cpp --- a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp Wed Apr 03 11:12:57 2013 -0700 @@ -1079,15 +1079,8 @@ __ call(rax); // result potentially in rax or xmm0 - // Depending on runtime options, either restore the MXCSR - // register after returning from the JNI Call or verify that - // it wasn't changed during -Xcheck:jni. - if (RestoreMXCSROnJNICalls) { - __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std())); - } - else if (CheckJNICalls) { - __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry()))); - } + // Verify or restore cpu control state after JNI call + __ restore_cpu_control_state_after_jni(); // NOTE: The order of these pushes is known to frame::interpreter_frame_result // in order to extract the result of a method call. If the order of these diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/x86_32.ad --- a/src/cpu/x86/vm/x86_32.ad Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/x86_32.ad Wed Apr 03 11:12:57 2013 -0700 @@ -228,10 +228,16 @@ static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000)); // Offset hacking within calls. -static int pre_call_FPU_size() { - if (Compile::current()->in_24_bit_fp_mode()) - return 6; // fldcw - return 0; +static int pre_call_resets_size() { + int size = 0; + Compile* C = Compile::current(); + if (C->in_24_bit_fp_mode()) { + size += 6; // fldcw + } + if (C->max_vector_size() > 16) { + size += 3; // vzeroupper + } + return size; } static int preserve_SP_size() { @@ -242,21 +248,21 @@ // from the start of the call to the point where the return address // will point. int MachCallStaticJavaNode::ret_addr_offset() { - int offset = 5 + pre_call_FPU_size(); // 5 bytes from start of call to where return address points + int offset = 5 + pre_call_resets_size(); // 5 bytes from start of call to where return address points if (_method_handle_invoke) offset += preserve_SP_size(); return offset; } int MachCallDynamicJavaNode::ret_addr_offset() { - return 10 + pre_call_FPU_size(); // 10 bytes from start of call to where return address points + return 10 + pre_call_resets_size(); // 10 bytes from start of call to where return address points } static int sizeof_FFree_Float_Stack_All = -1; int MachCallRuntimeNode::ret_addr_offset() { assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already"); - return sizeof_FFree_Float_Stack_All + 5 + pre_call_FPU_size(); + return sizeof_FFree_Float_Stack_All + 5 + pre_call_resets_size(); } // Indicate if the safepoint node needs the polling page as an input. @@ -272,7 +278,7 @@ // The address of the call instruction needs to be 4-byte aligned to // ensure that it does not span a cache line so that it can be patched. int CallStaticJavaDirectNode::compute_padding(int current_offset) const { - current_offset += pre_call_FPU_size(); // skip fldcw, if any + current_offset += pre_call_resets_size(); // skip fldcw, if any current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; } @@ -280,7 +286,7 @@ // The address of the call instruction needs to be 4-byte aligned to // ensure that it does not span a cache line so that it can be patched. int CallStaticJavaHandleNode::compute_padding(int current_offset) const { - current_offset += pre_call_FPU_size(); // skip fldcw, if any + current_offset += pre_call_resets_size(); // skip fldcw, if any current_offset += preserve_SP_size(); // skip mov rbp, rsp current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; @@ -289,7 +295,7 @@ // The address of the call instruction needs to be 4-byte aligned to // ensure that it does not span a cache line so that it can be patched. int CallDynamicJavaDirectNode::compute_padding(int current_offset) const { - current_offset += pre_call_FPU_size(); // skip fldcw, if any + current_offset += pre_call_resets_size(); // skip fldcw, if any current_offset += 5; // skip MOV instruction current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; @@ -583,16 +589,20 @@ // Remove two words for return addr and rbp, framesize -= 2*wordSize; - if( C->in_24_bit_fp_mode() ) { + if (C->max_vector_size() > 16) { + st->print("VZEROUPPER"); + st->cr(); st->print("\t"); + } + if (C->in_24_bit_fp_mode()) { st->print("FLDCW standard control word"); st->cr(); st->print("\t"); } - if( framesize ) { + if (framesize) { st->print("ADD ESP,%d\t# Destroy frame",framesize); st->cr(); st->print("\t"); } st->print_cr("POPL EBP"); st->print("\t"); - if( do_polling() && C->is_method_compilation() ) { + if (do_polling() && C->is_method_compilation()) { st->print("TEST PollPage,EAX\t! Poll Safepoint"); st->cr(); st->print("\t"); } @@ -602,8 +612,14 @@ void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { Compile *C = ra_->C; + if (C->max_vector_size() > 16) { + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler masm(&cbuf); + masm.vzeroupper(); + } // If method set FPU control word, restore to standard control word - if( C->in_24_bit_fp_mode() ) { + if (C->in_24_bit_fp_mode()) { MacroAssembler masm(&cbuf); masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); } @@ -615,12 +631,11 @@ // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here - if( framesize >= 128 ) { + if (framesize >= 128) { emit_opcode(cbuf, 0x81); // add SP, #framesize emit_rm(cbuf, 0x3, 0x00, ESP_enc); emit_d32(cbuf, framesize); - } - else if( framesize ) { + } else if (framesize) { emit_opcode(cbuf, 0x83); // add SP, #framesize emit_rm(cbuf, 0x3, 0x00, ESP_enc); emit_d8(cbuf, framesize); @@ -628,7 +643,7 @@ emit_opcode(cbuf, 0x58 | EBP_enc); - if( do_polling() && C->is_method_compilation() ) { + if (do_polling() && C->is_method_compilation()) { cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0); emit_opcode(cbuf,0x85); emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX @@ -640,7 +655,8 @@ Compile *C = ra_->C; // If method set FPU control word, restore to standard control word int size = C->in_24_bit_fp_mode() ? 6 : 0; - if( do_polling() && C->is_method_compilation() ) size += 6; + if (C->max_vector_size() > 16) size += 3; // vzeroupper + if (do_polling() && C->is_method_compilation()) size += 6; int framesize = C->frame_slots() << LogBytesPerInt; assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); @@ -649,7 +665,7 @@ size++; // popl rbp, - if( framesize >= 128 ) { + if (framesize >= 128) { size += 6; } else { size += framesize ? 3 : 0; @@ -1853,20 +1869,26 @@ %} - enc_class pre_call_FPU %{ + enc_class pre_call_resets %{ // If method sets FPU control word restore it here debug_only(int off0 = cbuf.insts_size()); - if( Compile::current()->in_24_bit_fp_mode() ) { - MacroAssembler masm(&cbuf); - masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); + if (ra_->C->in_24_bit_fp_mode()) { + MacroAssembler _masm(&cbuf); + __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); + } + if (ra_->C->max_vector_size() > 16) { + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler _masm(&cbuf); + __ vzeroupper(); } debug_only(int off1 = cbuf.insts_size()); - assert(off1 - off0 == pre_call_FPU_size(), "correct size prediction"); + assert(off1 - off0 == pre_call_resets_size(), "correct size prediction"); %} enc_class post_call_FPU %{ // If method sets FPU control word do it here also - if( Compile::current()->in_24_bit_fp_mode() ) { + if (Compile::current()->in_24_bit_fp_mode()) { MacroAssembler masm(&cbuf); masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24())); } @@ -1877,17 +1899,17 @@ // who we intended to call. cbuf.set_insts_mark(); $$$emit8$primary; - if ( !_method ) { + if (!_method) { emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), runtime_call_Relocation::spec(), RELOC_IMM32 ); - } else if(_optimized_virtual) { + } else if (_optimized_virtual) { emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), opt_virtual_call_Relocation::spec(), RELOC_IMM32 ); } else { emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), static_call_Relocation::spec(), RELOC_IMM32 ); } - if( _method ) { // Emit stub for static call + if (_method) { // Emit stub for static call emit_java_to_interp(cbuf); } %} @@ -12828,7 +12850,7 @@ ins_cost(300); format %{ "CALL,static " %} opcode(0xE8); /* E8 cd */ - ins_encode( pre_call_FPU, + ins_encode( pre_call_resets, Java_Static_Call( meth ), call_epilog, post_call_FPU ); @@ -12849,7 +12871,7 @@ ins_cost(300); format %{ "CALL,static/MethodHandle " %} opcode(0xE8); /* E8 cd */ - ins_encode( pre_call_FPU, + ins_encode( pre_call_resets, preserve_SP, Java_Static_Call( meth ), restore_SP, @@ -12870,7 +12892,7 @@ format %{ "MOV EAX,(oop)-1\n\t" "CALL,dynamic" %} opcode(0xE8); /* E8 cd */ - ins_encode( pre_call_FPU, + ins_encode( pre_call_resets, Java_Dynamic_Call( meth ), call_epilog, post_call_FPU ); @@ -12887,7 +12909,7 @@ format %{ "CALL,runtime " %} opcode(0xE8); /* E8 cd */ // Use FFREEs to clear entries in float stack - ins_encode( pre_call_FPU, + ins_encode( pre_call_resets, FFree_Float_Stack_All, Java_To_Runtime( meth ), post_call_FPU ); @@ -12902,7 +12924,7 @@ ins_cost(300); format %{ "CALL_LEAF,runtime " %} opcode(0xE8); /* E8 cd */ - ins_encode( pre_call_FPU, + ins_encode( pre_call_resets, FFree_Float_Stack_All, Java_To_Runtime( meth ), Verify_FPU_For_Leaf, post_call_FPU ); diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/x86_64.ad --- a/src/cpu/x86/vm/x86_64.ad Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/x86_64.ad Wed Apr 03 11:12:57 2013 -0700 @@ -399,6 +399,9 @@ static int preserve_SP_size() { return 3; // rex.w, op, rm(reg/reg) } +static int clear_avx_size() { + return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper +} // !!!!! Special hack to get all types of calls to specify the byte offset // from the start of the call to the point where the return address @@ -406,6 +409,7 @@ int MachCallStaticJavaNode::ret_addr_offset() { int offset = 5; // 5 bytes from start of call to where return address points + offset += clear_avx_size(); if (_method_handle_invoke) offset += preserve_SP_size(); return offset; @@ -413,11 +417,16 @@ int MachCallDynamicJavaNode::ret_addr_offset() { - return 15; // 15 bytes from start of call to where return address points + int offset = 15; // 15 bytes from start of call to where return address points + offset += clear_avx_size(); + return offset; } -// In os_cpu .ad file -// int MachCallRuntimeNode::ret_addr_offset() +int MachCallRuntimeNode::ret_addr_offset() { + int offset = 13; // movq r10,#addr; callq (r10) + offset += clear_avx_size(); + return offset; +} // Indicate if the safepoint node needs the polling page as an input, // it does if the polling page is more than disp32 away. @@ -434,6 +443,7 @@ // ensure that it does not span a cache line so that it can be patched. int CallStaticJavaDirectNode::compute_padding(int current_offset) const { + current_offset += clear_avx_size(); // skip vzeroupper current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; } @@ -443,6 +453,7 @@ int CallStaticJavaHandleNode::compute_padding(int current_offset) const { current_offset += preserve_SP_size(); // skip mov rbp, rsp + current_offset += clear_avx_size(); // skip vzeroupper current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; } @@ -451,6 +462,7 @@ // ensure that it does not span a cache line so that it can be patched. int CallDynamicJavaDirectNode::compute_padding(int current_offset) const { + current_offset += clear_avx_size(); // skip vzeroupper current_offset += 11; // skip movq instruction + call opcode byte return round_to(current_offset, alignment_required()) - current_offset; } @@ -764,6 +776,11 @@ void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const { Compile* C = ra_->C; + if (C->max_vector_size() > 16) { + st->print("vzeroupper"); + st->cr(); st->print("\t"); + } + int framesize = C->frame_slots() << LogBytesPerInt; assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); // Remove word for return adr already pushed @@ -793,6 +810,13 @@ void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const { Compile* C = ra_->C; + if (C->max_vector_size() > 16) { + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler _masm(&cbuf); + __ vzeroupper(); + } + int framesize = C->frame_slots() << LogBytesPerInt; assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); // Remove word for return adr already pushed @@ -2008,6 +2032,25 @@ __ bind(miss); %} + enc_class clear_avx %{ + debug_only(int off0 = cbuf.insts_size()); + if (ra_->C->max_vector_size() > 16) { + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler _masm(&cbuf); + __ vzeroupper(); + } + debug_only(int off1 = cbuf.insts_size()); + assert(off1 - off0 == clear_avx_size(), "correct size prediction"); + %} + + enc_class Java_To_Runtime(method meth) %{ + // No relocation needed + MacroAssembler _masm(&cbuf); + __ mov64(r10, (int64_t) $meth$$method); + __ call(r10); + %} + enc_class Java_To_Interpreter(method meth) %{ // CALL Java_To_Interpreter @@ -11366,7 +11409,7 @@ ins_cost(300); format %{ "call,static " %} opcode(0xE8); /* E8 cd */ - ins_encode(Java_Static_Call(meth), call_epilog); + ins_encode(clear_avx, Java_Static_Call(meth), call_epilog); ins_pipe(pipe_slow); ins_alignment(4); %} @@ -11384,7 +11427,7 @@ ins_cost(300); format %{ "call,static/MethodHandle " %} opcode(0xE8); /* E8 cd */ - ins_encode(preserve_SP, + ins_encode(clear_avx, preserve_SP, Java_Static_Call(meth), restore_SP, call_epilog); @@ -11403,7 +11446,7 @@ ins_cost(300); format %{ "movq rax, #Universe::non_oop_word()\n\t" "call,dynamic " %} - ins_encode(Java_Dynamic_Call(meth), call_epilog); + ins_encode(clear_avx, Java_Dynamic_Call(meth), call_epilog); ins_pipe(pipe_slow); ins_alignment(4); %} @@ -11416,8 +11459,7 @@ ins_cost(300); format %{ "call,runtime " %} - opcode(0xE8); /* E8 cd */ - ins_encode(Java_To_Runtime(meth)); + ins_encode(clear_avx, Java_To_Runtime(meth)); ins_pipe(pipe_slow); %} @@ -11429,8 +11471,7 @@ ins_cost(300); format %{ "call_leaf,runtime " %} - opcode(0xE8); /* E8 cd */ - ins_encode(Java_To_Runtime(meth)); + ins_encode(clear_avx, Java_To_Runtime(meth)); ins_pipe(pipe_slow); %} @@ -11442,7 +11483,6 @@ ins_cost(300); format %{ "call_leaf_nofp,runtime " %} - opcode(0xE8); /* E8 cd */ ins_encode(Java_To_Runtime(meth)); ins_pipe(pipe_slow); %} diff -r 53028d751155 -r e961c11b85fe src/os_cpu/bsd_x86/vm/bsd_x86_64.ad --- a/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad Tue Apr 02 09:30:07 2013 +0200 +++ b/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad Wed Apr 03 11:12:57 2013 -0700 @@ -55,20 +55,6 @@ // adding a syntax that specifies the sizes of fields in an order, // so that the adlc can build the emit functions automagically - enc_class Java_To_Runtime(method meth) %{ - // No relocation needed - - // movq r10, - emit_opcode(cbuf, Assembler::REX_WB); - emit_opcode(cbuf, 0xB8 | (R10_enc - 8)); - emit_d64(cbuf, (int64_t) $meth$$method); - - // call (r10) - emit_opcode(cbuf, Assembler::REX_B); - emit_opcode(cbuf, 0xFF); - emit_opcode(cbuf, 0xD0 | (R10_enc - 8)); - %} - %} @@ -76,8 +62,4 @@ source %{ -int MachCallRuntimeNode::ret_addr_offset() { - return 13; // movq r10,#addr; callq (r10) -} - %} diff -r 53028d751155 -r e961c11b85fe src/os_cpu/linux_x86/vm/linux_x86_64.ad --- a/src/os_cpu/linux_x86/vm/linux_x86_64.ad Tue Apr 02 09:30:07 2013 +0200 +++ b/src/os_cpu/linux_x86/vm/linux_x86_64.ad Wed Apr 03 11:12:57 2013 -0700 @@ -55,20 +55,6 @@ // adding a syntax that specifies the sizes of fields in an order, // so that the adlc can build the emit functions automagically - enc_class Java_To_Runtime(method meth) %{ - // No relocation needed - - // movq r10, - emit_opcode(cbuf, Assembler::REX_WB); - emit_opcode(cbuf, 0xB8 | (R10_enc - 8)); - emit_d64(cbuf, (int64_t) $meth$$method); - - // call (r10) - emit_opcode(cbuf, Assembler::REX_B); - emit_opcode(cbuf, 0xFF); - emit_opcode(cbuf, 0xD0 | (R10_enc - 8)); - %} - %} @@ -76,8 +62,4 @@ source %{ -int MachCallRuntimeNode::ret_addr_offset() { - return 13; // movq r10,#addr; callq (r10) -} - %} diff -r 53028d751155 -r e961c11b85fe src/os_cpu/solaris_x86/vm/solaris_x86_64.ad --- a/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad Tue Apr 02 09:30:07 2013 +0200 +++ b/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad Wed Apr 03 11:12:57 2013 -0700 @@ -54,39 +54,10 @@ // main source block for now. In future, we can generalize this by // adding a syntax that specifies the sizes of fields in an order, // so that the adlc can build the emit functions automagically - - enc_class Java_To_Runtime(method meth) %{ - // No relocation needed - - // movq r10, - emit_opcode(cbuf, Assembler::REX_WB); - emit_opcode(cbuf, 0xB8 | (R10_enc - 8)); - emit_d64(cbuf, (int64_t) $meth$$method); - - // call (r10) - emit_opcode(cbuf, Assembler::REX_B); - emit_opcode(cbuf, 0xFF); - emit_opcode(cbuf, 0xD0 | (R10_enc - 8)); - %} - - enc_class post_call_verify_mxcsr %{ - MacroAssembler _masm(&cbuf); - if (RestoreMXCSROnJNICalls) { - __ ldmxcsr(ExternalAddress(StubRoutines::amd64::mxcsr_std())); - } - else if (CheckJNICalls) { - __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::amd64::verify_mxcsr_entry()))); - } - %} %} // Platform dependent source source %{ - -int MachCallRuntimeNode::ret_addr_offset() { - return 13; // movq r10,#addr; callq (r10) -} - %} diff -r 53028d751155 -r e961c11b85fe src/os_cpu/windows_x86/vm/windows_x86_64.ad --- a/src/os_cpu/windows_x86/vm/windows_x86_64.ad Tue Apr 02 09:30:07 2013 +0200 +++ b/src/os_cpu/windows_x86/vm/windows_x86_64.ad Wed Apr 03 11:12:57 2013 -0700 @@ -53,30 +53,11 @@ // adding a syntax that specifies the sizes of fields in an order, // so that the adlc can build the emit functions automagically - enc_class Java_To_Runtime (method meth) %{ // CALL Java_To_Runtime - // No relocation needed +%} + - // movq r10, - emit_opcode(cbuf, Assembler::REX_WB); - emit_opcode(cbuf, 0xB8 | (R10_enc - 8)); - emit_d64(cbuf, (int64_t) $meth$$method); +// Platform dependent source - // call (r10) - emit_opcode(cbuf, Assembler::REX_B); - emit_opcode(cbuf, 0xFF); - emit_opcode(cbuf, 0xD0 | (R10_enc - 8)); - %} +source %{ %} - -// -// Platform dependent source -// -source %{ - -int MachCallRuntimeNode::ret_addr_offset() -{ - return 13; // movq r10,#addr; callq (r10) -} - -%}