# HG changeset patch
# User kvn
# Date 1365012777 25200
# Node ID e961c11b85fe2b977b4b6a7def649dbf6ace4241
# Parent  53028d7511557e0295ac650dd7c4a4f12db3d027
8011102: Clear AVX registers after return from JNI call
Summary: Execute vzeroupper instruction after JNI call and on exits in jit compiled code which use 256bit vectors.
Reviewed-by: roland

diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/cppInterpreter_x86.cpp
--- a/src/cpu/x86/vm/cppInterpreter_x86.cpp	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/cpu/x86/vm/cppInterpreter_x86.cpp	Wed Apr 03 11:12:57 2013 -0700
@@ -1299,25 +1299,8 @@
   __ push(rdx);
 #endif // _LP64
 
-  // Either restore the MXCSR register after returning from the JNI Call
-  // or verify that it wasn't changed.
-  if (VM_Version::supports_sse()) {
-    if (RestoreMXCSROnJNICalls) {
-      __ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
-    }
-    else if (CheckJNICalls ) {
-      __ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
-    }
-  }
-
-#ifndef _LP64
-  // Either restore the x87 floating pointer control word after returning
-  // from the JNI call or verify that it wasn't changed.
-  if (CheckJNICalls) {
-    __ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
-  }
-#endif // _LP64
-
+  // Verify or restore cpu control state after JNI call
+  __ restore_cpu_control_state_after_jni();
 
   // change thread state
   __ movl(Address(thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/macroAssembler_x86.cpp
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Wed Apr 03 11:12:57 2013 -0700
@@ -4765,6 +4765,31 @@
   pop_CPU_state();
 }
 
+void MacroAssembler::restore_cpu_control_state_after_jni() {
+  // Either restore the MXCSR register after returning from the JNI Call
+  // or verify that it wasn't changed (with -Xcheck:jni flag).
+  if (VM_Version::supports_sse()) {
+    if (RestoreMXCSROnJNICalls) {
+      ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
+    } else if (CheckJNICalls) {
+      call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
+    }
+  }
+  if (VM_Version::supports_avx()) {
+    // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
+    vzeroupper();
+  }
+
+#ifndef _LP64
+  // Either restore the x87 floating pointer control word after returning
+  // from the JNI call or verify that it wasn't changed.
+  if (CheckJNICalls) {
+    call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
+  }
+#endif // _LP64
+}
+
+
 void MacroAssembler::load_klass(Register dst, Register src) {
 #ifdef _LP64
   if (UseCompressedKlassPointers) {
@@ -5759,6 +5784,8 @@
     addptr(result, stride2);
     subl(cnt2, stride2);
     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
+    // clean upper bits of YMM registers
+    vzeroupper();
 
     // compare wide vectors tail
     bind(COMPARE_WIDE_TAIL);
@@ -5772,6 +5799,8 @@
 
     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
     bind(VECTOR_NOT_EQUAL);
+    // clean upper bits of YMM registers
+    vzeroupper();
     lea(str1, Address(str1, result, scale));
     lea(str2, Address(str2, result, scale));
     jmp(COMPARE_16_CHARS);
@@ -6028,6 +6057,10 @@
 
   // That's it
   bind(DONE);
+  if (UseAVX >= 2) {
+    // clean upper bits of YMM registers
+    vzeroupper();
+  }
 }
 
 void MacroAssembler::generate_fill(BasicType t, bool aligned,
@@ -6157,6 +6190,10 @@
         vmovdqu(Address(to, 0), xtmp);
         addptr(to, 32);
         subl(count, 8 << shift);
+
+        BIND(L_check_fill_8_bytes);
+        // clean upper bits of YMM registers
+        vzeroupper();
       } else {
         // Fill 32-byte chunks
         pshufd(xtmp, xtmp, 0);
@@ -6180,8 +6217,9 @@
         addptr(to, 32);
         subl(count, 8 << shift);
         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
+
+        BIND(L_check_fill_8_bytes);
       }
-      BIND(L_check_fill_8_bytes);
       addl(count, 8 << shift);
       jccb(Assembler::zero, L_exit);
       jmpb(L_fill_8_bytes);
@@ -6316,6 +6354,10 @@
     jccb(Assembler::lessEqual, L_copy_16_chars);
 
     bind(L_copy_16_chars_exit);
+    if (UseAVX >= 2) {
+      // clean upper bits of YMM registers
+      vzeroupper();
+    }
     subptr(len, 8);
     jccb(Assembler::greater, L_copy_8_chars_exit);
 
diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/macroAssembler_x86.hpp
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Wed Apr 03 11:12:57 2013 -0700
@@ -582,6 +582,9 @@
   // only if +VerifyFPU
   void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
 
+  // Verify or restore cpu control state after JNI call
+  void restore_cpu_control_state_after_jni();
+
   // prints msg, dumps registers and stops execution
   void stop(const char* msg);
 
diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/sharedRuntime_x86_32.cpp
--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Wed Apr 03 11:12:57 2013 -0700
@@ -2065,6 +2065,9 @@
 
   __ call(RuntimeAddress(native_func));
 
+  // Verify or restore cpu control state after JNI call
+  __ restore_cpu_control_state_after_jni();
+
   // WARNING - on Windows Java Natives use pascal calling convention and pop the
   // arguments off of the stack. We could just re-adjust the stack pointer here
   // and continue to do SP relative addressing but we instead switch to FP
diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/sharedRuntime_x86_64.cpp
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Wed Apr 03 11:12:57 2013 -0700
@@ -2315,16 +2315,8 @@
 
   __ call(RuntimeAddress(native_func));
 
-    // Either restore the MXCSR register after returning from the JNI Call
-    // or verify that it wasn't changed.
-    if (RestoreMXCSROnJNICalls) {
-      __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
-
-    }
-    else if (CheckJNICalls ) {
-      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry())));
-    }
-
+  // Verify or restore cpu control state after JNI call
+  __ restore_cpu_control_state_after_jni();
 
   // Unpack native results.
   switch (ret_type) {
diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/stubGenerator_x86_32.cpp
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Wed Apr 03 11:12:57 2013 -0700
@@ -835,6 +835,11 @@
   __ BIND(L_copy_64_bytes);
     __ subl(qword_count, 8);
     __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
+
+    if (UseUnalignedLoadStores && (UseAVX >= 2)) {
+      // clean upper bits of YMM registers
+      __ vzeroupper();
+    }
     __ addl(qword_count, 8);
     __ jccb(Assembler::zero, L_exit);
     //
diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/stubGenerator_x86_64.cpp
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Wed Apr 03 11:12:57 2013 -0700
@@ -1331,6 +1331,10 @@
       }
       __ addptr(qword_count, 4);
       __ BIND(L_end);
+      if (UseAVX >= 2) {
+        // clean upper bits of YMM registers
+        __ vzeroupper();
+      }
     } else {
       // Copy 32-bytes per iteration
       __ BIND(L_loop);
@@ -1404,6 +1408,10 @@
       }
       __ subptr(qword_count, 4);
       __ BIND(L_end);
+      if (UseAVX >= 2) {
+        // clean upper bits of YMM registers
+        __ vzeroupper();
+      }
     } else {
       // Copy 32-bytes per iteration
       __ BIND(L_loop);
diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/templateInterpreter_x86_32.cpp
--- a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Wed Apr 03 11:12:57 2013 -0700
@@ -1080,22 +1080,8 @@
 
   // result potentially in rdx:rax or ST0
 
-  // Either restore the MXCSR register after returning from the JNI Call
-  // or verify that it wasn't changed.
-  if (VM_Version::supports_sse()) {
-    if (RestoreMXCSROnJNICalls) {
-      __ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
-    }
-    else if (CheckJNICalls ) {
-      __ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
-    }
-  }
-
-  // Either restore the x87 floating pointer control word after returning
-  // from the JNI call or verify that it wasn't changed.
-  if (CheckJNICalls) {
-    __ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
-  }
+  // Verify or restore cpu control state after JNI call
+  __ restore_cpu_control_state_after_jni();
 
   // save potential result in ST(0) & rdx:rax
   // (if result handler is the T_FLOAT or T_DOUBLE handler, result must be in ST0 -
diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/templateInterpreter_x86_64.cpp
--- a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Wed Apr 03 11:12:57 2013 -0700
@@ -1079,15 +1079,8 @@
   __ call(rax);
   // result potentially in rax or xmm0
 
-  // Depending on runtime options, either restore the MXCSR
-  // register after returning from the JNI Call or verify that
-  // it wasn't changed during -Xcheck:jni.
-  if (RestoreMXCSROnJNICalls) {
-    __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
-  }
-  else if (CheckJNICalls) {
-    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry())));
-  }
+  // Verify or restore cpu control state after JNI call
+  __ restore_cpu_control_state_after_jni();
 
   // NOTE: The order of these pushes is known to frame::interpreter_frame_result
   // in order to extract the result of a method call. If the order of these
diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/x86_32.ad
--- a/src/cpu/x86/vm/x86_32.ad	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/cpu/x86/vm/x86_32.ad	Wed Apr 03 11:12:57 2013 -0700
@@ -228,10 +228,16 @@
 static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000));
 
 // Offset hacking within calls.
-static int pre_call_FPU_size() {
-  if (Compile::current()->in_24_bit_fp_mode())
-    return 6; // fldcw
-  return 0;
+static int pre_call_resets_size() {
+  int size = 0;
+  Compile* C = Compile::current();
+  if (C->in_24_bit_fp_mode()) {
+    size += 6; // fldcw
+  }
+  if (C->max_vector_size() > 16) {
+    size += 3; // vzeroupper
+  }
+  return size;
 }
 
 static int preserve_SP_size() {
@@ -242,21 +248,21 @@
 //       from the start of the call to the point where the return address
 //       will point.
 int MachCallStaticJavaNode::ret_addr_offset() {
-  int offset = 5 + pre_call_FPU_size();  // 5 bytes from start of call to where return address points
+  int offset = 5 + pre_call_resets_size();  // 5 bytes from start of call to where return address points
   if (_method_handle_invoke)
     offset += preserve_SP_size();
   return offset;
 }
 
 int MachCallDynamicJavaNode::ret_addr_offset() {
-  return 10 + pre_call_FPU_size();  // 10 bytes from start of call to where return address points
+  return 10 + pre_call_resets_size();  // 10 bytes from start of call to where return address points
 }
 
 static int sizeof_FFree_Float_Stack_All = -1;
 
 int MachCallRuntimeNode::ret_addr_offset() {
   assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already");
-  return sizeof_FFree_Float_Stack_All + 5 + pre_call_FPU_size();
+  return sizeof_FFree_Float_Stack_All + 5 + pre_call_resets_size();
 }
 
 // Indicate if the safepoint node needs the polling page as an input.
@@ -272,7 +278,7 @@
 // The address of the call instruction needs to be 4-byte aligned to
 // ensure that it does not span a cache line so that it can be patched.
 int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
-  current_offset += pre_call_FPU_size();  // skip fldcw, if any
+  current_offset += pre_call_resets_size();  // skip fldcw, if any
   current_offset += 1;      // skip call opcode byte
   return round_to(current_offset, alignment_required()) - current_offset;
 }
@@ -280,7 +286,7 @@
 // The address of the call instruction needs to be 4-byte aligned to
 // ensure that it does not span a cache line so that it can be patched.
 int CallStaticJavaHandleNode::compute_padding(int current_offset) const {
-  current_offset += pre_call_FPU_size();  // skip fldcw, if any
+  current_offset += pre_call_resets_size();  // skip fldcw, if any
   current_offset += preserve_SP_size();   // skip mov rbp, rsp
   current_offset += 1;      // skip call opcode byte
   return round_to(current_offset, alignment_required()) - current_offset;
@@ -289,7 +295,7 @@
 // The address of the call instruction needs to be 4-byte aligned to
 // ensure that it does not span a cache line so that it can be patched.
 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const {
-  current_offset += pre_call_FPU_size();  // skip fldcw, if any
+  current_offset += pre_call_resets_size();  // skip fldcw, if any
   current_offset += 5;      // skip MOV instruction
   current_offset += 1;      // skip call opcode byte
   return round_to(current_offset, alignment_required()) - current_offset;
@@ -583,16 +589,20 @@
   // Remove two words for return addr and rbp,
   framesize -= 2*wordSize;
 
-  if( C->in_24_bit_fp_mode() ) {
+  if (C->max_vector_size() > 16) {
+    st->print("VZEROUPPER");
+    st->cr(); st->print("\t");
+  }
+  if (C->in_24_bit_fp_mode()) {
     st->print("FLDCW  standard control word");
     st->cr(); st->print("\t");
   }
-  if( framesize ) {
+  if (framesize) {
     st->print("ADD    ESP,%d\t# Destroy frame",framesize);
     st->cr(); st->print("\t");
   }
   st->print_cr("POPL   EBP"); st->print("\t");
-  if( do_polling() && C->is_method_compilation() ) {
+  if (do_polling() && C->is_method_compilation()) {
     st->print("TEST   PollPage,EAX\t! Poll Safepoint");
     st->cr(); st->print("\t");
   }
@@ -602,8 +612,14 @@
 void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
   Compile *C = ra_->C;
 
+  if (C->max_vector_size() > 16) {
+    // Clear upper bits of YMM registers when current compiled code uses
+    // wide vectors to avoid AVX <-> SSE transition penalty during call.
+    MacroAssembler masm(&cbuf);
+    masm.vzeroupper();
+  }
   // If method set FPU control word, restore to standard control word
-  if( C->in_24_bit_fp_mode() ) {
+  if (C->in_24_bit_fp_mode()) {
     MacroAssembler masm(&cbuf);
     masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
   }
@@ -615,12 +631,11 @@
 
   // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
 
-  if( framesize >= 128 ) {
+  if (framesize >= 128) {
     emit_opcode(cbuf, 0x81); // add  SP, #framesize
     emit_rm(cbuf, 0x3, 0x00, ESP_enc);
     emit_d32(cbuf, framesize);
-  }
-  else if( framesize ) {
+  } else if (framesize) {
     emit_opcode(cbuf, 0x83); // add  SP, #framesize
     emit_rm(cbuf, 0x3, 0x00, ESP_enc);
     emit_d8(cbuf, framesize);
@@ -628,7 +643,7 @@
 
   emit_opcode(cbuf, 0x58 | EBP_enc);
 
-  if( do_polling() && C->is_method_compilation() ) {
+  if (do_polling() && C->is_method_compilation()) {
     cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0);
     emit_opcode(cbuf,0x85);
     emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX
@@ -640,7 +655,8 @@
   Compile *C = ra_->C;
   // If method set FPU control word, restore to standard control word
   int size = C->in_24_bit_fp_mode() ? 6 : 0;
-  if( do_polling() && C->is_method_compilation() ) size += 6;
+  if (C->max_vector_size() > 16) size += 3; // vzeroupper
+  if (do_polling() && C->is_method_compilation()) size += 6;
 
   int framesize = C->frame_slots() << LogBytesPerInt;
   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
@@ -649,7 +665,7 @@
 
   size++; // popl rbp,
 
-  if( framesize >= 128 ) {
+  if (framesize >= 128) {
     size += 6;
   } else {
     size += framesize ? 3 : 0;
@@ -1853,20 +1869,26 @@
   %}
 
 
-  enc_class pre_call_FPU %{
+  enc_class pre_call_resets %{
     // If method sets FPU control word restore it here
     debug_only(int off0 = cbuf.insts_size());
-    if( Compile::current()->in_24_bit_fp_mode() ) {
-      MacroAssembler masm(&cbuf);
-      masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
+    if (ra_->C->in_24_bit_fp_mode()) {
+      MacroAssembler _masm(&cbuf);
+      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
+    }
+    if (ra_->C->max_vector_size() > 16) {
+      // Clear upper bits of YMM registers when current compiled code uses
+      // wide vectors to avoid AVX <-> SSE transition penalty during call.
+      MacroAssembler _masm(&cbuf);
+      __ vzeroupper();
     }
     debug_only(int off1 = cbuf.insts_size());
-    assert(off1 - off0 == pre_call_FPU_size(), "correct size prediction");
+    assert(off1 - off0 == pre_call_resets_size(), "correct size prediction");
   %}
 
   enc_class post_call_FPU %{
     // If method sets FPU control word do it here also
-    if( Compile::current()->in_24_bit_fp_mode() ) {
+    if (Compile::current()->in_24_bit_fp_mode()) {
       MacroAssembler masm(&cbuf);
       masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
     }
@@ -1877,17 +1899,17 @@
     // who we intended to call.
     cbuf.set_insts_mark();
     $$$emit8$primary;
-    if ( !_method ) {
+    if (!_method) {
       emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
                      runtime_call_Relocation::spec(), RELOC_IMM32 );
-    } else if(_optimized_virtual) {
+    } else if (_optimized_virtual) {
       emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
                      opt_virtual_call_Relocation::spec(), RELOC_IMM32 );
     } else {
       emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
                      static_call_Relocation::spec(), RELOC_IMM32 );
     }
-    if( _method ) {  // Emit stub for static call
+    if (_method) {  // Emit stub for static call
       emit_java_to_interp(cbuf);
     }
   %}
@@ -12828,7 +12850,7 @@
   ins_cost(300);
   format %{ "CALL,static " %}
   opcode(0xE8); /* E8 cd */
-  ins_encode( pre_call_FPU,
+  ins_encode( pre_call_resets,
               Java_Static_Call( meth ),
               call_epilog,
               post_call_FPU );
@@ -12849,7 +12871,7 @@
   ins_cost(300);
   format %{ "CALL,static/MethodHandle " %}
   opcode(0xE8); /* E8 cd */
-  ins_encode( pre_call_FPU,
+  ins_encode( pre_call_resets,
               preserve_SP,
               Java_Static_Call( meth ),
               restore_SP,
@@ -12870,7 +12892,7 @@
   format %{ "MOV    EAX,(oop)-1\n\t"
             "CALL,dynamic" %}
   opcode(0xE8); /* E8 cd */
-  ins_encode( pre_call_FPU,
+  ins_encode( pre_call_resets,
               Java_Dynamic_Call( meth ),
               call_epilog,
               post_call_FPU );
@@ -12887,7 +12909,7 @@
   format %{ "CALL,runtime " %}
   opcode(0xE8); /* E8 cd */
   // Use FFREEs to clear entries in float stack
-  ins_encode( pre_call_FPU,
+  ins_encode( pre_call_resets,
               FFree_Float_Stack_All,
               Java_To_Runtime( meth ),
               post_call_FPU );
@@ -12902,7 +12924,7 @@
   ins_cost(300);
   format %{ "CALL_LEAF,runtime " %}
   opcode(0xE8); /* E8 cd */
-  ins_encode( pre_call_FPU,
+  ins_encode( pre_call_resets,
               FFree_Float_Stack_All,
               Java_To_Runtime( meth ),
               Verify_FPU_For_Leaf, post_call_FPU );
diff -r 53028d751155 -r e961c11b85fe src/cpu/x86/vm/x86_64.ad
--- a/src/cpu/x86/vm/x86_64.ad	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/cpu/x86/vm/x86_64.ad	Wed Apr 03 11:12:57 2013 -0700
@@ -399,6 +399,9 @@
 static int preserve_SP_size() {
   return 3;  // rex.w, op, rm(reg/reg)
 }
+static int clear_avx_size() {
+  return (Compile::current()->max_vector_size() > 16) ? 3 : 0;  // vzeroupper
+}
 
 // !!!!! Special hack to get all types of calls to specify the byte offset
 //       from the start of the call to the point where the return address
@@ -406,6 +409,7 @@
 int MachCallStaticJavaNode::ret_addr_offset()
 {
   int offset = 5; // 5 bytes from start of call to where return address points
+  offset += clear_avx_size();
   if (_method_handle_invoke)
     offset += preserve_SP_size();
   return offset;
@@ -413,11 +417,16 @@
 
 int MachCallDynamicJavaNode::ret_addr_offset()
 {
-  return 15; // 15 bytes from start of call to where return address points
+  int offset = 15; // 15 bytes from start of call to where return address points
+  offset += clear_avx_size();
+  return offset;
 }
 
-// In os_cpu .ad file
-// int MachCallRuntimeNode::ret_addr_offset()
+int MachCallRuntimeNode::ret_addr_offset() {
+  int offset = 13; // movq r10,#addr; callq (r10)
+  offset += clear_avx_size();
+  return offset;
+}
 
 // Indicate if the safepoint node needs the polling page as an input,
 // it does if the polling page is more than disp32 away.
@@ -434,6 +443,7 @@
 // ensure that it does not span a cache line so that it can be patched.
 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
 {
+  current_offset += clear_avx_size(); // skip vzeroupper
   current_offset += 1; // skip call opcode byte
   return round_to(current_offset, alignment_required()) - current_offset;
 }
@@ -443,6 +453,7 @@
 int CallStaticJavaHandleNode::compute_padding(int current_offset) const
 {
   current_offset += preserve_SP_size();   // skip mov rbp, rsp
+  current_offset += clear_avx_size(); // skip vzeroupper
   current_offset += 1; // skip call opcode byte
   return round_to(current_offset, alignment_required()) - current_offset;
 }
@@ -451,6 +462,7 @@
 // ensure that it does not span a cache line so that it can be patched.
 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
 {
+  current_offset += clear_avx_size(); // skip vzeroupper
   current_offset += 11; // skip movq instruction + call opcode byte
   return round_to(current_offset, alignment_required()) - current_offset;
 }
@@ -764,6 +776,11 @@
 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 {
   Compile* C = ra_->C;
+  if (C->max_vector_size() > 16) {
+    st->print("vzeroupper");
+    st->cr(); st->print("\t");
+  }
+
   int framesize = C->frame_slots() << LogBytesPerInt;
   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
   // Remove word for return adr already pushed
@@ -793,6 +810,13 @@
 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 {
   Compile* C = ra_->C;
+  if (C->max_vector_size() > 16) {
+    // Clear upper bits of YMM registers when current compiled code uses
+    // wide vectors to avoid AVX <-> SSE transition penalty during call.
+    MacroAssembler _masm(&cbuf);
+    __ vzeroupper();
+  }
+
   int framesize = C->frame_slots() << LogBytesPerInt;
   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
   // Remove word for return adr already pushed
@@ -2008,6 +2032,25 @@
     __ bind(miss);
   %}
 
+  enc_class clear_avx %{
+    debug_only(int off0 = cbuf.insts_size());
+    if (ra_->C->max_vector_size() > 16) {
+      // Clear upper bits of YMM registers when current compiled code uses
+      // wide vectors to avoid AVX <-> SSE transition penalty during call.
+      MacroAssembler _masm(&cbuf);
+      __ vzeroupper();
+    }
+    debug_only(int off1 = cbuf.insts_size());
+    assert(off1 - off0 == clear_avx_size(), "correct size prediction");
+  %}
+
+  enc_class Java_To_Runtime(method meth) %{
+    // No relocation needed
+    MacroAssembler _masm(&cbuf);
+    __ mov64(r10, (int64_t) $meth$$method);
+    __ call(r10);
+  %}
+
   enc_class Java_To_Interpreter(method meth)
   %{
     // CALL Java_To_Interpreter
@@ -11366,7 +11409,7 @@
   ins_cost(300);
   format %{ "call,static " %}
   opcode(0xE8); /* E8 cd */
-  ins_encode(Java_Static_Call(meth), call_epilog);
+  ins_encode(clear_avx, Java_Static_Call(meth), call_epilog);
   ins_pipe(pipe_slow);
   ins_alignment(4);
 %}
@@ -11384,7 +11427,7 @@
   ins_cost(300);
   format %{ "call,static/MethodHandle " %}
   opcode(0xE8); /* E8 cd */
-  ins_encode(preserve_SP,
+  ins_encode(clear_avx, preserve_SP,
              Java_Static_Call(meth),
              restore_SP,
              call_epilog);
@@ -11403,7 +11446,7 @@
   ins_cost(300);
   format %{ "movq    rax, #Universe::non_oop_word()\n\t"
             "call,dynamic " %}
-  ins_encode(Java_Dynamic_Call(meth), call_epilog);
+  ins_encode(clear_avx, Java_Dynamic_Call(meth), call_epilog);
   ins_pipe(pipe_slow);
   ins_alignment(4);
 %}
@@ -11416,8 +11459,7 @@
 
   ins_cost(300);
   format %{ "call,runtime " %}
-  opcode(0xE8); /* E8 cd */
-  ins_encode(Java_To_Runtime(meth));
+  ins_encode(clear_avx, Java_To_Runtime(meth));
   ins_pipe(pipe_slow);
 %}
 
@@ -11429,8 +11471,7 @@
 
   ins_cost(300);
   format %{ "call_leaf,runtime " %}
-  opcode(0xE8); /* E8 cd */
-  ins_encode(Java_To_Runtime(meth));
+  ins_encode(clear_avx, Java_To_Runtime(meth));
   ins_pipe(pipe_slow);
 %}
 
@@ -11442,7 +11483,6 @@
 
   ins_cost(300);
   format %{ "call_leaf_nofp,runtime " %}
-  opcode(0xE8); /* E8 cd */
   ins_encode(Java_To_Runtime(meth));
   ins_pipe(pipe_slow);
 %}
diff -r 53028d751155 -r e961c11b85fe src/os_cpu/bsd_x86/vm/bsd_x86_64.ad
--- a/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad	Wed Apr 03 11:12:57 2013 -0700
@@ -55,20 +55,6 @@
   // adding a syntax that specifies the sizes of fields in an order,
   // so that the adlc can build the emit functions automagically
 
-  enc_class Java_To_Runtime(method meth) %{
-    // No relocation needed
-
-    // movq r10, <meth>
-    emit_opcode(cbuf, Assembler::REX_WB);
-    emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
-    emit_d64(cbuf, (int64_t) $meth$$method);
-
-    // call (r10)
-    emit_opcode(cbuf, Assembler::REX_B);
-    emit_opcode(cbuf, 0xFF);
-    emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
-  %}
-
 %}
 
 
@@ -76,8 +62,4 @@
 
 source %{
 
-int MachCallRuntimeNode::ret_addr_offset() {
-  return 13; // movq r10,#addr; callq (r10)
-}
-
 %}
diff -r 53028d751155 -r e961c11b85fe src/os_cpu/linux_x86/vm/linux_x86_64.ad
--- a/src/os_cpu/linux_x86/vm/linux_x86_64.ad	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/os_cpu/linux_x86/vm/linux_x86_64.ad	Wed Apr 03 11:12:57 2013 -0700
@@ -55,20 +55,6 @@
   // adding a syntax that specifies the sizes of fields in an order,
   // so that the adlc can build the emit functions automagically
 
-  enc_class Java_To_Runtime(method meth) %{
-    // No relocation needed
-
-    // movq r10, <meth>
-    emit_opcode(cbuf, Assembler::REX_WB);
-    emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
-    emit_d64(cbuf, (int64_t) $meth$$method);
-
-    // call (r10)
-    emit_opcode(cbuf, Assembler::REX_B);
-    emit_opcode(cbuf, 0xFF);
-    emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
-  %}
-
 %}
 
 
@@ -76,8 +62,4 @@
 
 source %{
 
-int MachCallRuntimeNode::ret_addr_offset() {
-  return 13; // movq r10,#addr; callq (r10)
-}
-
 %}
diff -r 53028d751155 -r e961c11b85fe src/os_cpu/solaris_x86/vm/solaris_x86_64.ad
--- a/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad	Wed Apr 03 11:12:57 2013 -0700
@@ -54,39 +54,10 @@
   // main source block for now.  In future, we can generalize this by
   // adding a syntax that specifies the sizes of fields in an order,
   // so that the adlc can build the emit functions automagically
-
-  enc_class Java_To_Runtime(method meth) %{
-    // No relocation needed
-
-    // movq r10, <meth>
-    emit_opcode(cbuf, Assembler::REX_WB);
-    emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
-    emit_d64(cbuf, (int64_t) $meth$$method);
-
-    // call (r10)
-    emit_opcode(cbuf, Assembler::REX_B);
-    emit_opcode(cbuf, 0xFF);
-    emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
-  %}
-
-  enc_class post_call_verify_mxcsr %{
-    MacroAssembler _masm(&cbuf);
-    if (RestoreMXCSROnJNICalls) {
-      __ ldmxcsr(ExternalAddress(StubRoutines::amd64::mxcsr_std()));
-    }
-    else if (CheckJNICalls) {
-      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::amd64::verify_mxcsr_entry())));
-    }
-  %}
 %}
 
 
 // Platform dependent source
 
 source %{
-
-int MachCallRuntimeNode::ret_addr_offset() {
-  return 13; // movq r10,#addr; callq (r10)
-}
-
 %}
diff -r 53028d751155 -r e961c11b85fe src/os_cpu/windows_x86/vm/windows_x86_64.ad
--- a/src/os_cpu/windows_x86/vm/windows_x86_64.ad	Tue Apr 02 09:30:07 2013 +0200
+++ b/src/os_cpu/windows_x86/vm/windows_x86_64.ad	Wed Apr 03 11:12:57 2013 -0700
@@ -53,30 +53,11 @@
   // adding a syntax that specifies the sizes of fields in an order,
   // so that the adlc can build the emit functions automagically
 
-  enc_class Java_To_Runtime (method meth) %{    // CALL Java_To_Runtime
-    // No relocation needed
+%}
+
 
-    // movq r10, <meth>
-    emit_opcode(cbuf, Assembler::REX_WB);
-    emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
-    emit_d64(cbuf, (int64_t) $meth$$method);
+// Platform dependent source
 
-    // call (r10)
-    emit_opcode(cbuf, Assembler::REX_B);
-    emit_opcode(cbuf, 0xFF);
-    emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
-  %}
+source %{
 
 %}
-
-//
-// Platform dependent source
-//
-source %{
-
-int MachCallRuntimeNode::ret_addr_offset()
-{
-  return 13; // movq r10,#addr; callq (r10)
-}
-
-%}