Mercurial > hg > truffle
diff src/cpu/x86/vm/x86_64.ad @ 8873:e961c11b85fe
8011102: Clear AVX registers after return from JNI call
Summary: Execute vzeroupper instruction after JNI call and on exits in jit compiled code which use 256bit vectors.
Reviewed-by: roland
author | kvn |
---|---|
date | Wed, 03 Apr 2013 11:12:57 -0700 |
parents | b30b3c2a0cf2 |
children | 8be1318fbe77 a6e09d6dd8e5 |
line wrap: on
line diff
--- a/src/cpu/x86/vm/x86_64.ad Tue Apr 02 09:30:07 2013 +0200 +++ b/src/cpu/x86/vm/x86_64.ad Wed Apr 03 11:12:57 2013 -0700 @@ -399,6 +399,9 @@ static int preserve_SP_size() { return 3; // rex.w, op, rm(reg/reg) } +static int clear_avx_size() { + return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper +} // !!!!! Special hack to get all types of calls to specify the byte offset // from the start of the call to the point where the return address @@ -406,6 +409,7 @@ int MachCallStaticJavaNode::ret_addr_offset() { int offset = 5; // 5 bytes from start of call to where return address points + offset += clear_avx_size(); if (_method_handle_invoke) offset += preserve_SP_size(); return offset; @@ -413,11 +417,16 @@ int MachCallDynamicJavaNode::ret_addr_offset() { - return 15; // 15 bytes from start of call to where return address points + int offset = 15; // 15 bytes from start of call to where return address points + offset += clear_avx_size(); + return offset; } -// In os_cpu .ad file -// int MachCallRuntimeNode::ret_addr_offset() +int MachCallRuntimeNode::ret_addr_offset() { + int offset = 13; // movq r10,#addr; callq (r10) + offset += clear_avx_size(); + return offset; +} // Indicate if the safepoint node needs the polling page as an input, // it does if the polling page is more than disp32 away. @@ -434,6 +443,7 @@ // ensure that it does not span a cache line so that it can be patched. int CallStaticJavaDirectNode::compute_padding(int current_offset) const { + current_offset += clear_avx_size(); // skip vzeroupper current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; } @@ -443,6 +453,7 @@ int CallStaticJavaHandleNode::compute_padding(int current_offset) const { current_offset += preserve_SP_size(); // skip mov rbp, rsp + current_offset += clear_avx_size(); // skip vzeroupper current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; } @@ -451,6 +462,7 @@ // ensure that it does not span a cache line so that it can be patched. int CallDynamicJavaDirectNode::compute_padding(int current_offset) const { + current_offset += clear_avx_size(); // skip vzeroupper current_offset += 11; // skip movq instruction + call opcode byte return round_to(current_offset, alignment_required()) - current_offset; } @@ -764,6 +776,11 @@ void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const { Compile* C = ra_->C; + if (C->max_vector_size() > 16) { + st->print("vzeroupper"); + st->cr(); st->print("\t"); + } + int framesize = C->frame_slots() << LogBytesPerInt; assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); // Remove word for return adr already pushed @@ -793,6 +810,13 @@ void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const { Compile* C = ra_->C; + if (C->max_vector_size() > 16) { + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler _masm(&cbuf); + __ vzeroupper(); + } + int framesize = C->frame_slots() << LogBytesPerInt; assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); // Remove word for return adr already pushed @@ -2008,6 +2032,25 @@ __ bind(miss); %} + enc_class clear_avx %{ + debug_only(int off0 = cbuf.insts_size()); + if (ra_->C->max_vector_size() > 16) { + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler _masm(&cbuf); + __ vzeroupper(); + } + debug_only(int off1 = cbuf.insts_size()); + assert(off1 - off0 == clear_avx_size(), "correct size prediction"); + %} + + enc_class Java_To_Runtime(method meth) %{ + // No relocation needed + MacroAssembler _masm(&cbuf); + __ mov64(r10, (int64_t) $meth$$method); + __ call(r10); + %} + enc_class Java_To_Interpreter(method meth) %{ // CALL Java_To_Interpreter @@ -11366,7 +11409,7 @@ ins_cost(300); format %{ "call,static " %} opcode(0xE8); /* E8 cd */ - ins_encode(Java_Static_Call(meth), call_epilog); + ins_encode(clear_avx, Java_Static_Call(meth), call_epilog); ins_pipe(pipe_slow); ins_alignment(4); %} @@ -11384,7 +11427,7 @@ ins_cost(300); format %{ "call,static/MethodHandle " %} opcode(0xE8); /* E8 cd */ - ins_encode(preserve_SP, + ins_encode(clear_avx, preserve_SP, Java_Static_Call(meth), restore_SP, call_epilog); @@ -11403,7 +11446,7 @@ ins_cost(300); format %{ "movq rax, #Universe::non_oop_word()\n\t" "call,dynamic " %} - ins_encode(Java_Dynamic_Call(meth), call_epilog); + ins_encode(clear_avx, Java_Dynamic_Call(meth), call_epilog); ins_pipe(pipe_slow); ins_alignment(4); %} @@ -11416,8 +11459,7 @@ ins_cost(300); format %{ "call,runtime " %} - opcode(0xE8); /* E8 cd */ - ins_encode(Java_To_Runtime(meth)); + ins_encode(clear_avx, Java_To_Runtime(meth)); ins_pipe(pipe_slow); %} @@ -11429,8 +11471,7 @@ ins_cost(300); format %{ "call_leaf,runtime " %} - opcode(0xE8); /* E8 cd */ - ins_encode(Java_To_Runtime(meth)); + ins_encode(clear_avx, Java_To_Runtime(meth)); ins_pipe(pipe_slow); %} @@ -11442,7 +11483,6 @@ ins_cost(300); format %{ "call_leaf_nofp,runtime " %} - opcode(0xE8); /* E8 cd */ ins_encode(Java_To_Runtime(meth)); ins_pipe(pipe_slow); %}