# HG changeset patch # User kvn # Date 1329370669 28800 # Node ID fd8114661503c7f45c5f364fc246f1d22e0e13b0 # Parent 69333a2fbae24a72a51a7c1a7dd0fac627748958 7125136: SIGILL on linux amd64 in gc/ArrayJuggle/Juggle29 Summary: For C2 moved saving EBP after ESP adjustment. For C1 generated 5 byte nop instruction first if needed. Reviewed-by: never, twisti, azeemj diff -r 69333a2fbae2 -r fd8114661503 src/cpu/x86/vm/assembler_x86.cpp --- a/src/cpu/x86/vm/assembler_x86.cpp Wed Feb 15 16:29:40 2012 -0800 +++ b/src/cpu/x86/vm/assembler_x86.cpp Wed Feb 15 21:37:49 2012 -0800 @@ -236,6 +236,16 @@ } } +// Force generation of a 4 byte immediate value even if it fits into 8bit +void Assembler::emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32) { + assert(isByte(op1) && isByte(op2), "wrong opcode"); + assert((op1 & 0x01) == 1, "should be 32bit operation"); + assert((op1 & 0x02) == 0, "sign-extension bit should not be set"); + emit_byte(op1); + emit_byte(op2 | encode(dst)); + emit_long(imm32); +} + // immediate-to-memory forms void Assembler::emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32) { assert((op1 & 0x01) == 1, "should be 32bit operation"); @@ -939,6 +949,7 @@ } void Assembler::addr_nop_4() { + assert(UseAddressNop, "no CPU support"); // 4 bytes: NOP DWORD PTR [EAX+0] emit_byte(0x0F); emit_byte(0x1F); @@ -947,6 +958,7 @@ } void Assembler::addr_nop_5() { + assert(UseAddressNop, "no CPU support"); // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset emit_byte(0x0F); emit_byte(0x1F); @@ -956,6 +968,7 @@ } void Assembler::addr_nop_7() { + assert(UseAddressNop, "no CPU support"); // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset emit_byte(0x0F); emit_byte(0x1F); @@ -964,6 +977,7 @@ } void Assembler::addr_nop_8() { + assert(UseAddressNop, "no CPU support"); // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset emit_byte(0x0F); emit_byte(0x1F); @@ -2769,6 +2783,12 @@ emit_arith(0x81, 0xE8, dst, imm32); } +// Force generation of a 4 byte immediate value even if it fits into 8bit +void Assembler::subl_imm32(Register dst, int32_t imm32) { + prefix(dst); + emit_arith_imm32(0x81, 0xE8, dst, imm32); +} + void Assembler::subl(Register dst, Address src) { InstructionMark im(this); prefix(src, dst); @@ -4760,6 +4780,12 @@ emit_arith(0x81, 0xE8, dst, imm32); } +// Force generation of a 4 byte immediate value even if it fits into 8bit +void Assembler::subq_imm32(Register dst, int32_t imm32) { + (void) prefixq_and_encode(dst->encoding()); + emit_arith_imm32(0x81, 0xE8, dst, imm32); +} + void Assembler::subq(Register dst, Address src) { InstructionMark im(this); prefixq(src, dst); @@ -5101,15 +5127,6 @@ } } -void MacroAssembler::fat_nop() { - // A 5 byte nop that is safe for patching (see patch_verified_entry) - emit_byte(0x26); // es: - emit_byte(0x2e); // cs: - emit_byte(0x64); // fs: - emit_byte(0x65); // gs: - emit_byte(0x90); -} - void MacroAssembler::jC2(Register tmp, Label& L) { // set parity bit if FPU flag C2 is set (via rax) save_rax(tmp); @@ -5704,17 +5721,6 @@ /* else */ { subq(dst, value) ; return; } } -void MacroAssembler::fat_nop() { - // A 5 byte nop that is safe for patching (see patch_verified_entry) - // Recommened sequence from 'Software Optimization Guide for the AMD - // Hammer Processor' - emit_byte(0x66); - emit_byte(0x66); - emit_byte(0x90); - emit_byte(0x66); - emit_byte(0x90); -} - void MacroAssembler::incrementq(Register reg, int value) { if (value == min_jint) { addq(reg, value); return; } if (value < 0) { decrementq(reg, -value); return; } @@ -6766,6 +6772,19 @@ mov(rbp, rsp); } +// A 5 byte nop that is safe for patching (see patch_verified_entry) +void MacroAssembler::fat_nop() { + if (UseAddressNop) { + addr_nop_5(); + } else { + emit_byte(0x26); // es: + emit_byte(0x2e); // cs: + emit_byte(0x64); // fs: + emit_byte(0x65); // gs: + emit_byte(0x90); + } +} + void MacroAssembler::fcmp(Register tmp) { fcmp(tmp, 1, true, true); } @@ -7825,6 +7844,11 @@ LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32)); } +// Force generation of a 4 byte immediate value even if it fits into 8bit +void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) { + LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32)); +} + void MacroAssembler::subptr(Register dst, Register src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); } @@ -9292,6 +9316,80 @@ } #endif // _LP64 + +// C2 compiled method's prolog code. +void MacroAssembler::verified_entry(int framesize, bool stack_bang, bool fp_mode_24b) { + + // WARNING: Initial instruction MUST be 5 bytes or longer so that + // NativeJump::patch_verified_entry will be able to patch out the entry + // code safely. The push to verify stack depth is ok at 5 bytes, + // the frame allocation can be either 3 or 6 bytes. So if we don't do + // stack bang then we must use the 6 byte frame allocation even if + // we have no frame. :-( + + assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); + // Remove word for return addr + framesize -= wordSize; + + // Calls to C2R adapters often do not accept exceptional returns. + // We require that their callers must bang for them. But be careful, because + // some VM calls (such as call site linkage) can use several kilobytes of + // stack. But the stack safety zone should account for that. + // See bugs 4446381, 4468289, 4497237. + if (stack_bang) { + generate_stack_overflow_check(framesize); + + // We always push rbp, so that on return to interpreter rbp, will be + // restored correctly and we can correct the stack. + push(rbp); + // Remove word for ebp + framesize -= wordSize; + + // Create frame + if (framesize) { + subptr(rsp, framesize); + } + } else { + // Create frame (force generation of a 4 byte immediate value) + subptr_imm32(rsp, framesize); + + // Save RBP register now. + framesize -= wordSize; + movptr(Address(rsp, framesize), rbp); + } + + if (VerifyStackAtCalls) { // Majik cookie to verify stack depth + framesize -= wordSize; + movptr(Address(rsp, framesize), (int32_t)0xbadb100d); + } + +#ifndef _LP64 + // If method sets FPU control word do it now + if (fp_mode_24b) { + fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24())); + } + if (UseSSE >= 2 && VerifyFPU) { + verify_FPU(0, "FPU stack must be clean on entry"); + } +#endif + +#ifdef ASSERT + if (VerifyStackAtCalls) { + Label L; + push(rax); + mov(rax, rsp); + andptr(rax, StackAlignmentInBytes-1); + cmpptr(rax, StackAlignmentInBytes-wordSize); + pop(rax); + jcc(Assembler::equal, L); + stop("Stack is not properly aligned!"); + bind(L); + } +#endif + +} + + // IndexOf for constant substrings with size >= 8 chars // which don't need to be loaded through stack. void MacroAssembler::string_indexofC8(Register str1, Register str2, diff -r 69333a2fbae2 -r fd8114661503 src/cpu/x86/vm/assembler_x86.hpp --- a/src/cpu/x86/vm/assembler_x86.hpp Wed Feb 15 16:29:40 2012 -0800 +++ b/src/cpu/x86/vm/assembler_x86.hpp Wed Feb 15 21:37:49 2012 -0800 @@ -667,6 +667,8 @@ void emit_arith_b(int op1, int op2, Register dst, int imm8); void emit_arith(int op1, int op2, Register dst, int32_t imm32); + // Force generation of a 4 byte immediate value even if it fits into 8bit + void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32); // only 32bit?? void emit_arith(int op1, int op2, Register dst, jobject obj); void emit_arith(int op1, int op2, Register dst, Register src); @@ -1526,6 +1528,9 @@ void subq(Register dst, Address src); void subq(Register dst, Register src); + // Force generation of a 4 byte immediate value even if it fits into 8bit + void subl_imm32(Register dst, int32_t imm32); + void subq_imm32(Register dst, int32_t imm32); // Subtract Scalar Double-Precision Floating-Point Values void subsd(XMMRegister dst, Address src); @@ -1763,8 +1768,8 @@ // Alignment void align(int modulus); - // Misc - void fat_nop(); // 5 byte nop + // A 5 byte nop that is safe for patching (see patch_verified_entry) + void fat_nop(); // Stack frame creation/removal void enter(); @@ -2275,6 +2280,8 @@ void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); } void subptr(Register dst, int32_t src); + // Force generation of a 4 byte immediate value even if it fits into 8bit + void subptr_imm32(Register dst, int32_t src); void subptr(Register dst, Register src); void subptr(Register dst, RegisterOrConstant src) { if (src.is_constant()) subptr(dst, (int) src.as_constant()); @@ -2566,6 +2573,9 @@ void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); } void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); } + // C2 compiled method's prolog code. + void verified_entry(int framesize, bool stack_bang, bool fp_mode_24b); + // IndexOf strings. // Small strings are loaded through stack if they cross page boundary. void string_indexof(Register str1, Register str2, diff -r 69333a2fbae2 -r fd8114661503 src/cpu/x86/vm/c1_MacroAssembler_x86.cpp --- a/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp Wed Feb 15 16:29:40 2012 -0800 +++ b/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp Wed Feb 15 21:37:49 2012 -0800 @@ -381,6 +381,16 @@ void C1_MacroAssembler::verified_entry() { + if (C1Breakpoint || VerifyFPU || !UseStackBanging) { + // Verified Entry first instruction should be 5 bytes long for correct + // patching by patch_verified_entry(). + // + // C1Breakpoint and VerifyFPU have one byte first instruction. + // Also first instruction will be one byte "push(rbp)" if stack banging + // code is not generated (see build_frame() above). + // For all these cases generate long instruction first. + fat_nop(); + } if (C1Breakpoint)int3(); // build frame verify_FPU(0, "method_entry"); diff -r 69333a2fbae2 -r fd8114661503 src/cpu/x86/vm/x86_32.ad --- a/src/cpu/x86/vm/x86_32.ad Wed Feb 15 16:29:40 2012 -0800 +++ b/src/cpu/x86/vm/x86_32.ad Wed Feb 15 21:37:49 2012 -0800 @@ -550,118 +550,66 @@ //============================================================================= #ifndef PRODUCT -void MachPrologNode::format( PhaseRegAlloc *ra_, outputStream* st ) const { +void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const { Compile* C = ra_->C; - if( C->in_24_bit_fp_mode() ) { - st->print("FLDCW 24 bit fpu control word"); - st->print_cr(""); st->print("\t"); - } int framesize = C->frame_slots() << LogBytesPerInt; assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); - // Remove two words for return addr and rbp, - framesize -= 2*wordSize; - - // Calls to C2R adapters often do not accept exceptional returns. - // We require that their callers must bang for them. But be careful, because - // some VM calls (such as call site linkage) can use several kilobytes of - // stack. But the stack safety zone should account for that. - // See bugs 4446381, 4468289, 4497237. + // Remove wordSize for return addr which is already pushed. + framesize -= wordSize; + if (C->need_stack_bang(framesize)) { - st->print_cr("# stack bang"); st->print("\t"); - } - st->print_cr("PUSHL EBP"); st->print("\t"); - - if( VerifyStackAtCalls ) { // Majik cookie to verify stack depth - st->print("PUSH 0xBADB100D\t# Majik cookie for stack depth check"); - st->print_cr(""); st->print("\t"); framesize -= wordSize; - } - - if ((C->in_24_bit_fp_mode() || VerifyStackAtCalls ) && framesize < 128 ) { + st->print("# stack bang"); + st->print("\n\t"); + st->print("PUSH EBP\t# Save EBP"); if (framesize) { - st->print("SUB ESP,%d\t# Create frame",framesize); + st->print("\n\t"); + st->print("SUB ESP, #%d\t# Create frame",framesize); } } else { - st->print("SUB ESP,%d\t# Create frame",framesize); + st->print("SUB ESP, #%d\t# Create frame",framesize); + st->print("\n\t"); + framesize -= wordSize; + st->print("MOV [ESP + #%d], EBP\t# Save EBP",framesize); + } + + if (VerifyStackAtCalls) { + st->print("\n\t"); + framesize -= wordSize; + st->print("MOV [ESP + #%d], 0xBADB100D\t# Majik cookie for stack depth check",framesize); } + + if( C->in_24_bit_fp_mode() ) { + st->print("\n\t"); + st->print("FLDCW \t# load 24 bit fpu control word"); + } + if (UseSSE >= 2 && VerifyFPU) { + st->print("\n\t"); + st->print("# verify FPU stack (must be clean on entry)"); + } + +#ifdef ASSERT + if (VerifyStackAtCalls) { + st->print("\n\t"); + st->print("# stack alignment check"); + } +#endif + st->cr(); } #endif void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { Compile* C = ra_->C; - - if (UseSSE >= 2 && VerifyFPU) { - MacroAssembler masm(&cbuf); - masm.verify_FPU(0, "FPU stack must be clean on entry"); - } - - // WARNING: Initial instruction MUST be 5 bytes or longer so that - // NativeJump::patch_verified_entry will be able to patch out the entry - // code safely. The fldcw is ok at 6 bytes, the push to verify stack - // depth is ok at 5 bytes, the frame allocation can be either 3 or - // 6 bytes. So if we don't do the fldcw or the push then we must - // use the 6 byte frame allocation even if we have no frame. :-( - // If method sets FPU control word do it now - if( C->in_24_bit_fp_mode() ) { - MacroAssembler masm(&cbuf); - masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24())); - } + MacroAssembler _masm(&cbuf); int framesize = C->frame_slots() << LogBytesPerInt; - assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); - // Remove two words for return addr and rbp, - framesize -= 2*wordSize; - - // Calls to C2R adapters often do not accept exceptional returns. - // We require that their callers must bang for them. But be careful, because - // some VM calls (such as call site linkage) can use several kilobytes of - // stack. But the stack safety zone should account for that. - // See bugs 4446381, 4468289, 4497237. - if (C->need_stack_bang(framesize)) { - MacroAssembler masm(&cbuf); - masm.generate_stack_overflow_check(framesize); - } - - // We always push rbp, so that on return to interpreter rbp, will be - // restored correctly and we can correct the stack. - emit_opcode(cbuf, 0x50 | EBP_enc); - - if( VerifyStackAtCalls ) { // Majik cookie to verify stack depth - emit_opcode(cbuf, 0x68); // push 0xbadb100d - emit_d32(cbuf, 0xbadb100d); - framesize -= wordSize; - } - - if ((C->in_24_bit_fp_mode() || VerifyStackAtCalls ) && framesize < 128 ) { - if (framesize) { - emit_opcode(cbuf, 0x83); // sub SP,#framesize - emit_rm(cbuf, 0x3, 0x05, ESP_enc); - emit_d8(cbuf, framesize); - } - } else { - emit_opcode(cbuf, 0x81); // sub SP,#framesize - emit_rm(cbuf, 0x3, 0x05, ESP_enc); - emit_d32(cbuf, framesize); - } + + __ verified_entry(framesize, C->need_stack_bang(framesize), C->in_24_bit_fp_mode()); + C->set_frame_complete(cbuf.insts_size()); -#ifdef ASSERT - if (VerifyStackAtCalls) { - Label L; - MacroAssembler masm(&cbuf); - masm.push(rax); - masm.mov(rax, rsp); - masm.andptr(rax, StackAlignmentInBytes-1); - masm.cmpptr(rax, StackAlignmentInBytes-wordSize); - masm.pop(rax); - masm.jcc(Assembler::equal, L); - masm.stop("Stack is not properly aligned!"); - masm.bind(L); - } -#endif - if (C->has_mach_constant_base_node()) { // NOTE: We set the table base offset here because users might be // emitted before MachConstantBaseNode. diff -r 69333a2fbae2 -r fd8114661503 src/cpu/x86/vm/x86_64.ad --- a/src/cpu/x86/vm/x86_64.ad Wed Feb 15 16:29:40 2012 -0800 +++ b/src/cpu/x86/vm/x86_64.ad Wed Feb 15 21:37:49 2012 -0800 @@ -853,121 +853,53 @@ //============================================================================= #ifndef PRODUCT -void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const -{ +void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const { Compile* C = ra_->C; int framesize = C->frame_slots() << LogBytesPerInt; assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); - // Remove wordSize for return adr already pushed - // and another for the RBP we are going to save - framesize -= 2*wordSize; - bool need_nop = true; - - // Calls to C2R adapters often do not accept exceptional returns. - // We require that their callers must bang for them. But be - // careful, because some VM calls (such as call site linkage) can - // use several kilobytes of stack. But the stack safety zone should - // account for that. See bugs 4446381, 4468289, 4497237. + // Remove wordSize for return addr which is already pushed. + framesize -= wordSize; + if (C->need_stack_bang(framesize)) { - st->print_cr("# stack bang"); st->print("\t"); - need_nop = false; + framesize -= wordSize; + st->print("# stack bang"); + st->print("\n\t"); + st->print("pushq rbp\t# Save rbp"); + if (framesize) { + st->print("\n\t"); + st->print("subq rsp, #%d\t# Create frame",framesize); + } + } else { + st->print("subq rsp, #%d\t# Create frame",framesize); + st->print("\n\t"); + framesize -= wordSize; + st->print("movq [rsp + #%d], rbp\t# Save rbp",framesize); } - st->print_cr("pushq rbp"); st->print("\t"); if (VerifyStackAtCalls) { - // Majik cookie to verify stack depth - st->print_cr("pushq 0xffffffffbadb100d" - "\t# Majik cookie for stack depth check"); - st->print("\t"); - framesize -= wordSize; // Remove 2 for cookie - need_nop = false; + st->print("\n\t"); + framesize -= wordSize; + st->print("movq [rsp + #%d], 0xbadb100d\t# Majik cookie for stack depth check",framesize); +#ifdef ASSERT + st->print("\n\t"); + st->print("# stack alignment check"); +#endif } - - if (framesize) { - st->print("subq rsp, #%d\t# Create frame", framesize); - if (framesize < 0x80 && need_nop) { - st->print("\n\tnop\t# nop for patch_verified_entry"); - } - } + st->cr(); } #endif -void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const -{ +void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { Compile* C = ra_->C; - - // WARNING: Initial instruction MUST be 5 bytes or longer so that - // NativeJump::patch_verified_entry will be able to patch out the entry - // code safely. The fldcw is ok at 6 bytes, the push to verify stack - // depth is ok at 5 bytes, the frame allocation can be either 3 or - // 6 bytes. So if we don't do the fldcw or the push then we must - // use the 6 byte frame allocation even if we have no frame. :-( - // If method sets FPU control word do it now + MacroAssembler _masm(&cbuf); int framesize = C->frame_slots() << LogBytesPerInt; - assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); - // Remove wordSize for return adr already pushed - // and another for the RBP we are going to save - framesize -= 2*wordSize; - bool need_nop = true; - - // Calls to C2R adapters often do not accept exceptional returns. - // We require that their callers must bang for them. But be - // careful, because some VM calls (such as call site linkage) can - // use several kilobytes of stack. But the stack safety zone should - // account for that. See bugs 4446381, 4468289, 4497237. - if (C->need_stack_bang(framesize)) { - MacroAssembler masm(&cbuf); - masm.generate_stack_overflow_check(framesize); - need_nop = false; - } - - // We always push rbp so that on return to interpreter rbp will be - // restored correctly and we can correct the stack. - emit_opcode(cbuf, 0x50 | RBP_enc); - - if (VerifyStackAtCalls) { - // Majik cookie to verify stack depth - emit_opcode(cbuf, 0x68); // pushq (sign-extended) 0xbadb100d - emit_d32(cbuf, 0xbadb100d); - framesize -= wordSize; // Remove 2 for cookie - need_nop = false; - } - - if (framesize) { - emit_opcode(cbuf, Assembler::REX_W); - if (framesize < 0x80) { - emit_opcode(cbuf, 0x83); // sub SP,#framesize - emit_rm(cbuf, 0x3, 0x05, RSP_enc); - emit_d8(cbuf, framesize); - if (need_nop) { - emit_opcode(cbuf, 0x90); // nop - } - } else { - emit_opcode(cbuf, 0x81); // sub SP,#framesize - emit_rm(cbuf, 0x3, 0x05, RSP_enc); - emit_d32(cbuf, framesize); - } - } + + __ verified_entry(framesize, C->need_stack_bang(framesize), false); C->set_frame_complete(cbuf.insts_size()); -#ifdef ASSERT - if (VerifyStackAtCalls) { - Label L; - MacroAssembler masm(&cbuf); - masm.push(rax); - masm.mov(rax, rsp); - masm.andptr(rax, StackAlignmentInBytes-1); - masm.cmpptr(rax, StackAlignmentInBytes-wordSize); - masm.pop(rax); - masm.jcc(Assembler::equal, L); - masm.stop("Stack is not properly aligned!"); - masm.bind(L); - } -#endif - if (C->has_mach_constant_base_node()) { // NOTE: We set the table base offset here because users might be // emitted before MachConstantBaseNode. diff -r 69333a2fbae2 -r fd8114661503 src/share/vm/opto/output.cpp --- a/src/share/vm/opto/output.cpp Wed Feb 15 16:29:40 2012 -0800 +++ b/src/share/vm/opto/output.cpp Wed Feb 15 21:37:49 2012 -0800 @@ -167,7 +167,7 @@ // Determine if we need to generate a stack overflow check. // Do it if the method is not a stub function and // has java calls or has frame size > vm_page_size/8. - return (stub_function() == NULL && + return (UseStackBanging && stub_function() == NULL && (has_java_calls() || frame_size_in_bytes > os::vm_page_size()>>3)); }