Mercurial > hg > truffle
diff src/cpu/x86/vm/assembler_x86.cpp @ 6275:957c266d8bc5
Merge with http://hg.openjdk.java.net/hsx/hsx24/hotspot/
author | Doug Simon <doug.simon@oracle.com> |
---|---|
date | Tue, 21 Aug 2012 10:39:19 +0200 |
parents | 33df1aeaebbf 1d7922586cf6 |
children | c38f13903fdf |
line wrap: on
line diff
--- a/src/cpu/x86/vm/assembler_x86.cpp Mon Aug 20 15:21:31 2012 +0200 +++ b/src/cpu/x86/vm/assembler_x86.cpp Tue Aug 21 10:39:19 2012 +0200 @@ -41,6 +41,15 @@ #include "gc_implementation/g1/heapRegion.hpp" #endif +#ifdef PRODUCT +#define BLOCK_COMMENT(str) /* nothing */ +#define STOP(error) stop(error) +#else +#define BLOCK_COMMENT(str) block_comment(str) +#define STOP(error) block_comment(error); stop(error) +#endif + +#define BIND(label) bind(label); BLOCK_COMMENT(#label ":") // Implementation of AddressLiteral AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) { @@ -528,10 +537,12 @@ if (which == end_pc_operand) return ip + (is_64bit ? 8 : 4); // these asserts are somewhat nonsensical #ifndef _LP64 - assert(which == imm_operand || which == disp32_operand, ""); + assert(which == imm_operand || which == disp32_operand, + err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip)); #else assert((which == call32_operand || which == imm_operand) && is_64bit || - which == narrow_oop_operand && !is_64bit, ""); + which == narrow_oop_operand && !is_64bit, + err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip)); #endif // _LP64 return ip; @@ -1635,6 +1646,13 @@ emit_byte(0xC0 | encode); } +void Assembler::movlhps(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE); + emit_byte(0x16); + emit_byte(0xC0 | encode); +} + void Assembler::movb(Register dst, Address src) { NOT_LP64(assert(dst->has_byte_register(), "must have byte register")); InstructionMark im(this); @@ -1684,6 +1702,14 @@ emit_operand(dst, src); } +void Assembler::movdl(Address dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionMark im(this); + simd_prefix(dst, src, VEX_SIMD_66); + emit_byte(0x7E); + emit_operand(src, dst); +} + void Assembler::movdqa(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66); @@ -1714,6 +1740,35 @@ emit_operand(src, dst); } +// Move Unaligned 256bit Vector +void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) { + assert(UseAVX, ""); + bool vector256 = true; + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256); + emit_byte(0x6F); + emit_byte(0xC0 | encode); +} + +void Assembler::vmovdqu(XMMRegister dst, Address src) { + assert(UseAVX, ""); + InstructionMark im(this); + bool vector256 = true; + vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256); + emit_byte(0x6F); + emit_operand(dst, src); +} + +void Assembler::vmovdqu(Address dst, XMMRegister src) { + assert(UseAVX, ""); + InstructionMark im(this); + bool vector256 = true; + // swap src<->dst for encoding + assert(src != xnoreg, "sanity"); + vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector256); + emit_byte(0x7F); + emit_operand(src, dst); +} + // Uses zero extension on 64bit void Assembler::movl(Register dst, int32_t imm32) { @@ -2527,6 +2582,13 @@ emit_byte(0xC0 | encode); } +void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66); + emit_byte(0x6C); + emit_byte(0xC0 | encode); +} + void Assembler::push(int32_t imm32) { // in 64bits we push 64bits onto the stack but only // take a 32bit immediate @@ -3110,6 +3172,13 @@ emit_operand(dst, src); } +void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256); + emit_byte(0x57); + emit_byte(0xC0 | encode); +} + void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); @@ -3118,6 +3187,48 @@ emit_operand(dst, src); } +void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, vector256); + emit_byte(0x57); + emit_byte(0xC0 | encode); +} + +void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx2() || (!vector256) && VM_Version::supports_avx(), ""); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256); + emit_byte(0xEF); + emit_byte(0xC0 | encode); +} + +void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { + assert(VM_Version::supports_avx(), ""); + bool vector256 = true; + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A); + emit_byte(0x18); + emit_byte(0xC0 | encode); + // 0x00 - insert into lower 128 bits + // 0x01 - insert into upper 128 bits + emit_byte(0x01); +} + +void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { + assert(VM_Version::supports_avx2(), ""); + bool vector256 = true; + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A); + emit_byte(0x38); + emit_byte(0xC0 | encode); + // 0x00 - insert into lower 128 bits + // 0x01 - insert into upper 128 bits + emit_byte(0x01); +} + +void Assembler::vzeroupper() { + assert(VM_Version::supports_avx(), ""); + (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE); + emit_byte(0x77); +} + #ifndef _LP64 // 32bit only pieces of the assembler @@ -3576,6 +3687,21 @@ emit_byte(0xF1); } +void Assembler::frndint() { + emit_byte(0xD9); + emit_byte(0xFC); +} + +void Assembler::f2xm1() { + emit_byte(0xD9); + emit_byte(0xF0); +} + +void Assembler::fldl2e() { + emit_byte(0xD9); + emit_byte(0xEA); +} + // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding. static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 }; // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding. @@ -5391,23 +5517,7 @@ // To see where a verify_oop failed, get $ebx+40/X for this frame. // This is the value of eip which points to where verify_oop will return. if (os::message_box(msg, "Execution stopped, print registers?")) { - ttyLocker ttyl; - tty->print_cr("eip = 0x%08x", eip); -#ifndef PRODUCT - if ((WizardMode || Verbose) && PrintMiscellaneous) { - tty->cr(); - findpc(eip); - tty->cr(); - } -#endif - tty->print_cr("rax = 0x%08x", rax); - tty->print_cr("rbx = 0x%08x", rbx); - tty->print_cr("rcx = 0x%08x", rcx); - tty->print_cr("rdx = 0x%08x", rdx); - tty->print_cr("rdi = 0x%08x", rdi); - tty->print_cr("rsi = 0x%08x", rsi); - tty->print_cr("rbp = 0x%08x", rbp); - tty->print_cr("rsp = 0x%08x", rsp); + print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip); BREAKPOINT; assert(false, "start up GDB"); } @@ -5419,12 +5529,53 @@ ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); } +void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) { + ttyLocker ttyl; + FlagSetting fs(Debugging, true); + tty->print_cr("eip = 0x%08x", eip); +#ifndef PRODUCT + if ((WizardMode || Verbose) && PrintMiscellaneous) { + tty->cr(); + findpc(eip); + tty->cr(); + } +#endif +#define PRINT_REG(rax) \ + { tty->print("%s = ", #rax); os::print_location(tty, rax); } + PRINT_REG(rax); + PRINT_REG(rbx); + PRINT_REG(rcx); + PRINT_REG(rdx); + PRINT_REG(rdi); + PRINT_REG(rsi); + PRINT_REG(rbp); + PRINT_REG(rsp); +#undef PRINT_REG + // Print some words near top of staack. + int* dump_sp = (int*) rsp; + for (int col1 = 0; col1 < 8; col1++) { + tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); + os::print_location(tty, *dump_sp++); + } + for (int row = 0; row < 16; row++) { + tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); + for (int col = 0; col < 8; col++) { + tty->print(" 0x%08x", *dump_sp++); + } + tty->cr(); + } + // Print some instructions around pc: + Disassembler::decode((address)eip-64, (address)eip); + tty->print_cr("--------"); + Disassembler::decode((address)eip, (address)eip+32); +} + void MacroAssembler::stop(const char* msg) { ExternalAddress message((address)msg); // push address of message pushptr(message.addr()); { Label L; call(L, relocInfo::none); bind(L); } // push eip - pusha(); // push registers + pusha(); // push registers call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32))); hlt(); } @@ -5441,6 +5592,18 @@ pop_CPU_state(); } +void MacroAssembler::print_state() { + { Label L; call(L, relocInfo::none); bind(L); } // push eip + pusha(); // push registers + + push_CPU_state(); + call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32))); + pop_CPU_state(); + + popa(); + addl(rsp, wordSize); +} + #else // _LP64 // 64 bit versions @@ -5906,14 +6069,33 @@ } void MacroAssembler::warn(const char* msg) { - push(rsp); + push(rbp); + movq(rbp, rsp); andq(rsp, -16); // align stack as required by push_CPU_state and call - push_CPU_state(); // keeps alignment at 16 bytes lea(c_rarg0, ExternalAddress((address) msg)); call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0); pop_CPU_state(); - pop(rsp); + mov(rsp, rbp); + pop(rbp); +} + +void MacroAssembler::print_state() { + address rip = pc(); + pusha(); // get regs on stack + push(rbp); + movq(rbp, rsp); + andq(rsp, -16); // align stack as required by push_CPU_state and call + push_CPU_state(); // keeps alignment at 16 bytes + + lea(c_rarg0, InternalAddress(rip)); + lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array + call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1); + + pop_CPU_state(); + mov(rsp, rbp); + pop(rbp); + popa(); } #ifndef PRODUCT @@ -5922,7 +6104,7 @@ void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) { // In order to get locks to work, we need to fake a in_VM state - if (ShowMessageBoxOnError ) { + if (ShowMessageBoxOnError) { JavaThread* thread = JavaThread::current(); JavaThreadState saved_state = thread->thread_state(); thread->set_thread_state(_thread_in_vm); @@ -5936,30 +6118,9 @@ // XXX correct this offset for amd64 // This is the value of eip which points to where verify_oop will return. if (os::message_box(msg, "Execution stopped, print registers?")) { - ttyLocker ttyl; - tty->print_cr("rip = 0x%016lx", pc); -#ifndef PRODUCT - tty->cr(); - findpc(pc); - tty->cr(); -#endif - tty->print_cr("rax = 0x%016lx", regs[15]); - tty->print_cr("rbx = 0x%016lx", regs[12]); - tty->print_cr("rcx = 0x%016lx", regs[14]); - tty->print_cr("rdx = 0x%016lx", regs[13]); - tty->print_cr("rdi = 0x%016lx", regs[8]); - tty->print_cr("rsi = 0x%016lx", regs[9]); - tty->print_cr("rbp = 0x%016lx", regs[10]); - tty->print_cr("rsp = 0x%016lx", regs[11]); - tty->print_cr("r8 = 0x%016lx", regs[7]); - tty->print_cr("r9 = 0x%016lx", regs[6]); - tty->print_cr("r10 = 0x%016lx", regs[5]); - tty->print_cr("r11 = 0x%016lx", regs[4]); - tty->print_cr("r12 = 0x%016lx", regs[3]); - tty->print_cr("r13 = 0x%016lx", regs[2]); - tty->print_cr("r14 = 0x%016lx", regs[1]); - tty->print_cr("r15 = 0x%016lx", regs[0]); + print_state64(pc, regs); BREAKPOINT; + assert(false, "start up GDB"); } ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); } else { @@ -5970,6 +6131,54 @@ } } +void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) { + ttyLocker ttyl; + FlagSetting fs(Debugging, true); + tty->print_cr("rip = 0x%016lx", pc); +#ifndef PRODUCT + tty->cr(); + findpc(pc); + tty->cr(); +#endif +#define PRINT_REG(rax, value) \ + { tty->print("%s = ", #rax); os::print_location(tty, value); } + PRINT_REG(rax, regs[15]); + PRINT_REG(rbx, regs[12]); + PRINT_REG(rcx, regs[14]); + PRINT_REG(rdx, regs[13]); + PRINT_REG(rdi, regs[8]); + PRINT_REG(rsi, regs[9]); + PRINT_REG(rbp, regs[10]); + PRINT_REG(rsp, regs[11]); + PRINT_REG(r8 , regs[7]); + PRINT_REG(r9 , regs[6]); + PRINT_REG(r10, regs[5]); + PRINT_REG(r11, regs[4]); + PRINT_REG(r12, regs[3]); + PRINT_REG(r13, regs[2]); + PRINT_REG(r14, regs[1]); + PRINT_REG(r15, regs[0]); +#undef PRINT_REG + // Print some words near top of staack. + int64_t* rsp = (int64_t*) regs[11]; + int64_t* dump_sp = rsp; + for (int col1 = 0; col1 < 8; col1++) { + tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp); + os::print_location(tty, *dump_sp++); + } + for (int row = 0; row < 25; row++) { + tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp); + for (int col = 0; col < 4; col++) { + tty->print(" 0x%016lx", *dump_sp++); + } + tty->cr(); + } + // Print some instructions around pc: + Disassembler::decode((address)pc-64, (address)pc); + tty->print_cr("--------"); + Disassembler::decode((address)pc, (address)pc+32); +} + #endif // _LP64 // Now versions that are common to 32/64 bit @@ -6339,7 +6548,7 @@ get_thread(rax); cmpptr(java_thread, rax); jcc(Assembler::equal, L); - stop("MacroAssembler::call_VM_base: rdi not callee saved?"); + STOP("MacroAssembler::call_VM_base: rdi not callee saved?"); bind(L); } pop(rax); @@ -6866,6 +7075,264 @@ Assembler::fldcw(as_Address(src)); } +void MacroAssembler::pow_exp_core_encoding() { + // kills rax, rcx, rdx + subptr(rsp,sizeof(jdouble)); + // computes 2^X. Stack: X ... + // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and + // keep it on the thread's stack to compute 2^int(X) later + // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1) + // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X)) + fld_s(0); // Stack: X X ... + frndint(); // Stack: int(X) X ... + fsuba(1); // Stack: int(X) X-int(X) ... + fistp_s(Address(rsp,0)); // move int(X) as integer to thread's stack. Stack: X-int(X) ... + f2xm1(); // Stack: 2^(X-int(X))-1 ... + fld1(); // Stack: 1 2^(X-int(X))-1 ... + faddp(1); // Stack: 2^(X-int(X)) + // computes 2^(int(X)): add exponent bias (1023) to int(X), then + // shift int(X)+1023 to exponent position. + // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11 + // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent + // values so detect them and set result to NaN. + movl(rax,Address(rsp,0)); + movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding + addl(rax, 1023); + movl(rdx,rax); + shll(rax,20); + // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN. + addl(rdx,1); + // Check that 1 < int(X)+1023+1 < 2048 + // in 3 steps: + // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048 + // 2- (int(X)+1023+1)&-2048 != 0 + // 3- (int(X)+1023+1)&-2048 != 1 + // Do 2- first because addl just updated the flags. + cmov32(Assembler::equal,rax,rcx); + cmpl(rdx,1); + cmov32(Assembler::equal,rax,rcx); + testl(rdx,rcx); + cmov32(Assembler::notEqual,rax,rcx); + movl(Address(rsp,4),rax); + movl(Address(rsp,0),0); + fmul_d(Address(rsp,0)); // Stack: 2^X ... + addptr(rsp,sizeof(jdouble)); +} + +void MacroAssembler::increase_precision() { + subptr(rsp, BytesPerWord); + fnstcw(Address(rsp, 0)); + movl(rax, Address(rsp, 0)); + orl(rax, 0x300); + push(rax); + fldcw(Address(rsp, 0)); + pop(rax); +} + +void MacroAssembler::restore_precision() { + fldcw(Address(rsp, 0)); + addptr(rsp, BytesPerWord); +} + +void MacroAssembler::fast_pow() { + // computes X^Y = 2^(Y * log2(X)) + // if fast computation is not possible, result is NaN. Requires + // fallback from user of this macro. + // increase precision for intermediate steps of the computation + increase_precision(); + fyl2x(); // Stack: (Y*log2(X)) ... + pow_exp_core_encoding(); // Stack: exp(X) ... + restore_precision(); +} + +void MacroAssembler::fast_exp() { + // computes exp(X) = 2^(X * log2(e)) + // if fast computation is not possible, result is NaN. Requires + // fallback from user of this macro. + // increase precision for intermediate steps of the computation + increase_precision(); + fldl2e(); // Stack: log2(e) X ... + fmulp(1); // Stack: (X*log2(e)) ... + pow_exp_core_encoding(); // Stack: exp(X) ... + restore_precision(); +} + +void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) { + // kills rax, rcx, rdx + // pow and exp needs 2 extra registers on the fpu stack. + Label slow_case, done; + Register tmp = noreg; + if (!VM_Version::supports_cmov()) { + // fcmp needs a temporary so preserve rdx, + tmp = rdx; + } + Register tmp2 = rax; + Register tmp3 = rcx; + + if (is_exp) { + // Stack: X + fld_s(0); // duplicate argument for runtime call. Stack: X X + fast_exp(); // Stack: exp(X) X + fcmp(tmp, 0, false, false); // Stack: exp(X) X + // exp(X) not equal to itself: exp(X) is NaN go to slow case. + jcc(Assembler::parity, slow_case); + // get rid of duplicate argument. Stack: exp(X) + if (num_fpu_regs_in_use > 0) { + fxch(); + fpop(); + } else { + ffree(1); + } + jmp(done); + } else { + // Stack: X Y + Label x_negative, y_odd; + + fldz(); // Stack: 0 X Y + fcmp(tmp, 1, true, false); // Stack: X Y + jcc(Assembler::above, x_negative); + + // X >= 0 + + fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y + fld_s(1); // Stack: X Y X Y + fast_pow(); // Stack: X^Y X Y + fcmp(tmp, 0, false, false); // Stack: X^Y X Y + // X^Y not equal to itself: X^Y is NaN go to slow case. + jcc(Assembler::parity, slow_case); + // get rid of duplicate arguments. Stack: X^Y + if (num_fpu_regs_in_use > 0) { + fxch(); fpop(); + fxch(); fpop(); + } else { + ffree(2); + ffree(1); + } + jmp(done); + + // X <= 0 + bind(x_negative); + + fld_s(1); // Stack: Y X Y + frndint(); // Stack: int(Y) X Y + fcmp(tmp, 2, false, false); // Stack: int(Y) X Y + jcc(Assembler::notEqual, slow_case); + + subptr(rsp, 8); + + // For X^Y, when X < 0, Y has to be an integer and the final + // result depends on whether it's odd or even. We just checked + // that int(Y) == Y. We move int(Y) to gp registers as a 64 bit + // integer to test its parity. If int(Y) is huge and doesn't fit + // in the 64 bit integer range, the integer indefinite value will + // end up in the gp registers. Huge numbers are all even, the + // integer indefinite number is even so it's fine. + +#ifdef ASSERT + // Let's check we don't end up with an integer indefinite number + // when not expected. First test for huge numbers: check whether + // int(Y)+1 == int(Y) which is true for very large numbers and + // those are all even. A 64 bit integer is guaranteed to not + // overflow for numbers where y+1 != y (when precision is set to + // double precision). + Label y_not_huge; + + fld1(); // Stack: 1 int(Y) X Y + fadd(1); // Stack: 1+int(Y) int(Y) X Y + +#ifdef _LP64 + // trip to memory to force the precision down from double extended + // precision + fstp_d(Address(rsp, 0)); + fld_d(Address(rsp, 0)); +#endif + + fcmp(tmp, 1, true, false); // Stack: int(Y) X Y +#endif + + // move int(Y) as 64 bit integer to thread's stack + fistp_d(Address(rsp,0)); // Stack: X Y + +#ifdef ASSERT + jcc(Assembler::notEqual, y_not_huge); + + // Y is huge so we know it's even. It may not fit in a 64 bit + // integer and we don't want the debug code below to see the + // integer indefinite value so overwrite int(Y) on the thread's + // stack with 0. + movl(Address(rsp, 0), 0); + movl(Address(rsp, 4), 0); + + bind(y_not_huge); +#endif + + fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y + fld_s(1); // Stack: X Y X Y + fabs(); // Stack: abs(X) Y X Y + fast_pow(); // Stack: abs(X)^Y X Y + fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y + // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case. + + pop(tmp2); + NOT_LP64(pop(tmp3)); + jcc(Assembler::parity, slow_case); + +#ifdef ASSERT + // Check that int(Y) is not integer indefinite value (int + // overflow). Shouldn't happen because for values that would + // overflow, 1+int(Y)==Y which was tested earlier. +#ifndef _LP64 + { + Label integer; + testl(tmp2, tmp2); + jcc(Assembler::notZero, integer); + cmpl(tmp3, 0x80000000); + jcc(Assembler::notZero, integer); + STOP("integer indefinite value shouldn't be seen here"); + bind(integer); + } +#else + { + Label integer; + mov(tmp3, tmp2); // preserve tmp2 for parity check below + shlq(tmp3, 1); + jcc(Assembler::carryClear, integer); + jcc(Assembler::notZero, integer); + STOP("integer indefinite value shouldn't be seen here"); + bind(integer); + } +#endif +#endif + + // get rid of duplicate arguments. Stack: X^Y + if (num_fpu_regs_in_use > 0) { + fxch(); fpop(); + fxch(); fpop(); + } else { + ffree(2); + ffree(1); + } + + testl(tmp2, 1); + jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y + // X <= 0, Y even: X^Y = -abs(X)^Y + + fchs(); // Stack: -abs(X)^Y Y + jmp(done); + } + + // slow case: runtime call + bind(slow_case); + + fpop(); // pop incorrect result or int(Y) + + fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow), + is_exp ? 1 : 2, num_fpu_regs_in_use); + + // Come here with result in F-TOS + bind(done); +} + void MacroAssembler::fpop() { ffree(); fincstp(); @@ -7130,6 +7597,24 @@ movb(as_Address(dst), src); } +void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) { + if (reachable(src)) { + movdl(dst, as_Address(src)); + } else { + lea(rscratch1, src); + movdl(dst, Address(rscratch1, 0)); + } +} + +void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) { + if (reachable(src)) { + movq(dst, as_Address(src)); + } else { + lea(rscratch1, src); + movq(dst, Address(rscratch1, 0)); + } +} + void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) { if (reachable(src)) { if (UseXmmLoadAndClearUpper) { @@ -7995,7 +8480,7 @@ shlptr(tsize, LogHeapWordSize); cmpptr(t1, tsize); jcc(Assembler::equal, ok); - stop("assert(t1 != tlab size)"); + STOP("assert(t1 != tlab size)"); should_not_reach_here(); bind(ok); @@ -8043,6 +8528,144 @@ #endif } +void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) { + pusha(); + + // if we are coming from c1, xmm registers may be live + if (UseSSE >= 1) { + subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8)); + } + int off = 0; + if (UseSSE == 1) { + movflt(Address(rsp,off++*sizeof(jdouble)),xmm0); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm1); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm2); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm3); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm4); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm5); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm6); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm7); + } else if (UseSSE >= 2) { + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7); +#ifdef _LP64 + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15); +#endif + } + + // Preserve registers across runtime call + int incoming_argument_and_return_value_offset = -1; + if (num_fpu_regs_in_use > 1) { + // Must preserve all other FPU regs (could alternatively convert + // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash + // FPU state, but can not trust C compiler) + NEEDS_CLEANUP; + // NOTE that in this case we also push the incoming argument(s) to + // the stack and restore it later; we also use this stack slot to + // hold the return value from dsin, dcos etc. + for (int i = 0; i < num_fpu_regs_in_use; i++) { + subptr(rsp, sizeof(jdouble)); + fstp_d(Address(rsp, 0)); + } + incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1); + for (int i = nb_args-1; i >= 0; i--) { + fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble))); + } + } + + subptr(rsp, nb_args*sizeof(jdouble)); + for (int i = 0; i < nb_args; i++) { + fstp_d(Address(rsp, i*sizeof(jdouble))); + } + +#ifdef _LP64 + if (nb_args > 0) { + movdbl(xmm0, Address(rsp, 0)); + } + if (nb_args > 1) { + movdbl(xmm1, Address(rsp, sizeof(jdouble))); + } + assert(nb_args <= 2, "unsupported number of args"); +#endif // _LP64 + + // NOTE: we must not use call_VM_leaf here because that requires a + // complete interpreter frame in debug mode -- same bug as 4387334 + // MacroAssembler::call_VM_leaf_base is perfectly safe and will + // do proper 64bit abi + + NEEDS_CLEANUP; + // Need to add stack banging before this runtime call if it needs to + // be taken; however, there is no generic stack banging routine at + // the MacroAssembler level + + MacroAssembler::call_VM_leaf_base(runtime_entry, 0); + +#ifdef _LP64 + movsd(Address(rsp, 0), xmm0); + fld_d(Address(rsp, 0)); +#endif // _LP64 + addptr(rsp, sizeof(jdouble) * nb_args); + if (num_fpu_regs_in_use > 1) { + // Must save return value to stack and then restore entire FPU + // stack except incoming arguments + fstp_d(Address(rsp, incoming_argument_and_return_value_offset)); + for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) { + fld_d(Address(rsp, 0)); + addptr(rsp, sizeof(jdouble)); + } + fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble))); + addptr(rsp, sizeof(jdouble) * nb_args); + } + + off = 0; + if (UseSSE == 1) { + movflt(xmm0, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm1, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm2, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm3, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm4, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm5, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm6, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm7, Address(rsp,off++*sizeof(jdouble))); + } else if (UseSSE >= 2) { + movdbl(xmm0, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm1, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm2, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm3, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm4, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm5, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm6, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm7, Address(rsp,off++*sizeof(jdouble))); +#ifdef _LP64 + movdbl(xmm8, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm9, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm10, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm11, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm12, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm13, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm14, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm15, Address(rsp,off++*sizeof(jdouble))); +#endif + } + if (UseSSE >= 1) { + addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8)); + } + popa(); +} + static const double pi_4 = 0.7853981633974483; void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) { @@ -8090,73 +8713,27 @@ // slow case: runtime call bind(slow_case); - // Preserve registers across runtime call - pusha(); - int incoming_argument_and_return_value_offset = -1; - if (num_fpu_regs_in_use > 1) { - // Must preserve all other FPU regs (could alternatively convert - // SharedRuntime::dsin and dcos into assembly routines known not to trash - // FPU state, but can not trust C compiler) - NEEDS_CLEANUP; - // NOTE that in this case we also push the incoming argument to - // the stack and restore it later; we also use this stack slot to - // hold the return value from dsin or dcos. - for (int i = 0; i < num_fpu_regs_in_use; i++) { - subptr(rsp, sizeof(jdouble)); - fstp_d(Address(rsp, 0)); - } - incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1); - fld_d(Address(rsp, incoming_argument_and_return_value_offset)); - } - subptr(rsp, sizeof(jdouble)); - fstp_d(Address(rsp, 0)); -#ifdef _LP64 - movdbl(xmm0, Address(rsp, 0)); -#endif // _LP64 - - // NOTE: we must not use call_VM_leaf here because that requires a - // complete interpreter frame in debug mode -- same bug as 4387334 - // MacroAssembler::call_VM_leaf_base is perfectly safe and will - // do proper 64bit abi - - NEEDS_CLEANUP; - // Need to add stack banging before this runtime call if it needs to - // be taken; however, there is no generic stack banging routine at - // the MacroAssembler level + switch(trig) { case 's': { - MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0); + fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use); } break; case 'c': { - MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0); + fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use); } break; case 't': { - MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0); + fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use); } break; default: assert(false, "bad intrinsic"); break; } -#ifdef _LP64 - movsd(Address(rsp, 0), xmm0); - fld_d(Address(rsp, 0)); -#endif // _LP64 - addptr(rsp, sizeof(jdouble)); - if (num_fpu_regs_in_use > 1) { - // Must save return value to stack and then restore entire FPU stack - fstp_d(Address(rsp, incoming_argument_and_return_value_offset)); - for (int i = 0; i < num_fpu_regs_in_use; i++) { - fld_d(Address(rsp, 0)); - addptr(rsp, sizeof(jdouble)); - } - } - popa(); // Come here with result in F-TOS bind(done); @@ -8242,6 +8819,19 @@ } +// virtual method calling +void MacroAssembler::lookup_virtual_method(Register recv_klass, + RegisterOrConstant vtable_index, + Register method_result) { + const int base = instanceKlass::vtable_start_offset() * wordSize; + assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below"); + Address vtable_entry_addr(recv_klass, + vtable_index, Address::times_ptr, + base + vtableEntry::method_offset_in_bytes()); + movptr(method_result, vtable_entry_addr); +} + + void MacroAssembler::check_klass_subtype(Register sub_klass, Register super_klass, Register temp_reg, @@ -8491,6 +9081,7 @@ // Pass register number to verify_oop_subroutine char* b = new char[strlen(s) + 50]; sprintf(b, "verify_oop: %s: %s", reg->name(), s); + BLOCK_COMMENT("verify_oop {"); #ifdef _LP64 push(rscratch1); // save r10, trashed by movptr() #endif @@ -8505,6 +9096,7 @@ movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); call(rax); // Caller pops the arguments (oop, message) and restores rax, r10 + BLOCK_COMMENT("} verify_oop"); } @@ -8525,7 +9117,7 @@ jcc(Assembler::notZero, L); char* buf = new char[40]; sprintf(buf, "DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]); - stop(buf); + STOP(buf); } else { jccb(Assembler::notZero, L); hlt(); @@ -8541,60 +9133,6 @@ } -// registers on entry: -// - rax ('check' register): required MethodType -// - rcx: method handle -// - rdx, rsi, or ?: killable temp -void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, - Register temp_reg, - Label& wrong_method_type) { - Address type_addr(mh_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg)); - // compare method type against that of the receiver - if (UseCompressedOops) { - load_heap_oop(temp_reg, type_addr); - cmpptr(mtype_reg, temp_reg); - } else { - cmpptr(mtype_reg, type_addr); - } - jcc(Assembler::notEqual, wrong_method_type); -} - - -// A method handle has a "vmslots" field which gives the size of its -// argument list in JVM stack slots. This field is either located directly -// in every method handle, or else is indirectly accessed through the -// method handle's MethodType. This macro hides the distinction. -void MacroAssembler::load_method_handle_vmslots(Register vmslots_reg, Register mh_reg, - Register temp_reg) { - assert_different_registers(vmslots_reg, mh_reg, temp_reg); - // load mh.type.form.vmslots - Register temp2_reg = vmslots_reg; - load_heap_oop(temp2_reg, Address(mh_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg))); - load_heap_oop(temp2_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodType::form_offset_in_bytes, temp_reg))); - movl(vmslots_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodTypeForm::vmslots_offset_in_bytes, temp_reg))); -} - - -// registers on entry: -// - rcx: method handle -// - rdx: killable temp (interpreted only) -// - rax: killable temp (compiled only) -void MacroAssembler::jump_to_method_handle_entry(Register mh_reg, Register temp_reg) { - assert(mh_reg == rcx, "caller must put MH object in rcx"); - assert_different_registers(mh_reg, temp_reg); - - // pick out the interpreted side of the handler - // NOTE: vmentry is not an oop! - movptr(temp_reg, Address(mh_reg, delayed_value(java_lang_invoke_MethodHandle::vmentry_offset_in_bytes, temp_reg))); - - // off we go... - jmp(Address(temp_reg, MethodHandleEntry::from_interpreted_entry_offset_in_bytes())); - - // for the various stubs which take control at this point, - // see MethodHandles::generate_method_handle_stub -} - - Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, int extra_slot_offset) { // cf. TemplateTable::prepare_invoke(), if (load_receiver). @@ -8667,14 +9205,14 @@ movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset()))); jcc(Assembler::aboveEqual, next); - stop("assert(top >= start)"); + STOP("assert(top >= start)"); should_not_reach_here(); bind(next); movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset()))); cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); jcc(Assembler::aboveEqual, ok); - stop("assert(top <= end)"); + STOP("assert(top <= end)"); should_not_reach_here(); bind(ok); @@ -9107,6 +9645,25 @@ movptr(dst, src); } +void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) { + assert_different_registers(src1, tmp); +#ifdef _LP64 + if (UseCompressedOops) { + bool did_push = false; + if (tmp == noreg) { + tmp = rax; + push(tmp); + did_push = true; + assert(!src2.uses(rsp), "can't push"); + } + load_heap_oop(tmp, src2); + cmpptr(src1, tmp); + if (did_push) pop(tmp); + } else +#endif + cmpptr(src1, src2); +} + // Used for storing NULLs. void MacroAssembler::store_heap_oop_null(Address dst) { #ifdef _LP64 @@ -9137,7 +9694,7 @@ push(rscratch1); // cmpptr trashes rscratch1 cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr())); jcc(Assembler::equal, ok); - stop(msg); + STOP(msg); bind(ok); pop(rscratch1); } @@ -9170,7 +9727,7 @@ Label ok; testq(r, r); jcc(Assembler::notEqual, ok); - stop("null oop passed to encode_heap_oop_not_null"); + STOP("null oop passed to encode_heap_oop_not_null"); bind(ok); } #endif @@ -9191,7 +9748,7 @@ Label ok; testq(src, src); jcc(Assembler::notEqual, ok); - stop("null oop passed to encode_heap_oop_not_null2"); + STOP("null oop passed to encode_heap_oop_not_null2"); bind(ok); } #endif @@ -9382,7 +9939,7 @@ cmpptr(rax, StackAlignmentInBytes-wordSize); pop(rax); jcc(Assembler::equal, L); - stop("Stack is not properly aligned!"); + STOP("Stack is not properly aligned!"); bind(L); } #endif @@ -10056,13 +10613,6 @@ bind(DONE); } -#ifdef PRODUCT -#define BLOCK_COMMENT(str) /* nothing */ -#else -#define BLOCK_COMMENT(str) block_comment(str) -#endif - -#define BIND(label) bind(label); BLOCK_COMMENT(#label ":") void MacroAssembler::generate_fill(BasicType t, bool aligned, Register to, Register value, Register count, Register rtmp, XMMRegister xtmp) {