Mercurial > hg > graal-compiler
comparison src/cpu/x86/vm/x86_32.ad @ 18041:52b4284cb496
Merge with jdk8u20-b26
author | Gilles Duboscq <duboscq@ssw.jku.at> |
---|---|
date | Wed, 15 Oct 2014 16:02:50 +0200 |
parents | 89152779163c 0bf37f737702 |
children |
comparison
equal
deleted
inserted
replaced
17606:45d7b2c7029d | 18041:52b4284cb496 |
---|---|
485 | 485 |
486 int Compile::ConstantTable::calculate_table_base_offset() const { | 486 int Compile::ConstantTable::calculate_table_base_offset() const { |
487 return 0; // absolute addressing, no offset | 487 return 0; // absolute addressing, no offset |
488 } | 488 } |
489 | 489 |
490 bool MachConstantBaseNode::requires_postalloc_expand() const { return false; } | |
491 void MachConstantBaseNode::postalloc_expand(GrowableArray <Node *> *nodes, PhaseRegAlloc *ra_) { | |
492 ShouldNotReachHere(); | |
493 } | |
494 | |
490 void MachConstantBaseNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const { | 495 void MachConstantBaseNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const { |
491 // Empty encoding | 496 // Empty encoding |
492 } | 497 } |
493 | 498 |
494 uint MachConstantBaseNode::size(PhaseRegAlloc* ra_) const { | 499 uint MachConstantBaseNode::size(PhaseRegAlloc* ra_) const { |
505 //============================================================================= | 510 //============================================================================= |
506 #ifndef PRODUCT | 511 #ifndef PRODUCT |
507 void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const { | 512 void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const { |
508 Compile* C = ra_->C; | 513 Compile* C = ra_->C; |
509 | 514 |
510 int framesize = C->frame_slots() << LogBytesPerInt; | 515 int framesize = C->frame_size_in_bytes(); |
516 int bangsize = C->bang_size_in_bytes(); | |
511 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); | 517 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); |
512 // Remove wordSize for return addr which is already pushed. | 518 // Remove wordSize for return addr which is already pushed. |
513 framesize -= wordSize; | 519 framesize -= wordSize; |
514 | 520 |
515 if (C->need_stack_bang(framesize)) { | 521 if (C->need_stack_bang(bangsize)) { |
516 framesize -= wordSize; | 522 framesize -= wordSize; |
517 st->print("# stack bang"); | 523 st->print("# stack bang (%d bytes)", bangsize); |
518 st->print("\n\t"); | 524 st->print("\n\t"); |
519 st->print("PUSH EBP\t# Save EBP"); | 525 st->print("PUSH EBP\t# Save EBP"); |
520 if (framesize) { | 526 if (framesize) { |
521 st->print("\n\t"); | 527 st->print("\n\t"); |
522 st->print("SUB ESP, #%d\t# Create frame",framesize); | 528 st->print("SUB ESP, #%d\t# Create frame",framesize); |
556 | 562 |
557 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { | 563 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { |
558 Compile* C = ra_->C; | 564 Compile* C = ra_->C; |
559 MacroAssembler _masm(&cbuf); | 565 MacroAssembler _masm(&cbuf); |
560 | 566 |
561 int framesize = C->frame_slots() << LogBytesPerInt; | 567 int framesize = C->frame_size_in_bytes(); |
562 | 568 int bangsize = C->bang_size_in_bytes(); |
563 __ verified_entry(framesize, C->need_stack_bang(framesize), C->in_24_bit_fp_mode()); | 569 |
570 __ verified_entry(framesize, C->need_stack_bang(bangsize)?bangsize:0, C->in_24_bit_fp_mode()); | |
564 | 571 |
565 C->set_frame_complete(cbuf.insts_size()); | 572 C->set_frame_complete(cbuf.insts_size()); |
566 | 573 |
567 if (C->has_mach_constant_base_node()) { | 574 if (C->has_mach_constant_base_node()) { |
568 // NOTE: We set the table base offset here because users might be | 575 // NOTE: We set the table base offset here because users might be |
582 | 589 |
583 //============================================================================= | 590 //============================================================================= |
584 #ifndef PRODUCT | 591 #ifndef PRODUCT |
585 void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream* st ) const { | 592 void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream* st ) const { |
586 Compile *C = ra_->C; | 593 Compile *C = ra_->C; |
587 int framesize = C->frame_slots() << LogBytesPerInt; | 594 int framesize = C->frame_size_in_bytes(); |
588 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); | 595 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); |
589 // Remove two words for return addr and rbp, | 596 // Remove two words for return addr and rbp, |
590 framesize -= 2*wordSize; | 597 framesize -= 2*wordSize; |
591 | 598 |
592 if (C->max_vector_size() > 16) { | 599 if (C->max_vector_size() > 16) { |
622 if (C->in_24_bit_fp_mode()) { | 629 if (C->in_24_bit_fp_mode()) { |
623 MacroAssembler masm(&cbuf); | 630 MacroAssembler masm(&cbuf); |
624 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); | 631 masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); |
625 } | 632 } |
626 | 633 |
627 int framesize = C->frame_slots() << LogBytesPerInt; | 634 int framesize = C->frame_size_in_bytes(); |
628 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); | 635 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); |
629 // Remove two words for return addr and rbp, | 636 // Remove two words for return addr and rbp, |
630 framesize -= 2*wordSize; | 637 framesize -= 2*wordSize; |
631 | 638 |
632 // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here | 639 // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here |
656 // If method set FPU control word, restore to standard control word | 663 // If method set FPU control word, restore to standard control word |
657 int size = C->in_24_bit_fp_mode() ? 6 : 0; | 664 int size = C->in_24_bit_fp_mode() ? 6 : 0; |
658 if (C->max_vector_size() > 16) size += 3; // vzeroupper | 665 if (C->max_vector_size() > 16) size += 3; // vzeroupper |
659 if (do_polling() && C->is_method_compilation()) size += 6; | 666 if (do_polling() && C->is_method_compilation()) size += 6; |
660 | 667 |
661 int framesize = C->frame_slots() << LogBytesPerInt; | 668 int framesize = C->frame_size_in_bytes(); |
662 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); | 669 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); |
663 // Remove two words for return addr and rbp, | 670 // Remove two words for return addr and rbp, |
664 framesize -= 2*wordSize; | 671 framesize -= 2*wordSize; |
665 | 672 |
666 size++; // popl rbp, | 673 size++; // popl rbp, |
1290 return OptoBreakpoint ? 11 : 12; | 1297 return OptoBreakpoint ? 11 : 12; |
1291 } | 1298 } |
1292 | 1299 |
1293 | 1300 |
1294 //============================================================================= | 1301 //============================================================================= |
1295 uint size_exception_handler() { | |
1296 // NativeCall instruction size is the same as NativeJump. | |
1297 // exception handler starts out as jump and can be patched to | |
1298 // a call be deoptimization. (4932387) | |
1299 // Note that this value is also credited (in output.cpp) to | |
1300 // the size of the code section. | |
1301 return NativeJump::instruction_size; | |
1302 } | |
1303 | |
1304 // Emit exception handler code. Stuff framesize into a register | |
1305 // and call a VM stub routine. | |
1306 int emit_exception_handler(CodeBuffer& cbuf) { | |
1307 | |
1308 // Note that the code buffer's insts_mark is always relative to insts. | |
1309 // That's why we must use the macroassembler to generate a handler. | |
1310 MacroAssembler _masm(&cbuf); | |
1311 address base = | |
1312 __ start_a_stub(size_exception_handler()); | |
1313 if (base == NULL) return 0; // CodeBuffer::expand failed | |
1314 int offset = __ offset(); | |
1315 __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point())); | |
1316 assert(__ offset() - offset <= (int) size_exception_handler(), "overflow"); | |
1317 __ end_a_stub(); | |
1318 return offset; | |
1319 } | |
1320 | |
1321 uint size_deopt_handler() { | |
1322 // NativeCall instruction size is the same as NativeJump. | |
1323 // exception handler starts out as jump and can be patched to | |
1324 // a call be deoptimization. (4932387) | |
1325 // Note that this value is also credited (in output.cpp) to | |
1326 // the size of the code section. | |
1327 return 5 + NativeJump::instruction_size; // pushl(); jmp; | |
1328 } | |
1329 | |
1330 // Emit deopt handler code. | |
1331 int emit_deopt_handler(CodeBuffer& cbuf) { | |
1332 | |
1333 // Note that the code buffer's insts_mark is always relative to insts. | |
1334 // That's why we must use the macroassembler to generate a handler. | |
1335 MacroAssembler _masm(&cbuf); | |
1336 address base = | |
1337 __ start_a_stub(size_exception_handler()); | |
1338 if (base == NULL) return 0; // CodeBuffer::expand failed | |
1339 int offset = __ offset(); | |
1340 InternalAddress here(__ pc()); | |
1341 __ pushptr(here.addr()); | |
1342 | |
1343 __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack())); | |
1344 assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow"); | |
1345 __ end_a_stub(); | |
1346 return offset; | |
1347 } | |
1348 | 1302 |
1349 int Matcher::regnum_to_fpu_offset(int regnum) { | 1303 int Matcher::regnum_to_fpu_offset(int regnum) { |
1350 return regnum - 32; // The FP registers are in the second chunk | 1304 return regnum - 32; // The FP registers are in the second chunk |
1351 } | 1305 } |
1352 | 1306 |
1386 // Needs 2 CMOV's for longs. | 1340 // Needs 2 CMOV's for longs. |
1387 const int Matcher::long_cmove_cost() { return 1; } | 1341 const int Matcher::long_cmove_cost() { return 1; } |
1388 | 1342 |
1389 // No CMOVF/CMOVD with SSE/SSE2 | 1343 // No CMOVF/CMOVD with SSE/SSE2 |
1390 const int Matcher::float_cmove_cost() { return (UseSSE>=1) ? ConditionalMoveLimit : 0; } | 1344 const int Matcher::float_cmove_cost() { return (UseSSE>=1) ? ConditionalMoveLimit : 0; } |
1345 | |
1346 // Does the CPU require late expand (see block.cpp for description of late expand)? | |
1347 const bool Matcher::require_postalloc_expand = false; | |
1391 | 1348 |
1392 // Should the Matcher clone shifts on addressing modes, expecting them to | 1349 // Should the Matcher clone shifts on addressing modes, expecting them to |
1393 // be subsumed into complex addressing expressions or compute them into | 1350 // be subsumed into complex addressing expressions or compute them into |
1394 // registers? True for Intel but false for most RISCs | 1351 // registers? True for Intel but false for most RISCs |
1395 const bool Matcher::clone_shift_expressions = true; | 1352 const bool Matcher::clone_shift_expressions = true; |
1530 return RegMask(); | 1487 return RegMask(); |
1531 } | 1488 } |
1532 | 1489 |
1533 const RegMask Matcher::method_handle_invoke_SP_save_mask() { | 1490 const RegMask Matcher::method_handle_invoke_SP_save_mask() { |
1534 return EBP_REG_mask(); | 1491 return EBP_REG_mask(); |
1535 } | |
1536 | |
1537 const RegMask Matcher::mathExactI_result_proj_mask() { | |
1538 return EAX_REG_mask(); | |
1539 } | |
1540 | |
1541 const RegMask Matcher::mathExactL_result_proj_mask() { | |
1542 ShouldNotReachHere(); | |
1543 return RegMask(); | |
1544 } | |
1545 | |
1546 const RegMask Matcher::mathExactI_flags_proj_mask() { | |
1547 return INT_FLAGS_mask(); | |
1548 } | 1492 } |
1549 | 1493 |
1550 // Returns true if the high 32 bits of the value is known to be zero. | 1494 // Returns true if the high 32 bits of the value is known to be zero. |
1551 bool is_operand_hi32_zero(Node* n) { | 1495 bool is_operand_hi32_zero(Node* n) { |
1552 int opc = n->Opcode(); | 1496 int opc = n->Opcode(); |
2907 emit_rm (cbuf,0x3, 0x3, $dst$$reg ); | 2851 emit_rm (cbuf,0x3, 0x3, $dst$$reg ); |
2908 emit_opcode(cbuf,0x83); // SBB hi,0 | 2852 emit_opcode(cbuf,0x83); // SBB hi,0 |
2909 emit_rm (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg)); | 2853 emit_rm (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg)); |
2910 emit_d8 (cbuf,0 ); | 2854 emit_d8 (cbuf,0 ); |
2911 %} | 2855 %} |
2912 | |
2913 | |
2914 // Because the transitions from emitted code to the runtime | |
2915 // monitorenter/exit helper stubs are so slow it's critical that | |
2916 // we inline both the stack-locking fast-path and the inflated fast path. | |
2917 // | |
2918 // See also: cmpFastLock and cmpFastUnlock. | |
2919 // | |
2920 // What follows is a specialized inline transliteration of the code | |
2921 // in slow_enter() and slow_exit(). If we're concerned about I$ bloat | |
2922 // another option would be to emit TrySlowEnter and TrySlowExit methods | |
2923 // at startup-time. These methods would accept arguments as | |
2924 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure | |
2925 // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply | |
2926 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. | |
2927 // In practice, however, the # of lock sites is bounded and is usually small. | |
2928 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer | |
2929 // if the processor uses simple bimodal branch predictors keyed by EIP | |
2930 // Since the helper routines would be called from multiple synchronization | |
2931 // sites. | |
2932 // | |
2933 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" | |
2934 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites | |
2935 // to those specialized methods. That'd give us a mostly platform-independent | |
2936 // implementation that the JITs could optimize and inline at their pleasure. | |
2937 // Done correctly, the only time we'd need to cross to native could would be | |
2938 // to park() or unpark() threads. We'd also need a few more unsafe operators | |
2939 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and | |
2940 // (b) explicit barriers or fence operations. | |
2941 // | |
2942 // TODO: | |
2943 // | |
2944 // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr). | |
2945 // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals. | |
2946 // Given TLAB allocation, Self is usually manifested in a register, so passing it into | |
2947 // the lock operators would typically be faster than reifying Self. | |
2948 // | |
2949 // * Ideally I'd define the primitives as: | |
2950 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. | |
2951 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED | |
2952 // Unfortunately ADLC bugs prevent us from expressing the ideal form. | |
2953 // Instead, we're stuck with a rather awkward and brittle register assignments below. | |
2954 // Furthermore the register assignments are overconstrained, possibly resulting in | |
2955 // sub-optimal code near the synchronization site. | |
2956 // | |
2957 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. | |
2958 // Alternately, use a better sp-proximity test. | |
2959 // | |
2960 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. | |
2961 // Either one is sufficient to uniquely identify a thread. | |
2962 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. | |
2963 // | |
2964 // * Intrinsify notify() and notifyAll() for the common cases where the | |
2965 // object is locked by the calling thread but the waitlist is empty. | |
2966 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). | |
2967 // | |
2968 // * use jccb and jmpb instead of jcc and jmp to improve code density. | |
2969 // But beware of excessive branch density on AMD Opterons. | |
2970 // | |
2971 // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success | |
2972 // or failure of the fast-path. If the fast-path fails then we pass | |
2973 // control to the slow-path, typically in C. In Fast_Lock and | |
2974 // Fast_Unlock we often branch to DONE_LABEL, just to find that C2 | |
2975 // will emit a conditional branch immediately after the node. | |
2976 // So we have branches to branches and lots of ICC.ZF games. | |
2977 // Instead, it might be better to have C2 pass a "FailureLabel" | |
2978 // into Fast_Lock and Fast_Unlock. In the case of success, control | |
2979 // will drop through the node. ICC.ZF is undefined at exit. | |
2980 // In the case of failure, the node will branch directly to the | |
2981 // FailureLabel | |
2982 | |
2983 | |
2984 // obj: object to lock | |
2985 // box: on-stack box address (displaced header location) - KILLED | |
2986 // rax,: tmp -- KILLED | |
2987 // scr: tmp -- KILLED | |
2988 enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{ | |
2989 | |
2990 Register objReg = as_Register($obj$$reg); | |
2991 Register boxReg = as_Register($box$$reg); | |
2992 Register tmpReg = as_Register($tmp$$reg); | |
2993 Register scrReg = as_Register($scr$$reg); | |
2994 | |
2995 // Ensure the register assignents are disjoint | |
2996 guarantee (objReg != boxReg, "") ; | |
2997 guarantee (objReg != tmpReg, "") ; | |
2998 guarantee (objReg != scrReg, "") ; | |
2999 guarantee (boxReg != tmpReg, "") ; | |
3000 guarantee (boxReg != scrReg, "") ; | |
3001 guarantee (tmpReg == as_Register(EAX_enc), "") ; | |
3002 | |
3003 MacroAssembler masm(&cbuf); | |
3004 | |
3005 if (_counters != NULL) { | |
3006 masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr())); | |
3007 } | |
3008 if (EmitSync & 1) { | |
3009 // set box->dhw = unused_mark (3) | |
3010 // Force all sync thru slow-path: slow_enter() and slow_exit() | |
3011 masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ; | |
3012 masm.cmpptr (rsp, (int32_t)0) ; | |
3013 } else | |
3014 if (EmitSync & 2) { | |
3015 Label DONE_LABEL ; | |
3016 if (UseBiasedLocking) { | |
3017 // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument. | |
3018 masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters); | |
3019 } | |
3020 | |
3021 masm.movptr(tmpReg, Address(objReg, 0)) ; // fetch markword | |
3022 masm.orptr (tmpReg, 0x1); | |
3023 masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS | |
3024 if (os::is_MP()) { masm.lock(); } | |
3025 masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg | |
3026 masm.jcc(Assembler::equal, DONE_LABEL); | |
3027 // Recursive locking | |
3028 masm.subptr(tmpReg, rsp); | |
3029 masm.andptr(tmpReg, (int32_t) 0xFFFFF003 ); | |
3030 masm.movptr(Address(boxReg, 0), tmpReg); | |
3031 masm.bind(DONE_LABEL) ; | |
3032 } else { | |
3033 // Possible cases that we'll encounter in fast_lock | |
3034 // ------------------------------------------------ | |
3035 // * Inflated | |
3036 // -- unlocked | |
3037 // -- Locked | |
3038 // = by self | |
3039 // = by other | |
3040 // * biased | |
3041 // -- by Self | |
3042 // -- by other | |
3043 // * neutral | |
3044 // * stack-locked | |
3045 // -- by self | |
3046 // = sp-proximity test hits | |
3047 // = sp-proximity test generates false-negative | |
3048 // -- by other | |
3049 // | |
3050 | |
3051 Label IsInflated, DONE_LABEL, PopDone ; | |
3052 | |
3053 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage | |
3054 // order to reduce the number of conditional branches in the most common cases. | |
3055 // Beware -- there's a subtle invariant that fetch of the markword | |
3056 // at [FETCH], below, will never observe a biased encoding (*101b). | |
3057 // If this invariant is not held we risk exclusion (safety) failure. | |
3058 if (UseBiasedLocking && !UseOptoBiasInlining) { | |
3059 masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters); | |
3060 } | |
3061 | |
3062 masm.movptr(tmpReg, Address(objReg, 0)) ; // [FETCH] | |
3063 masm.testptr(tmpReg, 0x02) ; // Inflated v (Stack-locked or neutral) | |
3064 masm.jccb (Assembler::notZero, IsInflated) ; | |
3065 | |
3066 // Attempt stack-locking ... | |
3067 masm.orptr (tmpReg, 0x1); | |
3068 masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS | |
3069 if (os::is_MP()) { masm.lock(); } | |
3070 masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg | |
3071 if (_counters != NULL) { | |
3072 masm.cond_inc32(Assembler::equal, | |
3073 ExternalAddress((address)_counters->fast_path_entry_count_addr())); | |
3074 } | |
3075 masm.jccb (Assembler::equal, DONE_LABEL); | |
3076 | |
3077 // Recursive locking | |
3078 masm.subptr(tmpReg, rsp); | |
3079 masm.andptr(tmpReg, 0xFFFFF003 ); | |
3080 masm.movptr(Address(boxReg, 0), tmpReg); | |
3081 if (_counters != NULL) { | |
3082 masm.cond_inc32(Assembler::equal, | |
3083 ExternalAddress((address)_counters->fast_path_entry_count_addr())); | |
3084 } | |
3085 masm.jmp (DONE_LABEL) ; | |
3086 | |
3087 masm.bind (IsInflated) ; | |
3088 | |
3089 // The object is inflated. | |
3090 // | |
3091 // TODO-FIXME: eliminate the ugly use of manifest constants: | |
3092 // Use markOopDesc::monitor_value instead of "2". | |
3093 // use markOop::unused_mark() instead of "3". | |
3094 // The tmpReg value is an objectMonitor reference ORed with | |
3095 // markOopDesc::monitor_value (2). We can either convert tmpReg to an | |
3096 // objectmonitor pointer by masking off the "2" bit or we can just | |
3097 // use tmpReg as an objectmonitor pointer but bias the objectmonitor | |
3098 // field offsets with "-2" to compensate for and annul the low-order tag bit. | |
3099 // | |
3100 // I use the latter as it avoids AGI stalls. | |
3101 // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]" | |
3102 // instead of "mov r, [tmpReg+OFFSETOF(Owner)]". | |
3103 // | |
3104 #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2) | |
3105 | |
3106 // boxReg refers to the on-stack BasicLock in the current frame. | |
3107 // We'd like to write: | |
3108 // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices. | |
3109 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers | |
3110 // additional latency as we have another ST in the store buffer that must drain. | |
3111 | |
3112 if (EmitSync & 8192) { | |
3113 masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty | |
3114 masm.get_thread (scrReg) ; | |
3115 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] | |
3116 masm.movptr(tmpReg, NULL_WORD); // consider: xor vs mov | |
3117 if (os::is_MP()) { masm.lock(); } | |
3118 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; | |
3119 } else | |
3120 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS | |
3121 masm.movptr(scrReg, boxReg) ; | |
3122 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] | |
3123 | |
3124 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes | |
3125 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { | |
3126 // prefetchw [eax + Offset(_owner)-2] | |
3127 masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2)); | |
3128 } | |
3129 | |
3130 if ((EmitSync & 64) == 0) { | |
3131 // Optimistic form: consider XORL tmpReg,tmpReg | |
3132 masm.movptr(tmpReg, NULL_WORD) ; | |
3133 } else { | |
3134 // Can suffer RTS->RTO upgrades on shared or cold $ lines | |
3135 // Test-And-CAS instead of CAS | |
3136 masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner | |
3137 masm.testptr(tmpReg, tmpReg) ; // Locked ? | |
3138 masm.jccb (Assembler::notZero, DONE_LABEL) ; | |
3139 } | |
3140 | |
3141 // Appears unlocked - try to swing _owner from null to non-null. | |
3142 // Ideally, I'd manifest "Self" with get_thread and then attempt | |
3143 // to CAS the register containing Self into m->Owner. | |
3144 // But we don't have enough registers, so instead we can either try to CAS | |
3145 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds | |
3146 // we later store "Self" into m->Owner. Transiently storing a stack address | |
3147 // (rsp or the address of the box) into m->owner is harmless. | |
3148 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. | |
3149 if (os::is_MP()) { masm.lock(); } | |
3150 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; | |
3151 masm.movptr(Address(scrReg, 0), 3) ; // box->_displaced_header = 3 | |
3152 masm.jccb (Assembler::notZero, DONE_LABEL) ; | |
3153 masm.get_thread (scrReg) ; // beware: clobbers ICCs | |
3154 masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ; | |
3155 masm.xorptr(boxReg, boxReg) ; // set icc.ZFlag = 1 to indicate success | |
3156 | |
3157 // If the CAS fails we can either retry or pass control to the slow-path. | |
3158 // We use the latter tactic. | |
3159 // Pass the CAS result in the icc.ZFlag into DONE_LABEL | |
3160 // If the CAS was successful ... | |
3161 // Self has acquired the lock | |
3162 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. | |
3163 // Intentional fall-through into DONE_LABEL ... | |
3164 } else { | |
3165 masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty | |
3166 masm.movptr(boxReg, tmpReg) ; | |
3167 | |
3168 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes | |
3169 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { | |
3170 // prefetchw [eax + Offset(_owner)-2] | |
3171 masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2)); | |
3172 } | |
3173 | |
3174 if ((EmitSync & 64) == 0) { | |
3175 // Optimistic form | |
3176 masm.xorptr (tmpReg, tmpReg) ; | |
3177 } else { | |
3178 // Can suffer RTS->RTO upgrades on shared or cold $ lines | |
3179 masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner | |
3180 masm.testptr(tmpReg, tmpReg) ; // Locked ? | |
3181 masm.jccb (Assembler::notZero, DONE_LABEL) ; | |
3182 } | |
3183 | |
3184 // Appears unlocked - try to swing _owner from null to non-null. | |
3185 // Use either "Self" (in scr) or rsp as thread identity in _owner. | |
3186 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. | |
3187 masm.get_thread (scrReg) ; | |
3188 if (os::is_MP()) { masm.lock(); } | |
3189 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; | |
3190 | |
3191 // If the CAS fails we can either retry or pass control to the slow-path. | |
3192 // We use the latter tactic. | |
3193 // Pass the CAS result in the icc.ZFlag into DONE_LABEL | |
3194 // If the CAS was successful ... | |
3195 // Self has acquired the lock | |
3196 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. | |
3197 // Intentional fall-through into DONE_LABEL ... | |
3198 } | |
3199 | |
3200 // DONE_LABEL is a hot target - we'd really like to place it at the | |
3201 // start of cache line by padding with NOPs. | |
3202 // See the AMD and Intel software optimization manuals for the | |
3203 // most efficient "long" NOP encodings. | |
3204 // Unfortunately none of our alignment mechanisms suffice. | |
3205 masm.bind(DONE_LABEL); | |
3206 | |
3207 // Avoid branch-to-branch on AMD processors | |
3208 // This appears to be superstition. | |
3209 if (EmitSync & 32) masm.nop() ; | |
3210 | |
3211 | |
3212 // At DONE_LABEL the icc ZFlag is set as follows ... | |
3213 // Fast_Unlock uses the same protocol. | |
3214 // ZFlag == 1 -> Success | |
3215 // ZFlag == 0 -> Failure - force control through the slow-path | |
3216 } | |
3217 %} | |
3218 | |
3219 // obj: object to unlock | |
3220 // box: box address (displaced header location), killed. Must be EAX. | |
3221 // rbx,: killed tmp; cannot be obj nor box. | |
3222 // | |
3223 // Some commentary on balanced locking: | |
3224 // | |
3225 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites. | |
3226 // Methods that don't have provably balanced locking are forced to run in the | |
3227 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. | |
3228 // The interpreter provides two properties: | |
3229 // I1: At return-time the interpreter automatically and quietly unlocks any | |
3230 // objects acquired the current activation (frame). Recall that the | |
3231 // interpreter maintains an on-stack list of locks currently held by | |
3232 // a frame. | |
3233 // I2: If a method attempts to unlock an object that is not held by the | |
3234 // the frame the interpreter throws IMSX. | |
3235 // | |
3236 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). | |
3237 // B() doesn't have provably balanced locking so it runs in the interpreter. | |
3238 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O | |
3239 // is still locked by A(). | |
3240 // | |
3241 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: | |
3242 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter | |
3243 // should not be unlocked by "normal" java-level locking and vice-versa. The specification | |
3244 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. | |
3245 | |
3246 enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{ | |
3247 | |
3248 Register objReg = as_Register($obj$$reg); | |
3249 Register boxReg = as_Register($box$$reg); | |
3250 Register tmpReg = as_Register($tmp$$reg); | |
3251 | |
3252 guarantee (objReg != boxReg, "") ; | |
3253 guarantee (objReg != tmpReg, "") ; | |
3254 guarantee (boxReg != tmpReg, "") ; | |
3255 guarantee (boxReg == as_Register(EAX_enc), "") ; | |
3256 MacroAssembler masm(&cbuf); | |
3257 | |
3258 if (EmitSync & 4) { | |
3259 // Disable - inhibit all inlining. Force control through the slow-path | |
3260 masm.cmpptr (rsp, 0) ; | |
3261 } else | |
3262 if (EmitSync & 8) { | |
3263 Label DONE_LABEL ; | |
3264 if (UseBiasedLocking) { | |
3265 masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL); | |
3266 } | |
3267 // classic stack-locking code ... | |
3268 masm.movptr(tmpReg, Address(boxReg, 0)) ; | |
3269 masm.testptr(tmpReg, tmpReg) ; | |
3270 masm.jcc (Assembler::zero, DONE_LABEL) ; | |
3271 if (os::is_MP()) { masm.lock(); } | |
3272 masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box | |
3273 masm.bind(DONE_LABEL); | |
3274 } else { | |
3275 Label DONE_LABEL, Stacked, CheckSucc, Inflated ; | |
3276 | |
3277 // Critically, the biased locking test must have precedence over | |
3278 // and appear before the (box->dhw == 0) recursive stack-lock test. | |
3279 if (UseBiasedLocking && !UseOptoBiasInlining) { | |
3280 masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL); | |
3281 } | |
3282 | |
3283 masm.cmpptr(Address(boxReg, 0), 0) ; // Examine the displaced header | |
3284 masm.movptr(tmpReg, Address(objReg, 0)) ; // Examine the object's markword | |
3285 masm.jccb (Assembler::zero, DONE_LABEL) ; // 0 indicates recursive stack-lock | |
3286 | |
3287 masm.testptr(tmpReg, 0x02) ; // Inflated? | |
3288 masm.jccb (Assembler::zero, Stacked) ; | |
3289 | |
3290 masm.bind (Inflated) ; | |
3291 // It's inflated. | |
3292 // Despite our balanced locking property we still check that m->_owner == Self | |
3293 // as java routines or native JNI code called by this thread might | |
3294 // have released the lock. | |
3295 // Refer to the comments in synchronizer.cpp for how we might encode extra | |
3296 // state in _succ so we can avoid fetching EntryList|cxq. | |
3297 // | |
3298 // I'd like to add more cases in fast_lock() and fast_unlock() -- | |
3299 // such as recursive enter and exit -- but we have to be wary of | |
3300 // I$ bloat, T$ effects and BP$ effects. | |
3301 // | |
3302 // If there's no contention try a 1-0 exit. That is, exit without | |
3303 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how | |
3304 // we detect and recover from the race that the 1-0 exit admits. | |
3305 // | |
3306 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier | |
3307 // before it STs null into _owner, releasing the lock. Updates | |
3308 // to data protected by the critical section must be visible before | |
3309 // we drop the lock (and thus before any other thread could acquire | |
3310 // the lock and observe the fields protected by the lock). | |
3311 // IA32's memory-model is SPO, so STs are ordered with respect to | |
3312 // each other and there's no need for an explicit barrier (fence). | |
3313 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. | |
3314 | |
3315 masm.get_thread (boxReg) ; | |
3316 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { | |
3317 // prefetchw [ebx + Offset(_owner)-2] | |
3318 masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2)); | |
3319 } | |
3320 | |
3321 // Note that we could employ various encoding schemes to reduce | |
3322 // the number of loads below (currently 4) to just 2 or 3. | |
3323 // Refer to the comments in synchronizer.cpp. | |
3324 // In practice the chain of fetches doesn't seem to impact performance, however. | |
3325 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) { | |
3326 // Attempt to reduce branch density - AMD's branch predictor. | |
3327 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; | |
3328 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ; | |
3329 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; | |
3330 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; | |
3331 masm.jccb (Assembler::notZero, DONE_LABEL) ; | |
3332 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; | |
3333 masm.jmpb (DONE_LABEL) ; | |
3334 } else { | |
3335 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; | |
3336 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ; | |
3337 masm.jccb (Assembler::notZero, DONE_LABEL) ; | |
3338 masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; | |
3339 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; | |
3340 masm.jccb (Assembler::notZero, CheckSucc) ; | |
3341 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; | |
3342 masm.jmpb (DONE_LABEL) ; | |
3343 } | |
3344 | |
3345 // The Following code fragment (EmitSync & 65536) improves the performance of | |
3346 // contended applications and contended synchronization microbenchmarks. | |
3347 // Unfortunately the emission of the code - even though not executed - causes regressions | |
3348 // in scimark and jetstream, evidently because of $ effects. Replacing the code | |
3349 // with an equal number of never-executed NOPs results in the same regression. | |
3350 // We leave it off by default. | |
3351 | |
3352 if ((EmitSync & 65536) != 0) { | |
3353 Label LSuccess, LGoSlowPath ; | |
3354 | |
3355 masm.bind (CheckSucc) ; | |
3356 | |
3357 // Optional pre-test ... it's safe to elide this | |
3358 if ((EmitSync & 16) == 0) { | |
3359 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; | |
3360 masm.jccb (Assembler::zero, LGoSlowPath) ; | |
3361 } | |
3362 | |
3363 // We have a classic Dekker-style idiom: | |
3364 // ST m->_owner = 0 ; MEMBAR; LD m->_succ | |
3365 // There are a number of ways to implement the barrier: | |
3366 // (1) lock:andl &m->_owner, 0 | |
3367 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form. | |
3368 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0 | |
3369 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8 | |
3370 // (2) If supported, an explicit MFENCE is appealing. | |
3371 // In older IA32 processors MFENCE is slower than lock:add or xchg | |
3372 // particularly if the write-buffer is full as might be the case if | |
3373 // if stores closely precede the fence or fence-equivalent instruction. | |
3374 // In more modern implementations MFENCE appears faster, however. | |
3375 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack | |
3376 // The $lines underlying the top-of-stack should be in M-state. | |
3377 // The locked add instruction is serializing, of course. | |
3378 // (4) Use xchg, which is serializing | |
3379 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works | |
3380 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0. | |
3381 // The integer condition codes will tell us if succ was 0. | |
3382 // Since _succ and _owner should reside in the same $line and | |
3383 // we just stored into _owner, it's likely that the $line | |
3384 // remains in M-state for the lock:orl. | |
3385 // | |
3386 // We currently use (3), although it's likely that switching to (2) | |
3387 // is correct for the future. | |
3388 | |
3389 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; | |
3390 if (os::is_MP()) { | |
3391 if (VM_Version::supports_sse2() && 1 == FenceInstruction) { | |
3392 masm.mfence(); | |
3393 } else { | |
3394 masm.lock () ; masm.addptr(Address(rsp, 0), 0) ; | |
3395 } | |
3396 } | |
3397 // Ratify _succ remains non-null | |
3398 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; | |
3399 masm.jccb (Assembler::notZero, LSuccess) ; | |
3400 | |
3401 masm.xorptr(boxReg, boxReg) ; // box is really EAX | |
3402 if (os::is_MP()) { masm.lock(); } | |
3403 masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
3404 masm.jccb (Assembler::notEqual, LSuccess) ; | |
3405 // Since we're low on registers we installed rsp as a placeholding in _owner. | |
3406 // Now install Self over rsp. This is safe as we're transitioning from | |
3407 // non-null to non=null | |
3408 masm.get_thread (boxReg) ; | |
3409 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ; | |
3410 // Intentional fall-through into LGoSlowPath ... | |
3411 | |
3412 masm.bind (LGoSlowPath) ; | |
3413 masm.orptr(boxReg, 1) ; // set ICC.ZF=0 to indicate failure | |
3414 masm.jmpb (DONE_LABEL) ; | |
3415 | |
3416 masm.bind (LSuccess) ; | |
3417 masm.xorptr(boxReg, boxReg) ; // set ICC.ZF=1 to indicate success | |
3418 masm.jmpb (DONE_LABEL) ; | |
3419 } | |
3420 | |
3421 masm.bind (Stacked) ; | |
3422 // It's not inflated and it's not recursively stack-locked and it's not biased. | |
3423 // It must be stack-locked. | |
3424 // Try to reset the header to displaced header. | |
3425 // The "box" value on the stack is stable, so we can reload | |
3426 // and be assured we observe the same value as above. | |
3427 masm.movptr(tmpReg, Address(boxReg, 0)) ; | |
3428 if (os::is_MP()) { masm.lock(); } | |
3429 masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box | |
3430 // Intention fall-thru into DONE_LABEL | |
3431 | |
3432 | |
3433 // DONE_LABEL is a hot target - we'd really like to place it at the | |
3434 // start of cache line by padding with NOPs. | |
3435 // See the AMD and Intel software optimization manuals for the | |
3436 // most efficient "long" NOP encodings. | |
3437 // Unfortunately none of our alignment mechanisms suffice. | |
3438 if ((EmitSync & 65536) == 0) { | |
3439 masm.bind (CheckSucc) ; | |
3440 } | |
3441 masm.bind(DONE_LABEL); | |
3442 | |
3443 // Avoid branch to branch on AMD processors | |
3444 if (EmitSync & 32768) { masm.nop() ; } | |
3445 } | |
3446 %} | |
3447 | |
3448 | 2856 |
3449 enc_class enc_pop_rdx() %{ | 2857 enc_class enc_pop_rdx() %{ |
3450 emit_opcode(cbuf,0x5A); | 2858 emit_opcode(cbuf,0x5A); |
3451 %} | 2859 %} |
3452 | 2860 |
3766 // offsets are based on outgoing arguments, i.e. a CALLER setting up | 3174 // offsets are based on outgoing arguments, i.e. a CALLER setting up |
3767 // arguments for a CALLEE. Incoming stack arguments are | 3175 // arguments for a CALLEE. Incoming stack arguments are |
3768 // automatically biased by the preserve_stack_slots field above. | 3176 // automatically biased by the preserve_stack_slots field above. |
3769 c_calling_convention %{ | 3177 c_calling_convention %{ |
3770 // This is obviously always outgoing | 3178 // This is obviously always outgoing |
3771 (void) SharedRuntime::c_calling_convention(sig_bt, regs, length); | 3179 (void) SharedRuntime::c_calling_convention(sig_bt, regs, /*regs2=*/NULL, length); |
3772 %} | 3180 %} |
3773 | 3181 |
3774 // Location of C & interpreter return values | 3182 // Location of C & interpreter return values |
3775 c_return_value %{ | 3183 c_return_value %{ |
3776 assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" ); | 3184 assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" ); |
5702 %} | 5110 %} |
5703 ins_pipe(ialu_reg); | 5111 ins_pipe(ialu_reg); |
5704 %} | 5112 %} |
5705 | 5113 |
5706 instruct countTrailingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{ | 5114 instruct countTrailingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{ |
5115 predicate(UseCountTrailingZerosInstruction); | |
5116 match(Set dst (CountTrailingZerosI src)); | |
5117 effect(KILL cr); | |
5118 | |
5119 format %{ "TZCNT $dst, $src\t# count trailing zeros (int)" %} | |
5120 ins_encode %{ | |
5121 __ tzcntl($dst$$Register, $src$$Register); | |
5122 %} | |
5123 ins_pipe(ialu_reg); | |
5124 %} | |
5125 | |
5126 instruct countTrailingZerosI_bsf(rRegI dst, rRegI src, eFlagsReg cr) %{ | |
5127 predicate(!UseCountTrailingZerosInstruction); | |
5707 match(Set dst (CountTrailingZerosI src)); | 5128 match(Set dst (CountTrailingZerosI src)); |
5708 effect(KILL cr); | 5129 effect(KILL cr); |
5709 | 5130 |
5710 format %{ "BSF $dst, $src\t# count trailing zeros (int)\n\t" | 5131 format %{ "BSF $dst, $src\t# count trailing zeros (int)\n\t" |
5711 "JNZ done\n\t" | 5132 "JNZ done\n\t" |
5721 %} | 5142 %} |
5722 ins_pipe(ialu_reg); | 5143 ins_pipe(ialu_reg); |
5723 %} | 5144 %} |
5724 | 5145 |
5725 instruct countTrailingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{ | 5146 instruct countTrailingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{ |
5147 predicate(UseCountTrailingZerosInstruction); | |
5148 match(Set dst (CountTrailingZerosL src)); | |
5149 effect(TEMP dst, KILL cr); | |
5150 | |
5151 format %{ "TZCNT $dst, $src.lo\t# count trailing zeros (long) \n\t" | |
5152 "JNC done\n\t" | |
5153 "TZCNT $dst, $src.hi\n\t" | |
5154 "ADD $dst, 32\n" | |
5155 "done:" %} | |
5156 ins_encode %{ | |
5157 Register Rdst = $dst$$Register; | |
5158 Register Rsrc = $src$$Register; | |
5159 Label done; | |
5160 __ tzcntl(Rdst, Rsrc); | |
5161 __ jccb(Assembler::carryClear, done); | |
5162 __ tzcntl(Rdst, HIGH_FROM_LOW(Rsrc)); | |
5163 __ addl(Rdst, BitsPerInt); | |
5164 __ bind(done); | |
5165 %} | |
5166 ins_pipe(ialu_reg); | |
5167 %} | |
5168 | |
5169 instruct countTrailingZerosL_bsf(rRegI dst, eRegL src, eFlagsReg cr) %{ | |
5170 predicate(!UseCountTrailingZerosInstruction); | |
5726 match(Set dst (CountTrailingZerosL src)); | 5171 match(Set dst (CountTrailingZerosL src)); |
5727 effect(TEMP dst, KILL cr); | 5172 effect(TEMP dst, KILL cr); |
5728 | 5173 |
5729 format %{ "BSF $dst, $src.lo\t# count trailing zeros (long)\n\t" | 5174 format %{ "BSF $dst, $src.lo\t# count trailing zeros (long)\n\t" |
5730 "JNZ done\n\t" | 5175 "JNZ done\n\t" |
7097 //----------MemBar Instructions----------------------------------------------- | 6542 //----------MemBar Instructions----------------------------------------------- |
7098 // Memory barrier flavors | 6543 // Memory barrier flavors |
7099 | 6544 |
7100 instruct membar_acquire() %{ | 6545 instruct membar_acquire() %{ |
7101 match(MemBarAcquire); | 6546 match(MemBarAcquire); |
6547 match(LoadFence); | |
7102 ins_cost(400); | 6548 ins_cost(400); |
7103 | 6549 |
7104 size(0); | 6550 size(0); |
7105 format %{ "MEMBAR-acquire ! (empty encoding)" %} | 6551 format %{ "MEMBAR-acquire ! (empty encoding)" %} |
7106 ins_encode(); | 6552 ins_encode(); |
7117 ins_pipe(empty); | 6563 ins_pipe(empty); |
7118 %} | 6564 %} |
7119 | 6565 |
7120 instruct membar_release() %{ | 6566 instruct membar_release() %{ |
7121 match(MemBarRelease); | 6567 match(MemBarRelease); |
6568 match(StoreFence); | |
7122 ins_cost(400); | 6569 ins_cost(400); |
7123 | 6570 |
7124 size(0); | 6571 size(0); |
7125 format %{ "MEMBAR-release ! (empty encoding)" %} | 6572 format %{ "MEMBAR-release ! (empty encoding)" %} |
7126 ins_encode( ); | 6573 ins_encode( ); |
7533 %} | 6980 %} |
7534 | 6981 |
7535 //----------Arithmetic Instructions-------------------------------------------- | 6982 //----------Arithmetic Instructions-------------------------------------------- |
7536 //----------Addition Instructions---------------------------------------------- | 6983 //----------Addition Instructions---------------------------------------------- |
7537 | 6984 |
7538 instruct addExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr) | |
7539 %{ | |
7540 match(AddExactI dst src); | |
7541 effect(DEF cr); | |
7542 | |
7543 format %{ "ADD $dst, $src\t# addExact int" %} | |
7544 ins_encode %{ | |
7545 __ addl($dst$$Register, $src$$Register); | |
7546 %} | |
7547 ins_pipe(ialu_reg_reg); | |
7548 %} | |
7549 | |
7550 instruct addExactI_eReg_imm(eAXRegI dst, immI src, eFlagsReg cr) | |
7551 %{ | |
7552 match(AddExactI dst src); | |
7553 effect(DEF cr); | |
7554 | |
7555 format %{ "ADD $dst, $src\t# addExact int" %} | |
7556 ins_encode %{ | |
7557 __ addl($dst$$Register, $src$$constant); | |
7558 %} | |
7559 ins_pipe(ialu_reg_reg); | |
7560 %} | |
7561 | |
7562 instruct addExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr) | |
7563 %{ | |
7564 match(AddExactI dst (LoadI src)); | |
7565 effect(DEF cr); | |
7566 | |
7567 ins_cost(125); | |
7568 format %{ "ADD $dst,$src\t# addExact int" %} | |
7569 ins_encode %{ | |
7570 __ addl($dst$$Register, $src$$Address); | |
7571 %} | |
7572 ins_pipe( ialu_reg_mem ); | |
7573 %} | |
7574 | |
7575 | |
7576 // Integer Addition Instructions | 6985 // Integer Addition Instructions |
7577 instruct addI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{ | 6986 instruct addI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{ |
7578 match(Set dst (AddI dst src)); | 6987 match(Set dst (AddI dst src)); |
7579 effect(KILL cr); | 6988 effect(KILL cr); |
7580 | 6989 |
7880 ins_pipe( pipe_cmpxchg ); | 7289 ins_pipe( pipe_cmpxchg ); |
7881 %} | 7290 %} |
7882 | 7291 |
7883 //----------Subtraction Instructions------------------------------------------- | 7292 //----------Subtraction Instructions------------------------------------------- |
7884 | 7293 |
7885 instruct subExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr) | |
7886 %{ | |
7887 match(SubExactI dst src); | |
7888 effect(DEF cr); | |
7889 | |
7890 format %{ "SUB $dst, $src\t# subExact int" %} | |
7891 ins_encode %{ | |
7892 __ subl($dst$$Register, $src$$Register); | |
7893 %} | |
7894 ins_pipe(ialu_reg_reg); | |
7895 %} | |
7896 | |
7897 instruct subExactI_eReg_imm(eAXRegI dst, immI src, eFlagsReg cr) | |
7898 %{ | |
7899 match(SubExactI dst src); | |
7900 effect(DEF cr); | |
7901 | |
7902 format %{ "SUB $dst, $src\t# subExact int" %} | |
7903 ins_encode %{ | |
7904 __ subl($dst$$Register, $src$$constant); | |
7905 %} | |
7906 ins_pipe(ialu_reg_reg); | |
7907 %} | |
7908 | |
7909 instruct subExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr) | |
7910 %{ | |
7911 match(SubExactI dst (LoadI src)); | |
7912 effect(DEF cr); | |
7913 | |
7914 ins_cost(125); | |
7915 format %{ "SUB $dst,$src\t# subExact int" %} | |
7916 ins_encode %{ | |
7917 __ subl($dst$$Register, $src$$Address); | |
7918 %} | |
7919 ins_pipe( ialu_reg_mem ); | |
7920 %} | |
7921 | |
7922 // Integer Subtraction Instructions | 7294 // Integer Subtraction Instructions |
7923 instruct subI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{ | 7295 instruct subI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{ |
7924 match(Set dst (SubI dst src)); | 7296 match(Set dst (SubI dst src)); |
7925 effect(KILL cr); | 7297 effect(KILL cr); |
7926 | 7298 |
7983 size(2); | 7355 size(2); |
7984 format %{ "NEG $dst" %} | 7356 format %{ "NEG $dst" %} |
7985 opcode(0xF7,0x03); // Opcode F7 /3 | 7357 opcode(0xF7,0x03); // Opcode F7 /3 |
7986 ins_encode( OpcP, RegOpc( dst ) ); | 7358 ins_encode( OpcP, RegOpc( dst ) ); |
7987 ins_pipe( ialu_reg ); | 7359 ins_pipe( ialu_reg ); |
7988 %} | |
7989 | |
7990 instruct negExactI_eReg(eAXRegI dst, eFlagsReg cr) %{ | |
7991 match(NegExactI dst); | |
7992 effect(DEF cr); | |
7993 | |
7994 format %{ "NEG $dst\t# negExact int"%} | |
7995 ins_encode %{ | |
7996 __ negl($dst$$Register); | |
7997 %} | |
7998 ins_pipe(ialu_reg); | |
7999 %} | 7360 %} |
8000 | 7361 |
8001 //----------Multiplication/Division Instructions------------------------------- | 7362 //----------Multiplication/Division Instructions------------------------------- |
8002 // Integer Multiplication Instructions | 7363 // Integer Multiplication Instructions |
8003 // Multiply Register | 7364 // Multiply Register |
8206 "MUL EDX\t# EDX*EAX -> EDX:EAX\n\t" | 7567 "MUL EDX\t# EDX*EAX -> EDX:EAX\n\t" |
8207 "ADD EDX,$tmp" %} | 7568 "ADD EDX,$tmp" %} |
8208 ins_encode( long_multiply_con( dst, src, tmp ) ); | 7569 ins_encode( long_multiply_con( dst, src, tmp ) ); |
8209 ins_pipe( pipe_slow ); | 7570 ins_pipe( pipe_slow ); |
8210 %} | 7571 %} |
8211 | |
8212 instruct mulExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr) | |
8213 %{ | |
8214 match(MulExactI dst src); | |
8215 effect(DEF cr); | |
8216 | |
8217 ins_cost(300); | |
8218 format %{ "IMUL $dst, $src\t# mulExact int" %} | |
8219 ins_encode %{ | |
8220 __ imull($dst$$Register, $src$$Register); | |
8221 %} | |
8222 ins_pipe(ialu_reg_reg_alu0); | |
8223 %} | |
8224 | |
8225 instruct mulExactI_eReg_imm(eAXRegI dst, rRegI src, immI imm, eFlagsReg cr) | |
8226 %{ | |
8227 match(MulExactI src imm); | |
8228 effect(DEF cr); | |
8229 | |
8230 ins_cost(300); | |
8231 format %{ "IMUL $dst, $src, $imm\t# mulExact int" %} | |
8232 ins_encode %{ | |
8233 __ imull($dst$$Register, $src$$Register, $imm$$constant); | |
8234 %} | |
8235 ins_pipe(ialu_reg_reg_alu0); | |
8236 %} | |
8237 | |
8238 instruct mulExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr) | |
8239 %{ | |
8240 match(MulExactI dst (LoadI src)); | |
8241 effect(DEF cr); | |
8242 | |
8243 ins_cost(350); | |
8244 format %{ "IMUL $dst, $src\t# mulExact int" %} | |
8245 ins_encode %{ | |
8246 __ imull($dst$$Register, $src$$Address); | |
8247 %} | |
8248 ins_pipe(ialu_reg_mem_alu0); | |
8249 %} | |
8250 | |
8251 | 7572 |
8252 // Integer DIV with Register | 7573 // Integer DIV with Register |
8253 instruct divI_eReg(eAXRegI rax, eDXRegI rdx, eCXRegI div, eFlagsReg cr) %{ | 7574 instruct divI_eReg(eAXRegI rax, eDXRegI rdx, eCXRegI div, eFlagsReg cr) %{ |
8254 match(Set rax (DivI rax div)); | 7575 match(Set rax (DivI rax div)); |
8255 effect(KILL rdx, KILL cr); | 7576 effect(KILL rdx, KILL cr); |
8690 // ins_encode( MemImm( dst, src) ); | 8011 // ins_encode( MemImm( dst, src) ); |
8691 ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) ); | 8012 ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) ); |
8692 ins_pipe( ialu_mem_imm ); | 8013 ins_pipe( ialu_mem_imm ); |
8693 %} | 8014 %} |
8694 | 8015 |
8016 // BMI1 instructions | |
8017 instruct andnI_rReg_rReg_rReg(rRegI dst, rRegI src1, rRegI src2, immI_M1 minus_1, eFlagsReg cr) %{ | |
8018 match(Set dst (AndI (XorI src1 minus_1) src2)); | |
8019 predicate(UseBMI1Instructions); | |
8020 effect(KILL cr); | |
8021 | |
8022 format %{ "ANDNL $dst, $src1, $src2" %} | |
8023 | |
8024 ins_encode %{ | |
8025 __ andnl($dst$$Register, $src1$$Register, $src2$$Register); | |
8026 %} | |
8027 ins_pipe(ialu_reg); | |
8028 %} | |
8029 | |
8030 instruct andnI_rReg_rReg_mem(rRegI dst, rRegI src1, memory src2, immI_M1 minus_1, eFlagsReg cr) %{ | |
8031 match(Set dst (AndI (XorI src1 minus_1) (LoadI src2) )); | |
8032 predicate(UseBMI1Instructions); | |
8033 effect(KILL cr); | |
8034 | |
8035 ins_cost(125); | |
8036 format %{ "ANDNL $dst, $src1, $src2" %} | |
8037 | |
8038 ins_encode %{ | |
8039 __ andnl($dst$$Register, $src1$$Register, $src2$$Address); | |
8040 %} | |
8041 ins_pipe(ialu_reg_mem); | |
8042 %} | |
8043 | |
8044 instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, eFlagsReg cr) %{ | |
8045 match(Set dst (AndI (SubI imm_zero src) src)); | |
8046 predicate(UseBMI1Instructions); | |
8047 effect(KILL cr); | |
8048 | |
8049 format %{ "BLSIL $dst, $src" %} | |
8050 | |
8051 ins_encode %{ | |
8052 __ blsil($dst$$Register, $src$$Register); | |
8053 %} | |
8054 ins_pipe(ialu_reg); | |
8055 %} | |
8056 | |
8057 instruct blsiI_rReg_mem(rRegI dst, memory src, immI0 imm_zero, eFlagsReg cr) %{ | |
8058 match(Set dst (AndI (SubI imm_zero (LoadI src) ) (LoadI src) )); | |
8059 predicate(UseBMI1Instructions); | |
8060 effect(KILL cr); | |
8061 | |
8062 ins_cost(125); | |
8063 format %{ "BLSIL $dst, $src" %} | |
8064 | |
8065 ins_encode %{ | |
8066 __ blsil($dst$$Register, $src$$Address); | |
8067 %} | |
8068 ins_pipe(ialu_reg_mem); | |
8069 %} | |
8070 | |
8071 instruct blsmskI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, eFlagsReg cr) | |
8072 %{ | |
8073 match(Set dst (XorI (AddI src minus_1) src)); | |
8074 predicate(UseBMI1Instructions); | |
8075 effect(KILL cr); | |
8076 | |
8077 format %{ "BLSMSKL $dst, $src" %} | |
8078 | |
8079 ins_encode %{ | |
8080 __ blsmskl($dst$$Register, $src$$Register); | |
8081 %} | |
8082 | |
8083 ins_pipe(ialu_reg); | |
8084 %} | |
8085 | |
8086 instruct blsmskI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, eFlagsReg cr) | |
8087 %{ | |
8088 match(Set dst (XorI (AddI (LoadI src) minus_1) (LoadI src) )); | |
8089 predicate(UseBMI1Instructions); | |
8090 effect(KILL cr); | |
8091 | |
8092 ins_cost(125); | |
8093 format %{ "BLSMSKL $dst, $src" %} | |
8094 | |
8095 ins_encode %{ | |
8096 __ blsmskl($dst$$Register, $src$$Address); | |
8097 %} | |
8098 | |
8099 ins_pipe(ialu_reg_mem); | |
8100 %} | |
8101 | |
8102 instruct blsrI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, eFlagsReg cr) | |
8103 %{ | |
8104 match(Set dst (AndI (AddI src minus_1) src) ); | |
8105 predicate(UseBMI1Instructions); | |
8106 effect(KILL cr); | |
8107 | |
8108 format %{ "BLSRL $dst, $src" %} | |
8109 | |
8110 ins_encode %{ | |
8111 __ blsrl($dst$$Register, $src$$Register); | |
8112 %} | |
8113 | |
8114 ins_pipe(ialu_reg); | |
8115 %} | |
8116 | |
8117 instruct blsrI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, eFlagsReg cr) | |
8118 %{ | |
8119 match(Set dst (AndI (AddI (LoadI src) minus_1) (LoadI src) )); | |
8120 predicate(UseBMI1Instructions); | |
8121 effect(KILL cr); | |
8122 | |
8123 ins_cost(125); | |
8124 format %{ "BLSRL $dst, $src" %} | |
8125 | |
8126 ins_encode %{ | |
8127 __ blsrl($dst$$Register, $src$$Address); | |
8128 %} | |
8129 | |
8130 ins_pipe(ialu_reg_mem); | |
8131 %} | |
8132 | |
8695 // Or Instructions | 8133 // Or Instructions |
8696 // Or Register with Register | 8134 // Or Register with Register |
8697 instruct orI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{ | 8135 instruct orI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{ |
8698 match(Set dst (OrI dst src)); | 8136 match(Set dst (OrI dst src)); |
8699 effect(KILL cr); | 8137 effect(KILL cr); |
9112 | 8550 |
9113 /* If I enable this, I encourage spilling in the inner loop of compress. | 8551 /* If I enable this, I encourage spilling in the inner loop of compress. |
9114 instruct cadd_cmpLTMask_mem(ncxRegI p, ncxRegI q, memory y, eCXRegI tmp, eFlagsReg cr) %{ | 8552 instruct cadd_cmpLTMask_mem(ncxRegI p, ncxRegI q, memory y, eCXRegI tmp, eFlagsReg cr) %{ |
9115 match(Set p (AddI (AndI (CmpLTMask p q) (LoadI y)) (SubI p q))); | 8553 match(Set p (AddI (AndI (CmpLTMask p q) (LoadI y)) (SubI p q))); |
9116 */ | 8554 */ |
8555 //----------Overflow Math Instructions----------------------------------------- | |
8556 | |
8557 instruct overflowAddI_eReg(eFlagsReg cr, eAXRegI op1, rRegI op2) | |
8558 %{ | |
8559 match(Set cr (OverflowAddI op1 op2)); | |
8560 effect(DEF cr, USE_KILL op1, USE op2); | |
8561 | |
8562 format %{ "ADD $op1, $op2\t# overflow check int" %} | |
8563 | |
8564 ins_encode %{ | |
8565 __ addl($op1$$Register, $op2$$Register); | |
8566 %} | |
8567 ins_pipe(ialu_reg_reg); | |
8568 %} | |
8569 | |
8570 instruct overflowAddI_rReg_imm(eFlagsReg cr, eAXRegI op1, immI op2) | |
8571 %{ | |
8572 match(Set cr (OverflowAddI op1 op2)); | |
8573 effect(DEF cr, USE_KILL op1, USE op2); | |
8574 | |
8575 format %{ "ADD $op1, $op2\t# overflow check int" %} | |
8576 | |
8577 ins_encode %{ | |
8578 __ addl($op1$$Register, $op2$$constant); | |
8579 %} | |
8580 ins_pipe(ialu_reg_reg); | |
8581 %} | |
8582 | |
8583 instruct overflowSubI_rReg(eFlagsReg cr, rRegI op1, rRegI op2) | |
8584 %{ | |
8585 match(Set cr (OverflowSubI op1 op2)); | |
8586 | |
8587 format %{ "CMP $op1, $op2\t# overflow check int" %} | |
8588 ins_encode %{ | |
8589 __ cmpl($op1$$Register, $op2$$Register); | |
8590 %} | |
8591 ins_pipe(ialu_reg_reg); | |
8592 %} | |
8593 | |
8594 instruct overflowSubI_rReg_imm(eFlagsReg cr, rRegI op1, immI op2) | |
8595 %{ | |
8596 match(Set cr (OverflowSubI op1 op2)); | |
8597 | |
8598 format %{ "CMP $op1, $op2\t# overflow check int" %} | |
8599 ins_encode %{ | |
8600 __ cmpl($op1$$Register, $op2$$constant); | |
8601 %} | |
8602 ins_pipe(ialu_reg_reg); | |
8603 %} | |
8604 | |
8605 instruct overflowNegI_rReg(eFlagsReg cr, immI0 zero, eAXRegI op2) | |
8606 %{ | |
8607 match(Set cr (OverflowSubI zero op2)); | |
8608 effect(DEF cr, USE_KILL op2); | |
8609 | |
8610 format %{ "NEG $op2\t# overflow check int" %} | |
8611 ins_encode %{ | |
8612 __ negl($op2$$Register); | |
8613 %} | |
8614 ins_pipe(ialu_reg_reg); | |
8615 %} | |
8616 | |
8617 instruct overflowMulI_rReg(eFlagsReg cr, eAXRegI op1, rRegI op2) | |
8618 %{ | |
8619 match(Set cr (OverflowMulI op1 op2)); | |
8620 effect(DEF cr, USE_KILL op1, USE op2); | |
8621 | |
8622 format %{ "IMUL $op1, $op2\t# overflow check int" %} | |
8623 ins_encode %{ | |
8624 __ imull($op1$$Register, $op2$$Register); | |
8625 %} | |
8626 ins_pipe(ialu_reg_reg_alu0); | |
8627 %} | |
8628 | |
8629 instruct overflowMulI_rReg_imm(eFlagsReg cr, rRegI op1, immI op2, rRegI tmp) | |
8630 %{ | |
8631 match(Set cr (OverflowMulI op1 op2)); | |
8632 effect(DEF cr, TEMP tmp, USE op1, USE op2); | |
8633 | |
8634 format %{ "IMUL $tmp, $op1, $op2\t# overflow check int" %} | |
8635 ins_encode %{ | |
8636 __ imull($tmp$$Register, $op1$$Register, $op2$$constant); | |
8637 %} | |
8638 ins_pipe(ialu_reg_reg_alu0); | |
8639 %} | |
9117 | 8640 |
9118 //----------Long Instructions------------------------------------------------ | 8641 //----------Long Instructions------------------------------------------------ |
9119 // Add Long Register with Register | 8642 // Add Long Register with Register |
9120 instruct addL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{ | 8643 instruct addL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{ |
9121 match(Set dst (AddL dst src)); | 8644 match(Set dst (AddL dst src)); |
9225 format %{ "AND $dst.lo,$mem\n\t" | 8748 format %{ "AND $dst.lo,$mem\n\t" |
9226 "AND $dst.hi,$mem+4" %} | 8749 "AND $dst.hi,$mem+4" %} |
9227 opcode(0x23, 0x23); | 8750 opcode(0x23, 0x23); |
9228 ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) ); | 8751 ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) ); |
9229 ins_pipe( ialu_reg_long_mem ); | 8752 ins_pipe( ialu_reg_long_mem ); |
8753 %} | |
8754 | |
8755 // BMI1 instructions | |
8756 instruct andnL_eReg_eReg_eReg(eRegL dst, eRegL src1, eRegL src2, immL_M1 minus_1, eFlagsReg cr) %{ | |
8757 match(Set dst (AndL (XorL src1 minus_1) src2)); | |
8758 predicate(UseBMI1Instructions); | |
8759 effect(KILL cr, TEMP dst); | |
8760 | |
8761 format %{ "ANDNL $dst.lo, $src1.lo, $src2.lo\n\t" | |
8762 "ANDNL $dst.hi, $src1.hi, $src2.hi" | |
8763 %} | |
8764 | |
8765 ins_encode %{ | |
8766 Register Rdst = $dst$$Register; | |
8767 Register Rsrc1 = $src1$$Register; | |
8768 Register Rsrc2 = $src2$$Register; | |
8769 __ andnl(Rdst, Rsrc1, Rsrc2); | |
8770 __ andnl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc1), HIGH_FROM_LOW(Rsrc2)); | |
8771 %} | |
8772 ins_pipe(ialu_reg_reg_long); | |
8773 %} | |
8774 | |
8775 instruct andnL_eReg_eReg_mem(eRegL dst, eRegL src1, memory src2, immL_M1 minus_1, eFlagsReg cr) %{ | |
8776 match(Set dst (AndL (XorL src1 minus_1) (LoadL src2) )); | |
8777 predicate(UseBMI1Instructions); | |
8778 effect(KILL cr, TEMP dst); | |
8779 | |
8780 ins_cost(125); | |
8781 format %{ "ANDNL $dst.lo, $src1.lo, $src2\n\t" | |
8782 "ANDNL $dst.hi, $src1.hi, $src2+4" | |
8783 %} | |
8784 | |
8785 ins_encode %{ | |
8786 Register Rdst = $dst$$Register; | |
8787 Register Rsrc1 = $src1$$Register; | |
8788 Address src2_hi = Address::make_raw($src2$$base, $src2$$index, $src2$$scale, $src2$$disp + 4, relocInfo::none); | |
8789 | |
8790 __ andnl(Rdst, Rsrc1, $src2$$Address); | |
8791 __ andnl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc1), src2_hi); | |
8792 %} | |
8793 ins_pipe(ialu_reg_mem); | |
8794 %} | |
8795 | |
8796 instruct blsiL_eReg_eReg(eRegL dst, eRegL src, immL0 imm_zero, eFlagsReg cr) %{ | |
8797 match(Set dst (AndL (SubL imm_zero src) src)); | |
8798 predicate(UseBMI1Instructions); | |
8799 effect(KILL cr, TEMP dst); | |
8800 | |
8801 format %{ "MOVL $dst.hi, 0\n\t" | |
8802 "BLSIL $dst.lo, $src.lo\n\t" | |
8803 "JNZ done\n\t" | |
8804 "BLSIL $dst.hi, $src.hi\n" | |
8805 "done:" | |
8806 %} | |
8807 | |
8808 ins_encode %{ | |
8809 Label done; | |
8810 Register Rdst = $dst$$Register; | |
8811 Register Rsrc = $src$$Register; | |
8812 __ movl(HIGH_FROM_LOW(Rdst), 0); | |
8813 __ blsil(Rdst, Rsrc); | |
8814 __ jccb(Assembler::notZero, done); | |
8815 __ blsil(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc)); | |
8816 __ bind(done); | |
8817 %} | |
8818 ins_pipe(ialu_reg); | |
8819 %} | |
8820 | |
8821 instruct blsiL_eReg_mem(eRegL dst, memory src, immL0 imm_zero, eFlagsReg cr) %{ | |
8822 match(Set dst (AndL (SubL imm_zero (LoadL src) ) (LoadL src) )); | |
8823 predicate(UseBMI1Instructions); | |
8824 effect(KILL cr, TEMP dst); | |
8825 | |
8826 ins_cost(125); | |
8827 format %{ "MOVL $dst.hi, 0\n\t" | |
8828 "BLSIL $dst.lo, $src\n\t" | |
8829 "JNZ done\n\t" | |
8830 "BLSIL $dst.hi, $src+4\n" | |
8831 "done:" | |
8832 %} | |
8833 | |
8834 ins_encode %{ | |
8835 Label done; | |
8836 Register Rdst = $dst$$Register; | |
8837 Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none); | |
8838 | |
8839 __ movl(HIGH_FROM_LOW(Rdst), 0); | |
8840 __ blsil(Rdst, $src$$Address); | |
8841 __ jccb(Assembler::notZero, done); | |
8842 __ blsil(HIGH_FROM_LOW(Rdst), src_hi); | |
8843 __ bind(done); | |
8844 %} | |
8845 ins_pipe(ialu_reg_mem); | |
8846 %} | |
8847 | |
8848 instruct blsmskL_eReg_eReg(eRegL dst, eRegL src, immL_M1 minus_1, eFlagsReg cr) | |
8849 %{ | |
8850 match(Set dst (XorL (AddL src minus_1) src)); | |
8851 predicate(UseBMI1Instructions); | |
8852 effect(KILL cr, TEMP dst); | |
8853 | |
8854 format %{ "MOVL $dst.hi, 0\n\t" | |
8855 "BLSMSKL $dst.lo, $src.lo\n\t" | |
8856 "JNC done\n\t" | |
8857 "BLSMSKL $dst.hi, $src.hi\n" | |
8858 "done:" | |
8859 %} | |
8860 | |
8861 ins_encode %{ | |
8862 Label done; | |
8863 Register Rdst = $dst$$Register; | |
8864 Register Rsrc = $src$$Register; | |
8865 __ movl(HIGH_FROM_LOW(Rdst), 0); | |
8866 __ blsmskl(Rdst, Rsrc); | |
8867 __ jccb(Assembler::carryClear, done); | |
8868 __ blsmskl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc)); | |
8869 __ bind(done); | |
8870 %} | |
8871 | |
8872 ins_pipe(ialu_reg); | |
8873 %} | |
8874 | |
8875 instruct blsmskL_eReg_mem(eRegL dst, memory src, immL_M1 minus_1, eFlagsReg cr) | |
8876 %{ | |
8877 match(Set dst (XorL (AddL (LoadL src) minus_1) (LoadL src) )); | |
8878 predicate(UseBMI1Instructions); | |
8879 effect(KILL cr, TEMP dst); | |
8880 | |
8881 ins_cost(125); | |
8882 format %{ "MOVL $dst.hi, 0\n\t" | |
8883 "BLSMSKL $dst.lo, $src\n\t" | |
8884 "JNC done\n\t" | |
8885 "BLSMSKL $dst.hi, $src+4\n" | |
8886 "done:" | |
8887 %} | |
8888 | |
8889 ins_encode %{ | |
8890 Label done; | |
8891 Register Rdst = $dst$$Register; | |
8892 Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none); | |
8893 | |
8894 __ movl(HIGH_FROM_LOW(Rdst), 0); | |
8895 __ blsmskl(Rdst, $src$$Address); | |
8896 __ jccb(Assembler::carryClear, done); | |
8897 __ blsmskl(HIGH_FROM_LOW(Rdst), src_hi); | |
8898 __ bind(done); | |
8899 %} | |
8900 | |
8901 ins_pipe(ialu_reg_mem); | |
8902 %} | |
8903 | |
8904 instruct blsrL_eReg_eReg(eRegL dst, eRegL src, immL_M1 minus_1, eFlagsReg cr) | |
8905 %{ | |
8906 match(Set dst (AndL (AddL src minus_1) src) ); | |
8907 predicate(UseBMI1Instructions); | |
8908 effect(KILL cr, TEMP dst); | |
8909 | |
8910 format %{ "MOVL $dst.hi, $src.hi\n\t" | |
8911 "BLSRL $dst.lo, $src.lo\n\t" | |
8912 "JNC done\n\t" | |
8913 "BLSRL $dst.hi, $src.hi\n" | |
8914 "done:" | |
8915 %} | |
8916 | |
8917 ins_encode %{ | |
8918 Label done; | |
8919 Register Rdst = $dst$$Register; | |
8920 Register Rsrc = $src$$Register; | |
8921 __ movl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc)); | |
8922 __ blsrl(Rdst, Rsrc); | |
8923 __ jccb(Assembler::carryClear, done); | |
8924 __ blsrl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc)); | |
8925 __ bind(done); | |
8926 %} | |
8927 | |
8928 ins_pipe(ialu_reg); | |
8929 %} | |
8930 | |
8931 instruct blsrL_eReg_mem(eRegL dst, memory src, immL_M1 minus_1, eFlagsReg cr) | |
8932 %{ | |
8933 match(Set dst (AndL (AddL (LoadL src) minus_1) (LoadL src) )); | |
8934 predicate(UseBMI1Instructions); | |
8935 effect(KILL cr, TEMP dst); | |
8936 | |
8937 ins_cost(125); | |
8938 format %{ "MOVL $dst.hi, $src+4\n\t" | |
8939 "BLSRL $dst.lo, $src\n\t" | |
8940 "JNC done\n\t" | |
8941 "BLSRL $dst.hi, $src+4\n" | |
8942 "done:" | |
8943 %} | |
8944 | |
8945 ins_encode %{ | |
8946 Label done; | |
8947 Register Rdst = $dst$$Register; | |
8948 Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none); | |
8949 __ movl(HIGH_FROM_LOW(Rdst), src_hi); | |
8950 __ blsrl(Rdst, $src$$Address); | |
8951 __ jccb(Assembler::carryClear, done); | |
8952 __ blsrl(HIGH_FROM_LOW(Rdst), src_hi); | |
8953 __ bind(done); | |
8954 %} | |
8955 | |
8956 ins_pipe(ialu_reg_mem); | |
9230 %} | 8957 %} |
9231 | 8958 |
9232 // Or Long Register with Register | 8959 // Or Long Register with Register |
9233 instruct orl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{ | 8960 instruct orl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{ |
9234 match(Set dst (OrL dst src)); | 8961 match(Set dst (OrL dst src)); |
13145 ins_pipe( pipe_jmp ); | 12872 ins_pipe( pipe_jmp ); |
13146 %} | 12873 %} |
13147 | 12874 |
13148 // inlined locking and unlocking | 12875 // inlined locking and unlocking |
13149 | 12876 |
13150 | 12877 instruct cmpFastLockRTM(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eDXRegI scr, rRegI cx1, rRegI cx2) %{ |
13151 instruct cmpFastLock( eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{ | 12878 predicate(Compile::current()->use_rtm()); |
13152 match( Set cr (FastLock object box) ); | 12879 match(Set cr (FastLock object box)); |
13153 effect( TEMP tmp, TEMP scr, USE_KILL box ); | 12880 effect(TEMP tmp, TEMP scr, TEMP cx1, TEMP cx2, USE_KILL box); |
12881 ins_cost(300); | |
12882 format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr,$cx1,$cx2" %} | |
12883 ins_encode %{ | |
12884 __ fast_lock($object$$Register, $box$$Register, $tmp$$Register, | |
12885 $scr$$Register, $cx1$$Register, $cx2$$Register, | |
12886 _counters, _rtm_counters, _stack_rtm_counters, | |
12887 ((Method*)(ra_->C->method()->constant_encoding()))->method_data(), | |
12888 true, ra_->C->profile_rtm()); | |
12889 %} | |
12890 ins_pipe(pipe_slow); | |
12891 %} | |
12892 | |
12893 instruct cmpFastLock(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{ | |
12894 predicate(!Compile::current()->use_rtm()); | |
12895 match(Set cr (FastLock object box)); | |
12896 effect(TEMP tmp, TEMP scr, USE_KILL box); | |
13154 ins_cost(300); | 12897 ins_cost(300); |
13155 format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %} | 12898 format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %} |
13156 ins_encode( Fast_Lock(object,box,tmp,scr) ); | 12899 ins_encode %{ |
13157 ins_pipe( pipe_slow ); | 12900 __ fast_lock($object$$Register, $box$$Register, $tmp$$Register, |
13158 %} | 12901 $scr$$Register, noreg, noreg, _counters, NULL, NULL, NULL, false, false); |
13159 | 12902 %} |
13160 instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{ | 12903 ins_pipe(pipe_slow); |
13161 match( Set cr (FastUnlock object box) ); | 12904 %} |
13162 effect( TEMP tmp, USE_KILL box ); | 12905 |
12906 instruct cmpFastUnlock(eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{ | |
12907 match(Set cr (FastUnlock object box)); | |
12908 effect(TEMP tmp, USE_KILL box); | |
13163 ins_cost(300); | 12909 ins_cost(300); |
13164 format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %} | 12910 format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %} |
13165 ins_encode( Fast_Unlock(object,box,tmp) ); | 12911 ins_encode %{ |
13166 ins_pipe( pipe_slow ); | 12912 __ fast_unlock($object$$Register, $box$$Register, $tmp$$Register, ra_->C->use_rtm()); |
12913 %} | |
12914 ins_pipe(pipe_slow); | |
13167 %} | 12915 %} |
13168 | 12916 |
13169 | 12917 |
13170 | 12918 |
13171 // ============================================================================ | 12919 // ============================================================================ |