comparison src/cpu/x86/vm/x86_32.ad @ 17810:62c54fcc0a35

Merge
author kvn
date Tue, 25 Mar 2014 17:07:36 -0700
parents a433eb716ce1 606acabe7b5c
children 0bf37f737702
comparison
equal deleted inserted replaced
17809:a433eb716ce1 17810:62c54fcc0a35
1487 1487
1488 const RegMask Matcher::method_handle_invoke_SP_save_mask() { 1488 const RegMask Matcher::method_handle_invoke_SP_save_mask() {
1489 return EBP_REG_mask(); 1489 return EBP_REG_mask();
1490 } 1490 }
1491 1491
1492 const RegMask Matcher::mathExactI_result_proj_mask() {
1493 return EAX_REG_mask();
1494 }
1495
1496 const RegMask Matcher::mathExactL_result_proj_mask() {
1497 ShouldNotReachHere();
1498 return RegMask();
1499 }
1500
1501 const RegMask Matcher::mathExactI_flags_proj_mask() {
1502 return INT_FLAGS_mask();
1503 }
1504
1505 // Returns true if the high 32 bits of the value is known to be zero. 1492 // Returns true if the high 32 bits of the value is known to be zero.
1506 bool is_operand_hi32_zero(Node* n) { 1493 bool is_operand_hi32_zero(Node* n) {
1507 int opc = n->Opcode(); 1494 int opc = n->Opcode();
1508 if (opc == Op_AndL) { 1495 if (opc == Op_AndL) {
1509 Node* o2 = n->in(2); 1496 Node* o2 = n->in(2);
2862 emit_rm (cbuf,0x3, 0x3, $dst$$reg ); 2849 emit_rm (cbuf,0x3, 0x3, $dst$$reg );
2863 emit_opcode(cbuf,0x83); // SBB hi,0 2850 emit_opcode(cbuf,0x83); // SBB hi,0
2864 emit_rm (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg)); 2851 emit_rm (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
2865 emit_d8 (cbuf,0 ); 2852 emit_d8 (cbuf,0 );
2866 %} 2853 %}
2867
2868
2869 // Because the transitions from emitted code to the runtime
2870 // monitorenter/exit helper stubs are so slow it's critical that
2871 // we inline both the stack-locking fast-path and the inflated fast path.
2872 //
2873 // See also: cmpFastLock and cmpFastUnlock.
2874 //
2875 // What follows is a specialized inline transliteration of the code
2876 // in slow_enter() and slow_exit(). If we're concerned about I$ bloat
2877 // another option would be to emit TrySlowEnter and TrySlowExit methods
2878 // at startup-time. These methods would accept arguments as
2879 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
2880 // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply
2881 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
2882 // In practice, however, the # of lock sites is bounded and is usually small.
2883 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
2884 // if the processor uses simple bimodal branch predictors keyed by EIP
2885 // Since the helper routines would be called from multiple synchronization
2886 // sites.
2887 //
2888 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
2889 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
2890 // to those specialized methods. That'd give us a mostly platform-independent
2891 // implementation that the JITs could optimize and inline at their pleasure.
2892 // Done correctly, the only time we'd need to cross to native could would be
2893 // to park() or unpark() threads. We'd also need a few more unsafe operators
2894 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
2895 // (b) explicit barriers or fence operations.
2896 //
2897 // TODO:
2898 //
2899 // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
2900 // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
2901 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
2902 // the lock operators would typically be faster than reifying Self.
2903 //
2904 // * Ideally I'd define the primitives as:
2905 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
2906 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
2907 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
2908 // Instead, we're stuck with a rather awkward and brittle register assignments below.
2909 // Furthermore the register assignments are overconstrained, possibly resulting in
2910 // sub-optimal code near the synchronization site.
2911 //
2912 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
2913 // Alternately, use a better sp-proximity test.
2914 //
2915 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
2916 // Either one is sufficient to uniquely identify a thread.
2917 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
2918 //
2919 // * Intrinsify notify() and notifyAll() for the common cases where the
2920 // object is locked by the calling thread but the waitlist is empty.
2921 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
2922 //
2923 // * use jccb and jmpb instead of jcc and jmp to improve code density.
2924 // But beware of excessive branch density on AMD Opterons.
2925 //
2926 // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
2927 // or failure of the fast-path. If the fast-path fails then we pass
2928 // control to the slow-path, typically in C. In Fast_Lock and
2929 // Fast_Unlock we often branch to DONE_LABEL, just to find that C2
2930 // will emit a conditional branch immediately after the node.
2931 // So we have branches to branches and lots of ICC.ZF games.
2932 // Instead, it might be better to have C2 pass a "FailureLabel"
2933 // into Fast_Lock and Fast_Unlock. In the case of success, control
2934 // will drop through the node. ICC.ZF is undefined at exit.
2935 // In the case of failure, the node will branch directly to the
2936 // FailureLabel
2937
2938
2939 // obj: object to lock
2940 // box: on-stack box address (displaced header location) - KILLED
2941 // rax,: tmp -- KILLED
2942 // scr: tmp -- KILLED
2943 enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{
2944
2945 Register objReg = as_Register($obj$$reg);
2946 Register boxReg = as_Register($box$$reg);
2947 Register tmpReg = as_Register($tmp$$reg);
2948 Register scrReg = as_Register($scr$$reg);
2949
2950 // Ensure the register assignents are disjoint
2951 guarantee (objReg != boxReg, "") ;
2952 guarantee (objReg != tmpReg, "") ;
2953 guarantee (objReg != scrReg, "") ;
2954 guarantee (boxReg != tmpReg, "") ;
2955 guarantee (boxReg != scrReg, "") ;
2956 guarantee (tmpReg == as_Register(EAX_enc), "") ;
2957
2958 MacroAssembler masm(&cbuf);
2959
2960 if (_counters != NULL) {
2961 masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
2962 }
2963 if (EmitSync & 1) {
2964 // set box->dhw = unused_mark (3)
2965 // Force all sync thru slow-path: slow_enter() and slow_exit()
2966 masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ;
2967 masm.cmpptr (rsp, (int32_t)0) ;
2968 } else
2969 if (EmitSync & 2) {
2970 Label DONE_LABEL ;
2971 if (UseBiasedLocking) {
2972 // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
2973 masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
2974 }
2975
2976 masm.movptr(tmpReg, Address(objReg, 0)) ; // fetch markword
2977 masm.orptr (tmpReg, 0x1);
2978 masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
2979 if (os::is_MP()) { masm.lock(); }
2980 masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
2981 masm.jcc(Assembler::equal, DONE_LABEL);
2982 // Recursive locking
2983 masm.subptr(tmpReg, rsp);
2984 masm.andptr(tmpReg, (int32_t) 0xFFFFF003 );
2985 masm.movptr(Address(boxReg, 0), tmpReg);
2986 masm.bind(DONE_LABEL) ;
2987 } else {
2988 // Possible cases that we'll encounter in fast_lock
2989 // ------------------------------------------------
2990 // * Inflated
2991 // -- unlocked
2992 // -- Locked
2993 // = by self
2994 // = by other
2995 // * biased
2996 // -- by Self
2997 // -- by other
2998 // * neutral
2999 // * stack-locked
3000 // -- by self
3001 // = sp-proximity test hits
3002 // = sp-proximity test generates false-negative
3003 // -- by other
3004 //
3005
3006 Label IsInflated, DONE_LABEL, PopDone ;
3007
3008 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
3009 // order to reduce the number of conditional branches in the most common cases.
3010 // Beware -- there's a subtle invariant that fetch of the markword
3011 // at [FETCH], below, will never observe a biased encoding (*101b).
3012 // If this invariant is not held we risk exclusion (safety) failure.
3013 if (UseBiasedLocking && !UseOptoBiasInlining) {
3014 masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
3015 }
3016
3017 masm.movptr(tmpReg, Address(objReg, 0)) ; // [FETCH]
3018 masm.testptr(tmpReg, 0x02) ; // Inflated v (Stack-locked or neutral)
3019 masm.jccb (Assembler::notZero, IsInflated) ;
3020
3021 // Attempt stack-locking ...
3022 masm.orptr (tmpReg, 0x1);
3023 masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
3024 if (os::is_MP()) { masm.lock(); }
3025 masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
3026 if (_counters != NULL) {
3027 masm.cond_inc32(Assembler::equal,
3028 ExternalAddress((address)_counters->fast_path_entry_count_addr()));
3029 }
3030 masm.jccb (Assembler::equal, DONE_LABEL);
3031
3032 // Recursive locking
3033 masm.subptr(tmpReg, rsp);
3034 masm.andptr(tmpReg, 0xFFFFF003 );
3035 masm.movptr(Address(boxReg, 0), tmpReg);
3036 if (_counters != NULL) {
3037 masm.cond_inc32(Assembler::equal,
3038 ExternalAddress((address)_counters->fast_path_entry_count_addr()));
3039 }
3040 masm.jmp (DONE_LABEL) ;
3041
3042 masm.bind (IsInflated) ;
3043
3044 // The object is inflated.
3045 //
3046 // TODO-FIXME: eliminate the ugly use of manifest constants:
3047 // Use markOopDesc::monitor_value instead of "2".
3048 // use markOop::unused_mark() instead of "3".
3049 // The tmpReg value is an objectMonitor reference ORed with
3050 // markOopDesc::monitor_value (2). We can either convert tmpReg to an
3051 // objectmonitor pointer by masking off the "2" bit or we can just
3052 // use tmpReg as an objectmonitor pointer but bias the objectmonitor
3053 // field offsets with "-2" to compensate for and annul the low-order tag bit.
3054 //
3055 // I use the latter as it avoids AGI stalls.
3056 // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
3057 // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
3058 //
3059 #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
3060
3061 // boxReg refers to the on-stack BasicLock in the current frame.
3062 // We'd like to write:
3063 // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices.
3064 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
3065 // additional latency as we have another ST in the store buffer that must drain.
3066
3067 if (EmitSync & 8192) {
3068 masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty
3069 masm.get_thread (scrReg) ;
3070 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
3071 masm.movptr(tmpReg, NULL_WORD); // consider: xor vs mov
3072 if (os::is_MP()) { masm.lock(); }
3073 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3074 } else
3075 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
3076 masm.movptr(scrReg, boxReg) ;
3077 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
3078
3079 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3080 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
3081 // prefetchw [eax + Offset(_owner)-2]
3082 masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3083 }
3084
3085 if ((EmitSync & 64) == 0) {
3086 // Optimistic form: consider XORL tmpReg,tmpReg
3087 masm.movptr(tmpReg, NULL_WORD) ;
3088 } else {
3089 // Can suffer RTS->RTO upgrades on shared or cold $ lines
3090 // Test-And-CAS instead of CAS
3091 masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner
3092 masm.testptr(tmpReg, tmpReg) ; // Locked ?
3093 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3094 }
3095
3096 // Appears unlocked - try to swing _owner from null to non-null.
3097 // Ideally, I'd manifest "Self" with get_thread and then attempt
3098 // to CAS the register containing Self into m->Owner.
3099 // But we don't have enough registers, so instead we can either try to CAS
3100 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
3101 // we later store "Self" into m->Owner. Transiently storing a stack address
3102 // (rsp or the address of the box) into m->owner is harmless.
3103 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
3104 if (os::is_MP()) { masm.lock(); }
3105 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3106 masm.movptr(Address(scrReg, 0), 3) ; // box->_displaced_header = 3
3107 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3108 masm.get_thread (scrReg) ; // beware: clobbers ICCs
3109 masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ;
3110 masm.xorptr(boxReg, boxReg) ; // set icc.ZFlag = 1 to indicate success
3111
3112 // If the CAS fails we can either retry or pass control to the slow-path.
3113 // We use the latter tactic.
3114 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
3115 // If the CAS was successful ...
3116 // Self has acquired the lock
3117 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
3118 // Intentional fall-through into DONE_LABEL ...
3119 } else {
3120 masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty
3121 masm.movptr(boxReg, tmpReg) ;
3122
3123 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3124 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
3125 // prefetchw [eax + Offset(_owner)-2]
3126 masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3127 }
3128
3129 if ((EmitSync & 64) == 0) {
3130 // Optimistic form
3131 masm.xorptr (tmpReg, tmpReg) ;
3132 } else {
3133 // Can suffer RTS->RTO upgrades on shared or cold $ lines
3134 masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner
3135 masm.testptr(tmpReg, tmpReg) ; // Locked ?
3136 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3137 }
3138
3139 // Appears unlocked - try to swing _owner from null to non-null.
3140 // Use either "Self" (in scr) or rsp as thread identity in _owner.
3141 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
3142 masm.get_thread (scrReg) ;
3143 if (os::is_MP()) { masm.lock(); }
3144 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3145
3146 // If the CAS fails we can either retry or pass control to the slow-path.
3147 // We use the latter tactic.
3148 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
3149 // If the CAS was successful ...
3150 // Self has acquired the lock
3151 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
3152 // Intentional fall-through into DONE_LABEL ...
3153 }
3154
3155 // DONE_LABEL is a hot target - we'd really like to place it at the
3156 // start of cache line by padding with NOPs.
3157 // See the AMD and Intel software optimization manuals for the
3158 // most efficient "long" NOP encodings.
3159 // Unfortunately none of our alignment mechanisms suffice.
3160 masm.bind(DONE_LABEL);
3161
3162 // Avoid branch-to-branch on AMD processors
3163 // This appears to be superstition.
3164 if (EmitSync & 32) masm.nop() ;
3165
3166
3167 // At DONE_LABEL the icc ZFlag is set as follows ...
3168 // Fast_Unlock uses the same protocol.
3169 // ZFlag == 1 -> Success
3170 // ZFlag == 0 -> Failure - force control through the slow-path
3171 }
3172 %}
3173
3174 // obj: object to unlock
3175 // box: box address (displaced header location), killed. Must be EAX.
3176 // rbx,: killed tmp; cannot be obj nor box.
3177 //
3178 // Some commentary on balanced locking:
3179 //
3180 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
3181 // Methods that don't have provably balanced locking are forced to run in the
3182 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
3183 // The interpreter provides two properties:
3184 // I1: At return-time the interpreter automatically and quietly unlocks any
3185 // objects acquired the current activation (frame). Recall that the
3186 // interpreter maintains an on-stack list of locks currently held by
3187 // a frame.
3188 // I2: If a method attempts to unlock an object that is not held by the
3189 // the frame the interpreter throws IMSX.
3190 //
3191 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
3192 // B() doesn't have provably balanced locking so it runs in the interpreter.
3193 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
3194 // is still locked by A().
3195 //
3196 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
3197 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
3198 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
3199 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
3200
3201 enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{
3202
3203 Register objReg = as_Register($obj$$reg);
3204 Register boxReg = as_Register($box$$reg);
3205 Register tmpReg = as_Register($tmp$$reg);
3206
3207 guarantee (objReg != boxReg, "") ;
3208 guarantee (objReg != tmpReg, "") ;
3209 guarantee (boxReg != tmpReg, "") ;
3210 guarantee (boxReg == as_Register(EAX_enc), "") ;
3211 MacroAssembler masm(&cbuf);
3212
3213 if (EmitSync & 4) {
3214 // Disable - inhibit all inlining. Force control through the slow-path
3215 masm.cmpptr (rsp, 0) ;
3216 } else
3217 if (EmitSync & 8) {
3218 Label DONE_LABEL ;
3219 if (UseBiasedLocking) {
3220 masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
3221 }
3222 // classic stack-locking code ...
3223 masm.movptr(tmpReg, Address(boxReg, 0)) ;
3224 masm.testptr(tmpReg, tmpReg) ;
3225 masm.jcc (Assembler::zero, DONE_LABEL) ;
3226 if (os::is_MP()) { masm.lock(); }
3227 masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
3228 masm.bind(DONE_LABEL);
3229 } else {
3230 Label DONE_LABEL, Stacked, CheckSucc, Inflated ;
3231
3232 // Critically, the biased locking test must have precedence over
3233 // and appear before the (box->dhw == 0) recursive stack-lock test.
3234 if (UseBiasedLocking && !UseOptoBiasInlining) {
3235 masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
3236 }
3237
3238 masm.cmpptr(Address(boxReg, 0), 0) ; // Examine the displaced header
3239 masm.movptr(tmpReg, Address(objReg, 0)) ; // Examine the object's markword
3240 masm.jccb (Assembler::zero, DONE_LABEL) ; // 0 indicates recursive stack-lock
3241
3242 masm.testptr(tmpReg, 0x02) ; // Inflated?
3243 masm.jccb (Assembler::zero, Stacked) ;
3244
3245 masm.bind (Inflated) ;
3246 // It's inflated.
3247 // Despite our balanced locking property we still check that m->_owner == Self
3248 // as java routines or native JNI code called by this thread might
3249 // have released the lock.
3250 // Refer to the comments in synchronizer.cpp for how we might encode extra
3251 // state in _succ so we can avoid fetching EntryList|cxq.
3252 //
3253 // I'd like to add more cases in fast_lock() and fast_unlock() --
3254 // such as recursive enter and exit -- but we have to be wary of
3255 // I$ bloat, T$ effects and BP$ effects.
3256 //
3257 // If there's no contention try a 1-0 exit. That is, exit without
3258 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
3259 // we detect and recover from the race that the 1-0 exit admits.
3260 //
3261 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
3262 // before it STs null into _owner, releasing the lock. Updates
3263 // to data protected by the critical section must be visible before
3264 // we drop the lock (and thus before any other thread could acquire
3265 // the lock and observe the fields protected by the lock).
3266 // IA32's memory-model is SPO, so STs are ordered with respect to
3267 // each other and there's no need for an explicit barrier (fence).
3268 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
3269
3270 masm.get_thread (boxReg) ;
3271 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
3272 // prefetchw [ebx + Offset(_owner)-2]
3273 masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
3274 }
3275
3276 // Note that we could employ various encoding schemes to reduce
3277 // the number of loads below (currently 4) to just 2 or 3.
3278 // Refer to the comments in synchronizer.cpp.
3279 // In practice the chain of fetches doesn't seem to impact performance, however.
3280 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
3281 // Attempt to reduce branch density - AMD's branch predictor.
3282 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3283 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3284 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
3285 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
3286 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3287 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
3288 masm.jmpb (DONE_LABEL) ;
3289 } else {
3290 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3291 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3292 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3293 masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
3294 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
3295 masm.jccb (Assembler::notZero, CheckSucc) ;
3296 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
3297 masm.jmpb (DONE_LABEL) ;
3298 }
3299
3300 // The Following code fragment (EmitSync & 65536) improves the performance of
3301 // contended applications and contended synchronization microbenchmarks.
3302 // Unfortunately the emission of the code - even though not executed - causes regressions
3303 // in scimark and jetstream, evidently because of $ effects. Replacing the code
3304 // with an equal number of never-executed NOPs results in the same regression.
3305 // We leave it off by default.
3306
3307 if ((EmitSync & 65536) != 0) {
3308 Label LSuccess, LGoSlowPath ;
3309
3310 masm.bind (CheckSucc) ;
3311
3312 // Optional pre-test ... it's safe to elide this
3313 if ((EmitSync & 16) == 0) {
3314 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
3315 masm.jccb (Assembler::zero, LGoSlowPath) ;
3316 }
3317
3318 // We have a classic Dekker-style idiom:
3319 // ST m->_owner = 0 ; MEMBAR; LD m->_succ
3320 // There are a number of ways to implement the barrier:
3321 // (1) lock:andl &m->_owner, 0
3322 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
3323 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0
3324 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
3325 // (2) If supported, an explicit MFENCE is appealing.
3326 // In older IA32 processors MFENCE is slower than lock:add or xchg
3327 // particularly if the write-buffer is full as might be the case if
3328 // if stores closely precede the fence or fence-equivalent instruction.
3329 // In more modern implementations MFENCE appears faster, however.
3330 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
3331 // The $lines underlying the top-of-stack should be in M-state.
3332 // The locked add instruction is serializing, of course.
3333 // (4) Use xchg, which is serializing
3334 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
3335 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
3336 // The integer condition codes will tell us if succ was 0.
3337 // Since _succ and _owner should reside in the same $line and
3338 // we just stored into _owner, it's likely that the $line
3339 // remains in M-state for the lock:orl.
3340 //
3341 // We currently use (3), although it's likely that switching to (2)
3342 // is correct for the future.
3343
3344 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
3345 if (os::is_MP()) {
3346 if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
3347 masm.mfence();
3348 } else {
3349 masm.lock () ; masm.addptr(Address(rsp, 0), 0) ;
3350 }
3351 }
3352 // Ratify _succ remains non-null
3353 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
3354 masm.jccb (Assembler::notZero, LSuccess) ;
3355
3356 masm.xorptr(boxReg, boxReg) ; // box is really EAX
3357 if (os::is_MP()) { masm.lock(); }
3358 masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
3359 masm.jccb (Assembler::notEqual, LSuccess) ;
3360 // Since we're low on registers we installed rsp as a placeholding in _owner.
3361 // Now install Self over rsp. This is safe as we're transitioning from
3362 // non-null to non=null
3363 masm.get_thread (boxReg) ;
3364 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
3365 // Intentional fall-through into LGoSlowPath ...
3366
3367 masm.bind (LGoSlowPath) ;
3368 masm.orptr(boxReg, 1) ; // set ICC.ZF=0 to indicate failure
3369 masm.jmpb (DONE_LABEL) ;
3370
3371 masm.bind (LSuccess) ;
3372 masm.xorptr(boxReg, boxReg) ; // set ICC.ZF=1 to indicate success
3373 masm.jmpb (DONE_LABEL) ;
3374 }
3375
3376 masm.bind (Stacked) ;
3377 // It's not inflated and it's not recursively stack-locked and it's not biased.
3378 // It must be stack-locked.
3379 // Try to reset the header to displaced header.
3380 // The "box" value on the stack is stable, so we can reload
3381 // and be assured we observe the same value as above.
3382 masm.movptr(tmpReg, Address(boxReg, 0)) ;
3383 if (os::is_MP()) { masm.lock(); }
3384 masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
3385 // Intention fall-thru into DONE_LABEL
3386
3387
3388 // DONE_LABEL is a hot target - we'd really like to place it at the
3389 // start of cache line by padding with NOPs.
3390 // See the AMD and Intel software optimization manuals for the
3391 // most efficient "long" NOP encodings.
3392 // Unfortunately none of our alignment mechanisms suffice.
3393 if ((EmitSync & 65536) == 0) {
3394 masm.bind (CheckSucc) ;
3395 }
3396 masm.bind(DONE_LABEL);
3397
3398 // Avoid branch to branch on AMD processors
3399 if (EmitSync & 32768) { masm.nop() ; }
3400 }
3401 %}
3402
3403 2854
3404 enc_class enc_pop_rdx() %{ 2855 enc_class enc_pop_rdx() %{
3405 emit_opcode(cbuf,0x5A); 2856 emit_opcode(cbuf,0x5A);
3406 %} 2857 %}
3407 2858
5657 %} 5108 %}
5658 ins_pipe(ialu_reg); 5109 ins_pipe(ialu_reg);
5659 %} 5110 %}
5660 5111
5661 instruct countTrailingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{ 5112 instruct countTrailingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{
5113 predicate(UseCountTrailingZerosInstruction);
5114 match(Set dst (CountTrailingZerosI src));
5115 effect(KILL cr);
5116
5117 format %{ "TZCNT $dst, $src\t# count trailing zeros (int)" %}
5118 ins_encode %{
5119 __ tzcntl($dst$$Register, $src$$Register);
5120 %}
5121 ins_pipe(ialu_reg);
5122 %}
5123
5124 instruct countTrailingZerosI_bsf(rRegI dst, rRegI src, eFlagsReg cr) %{
5125 predicate(!UseCountTrailingZerosInstruction);
5662 match(Set dst (CountTrailingZerosI src)); 5126 match(Set dst (CountTrailingZerosI src));
5663 effect(KILL cr); 5127 effect(KILL cr);
5664 5128
5665 format %{ "BSF $dst, $src\t# count trailing zeros (int)\n\t" 5129 format %{ "BSF $dst, $src\t# count trailing zeros (int)\n\t"
5666 "JNZ done\n\t" 5130 "JNZ done\n\t"
5676 %} 5140 %}
5677 ins_pipe(ialu_reg); 5141 ins_pipe(ialu_reg);
5678 %} 5142 %}
5679 5143
5680 instruct countTrailingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{ 5144 instruct countTrailingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{
5145 predicate(UseCountTrailingZerosInstruction);
5146 match(Set dst (CountTrailingZerosL src));
5147 effect(TEMP dst, KILL cr);
5148
5149 format %{ "TZCNT $dst, $src.lo\t# count trailing zeros (long) \n\t"
5150 "JNC done\n\t"
5151 "TZCNT $dst, $src.hi\n\t"
5152 "ADD $dst, 32\n"
5153 "done:" %}
5154 ins_encode %{
5155 Register Rdst = $dst$$Register;
5156 Register Rsrc = $src$$Register;
5157 Label done;
5158 __ tzcntl(Rdst, Rsrc);
5159 __ jccb(Assembler::carryClear, done);
5160 __ tzcntl(Rdst, HIGH_FROM_LOW(Rsrc));
5161 __ addl(Rdst, BitsPerInt);
5162 __ bind(done);
5163 %}
5164 ins_pipe(ialu_reg);
5165 %}
5166
5167 instruct countTrailingZerosL_bsf(rRegI dst, eRegL src, eFlagsReg cr) %{
5168 predicate(!UseCountTrailingZerosInstruction);
5681 match(Set dst (CountTrailingZerosL src)); 5169 match(Set dst (CountTrailingZerosL src));
5682 effect(TEMP dst, KILL cr); 5170 effect(TEMP dst, KILL cr);
5683 5171
5684 format %{ "BSF $dst, $src.lo\t# count trailing zeros (long)\n\t" 5172 format %{ "BSF $dst, $src.lo\t# count trailing zeros (long)\n\t"
5685 "JNZ done\n\t" 5173 "JNZ done\n\t"
7490 %} 6978 %}
7491 6979
7492 //----------Arithmetic Instructions-------------------------------------------- 6980 //----------Arithmetic Instructions--------------------------------------------
7493 //----------Addition Instructions---------------------------------------------- 6981 //----------Addition Instructions----------------------------------------------
7494 6982
7495 instruct addExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr)
7496 %{
7497 match(AddExactI dst src);
7498 effect(DEF cr);
7499
7500 format %{ "ADD $dst, $src\t# addExact int" %}
7501 ins_encode %{
7502 __ addl($dst$$Register, $src$$Register);
7503 %}
7504 ins_pipe(ialu_reg_reg);
7505 %}
7506
7507 instruct addExactI_eReg_imm(eAXRegI dst, immI src, eFlagsReg cr)
7508 %{
7509 match(AddExactI dst src);
7510 effect(DEF cr);
7511
7512 format %{ "ADD $dst, $src\t# addExact int" %}
7513 ins_encode %{
7514 __ addl($dst$$Register, $src$$constant);
7515 %}
7516 ins_pipe(ialu_reg_reg);
7517 %}
7518
7519 instruct addExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr)
7520 %{
7521 match(AddExactI dst (LoadI src));
7522 effect(DEF cr);
7523
7524 ins_cost(125);
7525 format %{ "ADD $dst,$src\t# addExact int" %}
7526 ins_encode %{
7527 __ addl($dst$$Register, $src$$Address);
7528 %}
7529 ins_pipe( ialu_reg_mem );
7530 %}
7531
7532
7533 // Integer Addition Instructions 6983 // Integer Addition Instructions
7534 instruct addI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{ 6984 instruct addI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
7535 match(Set dst (AddI dst src)); 6985 match(Set dst (AddI dst src));
7536 effect(KILL cr); 6986 effect(KILL cr);
7537 6987
7837 ins_pipe( pipe_cmpxchg ); 7287 ins_pipe( pipe_cmpxchg );
7838 %} 7288 %}
7839 7289
7840 //----------Subtraction Instructions------------------------------------------- 7290 //----------Subtraction Instructions-------------------------------------------
7841 7291
7842 instruct subExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr)
7843 %{
7844 match(SubExactI dst src);
7845 effect(DEF cr);
7846
7847 format %{ "SUB $dst, $src\t# subExact int" %}
7848 ins_encode %{
7849 __ subl($dst$$Register, $src$$Register);
7850 %}
7851 ins_pipe(ialu_reg_reg);
7852 %}
7853
7854 instruct subExactI_eReg_imm(eAXRegI dst, immI src, eFlagsReg cr)
7855 %{
7856 match(SubExactI dst src);
7857 effect(DEF cr);
7858
7859 format %{ "SUB $dst, $src\t# subExact int" %}
7860 ins_encode %{
7861 __ subl($dst$$Register, $src$$constant);
7862 %}
7863 ins_pipe(ialu_reg_reg);
7864 %}
7865
7866 instruct subExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr)
7867 %{
7868 match(SubExactI dst (LoadI src));
7869 effect(DEF cr);
7870
7871 ins_cost(125);
7872 format %{ "SUB $dst,$src\t# subExact int" %}
7873 ins_encode %{
7874 __ subl($dst$$Register, $src$$Address);
7875 %}
7876 ins_pipe( ialu_reg_mem );
7877 %}
7878
7879 // Integer Subtraction Instructions 7292 // Integer Subtraction Instructions
7880 instruct subI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{ 7293 instruct subI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
7881 match(Set dst (SubI dst src)); 7294 match(Set dst (SubI dst src));
7882 effect(KILL cr); 7295 effect(KILL cr);
7883 7296
7940 size(2); 7353 size(2);
7941 format %{ "NEG $dst" %} 7354 format %{ "NEG $dst" %}
7942 opcode(0xF7,0x03); // Opcode F7 /3 7355 opcode(0xF7,0x03); // Opcode F7 /3
7943 ins_encode( OpcP, RegOpc( dst ) ); 7356 ins_encode( OpcP, RegOpc( dst ) );
7944 ins_pipe( ialu_reg ); 7357 ins_pipe( ialu_reg );
7945 %}
7946
7947 instruct negExactI_eReg(eAXRegI dst, eFlagsReg cr) %{
7948 match(NegExactI dst);
7949 effect(DEF cr);
7950
7951 format %{ "NEG $dst\t# negExact int"%}
7952 ins_encode %{
7953 __ negl($dst$$Register);
7954 %}
7955 ins_pipe(ialu_reg);
7956 %} 7358 %}
7957 7359
7958 //----------Multiplication/Division Instructions------------------------------- 7360 //----------Multiplication/Division Instructions-------------------------------
7959 // Integer Multiplication Instructions 7361 // Integer Multiplication Instructions
7960 // Multiply Register 7362 // Multiply Register
8163 "MUL EDX\t# EDX*EAX -> EDX:EAX\n\t" 7565 "MUL EDX\t# EDX*EAX -> EDX:EAX\n\t"
8164 "ADD EDX,$tmp" %} 7566 "ADD EDX,$tmp" %}
8165 ins_encode( long_multiply_con( dst, src, tmp ) ); 7567 ins_encode( long_multiply_con( dst, src, tmp ) );
8166 ins_pipe( pipe_slow ); 7568 ins_pipe( pipe_slow );
8167 %} 7569 %}
8168
8169 instruct mulExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr)
8170 %{
8171 match(MulExactI dst src);
8172 effect(DEF cr);
8173
8174 ins_cost(300);
8175 format %{ "IMUL $dst, $src\t# mulExact int" %}
8176 ins_encode %{
8177 __ imull($dst$$Register, $src$$Register);
8178 %}
8179 ins_pipe(ialu_reg_reg_alu0);
8180 %}
8181
8182 instruct mulExactI_eReg_imm(eAXRegI dst, rRegI src, immI imm, eFlagsReg cr)
8183 %{
8184 match(MulExactI src imm);
8185 effect(DEF cr);
8186
8187 ins_cost(300);
8188 format %{ "IMUL $dst, $src, $imm\t# mulExact int" %}
8189 ins_encode %{
8190 __ imull($dst$$Register, $src$$Register, $imm$$constant);
8191 %}
8192 ins_pipe(ialu_reg_reg_alu0);
8193 %}
8194
8195 instruct mulExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr)
8196 %{
8197 match(MulExactI dst (LoadI src));
8198 effect(DEF cr);
8199
8200 ins_cost(350);
8201 format %{ "IMUL $dst, $src\t# mulExact int" %}
8202 ins_encode %{
8203 __ imull($dst$$Register, $src$$Address);
8204 %}
8205 ins_pipe(ialu_reg_mem_alu0);
8206 %}
8207
8208 7570
8209 // Integer DIV with Register 7571 // Integer DIV with Register
8210 instruct divI_eReg(eAXRegI rax, eDXRegI rdx, eCXRegI div, eFlagsReg cr) %{ 7572 instruct divI_eReg(eAXRegI rax, eDXRegI rdx, eCXRegI div, eFlagsReg cr) %{
8211 match(Set rax (DivI rax div)); 7573 match(Set rax (DivI rax div));
8212 effect(KILL rdx, KILL cr); 7574 effect(KILL rdx, KILL cr);
8647 // ins_encode( MemImm( dst, src) ); 8009 // ins_encode( MemImm( dst, src) );
8648 ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) ); 8010 ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
8649 ins_pipe( ialu_mem_imm ); 8011 ins_pipe( ialu_mem_imm );
8650 %} 8012 %}
8651 8013
8014 // BMI1 instructions
8015 instruct andnI_rReg_rReg_rReg(rRegI dst, rRegI src1, rRegI src2, immI_M1 minus_1, eFlagsReg cr) %{
8016 match(Set dst (AndI (XorI src1 minus_1) src2));
8017 predicate(UseBMI1Instructions);
8018 effect(KILL cr);
8019
8020 format %{ "ANDNL $dst, $src1, $src2" %}
8021
8022 ins_encode %{
8023 __ andnl($dst$$Register, $src1$$Register, $src2$$Register);
8024 %}
8025 ins_pipe(ialu_reg);
8026 %}
8027
8028 instruct andnI_rReg_rReg_mem(rRegI dst, rRegI src1, memory src2, immI_M1 minus_1, eFlagsReg cr) %{
8029 match(Set dst (AndI (XorI src1 minus_1) (LoadI src2) ));
8030 predicate(UseBMI1Instructions);
8031 effect(KILL cr);
8032
8033 ins_cost(125);
8034 format %{ "ANDNL $dst, $src1, $src2" %}
8035
8036 ins_encode %{
8037 __ andnl($dst$$Register, $src1$$Register, $src2$$Address);
8038 %}
8039 ins_pipe(ialu_reg_mem);
8040 %}
8041
8042 instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, eFlagsReg cr) %{
8043 match(Set dst (AndI (SubI imm_zero src) src));
8044 predicate(UseBMI1Instructions);
8045 effect(KILL cr);
8046
8047 format %{ "BLSIL $dst, $src" %}
8048
8049 ins_encode %{
8050 __ blsil($dst$$Register, $src$$Register);
8051 %}
8052 ins_pipe(ialu_reg);
8053 %}
8054
8055 instruct blsiI_rReg_mem(rRegI dst, memory src, immI0 imm_zero, eFlagsReg cr) %{
8056 match(Set dst (AndI (SubI imm_zero (LoadI src) ) (LoadI src) ));
8057 predicate(UseBMI1Instructions);
8058 effect(KILL cr);
8059
8060 ins_cost(125);
8061 format %{ "BLSIL $dst, $src" %}
8062
8063 ins_encode %{
8064 __ blsil($dst$$Register, $src$$Address);
8065 %}
8066 ins_pipe(ialu_reg_mem);
8067 %}
8068
8069 instruct blsmskI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, eFlagsReg cr)
8070 %{
8071 match(Set dst (XorI (AddI src minus_1) src));
8072 predicate(UseBMI1Instructions);
8073 effect(KILL cr);
8074
8075 format %{ "BLSMSKL $dst, $src" %}
8076
8077 ins_encode %{
8078 __ blsmskl($dst$$Register, $src$$Register);
8079 %}
8080
8081 ins_pipe(ialu_reg);
8082 %}
8083
8084 instruct blsmskI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, eFlagsReg cr)
8085 %{
8086 match(Set dst (XorI (AddI (LoadI src) minus_1) (LoadI src) ));
8087 predicate(UseBMI1Instructions);
8088 effect(KILL cr);
8089
8090 ins_cost(125);
8091 format %{ "BLSMSKL $dst, $src" %}
8092
8093 ins_encode %{
8094 __ blsmskl($dst$$Register, $src$$Address);
8095 %}
8096
8097 ins_pipe(ialu_reg_mem);
8098 %}
8099
8100 instruct blsrI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, eFlagsReg cr)
8101 %{
8102 match(Set dst (AndI (AddI src minus_1) src) );
8103 predicate(UseBMI1Instructions);
8104 effect(KILL cr);
8105
8106 format %{ "BLSRL $dst, $src" %}
8107
8108 ins_encode %{
8109 __ blsrl($dst$$Register, $src$$Register);
8110 %}
8111
8112 ins_pipe(ialu_reg);
8113 %}
8114
8115 instruct blsrI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, eFlagsReg cr)
8116 %{
8117 match(Set dst (AndI (AddI (LoadI src) minus_1) (LoadI src) ));
8118 predicate(UseBMI1Instructions);
8119 effect(KILL cr);
8120
8121 ins_cost(125);
8122 format %{ "BLSRL $dst, $src" %}
8123
8124 ins_encode %{
8125 __ blsrl($dst$$Register, $src$$Address);
8126 %}
8127
8128 ins_pipe(ialu_reg_mem);
8129 %}
8130
8652 // Or Instructions 8131 // Or Instructions
8653 // Or Register with Register 8132 // Or Register with Register
8654 instruct orI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{ 8133 instruct orI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
8655 match(Set dst (OrI dst src)); 8134 match(Set dst (OrI dst src));
8656 effect(KILL cr); 8135 effect(KILL cr);
9069 8548
9070 /* If I enable this, I encourage spilling in the inner loop of compress. 8549 /* If I enable this, I encourage spilling in the inner loop of compress.
9071 instruct cadd_cmpLTMask_mem(ncxRegI p, ncxRegI q, memory y, eCXRegI tmp, eFlagsReg cr) %{ 8550 instruct cadd_cmpLTMask_mem(ncxRegI p, ncxRegI q, memory y, eCXRegI tmp, eFlagsReg cr) %{
9072 match(Set p (AddI (AndI (CmpLTMask p q) (LoadI y)) (SubI p q))); 8551 match(Set p (AddI (AndI (CmpLTMask p q) (LoadI y)) (SubI p q)));
9073 */ 8552 */
8553 //----------Overflow Math Instructions-----------------------------------------
8554
8555 instruct overflowAddI_eReg(eFlagsReg cr, eAXRegI op1, rRegI op2)
8556 %{
8557 match(Set cr (OverflowAddI op1 op2));
8558 effect(DEF cr, USE_KILL op1, USE op2);
8559
8560 format %{ "ADD $op1, $op2\t# overflow check int" %}
8561
8562 ins_encode %{
8563 __ addl($op1$$Register, $op2$$Register);
8564 %}
8565 ins_pipe(ialu_reg_reg);
8566 %}
8567
8568 instruct overflowAddI_rReg_imm(eFlagsReg cr, eAXRegI op1, immI op2)
8569 %{
8570 match(Set cr (OverflowAddI op1 op2));
8571 effect(DEF cr, USE_KILL op1, USE op2);
8572
8573 format %{ "ADD $op1, $op2\t# overflow check int" %}
8574
8575 ins_encode %{
8576 __ addl($op1$$Register, $op2$$constant);
8577 %}
8578 ins_pipe(ialu_reg_reg);
8579 %}
8580
8581 instruct overflowSubI_rReg(eFlagsReg cr, rRegI op1, rRegI op2)
8582 %{
8583 match(Set cr (OverflowSubI op1 op2));
8584
8585 format %{ "CMP $op1, $op2\t# overflow check int" %}
8586 ins_encode %{
8587 __ cmpl($op1$$Register, $op2$$Register);
8588 %}
8589 ins_pipe(ialu_reg_reg);
8590 %}
8591
8592 instruct overflowSubI_rReg_imm(eFlagsReg cr, rRegI op1, immI op2)
8593 %{
8594 match(Set cr (OverflowSubI op1 op2));
8595
8596 format %{ "CMP $op1, $op2\t# overflow check int" %}
8597 ins_encode %{
8598 __ cmpl($op1$$Register, $op2$$constant);
8599 %}
8600 ins_pipe(ialu_reg_reg);
8601 %}
8602
8603 instruct overflowNegI_rReg(eFlagsReg cr, immI0 zero, eAXRegI op2)
8604 %{
8605 match(Set cr (OverflowSubI zero op2));
8606 effect(DEF cr, USE_KILL op2);
8607
8608 format %{ "NEG $op2\t# overflow check int" %}
8609 ins_encode %{
8610 __ negl($op2$$Register);
8611 %}
8612 ins_pipe(ialu_reg_reg);
8613 %}
8614
8615 instruct overflowMulI_rReg(eFlagsReg cr, eAXRegI op1, rRegI op2)
8616 %{
8617 match(Set cr (OverflowMulI op1 op2));
8618 effect(DEF cr, USE_KILL op1, USE op2);
8619
8620 format %{ "IMUL $op1, $op2\t# overflow check int" %}
8621 ins_encode %{
8622 __ imull($op1$$Register, $op2$$Register);
8623 %}
8624 ins_pipe(ialu_reg_reg_alu0);
8625 %}
8626
8627 instruct overflowMulI_rReg_imm(eFlagsReg cr, rRegI op1, immI op2, rRegI tmp)
8628 %{
8629 match(Set cr (OverflowMulI op1 op2));
8630 effect(DEF cr, TEMP tmp, USE op1, USE op2);
8631
8632 format %{ "IMUL $tmp, $op1, $op2\t# overflow check int" %}
8633 ins_encode %{
8634 __ imull($tmp$$Register, $op1$$Register, $op2$$constant);
8635 %}
8636 ins_pipe(ialu_reg_reg_alu0);
8637 %}
9074 8638
9075 //----------Long Instructions------------------------------------------------ 8639 //----------Long Instructions------------------------------------------------
9076 // Add Long Register with Register 8640 // Add Long Register with Register
9077 instruct addL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{ 8641 instruct addL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
9078 match(Set dst (AddL dst src)); 8642 match(Set dst (AddL dst src));
9182 format %{ "AND $dst.lo,$mem\n\t" 8746 format %{ "AND $dst.lo,$mem\n\t"
9183 "AND $dst.hi,$mem+4" %} 8747 "AND $dst.hi,$mem+4" %}
9184 opcode(0x23, 0x23); 8748 opcode(0x23, 0x23);
9185 ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) ); 8749 ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
9186 ins_pipe( ialu_reg_long_mem ); 8750 ins_pipe( ialu_reg_long_mem );
8751 %}
8752
8753 // BMI1 instructions
8754 instruct andnL_eReg_eReg_eReg(eRegL dst, eRegL src1, eRegL src2, immL_M1 minus_1, eFlagsReg cr) %{
8755 match(Set dst (AndL (XorL src1 minus_1) src2));
8756 predicate(UseBMI1Instructions);
8757 effect(KILL cr, TEMP dst);
8758
8759 format %{ "ANDNL $dst.lo, $src1.lo, $src2.lo\n\t"
8760 "ANDNL $dst.hi, $src1.hi, $src2.hi"
8761 %}
8762
8763 ins_encode %{
8764 Register Rdst = $dst$$Register;
8765 Register Rsrc1 = $src1$$Register;
8766 Register Rsrc2 = $src2$$Register;
8767 __ andnl(Rdst, Rsrc1, Rsrc2);
8768 __ andnl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc1), HIGH_FROM_LOW(Rsrc2));
8769 %}
8770 ins_pipe(ialu_reg_reg_long);
8771 %}
8772
8773 instruct andnL_eReg_eReg_mem(eRegL dst, eRegL src1, memory src2, immL_M1 minus_1, eFlagsReg cr) %{
8774 match(Set dst (AndL (XorL src1 minus_1) (LoadL src2) ));
8775 predicate(UseBMI1Instructions);
8776 effect(KILL cr, TEMP dst);
8777
8778 ins_cost(125);
8779 format %{ "ANDNL $dst.lo, $src1.lo, $src2\n\t"
8780 "ANDNL $dst.hi, $src1.hi, $src2+4"
8781 %}
8782
8783 ins_encode %{
8784 Register Rdst = $dst$$Register;
8785 Register Rsrc1 = $src1$$Register;
8786 Address src2_hi = Address::make_raw($src2$$base, $src2$$index, $src2$$scale, $src2$$disp + 4, relocInfo::none);
8787
8788 __ andnl(Rdst, Rsrc1, $src2$$Address);
8789 __ andnl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc1), src2_hi);
8790 %}
8791 ins_pipe(ialu_reg_mem);
8792 %}
8793
8794 instruct blsiL_eReg_eReg(eRegL dst, eRegL src, immL0 imm_zero, eFlagsReg cr) %{
8795 match(Set dst (AndL (SubL imm_zero src) src));
8796 predicate(UseBMI1Instructions);
8797 effect(KILL cr, TEMP dst);
8798
8799 format %{ "MOVL $dst.hi, 0\n\t"
8800 "BLSIL $dst.lo, $src.lo\n\t"
8801 "JNZ done\n\t"
8802 "BLSIL $dst.hi, $src.hi\n"
8803 "done:"
8804 %}
8805
8806 ins_encode %{
8807 Label done;
8808 Register Rdst = $dst$$Register;
8809 Register Rsrc = $src$$Register;
8810 __ movl(HIGH_FROM_LOW(Rdst), 0);
8811 __ blsil(Rdst, Rsrc);
8812 __ jccb(Assembler::notZero, done);
8813 __ blsil(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
8814 __ bind(done);
8815 %}
8816 ins_pipe(ialu_reg);
8817 %}
8818
8819 instruct blsiL_eReg_mem(eRegL dst, memory src, immL0 imm_zero, eFlagsReg cr) %{
8820 match(Set dst (AndL (SubL imm_zero (LoadL src) ) (LoadL src) ));
8821 predicate(UseBMI1Instructions);
8822 effect(KILL cr, TEMP dst);
8823
8824 ins_cost(125);
8825 format %{ "MOVL $dst.hi, 0\n\t"
8826 "BLSIL $dst.lo, $src\n\t"
8827 "JNZ done\n\t"
8828 "BLSIL $dst.hi, $src+4\n"
8829 "done:"
8830 %}
8831
8832 ins_encode %{
8833 Label done;
8834 Register Rdst = $dst$$Register;
8835 Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none);
8836
8837 __ movl(HIGH_FROM_LOW(Rdst), 0);
8838 __ blsil(Rdst, $src$$Address);
8839 __ jccb(Assembler::notZero, done);
8840 __ blsil(HIGH_FROM_LOW(Rdst), src_hi);
8841 __ bind(done);
8842 %}
8843 ins_pipe(ialu_reg_mem);
8844 %}
8845
8846 instruct blsmskL_eReg_eReg(eRegL dst, eRegL src, immL_M1 minus_1, eFlagsReg cr)
8847 %{
8848 match(Set dst (XorL (AddL src minus_1) src));
8849 predicate(UseBMI1Instructions);
8850 effect(KILL cr, TEMP dst);
8851
8852 format %{ "MOVL $dst.hi, 0\n\t"
8853 "BLSMSKL $dst.lo, $src.lo\n\t"
8854 "JNC done\n\t"
8855 "BLSMSKL $dst.hi, $src.hi\n"
8856 "done:"
8857 %}
8858
8859 ins_encode %{
8860 Label done;
8861 Register Rdst = $dst$$Register;
8862 Register Rsrc = $src$$Register;
8863 __ movl(HIGH_FROM_LOW(Rdst), 0);
8864 __ blsmskl(Rdst, Rsrc);
8865 __ jccb(Assembler::carryClear, done);
8866 __ blsmskl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
8867 __ bind(done);
8868 %}
8869
8870 ins_pipe(ialu_reg);
8871 %}
8872
8873 instruct blsmskL_eReg_mem(eRegL dst, memory src, immL_M1 minus_1, eFlagsReg cr)
8874 %{
8875 match(Set dst (XorL (AddL (LoadL src) minus_1) (LoadL src) ));
8876 predicate(UseBMI1Instructions);
8877 effect(KILL cr, TEMP dst);
8878
8879 ins_cost(125);
8880 format %{ "MOVL $dst.hi, 0\n\t"
8881 "BLSMSKL $dst.lo, $src\n\t"
8882 "JNC done\n\t"
8883 "BLSMSKL $dst.hi, $src+4\n"
8884 "done:"
8885 %}
8886
8887 ins_encode %{
8888 Label done;
8889 Register Rdst = $dst$$Register;
8890 Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none);
8891
8892 __ movl(HIGH_FROM_LOW(Rdst), 0);
8893 __ blsmskl(Rdst, $src$$Address);
8894 __ jccb(Assembler::carryClear, done);
8895 __ blsmskl(HIGH_FROM_LOW(Rdst), src_hi);
8896 __ bind(done);
8897 %}
8898
8899 ins_pipe(ialu_reg_mem);
8900 %}
8901
8902 instruct blsrL_eReg_eReg(eRegL dst, eRegL src, immL_M1 minus_1, eFlagsReg cr)
8903 %{
8904 match(Set dst (AndL (AddL src minus_1) src) );
8905 predicate(UseBMI1Instructions);
8906 effect(KILL cr, TEMP dst);
8907
8908 format %{ "MOVL $dst.hi, $src.hi\n\t"
8909 "BLSRL $dst.lo, $src.lo\n\t"
8910 "JNC done\n\t"
8911 "BLSRL $dst.hi, $src.hi\n"
8912 "done:"
8913 %}
8914
8915 ins_encode %{
8916 Label done;
8917 Register Rdst = $dst$$Register;
8918 Register Rsrc = $src$$Register;
8919 __ movl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
8920 __ blsrl(Rdst, Rsrc);
8921 __ jccb(Assembler::carryClear, done);
8922 __ blsrl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
8923 __ bind(done);
8924 %}
8925
8926 ins_pipe(ialu_reg);
8927 %}
8928
8929 instruct blsrL_eReg_mem(eRegL dst, memory src, immL_M1 minus_1, eFlagsReg cr)
8930 %{
8931 match(Set dst (AndL (AddL (LoadL src) minus_1) (LoadL src) ));
8932 predicate(UseBMI1Instructions);
8933 effect(KILL cr, TEMP dst);
8934
8935 ins_cost(125);
8936 format %{ "MOVL $dst.hi, $src+4\n\t"
8937 "BLSRL $dst.lo, $src\n\t"
8938 "JNC done\n\t"
8939 "BLSRL $dst.hi, $src+4\n"
8940 "done:"
8941 %}
8942
8943 ins_encode %{
8944 Label done;
8945 Register Rdst = $dst$$Register;
8946 Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none);
8947 __ movl(HIGH_FROM_LOW(Rdst), src_hi);
8948 __ blsrl(Rdst, $src$$Address);
8949 __ jccb(Assembler::carryClear, done);
8950 __ blsrl(HIGH_FROM_LOW(Rdst), src_hi);
8951 __ bind(done);
8952 %}
8953
8954 ins_pipe(ialu_reg_mem);
9187 %} 8955 %}
9188 8956
9189 // Or Long Register with Register 8957 // Or Long Register with Register
9190 instruct orl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{ 8958 instruct orl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
9191 match(Set dst (OrL dst src)); 8959 match(Set dst (OrL dst src));
13102 ins_pipe( pipe_jmp ); 12870 ins_pipe( pipe_jmp );
13103 %} 12871 %}
13104 12872
13105 // inlined locking and unlocking 12873 // inlined locking and unlocking
13106 12874
13107 12875 instruct cmpFastLockRTM(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eDXRegI scr, rRegI cx1, rRegI cx2) %{
13108 instruct cmpFastLock( eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{ 12876 predicate(Compile::current()->use_rtm());
13109 match( Set cr (FastLock object box) ); 12877 match(Set cr (FastLock object box));
13110 effect( TEMP tmp, TEMP scr, USE_KILL box ); 12878 effect(TEMP tmp, TEMP scr, TEMP cx1, TEMP cx2, USE_KILL box);
12879 ins_cost(300);
12880 format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr,$cx1,$cx2" %}
12881 ins_encode %{
12882 __ fast_lock($object$$Register, $box$$Register, $tmp$$Register,
12883 $scr$$Register, $cx1$$Register, $cx2$$Register,
12884 _counters, _rtm_counters, _stack_rtm_counters,
12885 ((Method*)(ra_->C->method()->constant_encoding()))->method_data(),
12886 true, ra_->C->profile_rtm());
12887 %}
12888 ins_pipe(pipe_slow);
12889 %}
12890
12891 instruct cmpFastLock(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
12892 predicate(!Compile::current()->use_rtm());
12893 match(Set cr (FastLock object box));
12894 effect(TEMP tmp, TEMP scr, USE_KILL box);
13111 ins_cost(300); 12895 ins_cost(300);
13112 format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %} 12896 format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %}
13113 ins_encode( Fast_Lock(object,box,tmp,scr) ); 12897 ins_encode %{
13114 ins_pipe( pipe_slow ); 12898 __ fast_lock($object$$Register, $box$$Register, $tmp$$Register,
13115 %} 12899 $scr$$Register, noreg, noreg, _counters, NULL, NULL, NULL, false, false);
13116 12900 %}
13117 instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{ 12901 ins_pipe(pipe_slow);
13118 match( Set cr (FastUnlock object box) ); 12902 %}
13119 effect( TEMP tmp, USE_KILL box ); 12903
12904 instruct cmpFastUnlock(eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
12905 match(Set cr (FastUnlock object box));
12906 effect(TEMP tmp, USE_KILL box);
13120 ins_cost(300); 12907 ins_cost(300);
13121 format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %} 12908 format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %}
13122 ins_encode( Fast_Unlock(object,box,tmp) ); 12909 ins_encode %{
13123 ins_pipe( pipe_slow ); 12910 __ fast_unlock($object$$Register, $box$$Register, $tmp$$Register, ra_->C->use_rtm());
12911 %}
12912 ins_pipe(pipe_slow);
13124 %} 12913 %}
13125 12914
13126 12915
13127 12916
13128 // ============================================================================ 12917 // ============================================================================