comparison src/cpu/x86/vm/x86_32.ad @ 14494:5292439ef895

8033805: Move Fast_Lock/Fast_Unlock code from .ad files to macroassembler Summary: Consolidated C2 x86 locking code in one place in macroAssembler_x86.cpp. Reviewed-by: roland
author kvn
date Mon, 24 Feb 2014 15:12:26 -0800
parents 45467c53f178
children cd5d10655495
comparison
equal deleted inserted replaced
14472:80b39937b791 14494:5292439ef895
2916 emit_opcode(cbuf,0x83); // SBB hi,0 2916 emit_opcode(cbuf,0x83); // SBB hi,0
2917 emit_rm (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg)); 2917 emit_rm (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
2918 emit_d8 (cbuf,0 ); 2918 emit_d8 (cbuf,0 );
2919 %} 2919 %}
2920 2920
2921
2922 // Because the transitions from emitted code to the runtime
2923 // monitorenter/exit helper stubs are so slow it's critical that
2924 // we inline both the stack-locking fast-path and the inflated fast path.
2925 //
2926 // See also: cmpFastLock and cmpFastUnlock.
2927 //
2928 // What follows is a specialized inline transliteration of the code
2929 // in slow_enter() and slow_exit(). If we're concerned about I$ bloat
2930 // another option would be to emit TrySlowEnter and TrySlowExit methods
2931 // at startup-time. These methods would accept arguments as
2932 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
2933 // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply
2934 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
2935 // In practice, however, the # of lock sites is bounded and is usually small.
2936 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
2937 // if the processor uses simple bimodal branch predictors keyed by EIP
2938 // Since the helper routines would be called from multiple synchronization
2939 // sites.
2940 //
2941 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
2942 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
2943 // to those specialized methods. That'd give us a mostly platform-independent
2944 // implementation that the JITs could optimize and inline at their pleasure.
2945 // Done correctly, the only time we'd need to cross to native could would be
2946 // to park() or unpark() threads. We'd also need a few more unsafe operators
2947 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
2948 // (b) explicit barriers or fence operations.
2949 //
2950 // TODO:
2951 //
2952 // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
2953 // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
2954 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
2955 // the lock operators would typically be faster than reifying Self.
2956 //
2957 // * Ideally I'd define the primitives as:
2958 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
2959 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
2960 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
2961 // Instead, we're stuck with a rather awkward and brittle register assignments below.
2962 // Furthermore the register assignments are overconstrained, possibly resulting in
2963 // sub-optimal code near the synchronization site.
2964 //
2965 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
2966 // Alternately, use a better sp-proximity test.
2967 //
2968 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
2969 // Either one is sufficient to uniquely identify a thread.
2970 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
2971 //
2972 // * Intrinsify notify() and notifyAll() for the common cases where the
2973 // object is locked by the calling thread but the waitlist is empty.
2974 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
2975 //
2976 // * use jccb and jmpb instead of jcc and jmp to improve code density.
2977 // But beware of excessive branch density on AMD Opterons.
2978 //
2979 // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
2980 // or failure of the fast-path. If the fast-path fails then we pass
2981 // control to the slow-path, typically in C. In Fast_Lock and
2982 // Fast_Unlock we often branch to DONE_LABEL, just to find that C2
2983 // will emit a conditional branch immediately after the node.
2984 // So we have branches to branches and lots of ICC.ZF games.
2985 // Instead, it might be better to have C2 pass a "FailureLabel"
2986 // into Fast_Lock and Fast_Unlock. In the case of success, control
2987 // will drop through the node. ICC.ZF is undefined at exit.
2988 // In the case of failure, the node will branch directly to the
2989 // FailureLabel
2990
2991
2992 // obj: object to lock
2993 // box: on-stack box address (displaced header location) - KILLED
2994 // rax,: tmp -- KILLED
2995 // scr: tmp -- KILLED
2996 enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{
2997
2998 Register objReg = as_Register($obj$$reg);
2999 Register boxReg = as_Register($box$$reg);
3000 Register tmpReg = as_Register($tmp$$reg);
3001 Register scrReg = as_Register($scr$$reg);
3002
3003 // Ensure the register assignents are disjoint
3004 guarantee (objReg != boxReg, "") ;
3005 guarantee (objReg != tmpReg, "") ;
3006 guarantee (objReg != scrReg, "") ;
3007 guarantee (boxReg != tmpReg, "") ;
3008 guarantee (boxReg != scrReg, "") ;
3009 guarantee (tmpReg == as_Register(EAX_enc), "") ;
3010
3011 MacroAssembler masm(&cbuf);
3012
3013 if (_counters != NULL) {
3014 masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
3015 }
3016 if (EmitSync & 1) {
3017 // set box->dhw = unused_mark (3)
3018 // Force all sync thru slow-path: slow_enter() and slow_exit()
3019 masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ;
3020 masm.cmpptr (rsp, (int32_t)0) ;
3021 } else
3022 if (EmitSync & 2) {
3023 Label DONE_LABEL ;
3024 if (UseBiasedLocking) {
3025 // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
3026 masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
3027 }
3028
3029 masm.movptr(tmpReg, Address(objReg, 0)) ; // fetch markword
3030 masm.orptr (tmpReg, 0x1);
3031 masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
3032 if (os::is_MP()) { masm.lock(); }
3033 masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
3034 masm.jcc(Assembler::equal, DONE_LABEL);
3035 // Recursive locking
3036 masm.subptr(tmpReg, rsp);
3037 masm.andptr(tmpReg, (int32_t) 0xFFFFF003 );
3038 masm.movptr(Address(boxReg, 0), tmpReg);
3039 masm.bind(DONE_LABEL) ;
3040 } else {
3041 // Possible cases that we'll encounter in fast_lock
3042 // ------------------------------------------------
3043 // * Inflated
3044 // -- unlocked
3045 // -- Locked
3046 // = by self
3047 // = by other
3048 // * biased
3049 // -- by Self
3050 // -- by other
3051 // * neutral
3052 // * stack-locked
3053 // -- by self
3054 // = sp-proximity test hits
3055 // = sp-proximity test generates false-negative
3056 // -- by other
3057 //
3058
3059 Label IsInflated, DONE_LABEL, PopDone ;
3060
3061 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
3062 // order to reduce the number of conditional branches in the most common cases.
3063 // Beware -- there's a subtle invariant that fetch of the markword
3064 // at [FETCH], below, will never observe a biased encoding (*101b).
3065 // If this invariant is not held we risk exclusion (safety) failure.
3066 if (UseBiasedLocking && !UseOptoBiasInlining) {
3067 masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
3068 }
3069
3070 masm.movptr(tmpReg, Address(objReg, 0)) ; // [FETCH]
3071 masm.testptr(tmpReg, 0x02) ; // Inflated v (Stack-locked or neutral)
3072 masm.jccb (Assembler::notZero, IsInflated) ;
3073
3074 // Attempt stack-locking ...
3075 masm.orptr (tmpReg, 0x1);
3076 masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
3077 if (os::is_MP()) { masm.lock(); }
3078 masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
3079 if (_counters != NULL) {
3080 masm.cond_inc32(Assembler::equal,
3081 ExternalAddress((address)_counters->fast_path_entry_count_addr()));
3082 }
3083 masm.jccb (Assembler::equal, DONE_LABEL);
3084
3085 // Recursive locking
3086 masm.subptr(tmpReg, rsp);
3087 masm.andptr(tmpReg, 0xFFFFF003 );
3088 masm.movptr(Address(boxReg, 0), tmpReg);
3089 if (_counters != NULL) {
3090 masm.cond_inc32(Assembler::equal,
3091 ExternalAddress((address)_counters->fast_path_entry_count_addr()));
3092 }
3093 masm.jmp (DONE_LABEL) ;
3094
3095 masm.bind (IsInflated) ;
3096
3097 // The object is inflated.
3098 //
3099 // TODO-FIXME: eliminate the ugly use of manifest constants:
3100 // Use markOopDesc::monitor_value instead of "2".
3101 // use markOop::unused_mark() instead of "3".
3102 // The tmpReg value is an objectMonitor reference ORed with
3103 // markOopDesc::monitor_value (2). We can either convert tmpReg to an
3104 // objectmonitor pointer by masking off the "2" bit or we can just
3105 // use tmpReg as an objectmonitor pointer but bias the objectmonitor
3106 // field offsets with "-2" to compensate for and annul the low-order tag bit.
3107 //
3108 // I use the latter as it avoids AGI stalls.
3109 // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
3110 // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
3111 //
3112 #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
3113
3114 // boxReg refers to the on-stack BasicLock in the current frame.
3115 // We'd like to write:
3116 // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices.
3117 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
3118 // additional latency as we have another ST in the store buffer that must drain.
3119
3120 if (EmitSync & 8192) {
3121 masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty
3122 masm.get_thread (scrReg) ;
3123 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
3124 masm.movptr(tmpReg, NULL_WORD); // consider: xor vs mov
3125 if (os::is_MP()) { masm.lock(); }
3126 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3127 } else
3128 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
3129 masm.movptr(scrReg, boxReg) ;
3130 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
3131
3132 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3133 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
3134 // prefetchw [eax + Offset(_owner)-2]
3135 masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3136 }
3137
3138 if ((EmitSync & 64) == 0) {
3139 // Optimistic form: consider XORL tmpReg,tmpReg
3140 masm.movptr(tmpReg, NULL_WORD) ;
3141 } else {
3142 // Can suffer RTS->RTO upgrades on shared or cold $ lines
3143 // Test-And-CAS instead of CAS
3144 masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner
3145 masm.testptr(tmpReg, tmpReg) ; // Locked ?
3146 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3147 }
3148
3149 // Appears unlocked - try to swing _owner from null to non-null.
3150 // Ideally, I'd manifest "Self" with get_thread and then attempt
3151 // to CAS the register containing Self into m->Owner.
3152 // But we don't have enough registers, so instead we can either try to CAS
3153 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
3154 // we later store "Self" into m->Owner. Transiently storing a stack address
3155 // (rsp or the address of the box) into m->owner is harmless.
3156 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
3157 if (os::is_MP()) { masm.lock(); }
3158 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3159 masm.movptr(Address(scrReg, 0), 3) ; // box->_displaced_header = 3
3160 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3161 masm.get_thread (scrReg) ; // beware: clobbers ICCs
3162 masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ;
3163 masm.xorptr(boxReg, boxReg) ; // set icc.ZFlag = 1 to indicate success
3164
3165 // If the CAS fails we can either retry or pass control to the slow-path.
3166 // We use the latter tactic.
3167 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
3168 // If the CAS was successful ...
3169 // Self has acquired the lock
3170 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
3171 // Intentional fall-through into DONE_LABEL ...
3172 } else {
3173 masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty
3174 masm.movptr(boxReg, tmpReg) ;
3175
3176 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3177 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
3178 // prefetchw [eax + Offset(_owner)-2]
3179 masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3180 }
3181
3182 if ((EmitSync & 64) == 0) {
3183 // Optimistic form
3184 masm.xorptr (tmpReg, tmpReg) ;
3185 } else {
3186 // Can suffer RTS->RTO upgrades on shared or cold $ lines
3187 masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner
3188 masm.testptr(tmpReg, tmpReg) ; // Locked ?
3189 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3190 }
3191
3192 // Appears unlocked - try to swing _owner from null to non-null.
3193 // Use either "Self" (in scr) or rsp as thread identity in _owner.
3194 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
3195 masm.get_thread (scrReg) ;
3196 if (os::is_MP()) { masm.lock(); }
3197 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3198
3199 // If the CAS fails we can either retry or pass control to the slow-path.
3200 // We use the latter tactic.
3201 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
3202 // If the CAS was successful ...
3203 // Self has acquired the lock
3204 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
3205 // Intentional fall-through into DONE_LABEL ...
3206 }
3207
3208 // DONE_LABEL is a hot target - we'd really like to place it at the
3209 // start of cache line by padding with NOPs.
3210 // See the AMD and Intel software optimization manuals for the
3211 // most efficient "long" NOP encodings.
3212 // Unfortunately none of our alignment mechanisms suffice.
3213 masm.bind(DONE_LABEL);
3214
3215 // Avoid branch-to-branch on AMD processors
3216 // This appears to be superstition.
3217 if (EmitSync & 32) masm.nop() ;
3218
3219
3220 // At DONE_LABEL the icc ZFlag is set as follows ...
3221 // Fast_Unlock uses the same protocol.
3222 // ZFlag == 1 -> Success
3223 // ZFlag == 0 -> Failure - force control through the slow-path
3224 }
3225 %}
3226
3227 // obj: object to unlock
3228 // box: box address (displaced header location), killed. Must be EAX.
3229 // rbx,: killed tmp; cannot be obj nor box.
3230 //
3231 // Some commentary on balanced locking:
3232 //
3233 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
3234 // Methods that don't have provably balanced locking are forced to run in the
3235 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
3236 // The interpreter provides two properties:
3237 // I1: At return-time the interpreter automatically and quietly unlocks any
3238 // objects acquired the current activation (frame). Recall that the
3239 // interpreter maintains an on-stack list of locks currently held by
3240 // a frame.
3241 // I2: If a method attempts to unlock an object that is not held by the
3242 // the frame the interpreter throws IMSX.
3243 //
3244 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
3245 // B() doesn't have provably balanced locking so it runs in the interpreter.
3246 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
3247 // is still locked by A().
3248 //
3249 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
3250 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
3251 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
3252 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
3253
3254 enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{
3255
3256 Register objReg = as_Register($obj$$reg);
3257 Register boxReg = as_Register($box$$reg);
3258 Register tmpReg = as_Register($tmp$$reg);
3259
3260 guarantee (objReg != boxReg, "") ;
3261 guarantee (objReg != tmpReg, "") ;
3262 guarantee (boxReg != tmpReg, "") ;
3263 guarantee (boxReg == as_Register(EAX_enc), "") ;
3264 MacroAssembler masm(&cbuf);
3265
3266 if (EmitSync & 4) {
3267 // Disable - inhibit all inlining. Force control through the slow-path
3268 masm.cmpptr (rsp, 0) ;
3269 } else
3270 if (EmitSync & 8) {
3271 Label DONE_LABEL ;
3272 if (UseBiasedLocking) {
3273 masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
3274 }
3275 // classic stack-locking code ...
3276 masm.movptr(tmpReg, Address(boxReg, 0)) ;
3277 masm.testptr(tmpReg, tmpReg) ;
3278 masm.jcc (Assembler::zero, DONE_LABEL) ;
3279 if (os::is_MP()) { masm.lock(); }
3280 masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
3281 masm.bind(DONE_LABEL);
3282 } else {
3283 Label DONE_LABEL, Stacked, CheckSucc, Inflated ;
3284
3285 // Critically, the biased locking test must have precedence over
3286 // and appear before the (box->dhw == 0) recursive stack-lock test.
3287 if (UseBiasedLocking && !UseOptoBiasInlining) {
3288 masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
3289 }
3290
3291 masm.cmpptr(Address(boxReg, 0), 0) ; // Examine the displaced header
3292 masm.movptr(tmpReg, Address(objReg, 0)) ; // Examine the object's markword
3293 masm.jccb (Assembler::zero, DONE_LABEL) ; // 0 indicates recursive stack-lock
3294
3295 masm.testptr(tmpReg, 0x02) ; // Inflated?
3296 masm.jccb (Assembler::zero, Stacked) ;
3297
3298 masm.bind (Inflated) ;
3299 // It's inflated.
3300 // Despite our balanced locking property we still check that m->_owner == Self
3301 // as java routines or native JNI code called by this thread might
3302 // have released the lock.
3303 // Refer to the comments in synchronizer.cpp for how we might encode extra
3304 // state in _succ so we can avoid fetching EntryList|cxq.
3305 //
3306 // I'd like to add more cases in fast_lock() and fast_unlock() --
3307 // such as recursive enter and exit -- but we have to be wary of
3308 // I$ bloat, T$ effects and BP$ effects.
3309 //
3310 // If there's no contention try a 1-0 exit. That is, exit without
3311 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
3312 // we detect and recover from the race that the 1-0 exit admits.
3313 //
3314 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
3315 // before it STs null into _owner, releasing the lock. Updates
3316 // to data protected by the critical section must be visible before
3317 // we drop the lock (and thus before any other thread could acquire
3318 // the lock and observe the fields protected by the lock).
3319 // IA32's memory-model is SPO, so STs are ordered with respect to
3320 // each other and there's no need for an explicit barrier (fence).
3321 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
3322
3323 masm.get_thread (boxReg) ;
3324 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
3325 // prefetchw [ebx + Offset(_owner)-2]
3326 masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
3327 }
3328
3329 // Note that we could employ various encoding schemes to reduce
3330 // the number of loads below (currently 4) to just 2 or 3.
3331 // Refer to the comments in synchronizer.cpp.
3332 // In practice the chain of fetches doesn't seem to impact performance, however.
3333 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
3334 // Attempt to reduce branch density - AMD's branch predictor.
3335 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3336 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3337 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
3338 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
3339 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3340 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
3341 masm.jmpb (DONE_LABEL) ;
3342 } else {
3343 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3344 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3345 masm.jccb (Assembler::notZero, DONE_LABEL) ;
3346 masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
3347 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
3348 masm.jccb (Assembler::notZero, CheckSucc) ;
3349 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
3350 masm.jmpb (DONE_LABEL) ;
3351 }
3352
3353 // The Following code fragment (EmitSync & 65536) improves the performance of
3354 // contended applications and contended synchronization microbenchmarks.
3355 // Unfortunately the emission of the code - even though not executed - causes regressions
3356 // in scimark and jetstream, evidently because of $ effects. Replacing the code
3357 // with an equal number of never-executed NOPs results in the same regression.
3358 // We leave it off by default.
3359
3360 if ((EmitSync & 65536) != 0) {
3361 Label LSuccess, LGoSlowPath ;
3362
3363 masm.bind (CheckSucc) ;
3364
3365 // Optional pre-test ... it's safe to elide this
3366 if ((EmitSync & 16) == 0) {
3367 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
3368 masm.jccb (Assembler::zero, LGoSlowPath) ;
3369 }
3370
3371 // We have a classic Dekker-style idiom:
3372 // ST m->_owner = 0 ; MEMBAR; LD m->_succ
3373 // There are a number of ways to implement the barrier:
3374 // (1) lock:andl &m->_owner, 0
3375 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
3376 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0
3377 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
3378 // (2) If supported, an explicit MFENCE is appealing.
3379 // In older IA32 processors MFENCE is slower than lock:add or xchg
3380 // particularly if the write-buffer is full as might be the case if
3381 // if stores closely precede the fence or fence-equivalent instruction.
3382 // In more modern implementations MFENCE appears faster, however.
3383 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
3384 // The $lines underlying the top-of-stack should be in M-state.
3385 // The locked add instruction is serializing, of course.
3386 // (4) Use xchg, which is serializing
3387 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
3388 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
3389 // The integer condition codes will tell us if succ was 0.
3390 // Since _succ and _owner should reside in the same $line and
3391 // we just stored into _owner, it's likely that the $line
3392 // remains in M-state for the lock:orl.
3393 //
3394 // We currently use (3), although it's likely that switching to (2)
3395 // is correct for the future.
3396
3397 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
3398 if (os::is_MP()) {
3399 if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
3400 masm.mfence();
3401 } else {
3402 masm.lock () ; masm.addptr(Address(rsp, 0), 0) ;
3403 }
3404 }
3405 // Ratify _succ remains non-null
3406 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
3407 masm.jccb (Assembler::notZero, LSuccess) ;
3408
3409 masm.xorptr(boxReg, boxReg) ; // box is really EAX
3410 if (os::is_MP()) { masm.lock(); }
3411 masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
3412 masm.jccb (Assembler::notEqual, LSuccess) ;
3413 // Since we're low on registers we installed rsp as a placeholding in _owner.
3414 // Now install Self over rsp. This is safe as we're transitioning from
3415 // non-null to non=null
3416 masm.get_thread (boxReg) ;
3417 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
3418 // Intentional fall-through into LGoSlowPath ...
3419
3420 masm.bind (LGoSlowPath) ;
3421 masm.orptr(boxReg, 1) ; // set ICC.ZF=0 to indicate failure
3422 masm.jmpb (DONE_LABEL) ;
3423
3424 masm.bind (LSuccess) ;
3425 masm.xorptr(boxReg, boxReg) ; // set ICC.ZF=1 to indicate success
3426 masm.jmpb (DONE_LABEL) ;
3427 }
3428
3429 masm.bind (Stacked) ;
3430 // It's not inflated and it's not recursively stack-locked and it's not biased.
3431 // It must be stack-locked.
3432 // Try to reset the header to displaced header.
3433 // The "box" value on the stack is stable, so we can reload
3434 // and be assured we observe the same value as above.
3435 masm.movptr(tmpReg, Address(boxReg, 0)) ;
3436 if (os::is_MP()) { masm.lock(); }
3437 masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
3438 // Intention fall-thru into DONE_LABEL
3439
3440
3441 // DONE_LABEL is a hot target - we'd really like to place it at the
3442 // start of cache line by padding with NOPs.
3443 // See the AMD and Intel software optimization manuals for the
3444 // most efficient "long" NOP encodings.
3445 // Unfortunately none of our alignment mechanisms suffice.
3446 if ((EmitSync & 65536) == 0) {
3447 masm.bind (CheckSucc) ;
3448 }
3449 masm.bind(DONE_LABEL);
3450
3451 // Avoid branch to branch on AMD processors
3452 if (EmitSync & 32768) { masm.nop() ; }
3453 }
3454 %}
3455
3456
3457 enc_class enc_pop_rdx() %{ 2921 enc_class enc_pop_rdx() %{
3458 emit_opcode(cbuf,0x5A); 2922 emit_opcode(cbuf,0x5A);
3459 %} 2923 %}
3460 2924
3461 enc_class enc_rethrow() %{ 2925 enc_class enc_rethrow() %{
13155 ins_pipe( pipe_jmp ); 12619 ins_pipe( pipe_jmp );
13156 %} 12620 %}
13157 12621
13158 // inlined locking and unlocking 12622 // inlined locking and unlocking
13159 12623
13160 12624 instruct cmpFastLock(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
13161 instruct cmpFastLock( eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{ 12625 match(Set cr (FastLock object box));
13162 match( Set cr (FastLock object box) ); 12626 effect(TEMP tmp, TEMP scr, USE_KILL box);
13163 effect( TEMP tmp, TEMP scr, USE_KILL box );
13164 ins_cost(300); 12627 ins_cost(300);
13165 format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %} 12628 format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %}
13166 ins_encode( Fast_Lock(object,box,tmp,scr) ); 12629 ins_encode %{
13167 ins_pipe( pipe_slow ); 12630 __ fast_lock($object$$Register, $box$$Register, $tmp$$Register, $scr$$Register, _counters);
13168 %} 12631 %}
13169 12632 ins_pipe(pipe_slow);
13170 instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{ 12633 %}
13171 match( Set cr (FastUnlock object box) ); 12634
13172 effect( TEMP tmp, USE_KILL box ); 12635 instruct cmpFastUnlock(eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
12636 match(Set cr (FastUnlock object box));
12637 effect(TEMP tmp, USE_KILL box);
13173 ins_cost(300); 12638 ins_cost(300);
13174 format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %} 12639 format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %}
13175 ins_encode( Fast_Unlock(object,box,tmp) ); 12640 ins_encode %{
13176 ins_pipe( pipe_slow ); 12641 __ fast_unlock($object$$Register, $box$$Register, $tmp$$Register);
12642 %}
12643 ins_pipe(pipe_slow);
13177 %} 12644 %}
13178 12645
13179 12646
13180 12647
13181 // ============================================================================ 12648 // ============================================================================