diff src/cpu/x86/vm/macroAssembler_x86.cpp @ 14494:5292439ef895

8033805: Move Fast_Lock/Fast_Unlock code from .ad files to macroassembler Summary: Consolidated C2 x86 locking code in one place in macroAssembler_x86.cpp. Reviewed-by: roland
author kvn
date Mon, 24 Feb 2014 15:12:26 -0800
parents 61746b5f0ed3
children d8041d695d19 606acabe7b5c
line wrap: on
line diff
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Fri Feb 21 08:09:15 2014 -0800
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Feb 24 15:12:26 2014 -0800
@@ -98,217 +98,6 @@
   return Address::make_array(adr);
 }
 
-int MacroAssembler::biased_locking_enter(Register lock_reg,
-                                         Register obj_reg,
-                                         Register swap_reg,
-                                         Register tmp_reg,
-                                         bool swap_reg_contains_mark,
-                                         Label& done,
-                                         Label* slow_case,
-                                         BiasedLockingCounters* counters) {
-  assert(UseBiasedLocking, "why call this otherwise?");
-  assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
-  assert_different_registers(lock_reg, obj_reg, swap_reg);
-
-  if (PrintBiasedLockingStatistics && counters == NULL)
-    counters = BiasedLocking::counters();
-
-  bool need_tmp_reg = false;
-  if (tmp_reg == noreg) {
-    need_tmp_reg = true;
-    tmp_reg = lock_reg;
-  } else {
-    assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
-  }
-  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
-  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
-  Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
-  Address saved_mark_addr(lock_reg, 0);
-
-  // Biased locking
-  // See whether the lock is currently biased toward our thread and
-  // whether the epoch is still valid
-  // Note that the runtime guarantees sufficient alignment of JavaThread
-  // pointers to allow age to be placed into low bits
-  // First check to see whether biasing is even enabled for this object
-  Label cas_label;
-  int null_check_offset = -1;
-  if (!swap_reg_contains_mark) {
-    null_check_offset = offset();
-    movl(swap_reg, mark_addr);
-  }
-  if (need_tmp_reg) {
-    push(tmp_reg);
-  }
-  movl(tmp_reg, swap_reg);
-  andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
-  cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
-  if (need_tmp_reg) {
-    pop(tmp_reg);
-  }
-  jcc(Assembler::notEqual, cas_label);
-  // The bias pattern is present in the object's header. Need to check
-  // whether the bias owner and the epoch are both still current.
-  // Note that because there is no current thread register on x86 we
-  // need to store off the mark word we read out of the object to
-  // avoid reloading it and needing to recheck invariants below. This
-  // store is unfortunate but it makes the overall code shorter and
-  // simpler.
-  movl(saved_mark_addr, swap_reg);
-  if (need_tmp_reg) {
-    push(tmp_reg);
-  }
-  get_thread(tmp_reg);
-  xorl(swap_reg, tmp_reg);
-  if (swap_reg_contains_mark) {
-    null_check_offset = offset();
-  }
-  movl(tmp_reg, klass_addr);
-  xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
-  andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
-  if (need_tmp_reg) {
-    pop(tmp_reg);
-  }
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address)counters->biased_lock_entry_count_addr()));
-  }
-  jcc(Assembler::equal, done);
-
-  Label try_revoke_bias;
-  Label try_rebias;
-
-  // At this point we know that the header has the bias pattern and
-  // that we are not the bias owner in the current epoch. We need to
-  // figure out more details about the state of the header in order to
-  // know what operations can be legally performed on the object's
-  // header.
-
-  // If the low three bits in the xor result aren't clear, that means
-  // the prototype header is no longer biased and we have to revoke
-  // the bias on this object.
-  testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
-  jcc(Assembler::notZero, try_revoke_bias);
-
-  // Biasing is still enabled for this data type. See whether the
-  // epoch of the current bias is still valid, meaning that the epoch
-  // bits of the mark word are equal to the epoch bits of the
-  // prototype header. (Note that the prototype header's epoch bits
-  // only change at a safepoint.) If not, attempt to rebias the object
-  // toward the current thread. Note that we must be absolutely sure
-  // that the current epoch is invalid in order to do this because
-  // otherwise the manipulations it performs on the mark word are
-  // illegal.
-  testl(swap_reg, markOopDesc::epoch_mask_in_place);
-  jcc(Assembler::notZero, try_rebias);
-
-  // The epoch of the current bias is still valid but we know nothing
-  // about the owner; it might be set or it might be clear. Try to
-  // acquire the bias of the object using an atomic operation. If this
-  // fails we will go in to the runtime to revoke the object's bias.
-  // Note that we first construct the presumed unbiased header so we
-  // don't accidentally blow away another thread's valid bias.
-  movl(swap_reg, saved_mark_addr);
-  andl(swap_reg,
-       markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
-  if (need_tmp_reg) {
-    push(tmp_reg);
-  }
-  get_thread(tmp_reg);
-  orl(tmp_reg, swap_reg);
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
-  if (need_tmp_reg) {
-    pop(tmp_reg);
-  }
-  // If the biasing toward our thread failed, this means that
-  // another thread succeeded in biasing it toward itself and we
-  // need to revoke that bias. The revocation will occur in the
-  // interpreter runtime in the slow case.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
-  }
-  if (slow_case != NULL) {
-    jcc(Assembler::notZero, *slow_case);
-  }
-  jmp(done);
-
-  bind(try_rebias);
-  // At this point we know the epoch has expired, meaning that the
-  // current "bias owner", if any, is actually invalid. Under these
-  // circumstances _only_, we are allowed to use the current header's
-  // value as the comparison value when doing the cas to acquire the
-  // bias in the current epoch. In other words, we allow transfer of
-  // the bias from one thread to another directly in this situation.
-  //
-  // FIXME: due to a lack of registers we currently blow away the age
-  // bits in this situation. Should attempt to preserve them.
-  if (need_tmp_reg) {
-    push(tmp_reg);
-  }
-  get_thread(tmp_reg);
-  movl(swap_reg, klass_addr);
-  orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
-  movl(swap_reg, saved_mark_addr);
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
-  if (need_tmp_reg) {
-    pop(tmp_reg);
-  }
-  // If the biasing toward our thread failed, then another thread
-  // succeeded in biasing it toward itself and we need to revoke that
-  // bias. The revocation will occur in the runtime in the slow case.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
-  }
-  if (slow_case != NULL) {
-    jcc(Assembler::notZero, *slow_case);
-  }
-  jmp(done);
-
-  bind(try_revoke_bias);
-  // The prototype mark in the klass doesn't have the bias bit set any
-  // more, indicating that objects of this data type are not supposed
-  // to be biased any more. We are going to try to reset the mark of
-  // this object to the prototype value and fall through to the
-  // CAS-based locking scheme. Note that if our CAS fails, it means
-  // that another thread raced us for the privilege of revoking the
-  // bias of this particular object, so it's okay to continue in the
-  // normal locking code.
-  //
-  // FIXME: due to a lack of registers we currently blow away the age
-  // bits in this situation. Should attempt to preserve them.
-  movl(swap_reg, saved_mark_addr);
-  if (need_tmp_reg) {
-    push(tmp_reg);
-  }
-  movl(tmp_reg, klass_addr);
-  movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgptr(tmp_reg, Address(obj_reg, 0));
-  if (need_tmp_reg) {
-    pop(tmp_reg);
-  }
-  // Fall through to the normal CAS-based lock, because no matter what
-  // the result of the above CAS, some thread must have succeeded in
-  // removing the bias bit from the object's header.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
-  }
-
-  bind(cas_label);
-
-  return null_check_offset;
-}
 void MacroAssembler::call_VM_leaf_base(address entry_point,
                                        int number_of_arguments) {
   call(RuntimeAddress(entry_point));
@@ -726,165 +515,6 @@
   return array;
 }
 
-int MacroAssembler::biased_locking_enter(Register lock_reg,
-                                         Register obj_reg,
-                                         Register swap_reg,
-                                         Register tmp_reg,
-                                         bool swap_reg_contains_mark,
-                                         Label& done,
-                                         Label* slow_case,
-                                         BiasedLockingCounters* counters) {
-  assert(UseBiasedLocking, "why call this otherwise?");
-  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
-  assert(tmp_reg != noreg, "tmp_reg must be supplied");
-  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
-  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
-  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
-  Address saved_mark_addr(lock_reg, 0);
-
-  if (PrintBiasedLockingStatistics && counters == NULL)
-    counters = BiasedLocking::counters();
-
-  // Biased locking
-  // See whether the lock is currently biased toward our thread and
-  // whether the epoch is still valid
-  // Note that the runtime guarantees sufficient alignment of JavaThread
-  // pointers to allow age to be placed into low bits
-  // First check to see whether biasing is even enabled for this object
-  Label cas_label;
-  int null_check_offset = -1;
-  if (!swap_reg_contains_mark) {
-    null_check_offset = offset();
-    movq(swap_reg, mark_addr);
-  }
-  movq(tmp_reg, swap_reg);
-  andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
-  cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
-  jcc(Assembler::notEqual, cas_label);
-  // The bias pattern is present in the object's header. Need to check
-  // whether the bias owner and the epoch are both still current.
-  load_prototype_header(tmp_reg, obj_reg);
-  orq(tmp_reg, r15_thread);
-  xorq(tmp_reg, swap_reg);
-  andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
-  }
-  jcc(Assembler::equal, done);
-
-  Label try_revoke_bias;
-  Label try_rebias;
-
-  // At this point we know that the header has the bias pattern and
-  // that we are not the bias owner in the current epoch. We need to
-  // figure out more details about the state of the header in order to
-  // know what operations can be legally performed on the object's
-  // header.
-
-  // If the low three bits in the xor result aren't clear, that means
-  // the prototype header is no longer biased and we have to revoke
-  // the bias on this object.
-  testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
-  jcc(Assembler::notZero, try_revoke_bias);
-
-  // Biasing is still enabled for this data type. See whether the
-  // epoch of the current bias is still valid, meaning that the epoch
-  // bits of the mark word are equal to the epoch bits of the
-  // prototype header. (Note that the prototype header's epoch bits
-  // only change at a safepoint.) If not, attempt to rebias the object
-  // toward the current thread. Note that we must be absolutely sure
-  // that the current epoch is invalid in order to do this because
-  // otherwise the manipulations it performs on the mark word are
-  // illegal.
-  testq(tmp_reg, markOopDesc::epoch_mask_in_place);
-  jcc(Assembler::notZero, try_rebias);
-
-  // The epoch of the current bias is still valid but we know nothing
-  // about the owner; it might be set or it might be clear. Try to
-  // acquire the bias of the object using an atomic operation. If this
-  // fails we will go in to the runtime to revoke the object's bias.
-  // Note that we first construct the presumed unbiased header so we
-  // don't accidentally blow away another thread's valid bias.
-  andq(swap_reg,
-       markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
-  movq(tmp_reg, swap_reg);
-  orq(tmp_reg, r15_thread);
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgq(tmp_reg, Address(obj_reg, 0));
-  // If the biasing toward our thread failed, this means that
-  // another thread succeeded in biasing it toward itself and we
-  // need to revoke that bias. The revocation will occur in the
-  // interpreter runtime in the slow case.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
-  }
-  if (slow_case != NULL) {
-    jcc(Assembler::notZero, *slow_case);
-  }
-  jmp(done);
-
-  bind(try_rebias);
-  // At this point we know the epoch has expired, meaning that the
-  // current "bias owner", if any, is actually invalid. Under these
-  // circumstances _only_, we are allowed to use the current header's
-  // value as the comparison value when doing the cas to acquire the
-  // bias in the current epoch. In other words, we allow transfer of
-  // the bias from one thread to another directly in this situation.
-  //
-  // FIXME: due to a lack of registers we currently blow away the age
-  // bits in this situation. Should attempt to preserve them.
-  load_prototype_header(tmp_reg, obj_reg);
-  orq(tmp_reg, r15_thread);
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgq(tmp_reg, Address(obj_reg, 0));
-  // If the biasing toward our thread failed, then another thread
-  // succeeded in biasing it toward itself and we need to revoke that
-  // bias. The revocation will occur in the runtime in the slow case.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
-  }
-  if (slow_case != NULL) {
-    jcc(Assembler::notZero, *slow_case);
-  }
-  jmp(done);
-
-  bind(try_revoke_bias);
-  // The prototype mark in the klass doesn't have the bias bit set any
-  // more, indicating that objects of this data type are not supposed
-  // to be biased any more. We are going to try to reset the mark of
-  // this object to the prototype value and fall through to the
-  // CAS-based locking scheme. Note that if our CAS fails, it means
-  // that another thread raced us for the privilege of revoking the
-  // bias of this particular object, so it's okay to continue in the
-  // normal locking code.
-  //
-  // FIXME: due to a lack of registers we currently blow away the age
-  // bits in this situation. Should attempt to preserve them.
-  load_prototype_header(tmp_reg, obj_reg);
-  if (os::is_MP()) {
-    lock();
-  }
-  cmpxchgq(tmp_reg, Address(obj_reg, 0));
-  // Fall through to the normal CAS-based lock, because no matter what
-  // the result of the above CAS, some thread must have succeeded in
-  // removing the bias bit from the object's header.
-  if (counters != NULL) {
-    cond_inc32(Assembler::zero,
-               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
-  }
-
-  bind(cas_label);
-
-  return null_check_offset;
-}
-
 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
   Label L, E;
 
@@ -1360,9 +990,16 @@
 
 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
   pushf();
-  if (os::is_MP())
-    lock();
-  incrementl(counter_addr);
+  if (reachable(counter_addr)) {
+    if (os::is_MP())
+      lock();
+    incrementl(as_Address(counter_addr));
+  } else {
+    lea(rscratch1, counter_addr);
+    if (os::is_MP())
+      lock();
+    incrementl(Address(rscratch1, 0));
+  }
   popf();
 }
 
@@ -1393,6 +1030,234 @@
   }
 }
 
+int MacroAssembler::biased_locking_enter(Register lock_reg,
+                                         Register obj_reg,
+                                         Register swap_reg,
+                                         Register tmp_reg,
+                                         bool swap_reg_contains_mark,
+                                         Label& done,
+                                         Label* slow_case,
+                                         BiasedLockingCounters* counters) {
+  assert(UseBiasedLocking, "why call this otherwise?");
+  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
+  LP64_ONLY( assert(tmp_reg != noreg, "tmp_reg must be supplied"); )
+  bool need_tmp_reg = false;
+  if (tmp_reg == noreg) {
+    need_tmp_reg = true;
+    tmp_reg = lock_reg;
+    assert_different_registers(lock_reg, obj_reg, swap_reg);
+  } else {
+    assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
+  }
+  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
+  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
+  Address saved_mark_addr(lock_reg, 0);
+
+  if (PrintBiasedLockingStatistics && counters == NULL) {
+    counters = BiasedLocking::counters();
+  }
+  // Biased locking
+  // See whether the lock is currently biased toward our thread and
+  // whether the epoch is still valid
+  // Note that the runtime guarantees sufficient alignment of JavaThread
+  // pointers to allow age to be placed into low bits
+  // First check to see whether biasing is even enabled for this object
+  Label cas_label;
+  int null_check_offset = -1;
+  if (!swap_reg_contains_mark) {
+    null_check_offset = offset();
+    movptr(swap_reg, mark_addr);
+  }
+  if (need_tmp_reg) {
+    push(tmp_reg);
+  }
+  movptr(tmp_reg, swap_reg);
+  andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
+  cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
+  if (need_tmp_reg) {
+    pop(tmp_reg);
+  }
+  jcc(Assembler::notEqual, cas_label);
+  // The bias pattern is present in the object's header. Need to check
+  // whether the bias owner and the epoch are both still current.
+#ifndef _LP64
+  // Note that because there is no current thread register on x86_32 we
+  // need to store off the mark word we read out of the object to
+  // avoid reloading it and needing to recheck invariants below. This
+  // store is unfortunate but it makes the overall code shorter and
+  // simpler.
+  movptr(saved_mark_addr, swap_reg);
+#endif
+  if (need_tmp_reg) {
+    push(tmp_reg);
+  }
+  if (swap_reg_contains_mark) {
+    null_check_offset = offset();
+  }
+  load_prototype_header(tmp_reg, obj_reg);
+#ifdef _LP64
+  orptr(tmp_reg, r15_thread);
+  xorptr(tmp_reg, swap_reg);
+  Register header_reg = tmp_reg;
+#else
+  xorptr(tmp_reg, swap_reg);
+  get_thread(swap_reg);
+  xorptr(swap_reg, tmp_reg);
+  Register header_reg = swap_reg;
+#endif
+  andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
+  if (need_tmp_reg) {
+    pop(tmp_reg);
+  }
+  if (counters != NULL) {
+    cond_inc32(Assembler::zero,
+               ExternalAddress((address) counters->biased_lock_entry_count_addr()));
+  }
+  jcc(Assembler::equal, done);
+
+  Label try_revoke_bias;
+  Label try_rebias;
+
+  // At this point we know that the header has the bias pattern and
+  // that we are not the bias owner in the current epoch. We need to
+  // figure out more details about the state of the header in order to
+  // know what operations can be legally performed on the object's
+  // header.
+
+  // If the low three bits in the xor result aren't clear, that means
+  // the prototype header is no longer biased and we have to revoke
+  // the bias on this object.
+  testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
+  jccb(Assembler::notZero, try_revoke_bias);
+
+  // Biasing is still enabled for this data type. See whether the
+  // epoch of the current bias is still valid, meaning that the epoch
+  // bits of the mark word are equal to the epoch bits of the
+  // prototype header. (Note that the prototype header's epoch bits
+  // only change at a safepoint.) If not, attempt to rebias the object
+  // toward the current thread. Note that we must be absolutely sure
+  // that the current epoch is invalid in order to do this because
+  // otherwise the manipulations it performs on the mark word are
+  // illegal.
+  testptr(header_reg, markOopDesc::epoch_mask_in_place);
+  jccb(Assembler::notZero, try_rebias);
+
+  // The epoch of the current bias is still valid but we know nothing
+  // about the owner; it might be set or it might be clear. Try to
+  // acquire the bias of the object using an atomic operation. If this
+  // fails we will go in to the runtime to revoke the object's bias.
+  // Note that we first construct the presumed unbiased header so we
+  // don't accidentally blow away another thread's valid bias.
+  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
+  andptr(swap_reg,
+         markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
+  if (need_tmp_reg) {
+    push(tmp_reg);
+  }
+#ifdef _LP64
+  movptr(tmp_reg, swap_reg);
+  orptr(tmp_reg, r15_thread);
+#else
+  get_thread(tmp_reg);
+  orptr(tmp_reg, swap_reg);
+#endif
+  if (os::is_MP()) {
+    lock();
+  }
+  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
+  if (need_tmp_reg) {
+    pop(tmp_reg);
+  }
+  // If the biasing toward our thread failed, this means that
+  // another thread succeeded in biasing it toward itself and we
+  // need to revoke that bias. The revocation will occur in the
+  // interpreter runtime in the slow case.
+  if (counters != NULL) {
+    cond_inc32(Assembler::zero,
+               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
+  }
+  if (slow_case != NULL) {
+    jcc(Assembler::notZero, *slow_case);
+  }
+  jmp(done);
+
+  bind(try_rebias);
+  // At this point we know the epoch has expired, meaning that the
+  // current "bias owner", if any, is actually invalid. Under these
+  // circumstances _only_, we are allowed to use the current header's
+  // value as the comparison value when doing the cas to acquire the
+  // bias in the current epoch. In other words, we allow transfer of
+  // the bias from one thread to another directly in this situation.
+  //
+  // FIXME: due to a lack of registers we currently blow away the age
+  // bits in this situation. Should attempt to preserve them.
+  if (need_tmp_reg) {
+    push(tmp_reg);
+  }
+  load_prototype_header(tmp_reg, obj_reg);
+#ifdef _LP64
+  orptr(tmp_reg, r15_thread);
+#else
+  get_thread(swap_reg);
+  orptr(tmp_reg, swap_reg);
+  movptr(swap_reg, saved_mark_addr);
+#endif
+  if (os::is_MP()) {
+    lock();
+  }
+  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
+  if (need_tmp_reg) {
+    pop(tmp_reg);
+  }
+  // If the biasing toward our thread failed, then another thread
+  // succeeded in biasing it toward itself and we need to revoke that
+  // bias. The revocation will occur in the runtime in the slow case.
+  if (counters != NULL) {
+    cond_inc32(Assembler::zero,
+               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
+  }
+  if (slow_case != NULL) {
+    jcc(Assembler::notZero, *slow_case);
+  }
+  jmp(done);
+
+  bind(try_revoke_bias);
+  // The prototype mark in the klass doesn't have the bias bit set any
+  // more, indicating that objects of this data type are not supposed
+  // to be biased any more. We are going to try to reset the mark of
+  // this object to the prototype value and fall through to the
+  // CAS-based locking scheme. Note that if our CAS fails, it means
+  // that another thread raced us for the privilege of revoking the
+  // bias of this particular object, so it's okay to continue in the
+  // normal locking code.
+  //
+  // FIXME: due to a lack of registers we currently blow away the age
+  // bits in this situation. Should attempt to preserve them.
+  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
+  if (need_tmp_reg) {
+    push(tmp_reg);
+  }
+  load_prototype_header(tmp_reg, obj_reg);
+  if (os::is_MP()) {
+    lock();
+  }
+  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
+  if (need_tmp_reg) {
+    pop(tmp_reg);
+  }
+  // Fall through to the normal CAS-based lock, because no matter what
+  // the result of the above CAS, some thread must have succeeded in
+  // removing the bias bit from the object's header.
+  if (counters != NULL) {
+    cond_inc32(Assembler::zero,
+               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
+  }
+
+  bind(cas_label);
+
+  return null_check_offset;
+}
+
 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
   assert(UseBiasedLocking, "why call this otherwise?");
 
@@ -1408,6 +1273,620 @@
   jcc(Assembler::equal, done);
 }
 
+#ifdef COMPILER2
+// Fast_Lock and Fast_Unlock used by C2
+
+// Because the transitions from emitted code to the runtime
+// monitorenter/exit helper stubs are so slow it's critical that
+// we inline both the stack-locking fast-path and the inflated fast path.
+//
+// See also: cmpFastLock and cmpFastUnlock.
+//
+// What follows is a specialized inline transliteration of the code
+// in slow_enter() and slow_exit().  If we're concerned about I$ bloat
+// another option would be to emit TrySlowEnter and TrySlowExit methods
+// at startup-time.  These methods would accept arguments as
+// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
+// indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
+// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
+// In practice, however, the # of lock sites is bounded and is usually small.
+// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
+// if the processor uses simple bimodal branch predictors keyed by EIP
+// Since the helper routines would be called from multiple synchronization
+// sites.
+//
+// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
+// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
+// to those specialized methods.  That'd give us a mostly platform-independent
+// implementation that the JITs could optimize and inline at their pleasure.
+// Done correctly, the only time we'd need to cross to native could would be
+// to park() or unpark() threads.  We'd also need a few more unsafe operators
+// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
+// (b) explicit barriers or fence operations.
+//
+// TODO:
+//
+// *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
+//    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
+//    Given TLAB allocation, Self is usually manifested in a register, so passing it into
+//    the lock operators would typically be faster than reifying Self.
+//
+// *  Ideally I'd define the primitives as:
+//       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
+//       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
+//    Unfortunately ADLC bugs prevent us from expressing the ideal form.
+//    Instead, we're stuck with a rather awkward and brittle register assignments below.
+//    Furthermore the register assignments are overconstrained, possibly resulting in
+//    sub-optimal code near the synchronization site.
+//
+// *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
+//    Alternately, use a better sp-proximity test.
+//
+// *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
+//    Either one is sufficient to uniquely identify a thread.
+//    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
+//
+// *  Intrinsify notify() and notifyAll() for the common cases where the
+//    object is locked by the calling thread but the waitlist is empty.
+//    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
+//
+// *  use jccb and jmpb instead of jcc and jmp to improve code density.
+//    But beware of excessive branch density on AMD Opterons.
+//
+// *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
+//    or failure of the fast-path.  If the fast-path fails then we pass
+//    control to the slow-path, typically in C.  In Fast_Lock and
+//    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
+//    will emit a conditional branch immediately after the node.
+//    So we have branches to branches and lots of ICC.ZF games.
+//    Instead, it might be better to have C2 pass a "FailureLabel"
+//    into Fast_Lock and Fast_Unlock.  In the case of success, control
+//    will drop through the node.  ICC.ZF is undefined at exit.
+//    In the case of failure, the node will branch directly to the
+//    FailureLabel
+
+
+// obj: object to lock
+// box: on-stack box address (displaced header location) - KILLED
+// rax,: tmp -- KILLED
+// scr: tmp -- KILLED
+void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, Register scrReg, BiasedLockingCounters* counters) {
+  // Ensure the register assignents are disjoint
+  guarantee (objReg != boxReg, "");
+  guarantee (objReg != tmpReg, "");
+  guarantee (objReg != scrReg, "");
+  guarantee (boxReg != tmpReg, "");
+  guarantee (boxReg != scrReg, "");
+  guarantee (tmpReg == rax, "");
+
+  if (counters != NULL) {
+    atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()));
+  }
+  if (EmitSync & 1) {
+      // set box->dhw = unused_mark (3)
+      // Force all sync thru slow-path: slow_enter() and slow_exit()
+      movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
+      cmpptr (rsp, (int32_t)NULL_WORD);
+  } else
+  if (EmitSync & 2) {
+      Label DONE_LABEL ;
+      if (UseBiasedLocking) {
+         // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
+         biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
+      }
+
+      movptr(tmpReg, Address(objReg, 0));           // fetch markword
+      orptr (tmpReg, 0x1);
+      movptr(Address(boxReg, 0), tmpReg);           // Anticipate successful CAS
+      if (os::is_MP()) {
+        lock();
+      }
+      cmpxchgptr(boxReg, Address(objReg, 0));       // Updates tmpReg
+      jccb(Assembler::equal, DONE_LABEL);
+      // Recursive locking
+      subptr(tmpReg, rsp);
+      andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
+      movptr(Address(boxReg, 0), tmpReg);
+      bind(DONE_LABEL);
+  } else {
+    // Possible cases that we'll encounter in fast_lock
+    // ------------------------------------------------
+    // * Inflated
+    //    -- unlocked
+    //    -- Locked
+    //       = by self
+    //       = by other
+    // * biased
+    //    -- by Self
+    //    -- by other
+    // * neutral
+    // * stack-locked
+    //    -- by self
+    //       = sp-proximity test hits
+    //       = sp-proximity test generates false-negative
+    //    -- by other
+    //
+
+    Label IsInflated, DONE_LABEL;
+
+    // it's stack-locked, biased or neutral
+    // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
+    // order to reduce the number of conditional branches in the most common cases.
+    // Beware -- there's a subtle invariant that fetch of the markword
+    // at [FETCH], below, will never observe a biased encoding (*101b).
+    // If this invariant is not held we risk exclusion (safety) failure.
+    if (UseBiasedLocking && !UseOptoBiasInlining) {
+      biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters);
+    }
+
+    movptr(tmpReg, Address(objReg, 0));          // [FETCH]
+    testl (tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
+    jccb  (Assembler::notZero, IsInflated);
+
+    // Attempt stack-locking ...
+    orptr (tmpReg, 0x1);
+    movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
+    if (os::is_MP()) {
+      lock();
+    }
+    cmpxchgptr(boxReg, Address(objReg, 0));      // Updates tmpReg
+    if (counters != NULL) {
+      cond_inc32(Assembler::equal,
+                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
+    }
+    jccb(Assembler::equal, DONE_LABEL);
+
+    // Recursive locking
+    subptr(tmpReg, rsp);
+    andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
+    movptr(Address(boxReg, 0), tmpReg);
+    if (counters != NULL) {
+      cond_inc32(Assembler::equal,
+                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
+    }
+    jmpb(DONE_LABEL);
+
+    bind(IsInflated);
+#ifndef _LP64
+    // The object is inflated.
+    //
+    // TODO-FIXME: eliminate the ugly use of manifest constants:
+    //   Use markOopDesc::monitor_value instead of "2".
+    //   use markOop::unused_mark() instead of "3".
+    // The tmpReg value is an objectMonitor reference ORed with
+    // markOopDesc::monitor_value (2).   We can either convert tmpReg to an
+    // objectmonitor pointer by masking off the "2" bit or we can just
+    // use tmpReg as an objectmonitor pointer but bias the objectmonitor
+    // field offsets with "-2" to compensate for and annul the low-order tag bit.
+    //
+    // I use the latter as it avoids AGI stalls.
+    // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
+    // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
+    //
+    #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
+
+    // boxReg refers to the on-stack BasicLock in the current frame.
+    // We'd like to write:
+    //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
+    // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
+    // additional latency as we have another ST in the store buffer that must drain.
+
+    if (EmitSync & 8192) {
+       movptr(Address(boxReg, 0), 3);            // results in ST-before-CAS penalty
+       get_thread (scrReg);
+       movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
+       movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
+       if (os::is_MP()) {
+         lock();
+       }
+       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+    } else
+    if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
+       movptr(scrReg, boxReg);
+       movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
+
+       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
+       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
+          // prefetchw [eax + Offset(_owner)-2]
+          prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       }
+
+       if ((EmitSync & 64) == 0) {
+         // Optimistic form: consider XORL tmpReg,tmpReg
+         movptr(tmpReg, NULL_WORD);
+       } else {
+         // Can suffer RTS->RTO upgrades on shared or cold $ lines
+         // Test-And-CAS instead of CAS
+         movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));   // rax, = m->_owner
+         testptr(tmpReg, tmpReg);                   // Locked ?
+         jccb  (Assembler::notZero, DONE_LABEL);
+       }
+
+       // Appears unlocked - try to swing _owner from null to non-null.
+       // Ideally, I'd manifest "Self" with get_thread and then attempt
+       // to CAS the register containing Self into m->Owner.
+       // But we don't have enough registers, so instead we can either try to CAS
+       // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
+       // we later store "Self" into m->Owner.  Transiently storing a stack address
+       // (rsp or the address of the box) into  m->owner is harmless.
+       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
+       if (os::is_MP()) {
+         lock();
+       }
+       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
+       jccb  (Assembler::notZero, DONE_LABEL);
+       get_thread (scrReg);                    // beware: clobbers ICCs
+       movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg);
+       xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
+
+       // If the CAS fails we can either retry or pass control to the slow-path.
+       // We use the latter tactic.
+       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
+       // If the CAS was successful ...
+       //   Self has acquired the lock
+       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
+       // Intentional fall-through into DONE_LABEL ...
+    } else {
+       movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark()));  // results in ST-before-CAS penalty
+       movptr(boxReg, tmpReg);
+
+       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
+       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
+          // prefetchw [eax + Offset(_owner)-2]
+          prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       }
+
+       if ((EmitSync & 64) == 0) {
+         // Optimistic form
+         xorptr  (tmpReg, tmpReg);
+       } else {
+         // Can suffer RTS->RTO upgrades on shared or cold $ lines
+         movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));   // rax, = m->_owner
+         testptr(tmpReg, tmpReg);                   // Locked ?
+         jccb  (Assembler::notZero, DONE_LABEL);
+       }
+
+       // Appears unlocked - try to swing _owner from null to non-null.
+       // Use either "Self" (in scr) or rsp as thread identity in _owner.
+       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
+       get_thread (scrReg);
+       if (os::is_MP()) {
+         lock();
+       }
+       cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+
+       // If the CAS fails we can either retry or pass control to the slow-path.
+       // We use the latter tactic.
+       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
+       // If the CAS was successful ...
+       //   Self has acquired the lock
+       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
+       // Intentional fall-through into DONE_LABEL ...
+    }
+#else // _LP64
+    // It's inflated
+
+    // TODO: someday avoid the ST-before-CAS penalty by
+    // relocating (deferring) the following ST.
+    // We should also think about trying a CAS without having
+    // fetched _owner.  If the CAS is successful we may
+    // avoid an RTO->RTS upgrade on the $line.
+
+    // Without cast to int32_t a movptr will destroy r10 which is typically obj
+    movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
+
+    mov    (boxReg, tmpReg);
+    movptr (tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+    testptr(tmpReg, tmpReg);
+    jccb   (Assembler::notZero, DONE_LABEL);
+
+    // It's inflated and appears unlocked
+    if (os::is_MP()) {
+      lock();
+    }
+    cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+    // Intentional fall-through into DONE_LABEL ...
+
+#endif
+
+    // DONE_LABEL is a hot target - we'd really like to place it at the
+    // start of cache line by padding with NOPs.
+    // See the AMD and Intel software optimization manuals for the
+    // most efficient "long" NOP encodings.
+    // Unfortunately none of our alignment mechanisms suffice.
+    bind(DONE_LABEL);
+
+    // At DONE_LABEL the icc ZFlag is set as follows ...
+    // Fast_Unlock uses the same protocol.
+    // ZFlag == 1 -> Success
+    // ZFlag == 0 -> Failure - force control through the slow-path
+  }
+}
+
+// obj: object to unlock
+// box: box address (displaced header location), killed.  Must be EAX.
+// tmp: killed, cannot be obj nor box.
+//
+// Some commentary on balanced locking:
+//
+// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
+// Methods that don't have provably balanced locking are forced to run in the
+// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
+// The interpreter provides two properties:
+// I1:  At return-time the interpreter automatically and quietly unlocks any
+//      objects acquired the current activation (frame).  Recall that the
+//      interpreter maintains an on-stack list of locks currently held by
+//      a frame.
+// I2:  If a method attempts to unlock an object that is not held by the
+//      the frame the interpreter throws IMSX.
+//
+// Lets say A(), which has provably balanced locking, acquires O and then calls B().
+// B() doesn't have provably balanced locking so it runs in the interpreter.
+// Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
+// is still locked by A().
+//
+// The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
+// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
+// should not be unlocked by "normal" java-level locking and vice-versa.  The specification
+// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
+
+void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
+  guarantee (objReg != boxReg, "");
+  guarantee (objReg != tmpReg, "");
+  guarantee (boxReg != tmpReg, "");
+  guarantee (boxReg == rax, "");
+
+  if (EmitSync & 4) {
+    // Disable - inhibit all inlining.  Force control through the slow-path
+    cmpptr (rsp, 0);
+  } else
+  if (EmitSync & 8) {
+    Label DONE_LABEL;
+    if (UseBiasedLocking) {
+       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
+    }
+    // Classic stack-locking code ...
+    // Check whether the displaced header is 0
+    //(=> recursive unlock)
+    movptr(tmpReg, Address(boxReg, 0));
+    testptr(tmpReg, tmpReg);
+    jccb(Assembler::zero, DONE_LABEL);
+    // If not recursive lock, reset the header to displaced header
+    if (os::is_MP()) {
+      lock();
+    }
+    cmpxchgptr(tmpReg, Address(objReg, 0));   // Uses RAX which is box
+    bind(DONE_LABEL);
+  } else {
+    Label DONE_LABEL, Stacked, CheckSucc;
+
+    // Critically, the biased locking test must have precedence over
+    // and appear before the (box->dhw == 0) recursive stack-lock test.
+    if (UseBiasedLocking && !UseOptoBiasInlining) {
+       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
+    }
+
+    cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
+    movptr(tmpReg, Address(objReg, 0));             // Examine the object's markword
+    jccb  (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
+
+    testptr(tmpReg, 0x02);                          // Inflated?
+    jccb  (Assembler::zero, Stacked);
+
+    // It's inflated.
+    // Despite our balanced locking property we still check that m->_owner == Self
+    // as java routines or native JNI code called by this thread might
+    // have released the lock.
+    // Refer to the comments in synchronizer.cpp for how we might encode extra
+    // state in _succ so we can avoid fetching EntryList|cxq.
+    //
+    // I'd like to add more cases in fast_lock() and fast_unlock() --
+    // such as recursive enter and exit -- but we have to be wary of
+    // I$ bloat, T$ effects and BP$ effects.
+    //
+    // If there's no contention try a 1-0 exit.  That is, exit without
+    // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
+    // we detect and recover from the race that the 1-0 exit admits.
+    //
+    // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
+    // before it STs null into _owner, releasing the lock.  Updates
+    // to data protected by the critical section must be visible before
+    // we drop the lock (and thus before any other thread could acquire
+    // the lock and observe the fields protected by the lock).
+    // IA32's memory-model is SPO, so STs are ordered with respect to
+    // each other and there's no need for an explicit barrier (fence).
+    // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
+#ifndef _LP64
+    get_thread (boxReg);
+    if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
+      // prefetchw [ebx + Offset(_owner)-2]
+      prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+    }
+
+    // Note that we could employ various encoding schemes to reduce
+    // the number of loads below (currently 4) to just 2 or 3.
+    // Refer to the comments in synchronizer.cpp.
+    // In practice the chain of fetches doesn't seem to impact performance, however.
+    if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
+       // Attempt to reduce branch density - AMD's branch predictor.
+       xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
+       orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
+       orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
+       jccb  (Assembler::notZero, DONE_LABEL);
+       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
+       jmpb  (DONE_LABEL);
+    } else {
+       xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
+       jccb  (Assembler::notZero, DONE_LABEL);
+       movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
+       orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
+       jccb  (Assembler::notZero, CheckSucc);
+       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
+       jmpb  (DONE_LABEL);
+    }
+
+    // The Following code fragment (EmitSync & 65536) improves the performance of
+    // contended applications and contended synchronization microbenchmarks.
+    // Unfortunately the emission of the code - even though not executed - causes regressions
+    // in scimark and jetstream, evidently because of $ effects.  Replacing the code
+    // with an equal number of never-executed NOPs results in the same regression.
+    // We leave it off by default.
+
+    if ((EmitSync & 65536) != 0) {
+       Label LSuccess, LGoSlowPath ;
+
+       bind  (CheckSucc);
+
+       // Optional pre-test ... it's safe to elide this
+       if ((EmitSync & 16) == 0) {
+          cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
+          jccb  (Assembler::zero, LGoSlowPath);
+       }
+
+       // We have a classic Dekker-style idiom:
+       //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
+       // There are a number of ways to implement the barrier:
+       // (1) lock:andl &m->_owner, 0
+       //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
+       //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
+       //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
+       // (2) If supported, an explicit MFENCE is appealing.
+       //     In older IA32 processors MFENCE is slower than lock:add or xchg
+       //     particularly if the write-buffer is full as might be the case if
+       //     if stores closely precede the fence or fence-equivalent instruction.
+       //     In more modern implementations MFENCE appears faster, however.
+       // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
+       //     The $lines underlying the top-of-stack should be in M-state.
+       //     The locked add instruction is serializing, of course.
+       // (4) Use xchg, which is serializing
+       //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
+       // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
+       //     The integer condition codes will tell us if succ was 0.
+       //     Since _succ and _owner should reside in the same $line and
+       //     we just stored into _owner, it's likely that the $line
+       //     remains in M-state for the lock:orl.
+       //
+       // We currently use (3), although it's likely that switching to (2)
+       // is correct for the future.
+
+       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
+       if (os::is_MP()) {
+          if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
+            mfence();
+          } else {
+            lock (); addptr(Address(rsp, 0), 0);
+          }
+       }
+       // Ratify _succ remains non-null
+       cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0);
+       jccb  (Assembler::notZero, LSuccess);
+
+       xorptr(boxReg, boxReg);                  // box is really EAX
+       if (os::is_MP()) { lock(); }
+       cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+       jccb  (Assembler::notEqual, LSuccess);
+       // Since we're low on registers we installed rsp as a placeholding in _owner.
+       // Now install Self over rsp.  This is safe as we're transitioning from
+       // non-null to non=null
+       get_thread (boxReg);
+       movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
+       // Intentional fall-through into LGoSlowPath ...
+
+       bind  (LGoSlowPath);
+       orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
+       jmpb  (DONE_LABEL);
+
+       bind  (LSuccess);
+       xorptr(boxReg, boxReg);                 // set ICC.ZF=1 to indicate success
+       jmpb  (DONE_LABEL);
+    }
+
+    bind (Stacked);
+    // It's not inflated and it's not recursively stack-locked and it's not biased.
+    // It must be stack-locked.
+    // Try to reset the header to displaced header.
+    // The "box" value on the stack is stable, so we can reload
+    // and be assured we observe the same value as above.
+    movptr(tmpReg, Address(boxReg, 0));
+    if (os::is_MP()) {
+      lock();
+    }
+    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
+    // Intention fall-thru into DONE_LABEL
+
+    // DONE_LABEL is a hot target - we'd really like to place it at the
+    // start of cache line by padding with NOPs.
+    // See the AMD and Intel software optimization manuals for the
+    // most efficient "long" NOP encodings.
+    // Unfortunately none of our alignment mechanisms suffice.
+    if ((EmitSync & 65536) == 0) {
+       bind (CheckSucc);
+    }
+#else // _LP64
+    // It's inflated
+    movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+    xorptr(boxReg, r15_thread);
+    orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
+    jccb  (Assembler::notZero, DONE_LABEL);
+    movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
+    orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
+    jccb  (Assembler::notZero, CheckSucc);
+    movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
+    jmpb  (DONE_LABEL);
+
+    if ((EmitSync & 65536) == 0) {
+      Label LSuccess, LGoSlowPath ;
+      bind  (CheckSucc);
+      cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
+      jccb  (Assembler::zero, LGoSlowPath);
+
+      // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
+      // the explicit ST;MEMBAR combination, but masm doesn't currently support
+      // "ANDQ M,IMM".  Don't use MFENCE here.  lock:add to TOS, xchg, etc
+      // are all faster when the write buffer is populated.
+      movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
+      if (os::is_MP()) {
+         lock (); addl (Address(rsp, 0), 0);
+      }
+      cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
+      jccb  (Assembler::notZero, LSuccess);
+
+      movptr (boxReg, (int32_t)NULL_WORD);                   // box is really EAX
+      if (os::is_MP()) { lock(); }
+      cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+      jccb  (Assembler::notEqual, LSuccess);
+      // Intentional fall-through into slow-path
+
+      bind  (LGoSlowPath);
+      orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
+      jmpb  (DONE_LABEL);
+
+      bind  (LSuccess);
+      testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
+      jmpb  (DONE_LABEL);
+    }
+
+    bind  (Stacked);
+    movptr(tmpReg, Address (boxReg, 0));      // re-fetch
+    if (os::is_MP()) { lock(); }
+    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
+
+    if (EmitSync & 65536) {
+       bind (CheckSucc);
+    }
+#endif
+    bind(DONE_LABEL);
+    // Avoid branch to branch on AMD processors
+    if (EmitSync & 32768) {
+       nop();
+    }
+  }
+}
+#endif // COMPILER2
+
 void MacroAssembler::c2bool(Register x) {
   // implements x == 0 ? 0 : 1
   // note: must only look at least-significant byte of x