# HG changeset patch
# User trims
# Date 1261565851 28800
# Node ID 9749fbc4859bd2afa2d435c532ac21d8143d38d3
# Parent  920875ae1277b3cf3703bc7dc61e7c2e15d9bcf5# Parent  7ac7d558e8958df2f908432ec72b5e9e247e3871
Merge

diff -r 920875ae1277 -r 9749fbc4859b src/cpu/x86/vm/assembler_x86.cpp
--- a/src/cpu/x86/vm/assembler_x86.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -7666,7 +7666,7 @@
 
 #ifdef ASSERT
   Label L;
-  testl(tmp, tmp);
+  testptr(tmp, tmp);
   jccb(Assembler::notZero, L);
   hlt();
   bind(L);
diff -r 920875ae1277 -r 9749fbc4859b src/cpu/x86/vm/interp_masm_x86_32.cpp
--- a/src/cpu/x86/vm/interp_masm_x86_32.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/cpu/x86/vm/interp_masm_x86_32.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -196,6 +196,9 @@
   } else {
     assert(EnableInvokeDynamic, "giant index used only for EnableInvokeDynamic");
     movl(reg, Address(rsi, bcp_offset));
+    // Check if the secondary index definition is still ~x, otherwise
+    // we have to change the following assembler code to calculate the
+    // plain index.
     assert(constantPoolCacheOopDesc::decode_secondary_index(~123) == 123, "else change next line");
     notl(reg);  // convert to plain index
   }
diff -r 920875ae1277 -r 9749fbc4859b src/cpu/x86/vm/interp_masm_x86_64.cpp
--- a/src/cpu/x86/vm/interp_masm_x86_64.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/cpu/x86/vm/interp_masm_x86_64.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -185,12 +185,30 @@
 }
 
 
+void InterpreterMacroAssembler::get_cache_index_at_bcp(Register index,
+                                                       int bcp_offset,
+                                                       bool giant_index) {
+  assert(bcp_offset > 0, "bcp is still pointing to start of bytecode");
+  if (!giant_index) {
+    load_unsigned_short(index, Address(r13, bcp_offset));
+  } else {
+    assert(EnableInvokeDynamic, "giant index used only for EnableInvokeDynamic");
+    movl(index, Address(r13, bcp_offset));
+    // Check if the secondary index definition is still ~x, otherwise
+    // we have to change the following assembler code to calculate the
+    // plain index.
+    assert(constantPoolCacheOopDesc::decode_secondary_index(~123) == 123, "else change next line");
+    notl(index);  // convert to plain index
+  }
+}
+
+
 void InterpreterMacroAssembler::get_cache_and_index_at_bcp(Register cache,
                                                            Register index,
-                                                           int bcp_offset) {
-  assert(bcp_offset > 0, "bcp is still pointing to start of bytecode");
+                                                           int bcp_offset,
+                                                           bool giant_index) {
   assert(cache != index, "must use different registers");
-  load_unsigned_short(index, Address(r13, bcp_offset));
+  get_cache_index_at_bcp(index, bcp_offset, giant_index);
   movptr(cache, Address(rbp, frame::interpreter_frame_cache_offset * wordSize));
   assert(sizeof(ConstantPoolCacheEntry) == 4 * wordSize, "adjust code below");
   // convert from field index to ConstantPoolCacheEntry index
@@ -200,10 +218,10 @@
 
 void InterpreterMacroAssembler::get_cache_entry_pointer_at_bcp(Register cache,
                                                                Register tmp,
-                                                               int bcp_offset) {
-  assert(bcp_offset > 0, "bcp is still pointing to start of bytecode");
+                                                               int bcp_offset,
+                                                               bool giant_index) {
   assert(cache != tmp, "must use different register");
-  load_unsigned_short(tmp, Address(r13, bcp_offset));
+  get_cache_index_at_bcp(tmp, bcp_offset, giant_index);
   assert(sizeof(ConstantPoolCacheEntry) == 4 * wordSize, "adjust code below");
   // convert from field index to ConstantPoolCacheEntry index
   // and from word offset to byte offset
@@ -1236,7 +1254,8 @@
 
 void InterpreterMacroAssembler::profile_virtual_call(Register receiver,
                                                      Register mdp,
-                                                     Register reg2) {
+                                                     Register reg2,
+                                                     bool receiver_can_be_null) {
   if (ProfileInterpreter) {
     Label profile_continue;
 
@@ -1246,8 +1265,15 @@
     // We are making a call.  Increment the count.
     increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
 
+    Label skip_receiver_profile;
+    if (receiver_can_be_null) {
+      testptr(receiver, receiver);
+      jcc(Assembler::zero, skip_receiver_profile);
+    }
+
     // Record the receiver type.
     record_klass_in_profile(receiver, mdp, reg2);
+    bind(skip_receiver_profile);
 
     // The method data pointer needs to be updated to reflect the new target.
     update_mdp_by_constant(mdp,
diff -r 920875ae1277 -r 9749fbc4859b src/cpu/x86/vm/interp_masm_x86_64.hpp
--- a/src/cpu/x86/vm/interp_masm_x86_64.hpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/cpu/x86/vm/interp_masm_x86_64.hpp	Wed Dec 23 02:57:31 2009 -0800
@@ -95,9 +95,10 @@
 
   void get_unsigned_2_byte_index_at_bcp(Register reg, int bcp_offset);
   void get_cache_and_index_at_bcp(Register cache, Register index,
-                                  int bcp_offset);
+                                  int bcp_offset, bool giant_index = false);
   void get_cache_entry_pointer_at_bcp(Register cache, Register tmp,
-                                      int bcp_offset);
+                                      int bcp_offset, bool giant_index = false);
+  void get_cache_index_at_bcp(Register index, int bcp_offset, bool giant_index = false);
 
 
   void pop_ptr(Register r = rax);
@@ -236,7 +237,8 @@
   void profile_call(Register mdp);
   void profile_final_call(Register mdp);
   void profile_virtual_call(Register receiver, Register mdp,
-                            Register scratch2);
+                            Register scratch2,
+                            bool receiver_can_be_null = false);
   void profile_ret(Register return_bci, Register mdp);
   void profile_null_seen(Register mdp);
   void profile_typecheck(Register mdp, Register klass, Register scratch);
diff -r 920875ae1277 -r 9749fbc4859b src/cpu/x86/vm/interpreter_x86_64.cpp
--- a/src/cpu/x86/vm/interpreter_x86_64.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/cpu/x86/vm/interpreter_x86_64.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -277,12 +277,11 @@
   address entry_point = __ pc();
 
   // abstract method entry
-  // remove return address. Not really needed, since exception
-  // handling throws away expression stack
-  __ pop(rbx);
 
-  // adjust stack to what a normal return would do
-  __ mov(rsp, r13);
+  //  pop return address, reset last_sp to NULL
+  __ empty_expression_stack();
+  __ restore_bcp();      // rsi must be correct for exception handler   (was destroyed)
+  __ restore_locals();   // make sure locals pointer is correct as well (was destroyed)
 
   // throw exception
   __ call_VM(noreg, CAST_FROM_FN_PTR(address,
@@ -300,7 +299,10 @@
   if (!EnableMethodHandles) {
     return generate_abstract_entry();
   }
-  return generate_abstract_entry(); //6815692//
+
+  address entry_point = MethodHandles::generate_method_handle_interpreter_entry(_masm);
+
+  return entry_point;
 }
 
 
diff -r 920875ae1277 -r 9749fbc4859b src/cpu/x86/vm/methodHandles_x86.cpp
--- a/src/cpu/x86/vm/methodHandles_x86.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/cpu/x86/vm/methodHandles_x86.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -448,7 +448,7 @@
                                 rbx_index, Address::times_ptr,
                                 base + vtableEntry::method_offset_in_bytes());
       Register rbx_method = rbx_temp;
-      __ movl(rbx_method, vtable_entry_addr);
+      __ movptr(rbx_method, vtable_entry_addr);
 
       __ verify_oop(rbx_method);
       __ jmp(rbx_method_fie);
diff -r 920875ae1277 -r 9749fbc4859b src/cpu/x86/vm/stubGenerator_x86_64.cpp
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -2935,6 +2935,16 @@
 
     // arraycopy stubs used by compilers
     generate_arraycopy_stubs();
+
+    // generic method handle stubs
+    if (EnableMethodHandles && SystemDictionary::MethodHandle_klass() != NULL) {
+      for (MethodHandles::EntryKind ek = MethodHandles::_EK_FIRST;
+           ek < MethodHandles::_EK_LIMIT;
+           ek = MethodHandles::EntryKind(1 + (int)ek)) {
+        StubCodeMark mark(this, "MethodHandle", MethodHandles::entry_name(ek));
+        MethodHandles::generate_method_handle_stub(_masm, ek);
+      }
+    }
   }
 
  public:
diff -r 920875ae1277 -r 9749fbc4859b src/cpu/x86/vm/templateInterpreter_x86_64.cpp
--- a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -100,21 +100,26 @@
   return entry;
 }
 
-// Arguments are: required type in rarg1, failing object (or NULL) in rarg2
+// Arguments are: required type at TOS+8, failing object (or NULL) at TOS+4.
 address TemplateInterpreterGenerator::generate_WrongMethodType_handler() {
   address entry = __ pc();
 
   __ pop(c_rarg2);              // failing object is at TOS
   __ pop(c_rarg1);              // required type is at TOS+8
 
-  // expression stack must be empty before entering the VM if an
-  // exception happened
+  __ verify_oop(c_rarg1);
+  __ verify_oop(c_rarg2);
+
+  // Various method handle types use interpreter registers as temps.
+  __ restore_bcp();
+  __ restore_locals();
+
+  // Expression stack must be empty before entering the VM for an exception.
   __ empty_expression_stack();
 
   __ call_VM(noreg,
              CAST_FROM_FN_PTR(address,
-                              InterpreterRuntime::
-                              throw_WrongMethodTypeException),
+                              InterpreterRuntime::throw_WrongMethodTypeException),
              // pass required type, failing object (or NULL)
              c_rarg1, c_rarg2);
   return entry;
@@ -182,15 +187,29 @@
   __ restore_bcp();
   __ restore_locals();
 
-  __ get_cache_and_index_at_bcp(rbx, rcx, 1);
+  Label L_got_cache, L_giant_index;
+  if (EnableInvokeDynamic) {
+    __ cmpb(Address(r13, 0), Bytecodes::_invokedynamic);
+    __ jcc(Assembler::equal, L_giant_index);
+  }
+  __ get_cache_and_index_at_bcp(rbx, rcx, 1, false);
+  __ bind(L_got_cache);
   __ movl(rbx, Address(rbx, rcx,
-                       Address::times_8,
+                       Address::times_ptr,
                        in_bytes(constantPoolCacheOopDesc::base_offset()) +
                        3 * wordSize));
   __ andl(rbx, 0xFF);
   if (TaggedStackInterpreter) __ shll(rbx, 1); // 2 slots per parameter.
   __ lea(rsp, Address(rsp, rbx, Address::times_8));
   __ dispatch_next(state, step);
+
+  // out of the main line of code...
+  if (EnableInvokeDynamic) {
+    __ bind(L_giant_index);
+    __ get_cache_and_index_at_bcp(rbx, rcx, 1, true);
+    __ jmp(L_got_cache);
+  }
+
   return entry;
 }
 
diff -r 920875ae1277 -r 9749fbc4859b src/cpu/x86/vm/templateTable_x86_32.cpp
--- a/src/cpu/x86/vm/templateTable_x86_32.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/cpu/x86/vm/templateTable_x86_32.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -3146,7 +3146,6 @@
     __ profile_call(rsi);
   }
 
-  Label handle_unlinked_site;
   __ movptr(rcx, Address(rax, __ delayed_value(java_dyn_CallSite::target_offset_in_bytes, rcx)));
   __ null_check(rcx);
   __ prepare_to_jump_from_interpreted();
diff -r 920875ae1277 -r 9749fbc4859b src/cpu/x86/vm/templateTable_x86_64.cpp
--- a/src/cpu/x86/vm/templateTable_x86_64.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/cpu/x86/vm/templateTable_x86_64.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -203,18 +203,15 @@
     __ jcc(Assembler::notEqual, fast_patch);
     __ get_method(scratch);
     // Let breakpoint table handling rewrite to quicker bytecode
-    __ call_VM(noreg,
-               CAST_FROM_FN_PTR(address,
-                                InterpreterRuntime::set_original_bytecode_at),
-               scratch, r13, bc);
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::set_original_bytecode_at), scratch, r13, bc);
 #ifndef ASSERT
     __ jmpb(patch_done);
+#else
+    __ jmp(patch_done);
+#endif
     __ bind(fast_patch);
   }
-#else
-    __ jmp(patch_done);
-    __ bind(fast_patch);
-  }
+#ifdef ASSERT
   Label okay;
   __ load_unsigned_byte(scratch, at_bcp(0));
   __ cmpl(scratch, (int) Bytecodes::java_code(bytecode));
@@ -2054,26 +2051,28 @@
   }
 }
 
-void TemplateTable::resolve_cache_and_index(int byte_no,
-                                            Register Rcache,
-                                            Register index) {
+void TemplateTable::resolve_cache_and_index(int byte_no, Register Rcache, Register index) {
   assert(byte_no == 1 || byte_no == 2, "byte_no out of range");
+  bool is_invokedynamic = (bytecode() == Bytecodes::_invokedynamic);
 
   const Register temp = rbx;
   assert_different_registers(Rcache, index, temp);
 
   const int shift_count = (1 + byte_no) * BitsPerByte;
   Label resolved;
-  __ get_cache_and_index_at_bcp(Rcache, index, 1);
-  __ movl(temp, Address(Rcache,
-                        index, Address::times_8,
-                        constantPoolCacheOopDesc::base_offset() +
-                        ConstantPoolCacheEntry::indices_offset()));
-  __ shrl(temp, shift_count);
-  // have we resolved this bytecode?
-  __ andl(temp, 0xFF);
-  __ cmpl(temp, (int) bytecode());
-  __ jcc(Assembler::equal, resolved);
+  __ get_cache_and_index_at_bcp(Rcache, index, 1, is_invokedynamic);
+  if (is_invokedynamic) {
+    // we are resolved if the f1 field contains a non-null CallSite object
+    __ cmpptr(Address(Rcache, index, Address::times_ptr, constantPoolCacheOopDesc::base_offset() + ConstantPoolCacheEntry::f1_offset()), (int32_t) NULL_WORD);
+    __ jcc(Assembler::notEqual, resolved);
+  } else {
+    __ movl(temp, Address(Rcache, index, Address::times_ptr, constantPoolCacheOopDesc::base_offset() + ConstantPoolCacheEntry::indices_offset()));
+    __ shrl(temp, shift_count);
+    // have we resolved this bytecode?
+    __ andl(temp, 0xFF);
+    __ cmpl(temp, (int) bytecode());
+    __ jcc(Assembler::equal, resolved);
+  }
 
   // resolve first time through
   address entry;
@@ -2090,6 +2089,9 @@
   case Bytecodes::_invokeinterface:
     entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_invoke);
     break;
+  case Bytecodes::_invokedynamic:
+    entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_invokedynamic);
+    break;
   default:
     ShouldNotReachHere();
     break;
@@ -2098,7 +2100,7 @@
   __ call_VM(noreg, entry, temp);
 
   // Update registers with resolved info
-  __ get_cache_and_index_at_bcp(Rcache, index, 1);
+  __ get_cache_and_index_at_bcp(Rcache, index, 1, is_invokedynamic);
   __ bind(resolved);
 }
 
@@ -2832,15 +2834,14 @@
   ShouldNotReachHere();
 }
 
-void TemplateTable::prepare_invoke(Register method,
-                                   Register index,
-                                   int byte_no,
-                                   Bytecodes::Code code) {
+void TemplateTable::prepare_invoke(Register method, Register index, int byte_no) {
   // determine flags
+  Bytecodes::Code code = bytecode();
   const bool is_invokeinterface  = code == Bytecodes::_invokeinterface;
+  const bool is_invokedynamic    = code == Bytecodes::_invokedynamic;
   const bool is_invokevirtual    = code == Bytecodes::_invokevirtual;
   const bool is_invokespecial    = code == Bytecodes::_invokespecial;
-  const bool load_receiver       = code != Bytecodes::_invokestatic;
+  const bool load_receiver      = (code != Bytecodes::_invokestatic && code != Bytecodes::_invokedynamic);
   const bool receiver_null_check = is_invokespecial;
   const bool save_flags = is_invokeinterface || is_invokevirtual;
   // setup registers & access constant pool cache
@@ -2858,9 +2859,13 @@
     __ movl(recv, flags);
     __ andl(recv, 0xFF);
     if (TaggedStackInterpreter) __ shll(recv, 1);  // index*2
-    __ movptr(recv, Address(rsp, recv, Address::times_8,
-                                 -Interpreter::expr_offset_in_bytes(1)));
-    __ verify_oop(recv);
+    Address recv_addr(rsp, recv, Address::times_8, -Interpreter::expr_offset_in_bytes(1));
+    if (is_invokedynamic) {
+      __ lea(recv, recv_addr);
+    } else {
+      __ movptr(recv, recv_addr);
+      __ verify_oop(recv);
+    }
   }
 
   // do null check if needed
@@ -2878,10 +2883,14 @@
   ConstantPoolCacheEntry::verify_tosBits();
   // load return address
   {
-    ExternalAddress return_5((address)Interpreter::return_5_addrs_by_index_table());
-    ExternalAddress return_3((address)Interpreter::return_3_addrs_by_index_table());
-    __ lea(rscratch1, (is_invokeinterface ? return_5 : return_3));
-    __ movptr(flags, Address(rscratch1, flags, Address::times_8));
+    address table_addr;
+    if (is_invokeinterface || is_invokedynamic)
+      table_addr = (address)Interpreter::return_5_addrs_by_index_table();
+    else
+      table_addr = (address)Interpreter::return_3_addrs_by_index_table();
+    ExternalAddress table(table_addr);
+    __ lea(rscratch1, table);
+    __ movptr(flags, Address(rscratch1, flags, Address::times_ptr));
   }
 
   // push return address
@@ -2947,7 +2956,7 @@
 
 void TemplateTable::invokevirtual(int byte_no) {
   transition(vtos, vtos);
-  prepare_invoke(rbx, noreg, byte_no, bytecode());
+  prepare_invoke(rbx, noreg, byte_no);
 
   // rbx: index
   // rcx: receiver
@@ -2959,7 +2968,7 @@
 
 void TemplateTable::invokespecial(int byte_no) {
   transition(vtos, vtos);
-  prepare_invoke(rbx, noreg, byte_no, bytecode());
+  prepare_invoke(rbx, noreg, byte_no);
   // do the call
   __ verify_oop(rbx);
   __ profile_call(rax);
@@ -2969,7 +2978,7 @@
 
 void TemplateTable::invokestatic(int byte_no) {
   transition(vtos, vtos);
-  prepare_invoke(rbx, noreg, byte_no, bytecode());
+  prepare_invoke(rbx, noreg, byte_no);
   // do the call
   __ verify_oop(rbx);
   __ profile_call(rax);
@@ -2983,7 +2992,7 @@
 
 void TemplateTable::invokeinterface(int byte_no) {
   transition(vtos, vtos);
-  prepare_invoke(rax, rbx, byte_no, bytecode());
+  prepare_invoke(rax, rbx, byte_no);
 
   // rax: Interface
   // rbx: index
@@ -3072,7 +3081,24 @@
     return;
   }
 
-  __ stop("invokedynamic NYI");//6815692//
+  prepare_invoke(rax, rbx, byte_no);
+
+  // rax: CallSite object (f1)
+  // rbx: unused (f2)
+  // rcx: receiver address
+  // rdx: flags (unused)
+
+  if (ProfileInterpreter) {
+    Label L;
+    // %%% should make a type profile for any invokedynamic that takes a ref argument
+    // profile this call
+    __ profile_call(r13);
+  }
+
+  __ movptr(rcx, Address(rax, __ delayed_value(java_dyn_CallSite::target_offset_in_bytes, rcx)));
+  __ null_check(rcx);
+  __ prepare_to_jump_from_interpreted();
+  __ jump_to_method_handle_entry(rcx, rdx);
 }
 
 
diff -r 920875ae1277 -r 9749fbc4859b src/cpu/x86/vm/templateTable_x86_64.hpp
--- a/src/cpu/x86/vm/templateTable_x86_64.hpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/cpu/x86/vm/templateTable_x86_64.hpp	Wed Dec 23 02:57:31 2009 -0800
@@ -22,8 +22,7 @@
  *
  */
 
-  static void prepare_invoke(Register method, Register index, int byte_no,
-                             Bytecodes::Code code);
+  static void prepare_invoke(Register method, Register index, int byte_no);
   static void invokevirtual_helper(Register index, Register recv,
                                    Register flags);
   static void volatile_barrier(Assembler::Membar_mask_bits order_constraint);
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/classfile/classFileParser.cpp
--- a/src/share/vm/classfile/classFileParser.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/classfile/classFileParser.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -2511,23 +2511,12 @@
       fac_ptr->nonstatic_byte_count -= 1;
       (*fields_ptr)->ushort_at_put(i + instanceKlass::signature_index_offset,
                                    word_sig_index);
-      if (wordSize == jintSize) {
-        fac_ptr->nonstatic_word_count += 1;
-      } else {
-        fac_ptr->nonstatic_double_count += 1;
-      }
-
-      FieldAllocationType atype = (FieldAllocationType) (*fields_ptr)->ushort_at(i+4);
+      fac_ptr->nonstatic_word_count += 1;
+
+      FieldAllocationType atype = (FieldAllocationType) (*fields_ptr)->ushort_at(i + instanceKlass::low_offset);
       assert(atype == NONSTATIC_BYTE, "");
       FieldAllocationType new_atype = NONSTATIC_WORD;
-      if (wordSize > jintSize) {
-        if (Universe::field_type_should_be_aligned(T_LONG)) {
-          atype = NONSTATIC_ALIGNED_DOUBLE;
-        } else {
-          atype = NONSTATIC_DOUBLE;
-        }
-      }
-      (*fields_ptr)->ushort_at_put(i+4, new_atype);
+      (*fields_ptr)->ushort_at_put(i + instanceKlass::low_offset, new_atype);
 
       found_vmentry = true;
       break;
@@ -3085,7 +3074,7 @@
     int len = fields->length();
     for (int i = 0; i < len; i += instanceKlass::next_offset) {
       int real_offset;
-      FieldAllocationType atype = (FieldAllocationType) fields->ushort_at(i+4);
+      FieldAllocationType atype = (FieldAllocationType) fields->ushort_at(i + instanceKlass::low_offset);
       switch (atype) {
         case STATIC_OOP:
           real_offset = next_static_oop_offset;
@@ -3173,8 +3162,8 @@
         default:
           ShouldNotReachHere();
       }
-      fields->short_at_put(i+4, extract_low_short_from_int(real_offset) );
-      fields->short_at_put(i+5, extract_high_short_from_int(real_offset) );
+      fields->short_at_put(i + instanceKlass::low_offset,  extract_low_short_from_int(real_offset));
+      fields->short_at_put(i + instanceKlass::high_offset, extract_high_short_from_int(real_offset));
     }
 
     // Size of instances
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/code/nmethod.cpp
--- a/src/share/vm/code/nmethod.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/code/nmethod.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -414,9 +414,8 @@
 }
 
 const char* nmethod::compile_kind() const {
-  if (method() == NULL)    return "unloaded";
-  if (is_native_method())  return "c2n";
   if (is_osr_method())     return "osr";
+  if (method() != NULL && is_native_method())  return "c2n";
   return NULL;
 }
 
@@ -1127,6 +1126,9 @@
   }
   flags.state = unloaded;
 
+  // Log the unloading.
+  log_state_change();
+
   // The methodOop is gone at this point
   assert(_method == NULL, "Tautology");
 
@@ -1137,8 +1139,6 @@
 
 void nmethod::invalidate_osr_method() {
   assert(_entry_bci != InvocationEntryBci, "wrong kind of nmethod");
-  if (_entry_bci != InvalidOSREntryBci)
-    inc_decompile_count();
   // Remove from list of active nmethods
   if (method() != NULL)
     instanceKlass::cast(method()->method_holder())->remove_osr_nmethod(this);
@@ -1146,59 +1146,63 @@
   _entry_bci = InvalidOSREntryBci;
 }
 
-void nmethod::log_state_change(int state) const {
+void nmethod::log_state_change() const {
   if (LogCompilation) {
     if (xtty != NULL) {
       ttyLocker ttyl;  // keep the following output all in one block
-      xtty->begin_elem("make_not_entrant %sthread='" UINTX_FORMAT "'",
-                       (state == zombie ? "zombie='1' " : ""),
-                       os::current_thread_id());
+      if (flags.state == unloaded) {
+        xtty->begin_elem("make_unloaded thread='" UINTX_FORMAT "'",
+                         os::current_thread_id());
+      } else {
+        xtty->begin_elem("make_not_entrant thread='" UINTX_FORMAT "'%s",
+                         os::current_thread_id(),
+                         (flags.state == zombie ? " zombie='1'" : ""));
+      }
       log_identity(xtty);
       xtty->stamp();
       xtty->end_elem();
     }
   }
-  if (PrintCompilation) {
-    print_on(tty, state == zombie ? "made zombie " : "made not entrant ");
+  if (PrintCompilation && flags.state != unloaded) {
+    print_on(tty, flags.state == zombie ? "made zombie " : "made not entrant ");
     tty->cr();
   }
 }
 
 // Common functionality for both make_not_entrant and make_zombie
-void nmethod::make_not_entrant_or_zombie(int state) {
+bool nmethod::make_not_entrant_or_zombie(int state) {
   assert(state == zombie || state == not_entrant, "must be zombie or not_entrant");
 
-  // Code for an on-stack-replacement nmethod is removed when a class gets unloaded.
-  // They never become zombie/non-entrant, so the nmethod sweeper will never remove
-  // them. Instead the entry_bci is set to InvalidOSREntryBci, so the osr nmethod
-  // will never be used anymore. That the nmethods only gets removed when class unloading
-  // happens, make life much simpler, since the nmethods are not just going to disappear
-  // out of the blue.
-  if (is_osr_method()) {
-    if (osr_entry_bci() != InvalidOSREntryBci) {
-      // only log this once
-      log_state_change(state);
-    }
-    invalidate_osr_method();
-    return;
+  // If the method is already zombie there is nothing to do
+  if (is_zombie()) {
+    return false;
   }
 
-  // If the method is already zombie or set to the state we want, nothing to do
-  if (is_zombie() || (state == not_entrant && is_not_entrant())) {
-    return;
-  }
-
-  log_state_change(state);
-
   // Make sure the nmethod is not flushed in case of a safepoint in code below.
   nmethodLocker nml(this);
 
   {
+    // invalidate osr nmethod before acquiring the patching lock since
+    // they both acquire leaf locks and we don't want a deadlock.
+    // This logic is equivalent to the logic below for patching the
+    // verified entry point of regular methods.
+    if (is_osr_method()) {
+      // this effectively makes the osr nmethod not entrant
+      invalidate_osr_method();
+    }
+
     // Enter critical section.  Does not block for safepoint.
     MutexLockerEx pl(Patching_lock, Mutex::_no_safepoint_check_flag);
+
+    if (flags.state == state) {
+      // another thread already performed this transition so nothing
+      // to do, but return false to indicate this.
+      return false;
+    }
+
     // The caller can be calling the method statically or through an inline
     // cache call.
-    if (!is_not_entrant()) {
+    if (!is_osr_method() && !is_not_entrant()) {
       NativeJump::patch_verified_entry(entry_point(), verified_entry_point(),
                   SharedRuntime::get_handle_wrong_method_stub());
       assert (NativeJump::instruction_size == nmethod::_zombie_instruction_size, "");
@@ -1217,6 +1221,10 @@
 
     // Change state
     flags.state = state;
+
+    // Log the transition once
+    log_state_change();
+
   } // leave critical region under Patching_lock
 
   if (state == not_entrant) {
@@ -1240,7 +1248,6 @@
   // It's a true state change, so mark the method as decompiled.
   inc_decompile_count();
 
-
   // zombie only - if a JVMTI agent has enabled the CompiledMethodUnload event
   // and it hasn't already been reported for this nmethod then report it now.
   // (the event may have been reported earilier if the GC marked it for unloading).
@@ -1268,7 +1275,7 @@
 
   // Check whether method got unloaded at a safepoint before this,
   // if so we can skip the flushing steps below
-  if (method() == NULL) return;
+  if (method() == NULL) return true;
 
   // Remove nmethod from method.
   // We need to check if both the _code and _from_compiled_code_entry_point
@@ -1282,6 +1289,8 @@
     HandleMark hm;
     method()->clear_code();
   }
+
+  return true;
 }
 
 
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/code/nmethod.hpp
--- a/src/share/vm/code/nmethod.hpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/code/nmethod.hpp	Wed Dec 23 02:57:31 2009 -0800
@@ -252,7 +252,9 @@
   void* operator new(size_t size, int nmethod_size);
 
   const char* reloc_string_for(u_char* begin, u_char* end);
-  void make_not_entrant_or_zombie(int state);
+  // Returns true if this thread changed the state of the nmethod or
+  // false if another thread performed the transition.
+  bool make_not_entrant_or_zombie(int state);
   void inc_decompile_count();
 
   // used to check that writes to nmFlags are done consistently.
@@ -375,10 +377,12 @@
   bool  is_zombie() const                         { return flags.state == zombie; }
   bool  is_unloaded() const                       { return flags.state == unloaded;   }
 
-  // Make the nmethod non entrant. The nmethod will continue to be alive.
-  // It is used when an uncommon trap happens.
-  void  make_not_entrant()                        { make_not_entrant_or_zombie(not_entrant); }
-  void  make_zombie()                             { make_not_entrant_or_zombie(zombie); }
+  // Make the nmethod non entrant. The nmethod will continue to be
+  // alive.  It is used when an uncommon trap happens.  Returns true
+  // if this thread changed the state of the nmethod or false if
+  // another thread performed the transition.
+  bool  make_not_entrant()                        { return make_not_entrant_or_zombie(not_entrant); }
+  bool  make_zombie()                             { return make_not_entrant_or_zombie(zombie); }
 
   // used by jvmti to track if the unload event has been reported
   bool  unload_reported()                         { return _unload_reported; }
@@ -563,7 +567,7 @@
   // Logging
   void log_identity(xmlStream* log) const;
   void log_new_nmethod() const;
-  void log_state_change(int state) const;
+  void log_state_change() const;
 
   // Prints a comment for one native instruction (reloc info, pc desc)
   void print_code_comment_on(outputStream* st, int column, address begin, address end);
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
--- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -42,28 +42,49 @@
   _n_periods(0),
   _threads(NULL), _n_threads(0)
 {
-  if (G1ConcRefine) {
-    _n_threads = (int)thread_num();
-    if (_n_threads > 0) {
-      _threads = NEW_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _n_threads);
-      int worker_id_offset = (int)DirtyCardQueueSet::num_par_ids();
-      ConcurrentG1RefineThread *next = NULL;
-      for (int i = _n_threads - 1; i >= 0; i--) {
-        ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, worker_id_offset, i);
-        assert(t != NULL, "Conc refine should have been created");
-        assert(t->cg1r() == this, "Conc refine thread should refer to this");
-        _threads[i] = t;
-        next = t;
-      }
-    }
+
+  // Ergomonically select initial concurrent refinement parameters
+  if (FLAG_IS_DEFAULT(G1ConcRefineGreenZone)) {
+    FLAG_SET_DEFAULT(G1ConcRefineGreenZone, MAX2<int>(ParallelGCThreads, 1));
+  }
+  set_green_zone(G1ConcRefineGreenZone);
+
+  if (FLAG_IS_DEFAULT(G1ConcRefineYellowZone)) {
+    FLAG_SET_DEFAULT(G1ConcRefineYellowZone, green_zone() * 3);
+  }
+  set_yellow_zone(MAX2<int>(G1ConcRefineYellowZone, green_zone()));
+
+  if (FLAG_IS_DEFAULT(G1ConcRefineRedZone)) {
+    FLAG_SET_DEFAULT(G1ConcRefineRedZone, yellow_zone() * 2);
+  }
+  set_red_zone(MAX2<int>(G1ConcRefineRedZone, yellow_zone()));
+  _n_worker_threads = thread_num();
+  // We need one extra thread to do the young gen rset size sampling.
+  _n_threads = _n_worker_threads + 1;
+  reset_threshold_step();
+
+  _threads = NEW_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _n_threads);
+  int worker_id_offset = (int)DirtyCardQueueSet::num_par_ids();
+  ConcurrentG1RefineThread *next = NULL;
+  for (int i = _n_threads - 1; i >= 0; i--) {
+    ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, worker_id_offset, i);
+    assert(t != NULL, "Conc refine should have been created");
+    assert(t->cg1r() == this, "Conc refine thread should refer to this");
+    _threads[i] = t;
+    next = t;
   }
 }
 
-size_t ConcurrentG1Refine::thread_num() {
-  if (G1ConcRefine) {
-    return (G1ParallelRSetThreads > 0) ? G1ParallelRSetThreads : ParallelGCThreads;
+void ConcurrentG1Refine::reset_threshold_step() {
+  if (FLAG_IS_DEFAULT(G1ConcRefineThresholdStep)) {
+    _thread_threshold_step = (yellow_zone() - green_zone()) / (worker_thread_num() + 1);
+  } else {
+    _thread_threshold_step = G1ConcRefineThresholdStep;
   }
-  return 0;
+}
+
+int ConcurrentG1Refine::thread_num() {
+  return MAX2<int>((G1ParallelRSetThreads > 0) ? G1ParallelRSetThreads : ParallelGCThreads, 1);
 }
 
 void ConcurrentG1Refine::init() {
@@ -123,6 +144,15 @@
   }
 }
 
+void ConcurrentG1Refine::reinitialize_threads() {
+  reset_threshold_step();
+  if (_threads != NULL) {
+    for (int i = 0; i < _n_threads; i++) {
+      _threads[i]->initialize();
+    }
+  }
+}
+
 ConcurrentG1Refine::~ConcurrentG1Refine() {
   if (G1ConcRSLogCacheSize > 0) {
     assert(_card_counts != NULL, "Logic");
@@ -384,4 +414,3 @@
     st->cr();
   }
 }
-
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
--- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp	Wed Dec 23 02:57:31 2009 -0800
@@ -29,6 +29,31 @@
 class ConcurrentG1Refine: public CHeapObj {
   ConcurrentG1RefineThread** _threads;
   int _n_threads;
+  int _n_worker_threads;
+ /*
+  * The value of the update buffer queue length falls into one of 3 zones:
+  * green, yellow, red. If the value is in [0, green) nothing is
+  * done, the buffers are left unprocessed to enable the caching effect of the
+  * dirtied cards. In the yellow zone [green, yellow) the concurrent refinement
+  * threads are gradually activated. In [yellow, red) all threads are
+  * running. If the length becomes red (max queue length) the mutators start
+  * processing the buffers.
+  *
+  * There are some interesting cases (with G1AdaptiveConcRefine turned off):
+  * 1) green = yellow = red = 0. In this case the mutator will process all
+  *    buffers. Except for those that are created by the deferred updates
+  *    machinery during a collection.
+  * 2) green = 0. Means no caching. Can be a good way to minimize the
+  *    amount of time spent updating rsets during a collection.
+  */
+  int _green_zone;
+  int _yellow_zone;
+  int _red_zone;
+
+  int _thread_threshold_step;
+
+  // Reset the threshold step value based of the current zone boundaries.
+  void reset_threshold_step();
 
   // The cache for card refinement.
   bool   _use_cache;
@@ -147,6 +172,8 @@
   void init(); // Accomplish some initialization that has to wait.
   void stop();
 
+  void reinitialize_threads();
+
   // Iterate over the conc refine threads
   void threads_do(ThreadClosure *tc);
 
@@ -178,7 +205,20 @@
 
   void clear_and_record_card_counts();
 
-  static size_t thread_num();
+  static int thread_num();
 
   void print_worker_threads_on(outputStream* st) const;
+
+  void set_green_zone(int x)  { _green_zone = x;  }
+  void set_yellow_zone(int x) { _yellow_zone = x; }
+  void set_red_zone(int x)    { _red_zone = x;    }
+
+  int green_zone() const      { return _green_zone;  }
+  int yellow_zone() const     { return _yellow_zone; }
+  int red_zone() const        { return _red_zone;    }
+
+  int total_thread_num() const  { return _n_threads;        }
+  int worker_thread_num() const { return _n_worker_threads; }
+
+  int thread_threshold_step() const { return _thread_threshold_step; }
 };
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp
--- a/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -25,10 +25,6 @@
 #include "incls/_precompiled.incl"
 #include "incls/_concurrentG1RefineThread.cpp.incl"
 
-// ======= Concurrent Mark Thread ========
-
-// The CM thread is created when the G1 garbage collector is used
-
 ConcurrentG1RefineThread::
 ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread *next,
                          int worker_id_offset, int worker_id) :
@@ -37,19 +33,42 @@
   _worker_id(worker_id),
   _active(false),
   _next(next),
+  _monitor(NULL),
   _cg1r(cg1r),
-  _vtime_accum(0.0),
-  _interval_ms(5.0)
+  _vtime_accum(0.0)
 {
+
+  // Each thread has its own monitor. The i-th thread is responsible for signalling
+  // to thread i+1 if the number of buffers in the queue exceeds a threashold for this
+  // thread. Monitors are also used to wake up the threads during termination.
+  // The 0th worker in notified by mutator threads and has a special monitor.
+  // The last worker is used for young gen rset size sampling.
+  if (worker_id > 0) {
+    _monitor = new Monitor(Mutex::nonleaf, "Refinement monitor", true);
+  } else {
+    _monitor = DirtyCardQ_CBL_mon;
+  }
+  initialize();
   create_and_start();
 }
 
+void ConcurrentG1RefineThread::initialize() {
+  if (_worker_id < cg1r()->worker_thread_num()) {
+    // Current thread activation threshold
+    _threshold = MIN2<int>(cg1r()->thread_threshold_step() * (_worker_id + 1) + cg1r()->green_zone(),
+                           cg1r()->yellow_zone());
+    // A thread deactivates once the number of buffer reached a deactivation threshold
+    _deactivation_threshold = MAX2<int>(_threshold - cg1r()->thread_threshold_step(), cg1r()->green_zone());
+  } else {
+    set_active(true);
+  }
+}
+
 void ConcurrentG1RefineThread::sample_young_list_rs_lengths() {
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
   G1CollectorPolicy* g1p = g1h->g1_policy();
   if (g1p->adaptive_young_list_length()) {
     int regions_visited = 0;
-
     g1h->young_list_rs_length_sampling_init();
     while (g1h->young_list_rs_length_sampling_more()) {
       g1h->young_list_rs_length_sampling_next();
@@ -70,99 +89,121 @@
   }
 }
 
-void ConcurrentG1RefineThread::run() {
-  initialize_in_thread();
+void ConcurrentG1RefineThread::run_young_rs_sampling() {
+  DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
   _vtime_start = os::elapsedVTime();
-  wait_for_universe_init();
+  while(!_should_terminate) {
+    _sts.join();
+    sample_young_list_rs_lengths();
+    _sts.leave();
 
-  while (!_should_terminate) {
-    DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
-    // Wait for completed log buffers to exist.
-    {
-      MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag);
-      while (((_worker_id == 0 && !dcqs.process_completed_buffers()) ||
-              (_worker_id > 0 && !is_active())) &&
-             !_should_terminate) {
-         DirtyCardQ_CBL_mon->wait(Mutex::_no_safepoint_check_flag);
-      }
-    }
-
-    if (_should_terminate) {
-      return;
+    if (os::supports_vtime()) {
+      _vtime_accum = (os::elapsedVTime() - _vtime_start);
+    } else {
+      _vtime_accum = 0.0;
     }
 
-    // Now we take them off (this doesn't hold locks while it applies
-    // closures.)  (If we did a full collection, then we'll do a full
-    // traversal.
-    _sts.join();
-    int n_logs = 0;
-    int lower_limit = 0;
-    double start_vtime_sec; // only used when G1SmoothConcRefine is on
-    int prev_buffer_num; // only used when G1SmoothConcRefine is on
-    // This thread activation threshold
-    int threshold = G1UpdateBufferQueueProcessingThreshold * _worker_id;
-    // Next thread activation threshold
-    int next_threshold = threshold + G1UpdateBufferQueueProcessingThreshold;
-    int deactivation_threshold = MAX2<int>(threshold - G1UpdateBufferQueueProcessingThreshold / 2, 0);
+    MutexLockerEx x(_monitor, Mutex::_no_safepoint_check_flag);
+    if (_should_terminate) {
+      break;
+    }
+    _monitor->wait(Mutex::_no_safepoint_check_flag, G1ConcRefineServiceInterval);
+  }
+}
 
-    if (G1SmoothConcRefine) {
-      lower_limit = 0;
-      start_vtime_sec = os::elapsedVTime();
-      prev_buffer_num = (int) dcqs.completed_buffers_num();
-    } else {
-      lower_limit = G1UpdateBufferQueueProcessingThreshold / 4; // For now.
+void ConcurrentG1RefineThread::wait_for_completed_buffers() {
+  DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
+  MutexLockerEx x(_monitor, Mutex::_no_safepoint_check_flag);
+  while (!_should_terminate && !is_active()) {
+    _monitor->wait(Mutex::_no_safepoint_check_flag);
+  }
+}
+
+bool ConcurrentG1RefineThread::is_active() {
+  DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
+  return _worker_id > 0 ? _active : dcqs.process_completed_buffers();
+}
+
+void ConcurrentG1RefineThread::activate() {
+  MutexLockerEx x(_monitor, Mutex::_no_safepoint_check_flag);
+  if (_worker_id > 0) {
+    if (G1TraceConcurrentRefinement) {
+      DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
+      gclog_or_tty->print_cr("G1-Refine-activated worker %d, on threshold %d, current %d",
+                             _worker_id, _threshold, (int)dcqs.completed_buffers_num());
     }
-    while (dcqs.apply_closure_to_completed_buffer(_worker_id + _worker_id_offset, lower_limit)) {
-      double end_vtime_sec;
-      double elapsed_vtime_sec;
-      int elapsed_vtime_ms;
-      int curr_buffer_num = (int) dcqs.completed_buffers_num();
+    set_active(true);
+  } else {
+    DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
+    dcqs.set_process_completed(true);
+  }
+  _monitor->notify();
+}
 
-      if (G1SmoothConcRefine) {
-        end_vtime_sec = os::elapsedVTime();
-        elapsed_vtime_sec = end_vtime_sec - start_vtime_sec;
-        elapsed_vtime_ms = (int) (elapsed_vtime_sec * 1000.0);
+void ConcurrentG1RefineThread::deactivate() {
+  MutexLockerEx x(_monitor, Mutex::_no_safepoint_check_flag);
+  if (_worker_id > 0) {
+    if (G1TraceConcurrentRefinement) {
+      DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
+      gclog_or_tty->print_cr("G1-Refine-deactivated worker %d, off threshold %d, current %d",
+                             _worker_id, _deactivation_threshold, (int)dcqs.completed_buffers_num());
+    }
+    set_active(false);
+  } else {
+    DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
+    dcqs.set_process_completed(false);
+  }
+}
+
+void ConcurrentG1RefineThread::run() {
+  initialize_in_thread();
+  wait_for_universe_init();
 
-        if (curr_buffer_num > prev_buffer_num ||
-            curr_buffer_num > next_threshold) {
-          decreaseInterval(elapsed_vtime_ms);
-        } else if (curr_buffer_num < prev_buffer_num) {
-          increaseInterval(elapsed_vtime_ms);
-        }
+  if (_worker_id >= cg1r()->worker_thread_num()) {
+    run_young_rs_sampling();
+    terminate();
+  }
+
+  _vtime_start = os::elapsedVTime();
+  while (!_should_terminate) {
+    DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
+
+    // Wait for work
+    wait_for_completed_buffers();
+
+    if (_should_terminate) {
+      break;
+    }
+
+    _sts.join();
+
+    do {
+      int curr_buffer_num = (int)dcqs.completed_buffers_num();
+      // If the number of the buffers falls down into the yellow zone,
+      // that means that the transition period after the evacuation pause has ended.
+      if (dcqs.completed_queue_padding() > 0 && curr_buffer_num <= cg1r()->yellow_zone()) {
+        dcqs.set_completed_queue_padding(0);
       }
-      if (_worker_id == 0) {
-        sample_young_list_rs_lengths();
-      } else if (curr_buffer_num < deactivation_threshold) {
+
+      if (_worker_id > 0 && curr_buffer_num <= _deactivation_threshold) {
         // If the number of the buffer has fallen below our threshold
         // we should deactivate. The predecessor will reactivate this
         // thread should the number of the buffers cross the threshold again.
-        MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag);
         deactivate();
-        if (G1TraceConcurrentRefinement) {
-          gclog_or_tty->print_cr("G1-Refine-deactivated worker %d", _worker_id);
-        }
         break;
       }
 
       // Check if we need to activate the next thread.
-      if (curr_buffer_num > next_threshold && _next != NULL && !_next->is_active()) {
-        MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag);
+      if (_next != NULL && !_next->is_active() && curr_buffer_num > _next->_threshold) {
         _next->activate();
-        DirtyCardQ_CBL_mon->notify_all();
-        if (G1TraceConcurrentRefinement) {
-          gclog_or_tty->print_cr("G1-Refine-activated worker %d", _next->_worker_id);
-        }
       }
+    } while (dcqs.apply_closure_to_completed_buffer(_worker_id + _worker_id_offset, cg1r()->green_zone()));
 
-      if (G1SmoothConcRefine) {
-        prev_buffer_num = curr_buffer_num;
-        _sts.leave();
-        os::sleep(Thread::current(), (jlong) _interval_ms, false);
-        _sts.join();
-        start_vtime_sec = os::elapsedVTime();
-      }
-      n_logs++;
+    // We can exit the loop above while being active if there was a yield request.
+    if (is_active()) {
+      deactivate();
     }
+
     _sts.leave();
 
     if (os::supports_vtime()) {
@@ -172,7 +213,6 @@
     }
   }
   assert(_should_terminate, "just checking");
-
   terminate();
 }
 
@@ -191,8 +231,8 @@
   }
 
   {
-    MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag);
-    DirtyCardQ_CBL_mon->notify_all();
+    MutexLockerEx x(_monitor, Mutex::_no_safepoint_check_flag);
+    _monitor->notify();
   }
 
   {
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/concurrentG1RefineThread.hpp
--- a/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.hpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.hpp	Wed Dec 23 02:57:31 2009 -0800
@@ -40,42 +40,36 @@
   // when the number of the rset update buffer crosses a certain threshold. A successor
   // would self-deactivate when the number of the buffers falls below the threshold.
   bool _active;
-  ConcurrentG1RefineThread *       _next;
- public:
-  virtual void run();
-
-  bool is_active()  { return _active;  }
-  void activate()   { _active = true;  }
-  void deactivate() { _active = false; }
-
- private:
-  ConcurrentG1Refine*              _cg1r;
-
-  double                           _interval_ms;
+  ConcurrentG1RefineThread* _next;
+  Monitor* _monitor;
+  ConcurrentG1Refine* _cg1r;
 
-  void decreaseInterval(int processing_time_ms) {
-    double min_interval_ms = (double) processing_time_ms;
-    _interval_ms = 0.8 * _interval_ms;
-    if (_interval_ms < min_interval_ms)
-      _interval_ms = min_interval_ms;
-  }
-  void increaseInterval(int processing_time_ms) {
-    double max_interval_ms = 9.0 * (double) processing_time_ms;
-    _interval_ms = 1.1 * _interval_ms;
-    if (max_interval_ms > 0 && _interval_ms > max_interval_ms)
-      _interval_ms = max_interval_ms;
-  }
+  int _thread_threshold_step;
+  // This thread activation threshold
+  int _threshold;
+  // This thread deactivation threshold
+  int _deactivation_threshold;
 
-  void sleepBeforeNextCycle();
+  void sample_young_list_rs_lengths();
+  void run_young_rs_sampling();
+  void wait_for_completed_buffers();
+
+  void set_active(bool x) { _active = x; }
+  bool is_active();
+  void activate();
+  void deactivate();
 
   // For use by G1CollectedHeap, which is a friend.
   static SuspendibleThreadSet* sts() { return &_sts; }
 
- public:
+public:
+  virtual void run();
   // Constructor
   ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread* next,
                            int worker_id_offset, int worker_id);
 
+  void initialize();
+
   // Printing
   void print() const;
   void print_on(outputStream* st) const;
@@ -83,13 +77,10 @@
   // Total virtual time so far.
   double vtime_accum() { return _vtime_accum; }
 
-  ConcurrentG1Refine* cg1r()                     { return _cg1r;     }
-
-  void            sample_young_list_rs_lengths();
+  ConcurrentG1Refine* cg1r() { return _cg1r;     }
 
   // Yield for GC
-  void            yield();
-
+  void yield();
   // shutdown
   void stop();
 };
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/concurrentMark.cpp
--- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -760,7 +760,6 @@
   rp->setup_policy(false); // snapshot the soft ref policy to be used in this cycle
 
   SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
-  satb_mq_set.set_process_completed_threshold(G1SATBProcessCompletedThreshold);
   satb_mq_set.set_active_all_threads(true);
 
   // update_g1_committed() will be called at the end of an evac pause
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp
--- a/src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -61,8 +61,8 @@
 #pragma warning( disable:4355 ) // 'this' : used in base member initializer list
 #endif // _MSC_VER
 
-DirtyCardQueueSet::DirtyCardQueueSet() :
-  PtrQueueSet(true /*notify_when_complete*/),
+DirtyCardQueueSet::DirtyCardQueueSet(bool notify_when_complete) :
+  PtrQueueSet(notify_when_complete),
   _closure(NULL),
   _shared_dirty_card_queue(this, true /*perm*/),
   _free_ids(NULL),
@@ -77,12 +77,12 @@
 }
 
 void DirtyCardQueueSet::initialize(Monitor* cbl_mon, Mutex* fl_lock,
+                                   int process_completed_threshold,
                                    int max_completed_queue,
                                    Mutex* lock, PtrQueueSet* fl_owner) {
-  PtrQueueSet::initialize(cbl_mon, fl_lock, max_completed_queue, fl_owner);
+  PtrQueueSet::initialize(cbl_mon, fl_lock, process_completed_threshold,
+                          max_completed_queue, fl_owner);
   set_buffer_size(G1UpdateBufferSize);
-  set_process_completed_threshold(G1UpdateBufferQueueProcessingThreshold);
-
   _shared_dirty_card_queue.set_lock(lock);
   _free_ids = new FreeIdSet((int) num_par_ids(), _cbl_mon);
 }
@@ -154,9 +154,10 @@
   return b;
 }
 
-DirtyCardQueueSet::CompletedBufferNode*
+
+BufferNode*
 DirtyCardQueueSet::get_completed_buffer(int stop_at) {
-  CompletedBufferNode* nd = NULL;
+  BufferNode* nd = NULL;
   MutexLockerEx x(_cbl_mon, Mutex::_no_safepoint_check_flag);
 
   if ((int)_n_completed_buffers <= stop_at) {
@@ -166,10 +167,11 @@
 
   if (_completed_buffers_head != NULL) {
     nd = _completed_buffers_head;
-    _completed_buffers_head = nd->next;
+    _completed_buffers_head = nd->next();
     if (_completed_buffers_head == NULL)
       _completed_buffers_tail = NULL;
     _n_completed_buffers--;
+    assert(_n_completed_buffers >= 0, "Invariant");
   }
   debug_only(assert_completed_buffer_list_len_correct_locked());
   return nd;
@@ -177,20 +179,19 @@
 
 bool DirtyCardQueueSet::
 apply_closure_to_completed_buffer_helper(int worker_i,
-                                         CompletedBufferNode* nd) {
+                                         BufferNode* nd) {
   if (nd != NULL) {
+    void **buf = BufferNode::make_buffer_from_node(nd);
+    size_t index = nd->index();
     bool b =
-      DirtyCardQueue::apply_closure_to_buffer(_closure, nd->buf,
-                                              nd->index, _sz,
+      DirtyCardQueue::apply_closure_to_buffer(_closure, buf,
+                                              index, _sz,
                                               true, worker_i);
-    void** buf = nd->buf;
-    size_t index = nd->index;
-    delete nd;
     if (b) {
       deallocate_buffer(buf);
       return true;  // In normal case, go on to next buffer.
     } else {
-      enqueue_complete_buffer(buf, index, true);
+      enqueue_complete_buffer(buf, index);
       return false;
     }
   } else {
@@ -203,32 +204,33 @@
                                                           bool during_pause)
 {
   assert(!during_pause || stop_at == 0, "Should not leave any completed buffers during a pause");
-  CompletedBufferNode* nd = get_completed_buffer(stop_at);
+  BufferNode* nd = get_completed_buffer(stop_at);
   bool res = apply_closure_to_completed_buffer_helper(worker_i, nd);
   if (res) Atomic::inc(&_processed_buffers_rs_thread);
   return res;
 }
 
 void DirtyCardQueueSet::apply_closure_to_all_completed_buffers() {
-  CompletedBufferNode* nd = _completed_buffers_head;
+  BufferNode* nd = _completed_buffers_head;
   while (nd != NULL) {
     bool b =
-      DirtyCardQueue::apply_closure_to_buffer(_closure, nd->buf, 0, _sz,
-                                              false);
+      DirtyCardQueue::apply_closure_to_buffer(_closure,
+                                              BufferNode::make_buffer_from_node(nd),
+                                              0, _sz, false);
     guarantee(b, "Should not stop early.");
-    nd = nd->next;
+    nd = nd->next();
   }
 }
 
 void DirtyCardQueueSet::abandon_logs() {
   assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
-  CompletedBufferNode* buffers_to_delete = NULL;
+  BufferNode* buffers_to_delete = NULL;
   {
     MutexLockerEx x(_cbl_mon, Mutex::_no_safepoint_check_flag);
     while (_completed_buffers_head != NULL) {
-      CompletedBufferNode* nd = _completed_buffers_head;
-      _completed_buffers_head = nd->next;
-      nd->next = buffers_to_delete;
+      BufferNode* nd = _completed_buffers_head;
+      _completed_buffers_head = nd->next();
+      nd->set_next(buffers_to_delete);
       buffers_to_delete = nd;
     }
     _n_completed_buffers = 0;
@@ -236,10 +238,9 @@
     debug_only(assert_completed_buffer_list_len_correct_locked());
   }
   while (buffers_to_delete != NULL) {
-    CompletedBufferNode* nd = buffers_to_delete;
-    buffers_to_delete = nd->next;
-    deallocate_buffer(nd->buf);
-    delete nd;
+    BufferNode* nd = buffers_to_delete;
+    buffers_to_delete = nd->next();
+    deallocate_buffer(BufferNode::make_buffer_from_node(nd));
   }
   // Since abandon is done only at safepoints, we can safely manipulate
   // these queues.
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/dirtyCardQueue.hpp
--- a/src/share/vm/gc_implementation/g1/dirtyCardQueue.hpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/dirtyCardQueue.hpp	Wed Dec 23 02:57:31 2009 -0800
@@ -84,11 +84,12 @@
   jint _processed_buffers_rs_thread;
 
 public:
-  DirtyCardQueueSet();
+  DirtyCardQueueSet(bool notify_when_complete = true);
 
   void initialize(Monitor* cbl_mon, Mutex* fl_lock,
-                  int max_completed_queue = 0,
-                  Mutex* lock = NULL, PtrQueueSet* fl_owner = NULL);
+                  int process_completed_threshold,
+                  int max_completed_queue,
+                  Mutex* lock, PtrQueueSet* fl_owner = NULL);
 
   // The number of parallel ids that can be claimed to allow collector or
   // mutator threads to do card-processing work.
@@ -123,9 +124,9 @@
                                          bool during_pause = false);
 
   bool apply_closure_to_completed_buffer_helper(int worker_i,
-                                                CompletedBufferNode* nd);
+                                                BufferNode* nd);
 
-  CompletedBufferNode* get_completed_buffer(int stop_at);
+  BufferNode* get_completed_buffer(int stop_at);
 
   // Applies the current closure to all completed buffers,
   // non-consumptively.
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -1375,6 +1375,7 @@
 G1CollectedHeap::G1CollectedHeap(G1CollectorPolicy* policy_) :
   SharedHeap(policy_),
   _g1_policy(policy_),
+  _dirty_card_queue_set(false),
   _ref_processor(NULL),
   _process_strong_tasks(new SubTasksDone(G1H_PS_NumElements)),
   _bot_shared(NULL),
@@ -1460,8 +1461,6 @@
   Universe::check_alignment(init_byte_size, HeapRegion::GrainBytes, "g1 heap");
   Universe::check_alignment(max_byte_size, HeapRegion::GrainBytes, "g1 heap");
 
-  // We allocate this in any case, but only do no work if the command line
-  // param is off.
   _cg1r = new ConcurrentG1Refine();
 
   // Reserve the maximum.
@@ -1594,18 +1593,20 @@
 
   JavaThread::satb_mark_queue_set().initialize(SATB_Q_CBL_mon,
                                                SATB_Q_FL_lock,
-                                               0,
+                                               G1SATBProcessCompletedThreshold,
                                                Shared_SATB_Q_lock);
 
   JavaThread::dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon,
                                                 DirtyCardQ_FL_lock,
-                                                G1UpdateBufferQueueMaxLength,
+                                                concurrent_g1_refine()->yellow_zone(),
+                                                concurrent_g1_refine()->red_zone(),
                                                 Shared_DirtyCardQ_lock);
 
   if (G1DeferredRSUpdate) {
     dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon,
                                       DirtyCardQ_FL_lock,
-                                      0,
+                                      -1, // never trigger processing
+                                      -1, // no limit on length
                                       Shared_DirtyCardQ_lock,
                                       &JavaThread::dirty_card_queue_set());
   }
@@ -4239,10 +4240,11 @@
     RedirtyLoggedCardTableEntryFastClosure redirty;
     dirty_card_queue_set().set_closure(&redirty);
     dirty_card_queue_set().apply_closure_to_all_completed_buffers();
-    JavaThread::dirty_card_queue_set().merge_bufferlists(&dirty_card_queue_set());
+
+    DirtyCardQueueSet& dcq = JavaThread::dirty_card_queue_set();
+    dcq.merge_bufferlists(&dirty_card_queue_set());
     assert(dirty_card_queue_set().completed_buffers_num() == 0, "All should be consumed");
   }
-
   COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
 }
 
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -1914,6 +1914,10 @@
   calculate_young_list_min_length();
   calculate_young_list_target_config();
 
+  // Note that _mmu_tracker->max_gc_time() returns the time in seconds.
+  double update_rs_time_goal_ms = _mmu_tracker->max_gc_time() * MILLIUNITS * G1RSUpdatePauseFractionPercent / 100.0;
+  adjust_concurrent_refinement(update_rs_time, update_rs_processed_buffers, update_rs_time_goal_ms);
+
   // </NEW PREDICTION>
 
   _target_pause_time_ms = -1.0;
@@ -1921,6 +1925,47 @@
 
 // <NEW PREDICTION>
 
+void G1CollectorPolicy::adjust_concurrent_refinement(double update_rs_time,
+                                                     double update_rs_processed_buffers,
+                                                     double goal_ms) {
+  DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
+  ConcurrentG1Refine *cg1r = G1CollectedHeap::heap()->concurrent_g1_refine();
+
+  if (G1AdaptiveConcRefine) {
+    const int k_gy = 3, k_gr = 6;
+    const double inc_k = 1.1, dec_k = 0.9;
+
+    int g = cg1r->green_zone();
+    if (update_rs_time > goal_ms) {
+      g = (int)(g * dec_k);  // Can become 0, that's OK. That would mean a mutator-only processing.
+    } else {
+      if (update_rs_time < goal_ms && update_rs_processed_buffers > g) {
+        g = (int)MAX2(g * inc_k, g + 1.0);
+      }
+    }
+    // Change the refinement threads params
+    cg1r->set_green_zone(g);
+    cg1r->set_yellow_zone(g * k_gy);
+    cg1r->set_red_zone(g * k_gr);
+    cg1r->reinitialize_threads();
+
+    int processing_threshold_delta = MAX2((int)(cg1r->green_zone() * sigma()), 1);
+    int processing_threshold = MIN2(cg1r->green_zone() + processing_threshold_delta,
+                                    cg1r->yellow_zone());
+    // Change the barrier params
+    dcqs.set_process_completed_threshold(processing_threshold);
+    dcqs.set_max_completed_queue(cg1r->red_zone());
+  }
+
+  int curr_queue_size = dcqs.completed_buffers_num();
+  if (curr_queue_size >= cg1r->yellow_zone()) {
+    dcqs.set_completed_queue_padding(curr_queue_size);
+  } else {
+    dcqs.set_completed_queue_padding(0);
+  }
+  dcqs.notify_if_necessary();
+}
+
 double
 G1CollectorPolicy::
 predict_young_collection_elapsed_time_ms(size_t adjustment) {
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp	Wed Dec 23 02:57:31 2009 -0800
@@ -316,6 +316,10 @@
   bool verify_young_ages(HeapRegion* head, SurvRateGroup *surv_rate_group);
 #endif // PRODUCT
 
+  void adjust_concurrent_refinement(double update_rs_time,
+                                    double update_rs_processed_buffers,
+                                    double goal_ms);
+
 protected:
   double _pause_time_target_ms;
   double _recorded_young_cset_choice_time_ms;
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/g1_globals.hpp
--- a/src/share/vm/gc_implementation/g1/g1_globals.hpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp	Wed Dec 23 02:57:31 2009 -0800
@@ -85,7 +85,7 @@
   diagnostic(bool, G1SummarizeZFStats, false,                               \
           "Summarize zero-filling info")                                    \
                                                                             \
-  develop(bool, G1TraceConcurrentRefinement, false,                         \
+  diagnostic(bool, G1TraceConcurrentRefinement, false,                      \
           "Trace G1 concurrent refinement")                                 \
                                                                             \
   product(intx, G1MarkStackSize, 2 * 1024 * 1024,                           \
@@ -94,19 +94,6 @@
   product(intx, G1MarkRegionStackSize, 1024 * 1024,                         \
           "Size of the region stack for concurrent marking.")               \
                                                                             \
-  develop(bool, G1ConcRefine, true,                                         \
-          "If true, run concurrent rem set refinement for G1")              \
-                                                                            \
-  develop(intx, G1ConcRefineTargTraversals, 4,                              \
-          "Number of concurrent refinement we try to achieve")              \
-                                                                            \
-  develop(intx, G1ConcRefineInitialDelta, 4,                                \
-          "Number of heap regions of alloc ahead of starting collection "   \
-          "pause to start concurrent refinement (initially)")               \
-                                                                            \
-  develop(bool, G1SmoothConcRefine, true,                                   \
-          "Attempts to smooth out the overhead of concurrent refinement")   \
-                                                                            \
   develop(bool, G1ConcZeroFill, true,                                       \
           "If true, run concurrent zero-filling thread")                    \
                                                                             \
@@ -178,13 +165,38 @@
   product(intx, G1UpdateBufferSize, 256,                                    \
           "Size of an update buffer")                                       \
                                                                             \
-  product(intx, G1UpdateBufferQueueProcessingThreshold, 5,                  \
+  product(intx, G1ConcRefineYellowZone, 0,                                  \
           "Number of enqueued update buffers that will "                    \
-          "trigger concurrent processing")                                  \
+          "trigger concurrent processing. Will be selected ergonomically "  \
+          "by default.")                                                    \
+                                                                            \
+  product(intx, G1ConcRefineRedZone, 0,                                     \
+          "Maximum number of enqueued update buffers before mutator "       \
+          "threads start processing new ones instead of enqueueing them. "  \
+          "Will be selected ergonomically by default. Zero will disable "   \
+          "concurrent processing.")                                         \
+                                                                            \
+  product(intx, G1ConcRefineGreenZone, 0,                                   \
+          "The number of update buffers that are left in the queue by the " \
+          "concurrent processing threads. Will be selected ergonomically "  \
+          "by default.")                                                    \
                                                                             \
-  product(intx, G1UpdateBufferQueueMaxLength, 30,                           \
-          "Maximum number of enqueued update buffers before mutator "       \
-          "threads start processing new ones instead of enqueueing them")   \
+  product(intx, G1ConcRefineServiceInterval, 300,                           \
+          "The last concurrent refinement thread wakes up every "           \
+          "specified number of milliseconds to do miscellaneous work.")     \
+                                                                            \
+  product(intx, G1ConcRefineThresholdStep, 0,                               \
+          "Each time the rset update queue increases by this amount "       \
+          "activate the next refinement thread if available. "              \
+          "Will be selected ergonomically by default.")                     \
+                                                                            \
+  product(intx, G1RSUpdatePauseFractionPercent, 10,                         \
+          "A target percentage of time that is allowed to be spend on "     \
+          "process RS update buffers during the collection pause.")         \
+                                                                            \
+  product(bool, G1AdaptiveConcRefine, true,                                 \
+          "Select green, yellow and red zones adaptively to meet the "      \
+          "the pause requirements.")                                        \
                                                                             \
   develop(intx, G1ConcRSLogCacheSize, 10,                                   \
           "Log base 2 of the length of conc RS hot-card cache.")            \
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/ptrQueue.cpp
--- a/src/share/vm/gc_implementation/g1/ptrQueue.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/ptrQueue.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -64,8 +64,8 @@
   while (_index == 0) {
     handle_zero_index();
   }
+
   assert(_index > 0, "postcondition");
-
   _index -= oopSize;
   _buf[byte_index_to_index((int)_index)] = ptr;
   assert(0 <= _index && _index <= _sz, "Invariant.");
@@ -99,95 +99,110 @@
   assert(_sz > 0, "Didn't set a buffer size.");
   MutexLockerEx x(_fl_owner->_fl_lock, Mutex::_no_safepoint_check_flag);
   if (_fl_owner->_buf_free_list != NULL) {
-    void** res = _fl_owner->_buf_free_list;
-    _fl_owner->_buf_free_list = (void**)_fl_owner->_buf_free_list[0];
+    void** res = BufferNode::make_buffer_from_node(_fl_owner->_buf_free_list);
+    _fl_owner->_buf_free_list = _fl_owner->_buf_free_list->next();
     _fl_owner->_buf_free_list_sz--;
-    // Just override the next pointer with NULL, just in case we scan this part
-    // of the buffer.
-    res[0] = NULL;
     return res;
   } else {
-    return (void**) NEW_C_HEAP_ARRAY(char, _sz);
+    // Allocate space for the BufferNode in front of the buffer.
+    char *b =  NEW_C_HEAP_ARRAY(char, _sz + BufferNode::aligned_size());
+    return BufferNode::make_buffer_from_block(b);
   }
 }
 
 void PtrQueueSet::deallocate_buffer(void** buf) {
   assert(_sz > 0, "Didn't set a buffer size.");
   MutexLockerEx x(_fl_owner->_fl_lock, Mutex::_no_safepoint_check_flag);
-  buf[0] = (void*)_fl_owner->_buf_free_list;
-  _fl_owner->_buf_free_list = buf;
+  BufferNode *node = BufferNode::make_node_from_buffer(buf);
+  node->set_next(_fl_owner->_buf_free_list);
+  _fl_owner->_buf_free_list = node;
   _fl_owner->_buf_free_list_sz++;
 }
 
 void PtrQueueSet::reduce_free_list() {
+  assert(_fl_owner == this, "Free list reduction is allowed only for the owner");
   // For now we'll adopt the strategy of deleting half.
   MutexLockerEx x(_fl_lock, Mutex::_no_safepoint_check_flag);
   size_t n = _buf_free_list_sz / 2;
   while (n > 0) {
     assert(_buf_free_list != NULL, "_buf_free_list_sz must be wrong.");
-    void** head = _buf_free_list;
-    _buf_free_list = (void**)_buf_free_list[0];
-    FREE_C_HEAP_ARRAY(char, head);
+    void* b = BufferNode::make_block_from_node(_buf_free_list);
+    _buf_free_list = _buf_free_list->next();
+    FREE_C_HEAP_ARRAY(char, b);
     _buf_free_list_sz --;
     n--;
   }
 }
 
-void PtrQueueSet::enqueue_complete_buffer(void** buf, size_t index, bool ignore_max_completed) {
-  // I use explicit locking here because there's a bailout in the middle.
-  _cbl_mon->lock_without_safepoint_check();
-
-  Thread* thread = Thread::current();
-  assert( ignore_max_completed ||
-          thread->is_Java_thread() ||
-          SafepointSynchronize::is_at_safepoint(),
-          "invariant" );
-  ignore_max_completed = ignore_max_completed || !thread->is_Java_thread();
+void PtrQueue::handle_zero_index() {
+  assert(0 == _index, "Precondition.");
+  // This thread records the full buffer and allocates a new one (while
+  // holding the lock if there is one).
+  if (_buf != NULL) {
+    if (_lock) {
+      locking_enqueue_completed_buffer(_buf);
+    } else {
+      if (qset()->process_or_enqueue_complete_buffer(_buf)) {
+        // Recycle the buffer. No allocation.
+        _sz = qset()->buffer_size();
+        _index = _sz;
+        return;
+      }
+    }
+  }
+  // Reallocate the buffer
+  _buf = qset()->allocate_buffer();
+  _sz = qset()->buffer_size();
+  _index = _sz;
+  assert(0 <= _index && _index <= _sz, "Invariant.");
+}
 
-  if (!ignore_max_completed && _max_completed_queue > 0 &&
-      _n_completed_buffers >= (size_t) _max_completed_queue) {
-    _cbl_mon->unlock();
-    bool b = mut_process_buffer(buf);
-    if (b) {
-      deallocate_buffer(buf);
-      return;
+bool PtrQueueSet::process_or_enqueue_complete_buffer(void** buf) {
+  if (Thread::current()->is_Java_thread()) {
+    // We don't lock. It is fine to be epsilon-precise here.
+    if (_max_completed_queue == 0 || _max_completed_queue > 0 &&
+        _n_completed_buffers >= _max_completed_queue + _completed_queue_padding) {
+      bool b = mut_process_buffer(buf);
+      if (b) {
+        // True here means that the buffer hasn't been deallocated and the caller may reuse it.
+        return true;
+      }
     }
+  }
+  // The buffer will be enqueued. The caller will have to get a new one.
+  enqueue_complete_buffer(buf);
+  return false;
+}
 
-    // Otherwise, go ahead and enqueue the buffer.  Must reaquire the lock.
-    _cbl_mon->lock_without_safepoint_check();
-  }
-
-  // Here we still hold the _cbl_mon.
-  CompletedBufferNode* cbn = new CompletedBufferNode;
-  cbn->buf = buf;
-  cbn->next = NULL;
-  cbn->index = index;
+void PtrQueueSet::enqueue_complete_buffer(void** buf, size_t index) {
+  MutexLockerEx x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  BufferNode* cbn = BufferNode::new_from_buffer(buf);
+  cbn->set_index(index);
   if (_completed_buffers_tail == NULL) {
     assert(_completed_buffers_head == NULL, "Well-formedness");
     _completed_buffers_head = cbn;
     _completed_buffers_tail = cbn;
   } else {
-    _completed_buffers_tail->next = cbn;
+    _completed_buffers_tail->set_next(cbn);
     _completed_buffers_tail = cbn;
   }
   _n_completed_buffers++;
 
-  if (!_process_completed &&
+  if (!_process_completed && _process_completed_threshold >= 0 &&
       _n_completed_buffers >= _process_completed_threshold) {
     _process_completed = true;
     if (_notify_when_complete)
-      _cbl_mon->notify_all();
+      _cbl_mon->notify();
   }
   debug_only(assert_completed_buffer_list_len_correct_locked());
-  _cbl_mon->unlock();
 }
 
 int PtrQueueSet::completed_buffers_list_length() {
   int n = 0;
-  CompletedBufferNode* cbn = _completed_buffers_head;
+  BufferNode* cbn = _completed_buffers_head;
   while (cbn != NULL) {
     n++;
-    cbn = cbn->next;
+    cbn = cbn->next();
   }
   return n;
 }
@@ -198,7 +213,7 @@
 }
 
 void PtrQueueSet::assert_completed_buffer_list_len_correct_locked() {
-  guarantee((size_t)completed_buffers_list_length() ==  _n_completed_buffers,
+  guarantee(completed_buffers_list_length() ==  _n_completed_buffers,
             "Completed buffer length is wrong.");
 }
 
@@ -207,12 +222,8 @@
   _sz = sz * oopSize;
 }
 
-void PtrQueueSet::set_process_completed_threshold(size_t sz) {
-  _process_completed_threshold = sz;
-}
-
-// Merge lists of buffers. Notify waiting threads if the length of the list
-// exceeds threshold. The source queue is emptied as a result. The queues
+// Merge lists of buffers. Notify the processing threads.
+// The source queue is emptied as a result. The queues
 // must share the monitor.
 void PtrQueueSet::merge_bufferlists(PtrQueueSet *src) {
   assert(_cbl_mon == src->_cbl_mon, "Should share the same lock");
@@ -224,7 +235,7 @@
   } else {
     assert(_completed_buffers_head != NULL, "Well formedness");
     if (src->_completed_buffers_head != NULL) {
-      _completed_buffers_tail->next = src->_completed_buffers_head;
+      _completed_buffers_tail->set_next(src->_completed_buffers_head);
       _completed_buffers_tail = src->_completed_buffers_tail;
     }
   }
@@ -237,31 +248,13 @@
   assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL ||
          _completed_buffers_head != NULL && _completed_buffers_tail != NULL,
          "Sanity");
-
-  if (!_process_completed &&
-      _n_completed_buffers >= _process_completed_threshold) {
-    _process_completed = true;
-    if (_notify_when_complete)
-      _cbl_mon->notify_all();
-  }
 }
 
-// Merge free lists of the two queues. The free list of the source
-// queue is emptied as a result. The queues must share the same
-// mutex that guards free lists.
-void PtrQueueSet::merge_freelists(PtrQueueSet* src) {
-  assert(_fl_lock == src->_fl_lock, "Should share the same lock");
-  MutexLockerEx x(_fl_lock, Mutex::_no_safepoint_check_flag);
-  if (_buf_free_list != NULL) {
-    void **p = _buf_free_list;
-    while (*p != NULL) {
-      p = (void**)*p;
-    }
-    *p = src->_buf_free_list;
-  } else {
-    _buf_free_list = src->_buf_free_list;
+void PtrQueueSet::notify_if_necessary() {
+  MutexLockerEx x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  if (_n_completed_buffers >= _process_completed_threshold || _max_completed_queue == 0) {
+    _process_completed = true;
+    if (_notify_when_complete)
+      _cbl_mon->notify();
   }
-  _buf_free_list_sz += src->_buf_free_list_sz;
-  src->_buf_free_list = NULL;
-  src->_buf_free_list_sz = 0;
 }
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/ptrQueue.hpp
--- a/src/share/vm/gc_implementation/g1/ptrQueue.hpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/ptrQueue.hpp	Wed Dec 23 02:57:31 2009 -0800
@@ -27,8 +27,10 @@
 // the addresses of modified old-generation objects.  This type supports
 // this operation.
 
+// The definition of placement operator new(size_t, void*) in the <new>.
+#include <new>
+
 class PtrQueueSet;
-
 class PtrQueue VALUE_OBJ_CLASS_SPEC {
 
 protected:
@@ -77,7 +79,7 @@
     else enqueue_known_active(ptr);
   }
 
-  inline void handle_zero_index();
+  void handle_zero_index();
   void locking_enqueue_completed_buffer(void** buf);
 
   void enqueue_known_active(void* ptr);
@@ -126,34 +128,65 @@
 
 };
 
+class BufferNode {
+  size_t _index;
+  BufferNode* _next;
+public:
+  BufferNode() : _index(0), _next(NULL) { }
+  BufferNode* next() const     { return _next;  }
+  void set_next(BufferNode* n) { _next = n;     }
+  size_t index() const         { return _index; }
+  void set_index(size_t i)     { _index = i;    }
+
+  // Align the size of the structure to the size of the pointer
+  static size_t aligned_size() {
+    static const size_t alignment = round_to(sizeof(BufferNode), sizeof(void*));
+    return alignment;
+  }
+
+  // BufferNode is allocated before the buffer.
+  // The chunk of memory that holds both of them is a block.
+
+  // Produce a new BufferNode given a buffer.
+  static BufferNode* new_from_buffer(void** buf) {
+    return new (make_block_from_buffer(buf)) BufferNode;
+  }
+
+  // The following are the required conversion routines:
+  static BufferNode* make_node_from_buffer(void** buf) {
+    return (BufferNode*)make_block_from_buffer(buf);
+  }
+  static void** make_buffer_from_node(BufferNode *node) {
+    return make_buffer_from_block(node);
+  }
+  static void* make_block_from_node(BufferNode *node) {
+    return (void*)node;
+  }
+  static void** make_buffer_from_block(void* p) {
+    return (void**)((char*)p + aligned_size());
+  }
+  static void* make_block_from_buffer(void** p) {
+    return (void*)((char*)p - aligned_size());
+  }
+};
+
 // A PtrQueueSet represents resources common to a set of pointer queues.
 // In particular, the individual queues allocate buffers from this shared
 // set, and return completed buffers to the set.
 // All these variables are are protected by the TLOQ_CBL_mon. XXX ???
 class PtrQueueSet VALUE_OBJ_CLASS_SPEC {
-
 protected:
-
-  class CompletedBufferNode: public CHeapObj {
-  public:
-    void** buf;
-    size_t index;
-    CompletedBufferNode* next;
-    CompletedBufferNode() : buf(NULL),
-      index(0), next(NULL){ }
-  };
-
   Monitor* _cbl_mon;  // Protects the fields below.
-  CompletedBufferNode* _completed_buffers_head;
-  CompletedBufferNode* _completed_buffers_tail;
-  size_t _n_completed_buffers;
-  size_t _process_completed_threshold;
+  BufferNode* _completed_buffers_head;
+  BufferNode* _completed_buffers_tail;
+  int _n_completed_buffers;
+  int _process_completed_threshold;
   volatile bool _process_completed;
 
   // This (and the interpretation of the first element as a "next"
   // pointer) are protected by the TLOQ_FL_lock.
   Mutex* _fl_lock;
-  void** _buf_free_list;
+  BufferNode* _buf_free_list;
   size_t _buf_free_list_sz;
   // Queue set can share a freelist. The _fl_owner variable
   // specifies the owner. It is set to "this" by default.
@@ -170,6 +203,7 @@
   // Maximum number of elements allowed on completed queue: after that,
   // enqueuer does the work itself.  Zero indicates no maximum.
   int _max_completed_queue;
+  int _completed_queue_padding;
 
   int completed_buffers_list_length();
   void assert_completed_buffer_list_len_correct_locked();
@@ -191,9 +225,12 @@
   // Because of init-order concerns, we can't pass these as constructor
   // arguments.
   void initialize(Monitor* cbl_mon, Mutex* fl_lock,
-                  int max_completed_queue = 0,
+                  int process_completed_threshold,
+                  int max_completed_queue,
                   PtrQueueSet *fl_owner = NULL) {
     _max_completed_queue = max_completed_queue;
+    _process_completed_threshold = process_completed_threshold;
+    _completed_queue_padding = 0;
     assert(cbl_mon != NULL && fl_lock != NULL, "Init order issue?");
     _cbl_mon = cbl_mon;
     _fl_lock = fl_lock;
@@ -208,14 +245,17 @@
   void deallocate_buffer(void** buf);
 
   // Declares that "buf" is a complete buffer.
-  void enqueue_complete_buffer(void** buf, size_t index = 0,
-                               bool ignore_max_completed = false);
+  void enqueue_complete_buffer(void** buf, size_t index = 0);
+
+  // To be invoked by the mutator.
+  bool process_or_enqueue_complete_buffer(void** buf);
 
   bool completed_buffers_exist_dirty() {
     return _n_completed_buffers > 0;
   }
 
   bool process_completed_buffers() { return _process_completed; }
+  void set_process_completed(bool x) { _process_completed = x; }
 
   bool active() { return _all_active; }
 
@@ -226,15 +266,24 @@
   // Get the buffer size.
   size_t buffer_size() { return _sz; }
 
-  // Set the number of completed buffers that triggers log processing.
-  void set_process_completed_threshold(size_t sz);
+  // Get/Set the number of completed buffers that triggers log processing.
+  void set_process_completed_threshold(int sz) { _process_completed_threshold = sz; }
+  int process_completed_threshold() const { return _process_completed_threshold; }
 
   // Must only be called at a safe point.  Indicates that the buffer free
   // list size may be reduced, if that is deemed desirable.
   void reduce_free_list();
 
-  size_t completed_buffers_num() { return _n_completed_buffers; }
+  int completed_buffers_num() { return _n_completed_buffers; }
 
   void merge_bufferlists(PtrQueueSet* src);
-  void merge_freelists(PtrQueueSet* src);
+
+  void set_max_completed_queue(int m) { _max_completed_queue = m; }
+  int max_completed_queue() { return _max_completed_queue; }
+
+  void set_completed_queue_padding(int padding) { _completed_queue_padding = padding; }
+  int completed_queue_padding() { return _completed_queue_padding; }
+
+  // Notify the consumer if the number of buffers crossed the threshold
+  void notify_if_necessary();
 };
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/satbQueue.cpp
--- a/src/share/vm/gc_implementation/g1/satbQueue.cpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/satbQueue.cpp	Wed Dec 23 02:57:31 2009 -0800
@@ -67,9 +67,9 @@
 {}
 
 void SATBMarkQueueSet::initialize(Monitor* cbl_mon, Mutex* fl_lock,
-                                  int max_completed_queue,
+                                  int process_completed_threshold,
                                   Mutex* lock) {
-  PtrQueueSet::initialize(cbl_mon, fl_lock, max_completed_queue);
+  PtrQueueSet::initialize(cbl_mon, fl_lock, process_completed_threshold, -1);
   _shared_satb_queue.set_lock(lock);
   if (ParallelGCThreads > 0) {
     _par_closures = NEW_C_HEAP_ARRAY(ObjectClosure*, ParallelGCThreads);
@@ -122,12 +122,12 @@
 
 bool SATBMarkQueueSet::apply_closure_to_completed_buffer_work(bool par,
                                                               int worker) {
-  CompletedBufferNode* nd = NULL;
+  BufferNode* nd = NULL;
   {
     MutexLockerEx x(_cbl_mon, Mutex::_no_safepoint_check_flag);
     if (_completed_buffers_head != NULL) {
       nd = _completed_buffers_head;
-      _completed_buffers_head = nd->next;
+      _completed_buffers_head = nd->next();
       if (_completed_buffers_head == NULL) _completed_buffers_tail = NULL;
       _n_completed_buffers--;
       if (_n_completed_buffers == 0) _process_completed = false;
@@ -135,9 +135,9 @@
   }
   ObjectClosure* cl = (par ? _par_closures[worker] : _closure);
   if (nd != NULL) {
-    ObjPtrQueue::apply_closure_to_buffer(cl, nd->buf, 0, _sz);
-    deallocate_buffer(nd->buf);
-    delete nd;
+    void **buf = BufferNode::make_buffer_from_node(nd);
+    ObjPtrQueue::apply_closure_to_buffer(cl, buf, 0, _sz);
+    deallocate_buffer(buf);
     return true;
   } else {
     return false;
@@ -145,13 +145,13 @@
 }
 
 void SATBMarkQueueSet::abandon_partial_marking() {
-  CompletedBufferNode* buffers_to_delete = NULL;
+  BufferNode* buffers_to_delete = NULL;
   {
     MutexLockerEx x(_cbl_mon, Mutex::_no_safepoint_check_flag);
     while (_completed_buffers_head != NULL) {
-      CompletedBufferNode* nd = _completed_buffers_head;
-      _completed_buffers_head = nd->next;
-      nd->next = buffers_to_delete;
+      BufferNode* nd = _completed_buffers_head;
+      _completed_buffers_head = nd->next();
+      nd->set_next(buffers_to_delete);
       buffers_to_delete = nd;
     }
     _completed_buffers_tail = NULL;
@@ -159,10 +159,9 @@
     DEBUG_ONLY(assert_completed_buffer_list_len_correct_locked());
   }
   while (buffers_to_delete != NULL) {
-    CompletedBufferNode* nd = buffers_to_delete;
-    buffers_to_delete = nd->next;
-    deallocate_buffer(nd->buf);
-    delete nd;
+    BufferNode* nd = buffers_to_delete;
+    buffers_to_delete = nd->next();
+    deallocate_buffer(BufferNode::make_buffer_from_node(nd));
   }
   assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
   // So we can safely manipulate these queues.
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/g1/satbQueue.hpp
--- a/src/share/vm/gc_implementation/g1/satbQueue.hpp	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/g1/satbQueue.hpp	Wed Dec 23 02:57:31 2009 -0800
@@ -60,8 +60,8 @@
   SATBMarkQueueSet();
 
   void initialize(Monitor* cbl_mon, Mutex* fl_lock,
-                  int max_completed_queue = 0,
-                  Mutex* lock = NULL);
+                  int process_completed_threshold,
+                  Mutex* lock);
 
   static void handle_zero_index_for_thread(JavaThread* t);
 
diff -r 920875ae1277 -r 9749fbc4859b src/share/vm/gc_implementation/includeDB_gc_g1
--- a/src/share/vm/gc_implementation/includeDB_gc_g1	Tue Dec 22 16:35:08 2009 -0800
+++ b/src/share/vm/gc_implementation/includeDB_gc_g1	Wed Dec 23 02:57:31 2009 -0800
@@ -109,7 +109,6 @@
 dirtyCardQueue.cpp                      dirtyCardQueue.hpp
 dirtyCardQueue.cpp			heapRegionRemSet.hpp
 dirtyCardQueue.cpp                      mutexLocker.hpp
-dirtyCardQueue.cpp                      ptrQueue.inline.hpp
 dirtyCardQueue.cpp                      safepoint.hpp
 dirtyCardQueue.cpp                      thread.hpp
 dirtyCardQueue.cpp                      thread_<os_family>.inline.hpp
@@ -319,7 +318,6 @@
 ptrQueue.cpp                            mutex.hpp
 ptrQueue.cpp                            mutexLocker.hpp
 ptrQueue.cpp                            ptrQueue.hpp
-ptrQueue.cpp                            ptrQueue.inline.hpp
 ptrQueue.cpp                            thread_<os_family>.inline.hpp
 
 ptrQueue.hpp                            allocation.hpp
@@ -329,7 +327,6 @@
 
 satbQueue.cpp                           allocation.inline.hpp
 satbQueue.cpp                           mutexLocker.hpp
-satbQueue.cpp                           ptrQueue.inline.hpp
 satbQueue.cpp                           satbQueue.hpp
 satbQueue.cpp                           sharedHeap.hpp
 satbQueue.cpp                           thread.hpp