changeset 3910:d968f546734e

Merge
author iveresov
date Wed, 07 Sep 2011 11:52:00 -0700
parents 4668545121b8 (diff) c2d3caa64b3e (current diff)
children 5596e125fe4f
files agent/src/share/classes/sun/jvm/hotspot/runtime/amd64/AMD64Frame.java agent/src/share/classes/sun/jvm/hotspot/runtime/amd64/AMD64RegisterMap.java make/solaris/makefiles/mapfile-vers-nonproduct src/cpu/sparc/vm/assembler_sparc.cpp src/cpu/sparc/vm/assembler_sparc.hpp src/cpu/sparc/vm/c1_Runtime1_sparc.cpp src/share/vm/gc_interface/collectedHeap.cpp src/share/vm/runtime/globals.hpp src/share/vm/runtime/reflectionCompat.hpp
diffstat 24 files changed, 712 insertions(+), 694 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Wed Sep 07 09:35:52 2011 +0200
+++ b/.hgtags	Wed Sep 07 11:52:00 2011 -0700
@@ -174,3 +174,10 @@
 9ad1548c6b63d596c411afc35147ffd5254426d9 hs21-b12
 c149193c768b8b7233da4c3a3fdc0756b975848e hs21-b13
 c149193c768b8b7233da4c3a3fdc0756b975848e jdk7-b143
+0cc8a70952c368e06de2adab1f2649a408f5e577 jdk8-b01
+31e253c1da429124bb87570ab095d9bc89850d0a jdk8-b02
+3a2fb61165dfc72e398179a2796d740c8da5b8c0 jdk8-b03
+0cc8a70952c368e06de2adab1f2649a408f5e577 hs22-b01
+7c29742c41b44fb0cd5a13c7ac8834f3f2ca649e hs22-b02
+3a2fb61165dfc72e398179a2796d740c8da5b8c0 hs22-b03
+ce9bde819dcba4a5d2822229d9183e69c74326ca hs22-b04
--- a/make/hotspot_version	Wed Sep 07 09:35:52 2011 +0200
+++ b/make/hotspot_version	Wed Sep 07 11:52:00 2011 -0700
@@ -35,7 +35,7 @@
 
 HS_MAJOR_VER=22
 HS_MINOR_VER=0
-HS_BUILD_NUMBER=02
+HS_BUILD_NUMBER=04
 
 JDK_MAJOR_VER=1
 JDK_MINOR_VER=8
--- a/src/cpu/sparc/vm/assembler_sparc.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/cpu/sparc/vm/assembler_sparc.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -2165,29 +2165,6 @@
 #endif
 }
 
-void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p,
-                                     Register s1, address d,
-                                     relocInfo::relocType rt ) {
-  assert_not_delayed();
-  if (VM_Version::v9_instructions_work()) {
-    bpr(rc, a, p, s1, d, rt);
-  } else {
-    tst(s1);
-    br(reg_cond_to_cc_cond(rc), a, p, d, rt);
-  }
-}
-
-void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p,
-                                     Register s1, Label& L ) {
-  assert_not_delayed();
-  if (VM_Version::v9_instructions_work()) {
-    bpr(rc, a, p, s1, L);
-  } else {
-    tst(s1);
-    br(reg_cond_to_cc_cond(rc), a, p, L);
-  }
-}
-
 // Compare registers and branch with nop in delay slot or cbcond without delay slot.
 
 // Compare integer (32 bit) values (icc only).
@@ -4344,22 +4321,29 @@
   } else {
     pre_val = O0;
   }
+
   int satb_q_index_byte_offset =
     in_bytes(JavaThread::satb_mark_queue_offset() +
              PtrQueue::byte_offset_of_index());
+
   int satb_q_buf_byte_offset =
     in_bytes(JavaThread::satb_mark_queue_offset() +
              PtrQueue::byte_offset_of_buf());
+
   assert(in_bytes(PtrQueue::byte_width_of_index()) == sizeof(intptr_t) &&
          in_bytes(PtrQueue::byte_width_of_buf()) == sizeof(intptr_t),
          "check sizes in assembly below");
 
   __ bind(restart);
+
+  // Load the index into the SATB buffer. PtrQueue::_index is a size_t
+  // so ld_ptr is appropriate.
   __ ld_ptr(G2_thread, satb_q_index_byte_offset, L0);
 
-  __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn, L0, refill);
-  // If the branch is taken, no harm in executing this in the delay slot.
-  __ delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, L1);
+  // index == 0?
+  __ cmp_and_brx_short(L0, G0, Assembler::equal, Assembler::pn, refill);
+
+  __ ld_ptr(G2_thread, satb_q_buf_byte_offset, L1);
   __ sub(L0, oopSize, L0);
 
   __ st_ptr(pre_val, L1, L0);  // [_buf + index] := I0
@@ -4470,9 +4454,8 @@
          tmp);
   }
 
-  // Check on whether to annul.
-  br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered);
-  delayed()->nop();
+  // Is marking active?
+  cmp_and_br_short(tmp, G0, Assembler::equal, Assembler::pt, filtered);
 
   // Do we need to load the previous value?
   if (obj != noreg) {
@@ -4494,9 +4477,7 @@
   assert(pre_val != noreg, "must have a real register");
 
   // Is the previous value null?
-  // Check on whether to annul.
-  br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, pre_val, filtered);
-  delayed()->nop();
+  cmp_and_brx_short(pre_val, G0, Assembler::equal, Assembler::pt, filtered);
 
   // OK, it's not filtered, so we'll need to call enqueue.  In the normal
   // case, pre_val will be a scratch G-reg, but there are some cases in
@@ -4523,39 +4504,6 @@
   bind(filtered);
 }
 
-static jint num_ct_writes = 0;
-static jint num_ct_writes_filtered_in_hr = 0;
-static jint num_ct_writes_filtered_null = 0;
-static G1CollectedHeap* g1 = NULL;
-
-static Thread* count_ct_writes(void* filter_val, void* new_val) {
-  Atomic::inc(&num_ct_writes);
-  if (filter_val == NULL) {
-    Atomic::inc(&num_ct_writes_filtered_in_hr);
-  } else if (new_val == NULL) {
-    Atomic::inc(&num_ct_writes_filtered_null);
-  } else {
-    if (g1 == NULL) {
-      g1 = G1CollectedHeap::heap();
-    }
-  }
-  if ((num_ct_writes % 1000000) == 0) {
-    jint num_ct_writes_filtered =
-      num_ct_writes_filtered_in_hr +
-      num_ct_writes_filtered_null;
-
-    tty->print_cr("%d potential CT writes: %5.2f%% filtered\n"
-                  "   (%5.2f%% intra-HR, %5.2f%% null).",
-                  num_ct_writes,
-                  100.0*(float)num_ct_writes_filtered/(float)num_ct_writes,
-                  100.0*(float)num_ct_writes_filtered_in_hr/
-                  (float)num_ct_writes,
-                  100.0*(float)num_ct_writes_filtered_null/
-                  (float)num_ct_writes);
-  }
-  return Thread::current();
-}
-
 static address dirty_card_log_enqueue = 0;
 static u_char* dirty_card_log_enqueue_end = 0;
 
@@ -4578,11 +4526,8 @@
   __ set(addrlit, O1); // O1 := <card table base>
   __ ldub(O0, O1, O2); // O2 := [O0 + O1]
 
-  __ br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt,
-                      O2, not_already_dirty);
-  // Get O1 + O2 into a reg by itself -- useful in the take-the-branch
-  // case, harmless if not.
-  __ delayed()->add(O0, O1, O3);
+  assert(CardTableModRefBS::dirty_card_val() == 0, "otherwise check this code");
+  __ cmp_and_br_short(O2, G0, Assembler::notEqual, Assembler::pt, not_already_dirty);
 
   // We didn't take the branch, so we're already dirty: return.
   // Use return-from-leaf
@@ -4591,8 +4536,13 @@
 
   // Not dirty.
   __ bind(not_already_dirty);
+
+  // Get O0 + O1 into a reg by itself
+  __ add(O0, O1, O3);
+
   // First, dirty it.
   __ stb(G0, O3, G0);  // [cardPtr] := 0  (i.e., dirty).
+
   int dirty_card_q_index_byte_offset =
     in_bytes(JavaThread::dirty_card_queue_offset() +
              PtrQueue::byte_offset_of_index());
@@ -4600,12 +4550,15 @@
     in_bytes(JavaThread::dirty_card_queue_offset() +
              PtrQueue::byte_offset_of_buf());
   __ bind(restart);
+
+  // Load the index into the update buffer. PtrQueue::_index is
+  // a size_t so ld_ptr is appropriate here.
   __ ld_ptr(G2_thread, dirty_card_q_index_byte_offset, L0);
 
-  __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn,
-                      L0, refill);
-  // If the branch is taken, no harm in executing this in the delay slot.
-  __ delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1);
+  // index == 0?
+  __ cmp_and_brx_short(L0, G0, Assembler::equal, Assembler::pn, refill);
+
+  __ ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1);
   __ sub(L0, oopSize, L0);
 
   __ st_ptr(O3, L1, L0);  // [_buf + index] := I0
@@ -4668,6 +4621,7 @@
   G1SATBCardTableModRefBS* bs = (G1SATBCardTableModRefBS*) Universe::heap()->barrier_set();
   assert(bs->kind() == BarrierSet::G1SATBCT ||
          bs->kind() == BarrierSet::G1SATBCTLogging, "wrong barrier");
+
   if (G1RSBarrierRegionFilter) {
     xor3(store_addr, new_val, tmp);
 #ifdef _LP64
@@ -4676,33 +4630,8 @@
     srl(tmp, HeapRegion::LogOfHRGrainBytes, tmp);
 #endif
 
-    if (G1PrintCTFilterStats) {
-      guarantee(tmp->is_global(), "Or stats won't work...");
-      // This is a sleazy hack: I'm temporarily hijacking G2, which I
-      // promise to restore.
-      mov(new_val, G2);
-      save_frame(0);
-      mov(tmp, O0);
-      mov(G2, O1);
-      // Save G-regs that target may use.
-      mov(G1, L1);
-      mov(G2, L2);
-      mov(G3, L3);
-      mov(G4, L4);
-      mov(G5, L5);
-      call(CAST_FROM_FN_PTR(address, &count_ct_writes));
-      delayed()->nop();
-      mov(O0, G2);
-      // Restore G-regs that target may have used.
-      mov(L1, G1);
-      mov(L3, G3);
-      mov(L4, G4);
-      mov(L5, G5);
-      restore(G0, G0, G0);
-    }
-    // XXX Should I predict this taken or not?  Does it mattern?
-    br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered);
-    delayed()->nop();
+    // XXX Should I predict this taken or not?  Does it matter?
+    cmp_and_brx_short(tmp, G0, Assembler::equal, Assembler::pt, filtered);
   }
 
   // If the "store_addr" register is an "in" or "local" register, move it to
@@ -4727,7 +4656,6 @@
   restore();
 
   bind(filtered);
-
 }
 
 #endif  // SERIALGC
--- a/src/cpu/sparc/vm/assembler_sparc.hpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/cpu/sparc/vm/assembler_sparc.hpp	Wed Sep 07 11:52:00 2011 -0700
@@ -1944,12 +1944,6 @@
   void br_null   ( Register s1, bool a, Predict p, Label& L );
   void br_notnull( Register s1, bool a, Predict p, Label& L );
 
-  // These versions will do the most efficient thing on v8 and v9.  Perhaps
-  // this is what the routine above was meant to do, but it didn't (and
-  // didn't cover both target address kinds.)
-  void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt = relocInfo::none );
-  void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, Label& L);
-
   //
   // Compare registers and branch with nop in delay slot or cbcond without delay slot.
   //
--- a/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -421,8 +421,7 @@
   }
 
   if (__ is_in_wdisp16_range(_continuation)) {
-    __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                      pre_val_reg, _continuation);
+    __ br_null(pre_val_reg, /*annul*/false, Assembler::pt, _continuation);
   } else {
     __ cmp(pre_val_reg, G0);
     __ brx(Assembler::equal, false, Assembler::pn, _continuation);
@@ -458,8 +457,7 @@
     // The original src operand was not a constant.
     // Generate src == null?
     if (__ is_in_wdisp16_range(_continuation)) {
-      __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                        src_reg, _continuation);
+      __ br_null(src_reg, /*annul*/false, Assembler::pt, _continuation);
     } else {
       __ cmp(src_reg, G0);
       __ brx(Assembler::equal, false, Assembler::pt, _continuation);
@@ -476,13 +474,9 @@
   Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset_in_bytes() + sizeof(oopDesc));
   __ ld(ref_type_adr, tmp_reg);
 
-  if (__ is_in_wdisp16_range(_continuation)) {
-    __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                      tmp_reg, _continuation);
-  } else {
-    __ cmp(tmp_reg, G0);
-    __ brx(Assembler::equal, false, Assembler::pt, _continuation);
-  }
+  // _reference_type field is of type ReferenceType (enum)
+  assert(REF_NONE == 0, "check this code");
+  __ cmp_zero_and_br(Assembler::equal, tmp_reg, _continuation, /*annul*/false, Assembler::pt);
   __ delayed()->nop();
 
   // Is marking active?
@@ -498,13 +492,8 @@
     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
     __ ldsb(in_progress, tmp_reg);
   }
-  if (__ is_in_wdisp16_range(_continuation)) {
-    __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                      tmp_reg, _continuation);
-  } else {
-    __ cmp(tmp_reg, G0);
-    __ brx(Assembler::equal, false, Assembler::pt, _continuation);
-  }
+
+  __ cmp_zero_and_br(Assembler::equal, tmp_reg, _continuation, /*annul*/false, Assembler::pt);
   __ delayed()->nop();
 
   // val == null?
@@ -512,8 +501,7 @@
   Register val_reg = val()->as_register();
 
   if (__ is_in_wdisp16_range(_continuation)) {
-    __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                      val_reg, _continuation);
+    __ br_null(val_reg, /*annul*/false, Assembler::pt, _continuation);
   } else {
     __ cmp(val_reg, G0);
     __ brx(Assembler::equal, false, Assembler::pt, _continuation);
@@ -542,9 +530,9 @@
   assert(new_val()->is_register(), "Precondition.");
   Register addr_reg = addr()->as_pointer_register();
   Register new_val_reg = new_val()->as_register();
+
   if (__ is_in_wdisp16_range(_continuation)) {
-    __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                      new_val_reg, _continuation);
+    __ br_null(new_val_reg, /*annul*/false, Assembler::pt, _continuation);
   } else {
     __ cmp(new_val_reg, G0);
     __ brx(Assembler::equal, false, Assembler::pn, _continuation);
--- a/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -827,14 +827,16 @@
         int satb_q_buf_byte_offset =
           in_bytes(JavaThread::satb_mark_queue_offset() +
                    PtrQueue::byte_offset_of_buf());
+
         __ bind(restart);
+        // Load the index into the SATB buffer. PtrQueue::_index is a
+        // size_t so ld_ptr is appropriate
         __ ld_ptr(G2_thread, satb_q_index_byte_offset, tmp);
 
-        __ br_on_reg_cond(Assembler::rc_z, /*annul*/false,
-                          Assembler::pn, tmp, refill);
+        // index == 0?
+        __ cmp_and_brx_short(tmp, G0, Assembler::equal, Assembler::pn, refill);
 
-        // If the branch is taken, no harm in executing this in the delay slot.
-        __ delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, tmp2);
+        __ ld_ptr(G2_thread, satb_q_buf_byte_offset, tmp2);
         __ sub(tmp, oopSize, tmp);
 
         __ st_ptr(pre_val, tmp2, tmp);  // [_buf + index] := <address_of_card>
@@ -894,11 +896,8 @@
         __ set(rs, cardtable);         // cardtable := <card table base>
         __ ldub(addr, cardtable, tmp); // tmp := [addr + cardtable]
 
-        __ br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt,
-                          tmp, not_already_dirty);
-        // Get cardtable + tmp into a reg by itself -- useful in the take-the-branch
-        // case, harmless if not.
-        __ delayed()->add(addr, cardtable, tmp2);
+        assert(CardTableModRefBS::dirty_card_val() == 0, "otherwise check this code");
+        __ cmp_and_br_short(tmp, G0, Assembler::notEqual, Assembler::pt, not_already_dirty);
 
         // We didn't take the branch, so we're already dirty: return.
         // Use return-from-leaf
@@ -907,6 +906,10 @@
 
         // Not dirty.
         __ bind(not_already_dirty);
+
+        // Get cardtable + tmp into a reg by itself
+        __ add(addr, cardtable, tmp2);
+
         // First, dirty it.
         __ stb(G0, tmp2, 0);  // [cardPtr] := 0  (i.e., dirty).
 
@@ -922,13 +925,17 @@
         int dirty_card_q_buf_byte_offset =
           in_bytes(JavaThread::dirty_card_queue_offset() +
                    PtrQueue::byte_offset_of_buf());
+
         __ bind(restart);
+
+        // Get the index into the update buffer. PtrQueue::_index is
+        // a size_t so ld_ptr is appropriate here.
         __ ld_ptr(G2_thread, dirty_card_q_index_byte_offset, tmp3);
 
-        __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn,
-                          tmp3, refill);
-        // If the branch is taken, no harm in executing this in the delay slot.
-        __ delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, tmp4);
+        // index == 0?
+        __ cmp_and_brx_short(tmp3, G0, Assembler::equal,  Assembler::pn, refill);
+
+        __ ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, tmp4);
         __ sub(tmp3, oopSize, tmp3);
 
         __ st_ptr(tmp2, tmp4, tmp3);  // [_buf + index] := <address_of_card>
--- a/src/os/linux/vm/os_linux.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/os/linux/vm/os_linux.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -125,10 +125,6 @@
 # include <inttypes.h>
 # include <sys/ioctl.h>
 
-#ifdef AMD64
-#include <asm/vsyscall.h>
-#endif
-
 #define MAX_PATH    (2 * K)
 
 // for timer info max values which include all bits
@@ -2502,7 +2498,13 @@
   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
   uintptr_t res = (uintptr_t) ::mmap(addr, size, prot,
                                    MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
-  return res != (uintptr_t) MAP_FAILED;
+  if (res != (uintptr_t) MAP_FAILED) {
+    if (UseNUMAInterleaving) {
+      numa_make_global(addr, size);
+    }
+    return true;
+  }
+  return false;
 }
 
 // Define MAP_HUGETLB here so we can build HotSpot on old systems.
@@ -2523,7 +2525,13 @@
       (uintptr_t) ::mmap(addr, size, prot,
                          MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS|MAP_HUGETLB,
                          -1, 0);
-    return res != (uintptr_t) MAP_FAILED;
+    if (res != (uintptr_t) MAP_FAILED) {
+      if (UseNUMAInterleaving) {
+        numa_make_global(addr, size);
+      }
+      return true;
+    }
+    return false;
   }
 
   return commit_memory(addr, size, exec);
@@ -2588,8 +2596,17 @@
   int retval = -1;
 
 #if defined(IA32)
+# ifndef SYS_getcpu
+# define SYS_getcpu 318
+# endif
   retval = syscall(SYS_getcpu, &cpu, NULL, NULL);
 #elif defined(AMD64)
+// Unfortunately we have to bring all these macros here from vsyscall.h
+// to be able to compile on old linuxes.
+# define __NR_vgetcpu 2
+# define VSYSCALL_START (-10UL << 20)
+# define VSYSCALL_SIZE 1024
+# define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
   typedef long (*vgetcpu_t)(unsigned int *cpu, unsigned int *node, unsigned long *tcache);
   vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu);
   retval = vgetcpu(&cpu, NULL, NULL);
@@ -3115,6 +3132,10 @@
      return NULL;
   }
 
+  if ((addr != NULL) && UseNUMAInterleaving) {
+    numa_make_global(addr, bytes);
+  }
+
   return addr;
 }
 
--- a/src/os/solaris/vm/os_solaris.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/os/solaris/vm/os_solaris.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -2777,8 +2777,14 @@
 bool os::commit_memory(char* addr, size_t bytes, bool exec) {
   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
   size_t size = bytes;
-  return
-     NULL != Solaris::mmap_chunk(addr, size, MAP_PRIVATE|MAP_FIXED, prot);
+  char *res = Solaris::mmap_chunk(addr, size, MAP_PRIVATE|MAP_FIXED, prot);
+  if (res != NULL) {
+    if (UseNUMAInterleaving) {
+      numa_make_global(addr, bytes);
+    }
+    return true;
+  }
+  return false;
 }
 
 bool os::commit_memory(char* addr, size_t bytes, size_t alignment_hint,
@@ -3389,12 +3395,11 @@
   return true;
 }
 
-char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) {
+char* os::reserve_memory_special(size_t size, char* addr, bool exec) {
   // "exec" is passed in but not used.  Creating the shared image for
   // the code cache doesn't have an SHM_X executable permission to check.
   assert(UseLargePages && UseISM, "only for ISM large pages");
 
-  size_t size = bytes;
   char* retAddr = NULL;
   int shmid;
   key_t ismKey;
@@ -3436,7 +3441,9 @@
     }
     return NULL;
   }
-
+  if ((retAddr != NULL) && UseNUMAInterleaving) {
+    numa_make_global(retAddr, size);
+  }
   return retAddr;
 }
 
--- a/src/os/windows/vm/os_windows.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/os/windows/vm/os_windows.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -2614,6 +2614,57 @@
 static HANDLE    _hProcess;
 static HANDLE    _hToken;
 
+// Container for NUMA node list info
+class NUMANodeListHolder {
+private:
+  int *_numa_used_node_list;  // allocated below
+  int _numa_used_node_count;
+
+  void free_node_list() {
+    if (_numa_used_node_list != NULL) {
+      FREE_C_HEAP_ARRAY(int, _numa_used_node_list);
+    }
+  }
+
+public:
+  NUMANodeListHolder() {
+    _numa_used_node_count = 0;
+    _numa_used_node_list = NULL;
+    // do rest of initialization in build routine (after function pointers are set up)
+  }
+
+  ~NUMANodeListHolder() {
+    free_node_list();
+  }
+
+  bool build() {
+    DWORD_PTR proc_aff_mask;
+    DWORD_PTR sys_aff_mask;
+    if (!GetProcessAffinityMask(GetCurrentProcess(), &proc_aff_mask, &sys_aff_mask)) return false;
+    ULONG highest_node_number;
+    if (!os::Kernel32Dll::GetNumaHighestNodeNumber(&highest_node_number)) return false;
+    free_node_list();
+    _numa_used_node_list = NEW_C_HEAP_ARRAY(int, highest_node_number);
+    for (unsigned int i = 0; i <= highest_node_number; i++) {
+      ULONGLONG proc_mask_numa_node;
+      if (!os::Kernel32Dll::GetNumaNodeProcessorMask(i, &proc_mask_numa_node)) return false;
+      if ((proc_aff_mask & proc_mask_numa_node)!=0) {
+        _numa_used_node_list[_numa_used_node_count++] = i;
+      }
+    }
+    return (_numa_used_node_count > 1);
+  }
+
+  int get_count() {return _numa_used_node_count;}
+  int get_node_list_entry(int n) {
+    // for indexes out of range, returns -1
+    return (n < _numa_used_node_count ? _numa_used_node_list[n] : -1);
+  }
+
+} numa_node_list_holder;
+
+
+
 static size_t _large_page_size = 0;
 
 static bool resolve_functions_for_large_page_init() {
@@ -2653,6 +2704,154 @@
   _hToken = NULL;
 }
 
+static bool numa_interleaving_init() {
+  bool success = false;
+  bool use_numa_specified = !FLAG_IS_DEFAULT(UseNUMA);
+  bool use_numa_interleaving_specified = !FLAG_IS_DEFAULT(UseNUMAInterleaving);
+
+  // print a warning if UseNUMA or UseNUMAInterleaving flag is specified on command line
+  bool warn_on_failure =  use_numa_specified || use_numa_interleaving_specified;
+# define WARN(msg) if (warn_on_failure) { warning(msg); }
+
+  // NUMAInterleaveGranularity cannot be less than vm_allocation_granularity (or _large_page_size if using large pages)
+  size_t min_interleave_granularity = UseLargePages ? _large_page_size : os::vm_allocation_granularity();
+  NUMAInterleaveGranularity = align_size_up(NUMAInterleaveGranularity, min_interleave_granularity);
+
+  if (os::Kernel32Dll::NumaCallsAvailable()) {
+    if (numa_node_list_holder.build()) {
+      if (PrintMiscellaneous && Verbose) {
+        tty->print("NUMA UsedNodeCount=%d, namely ", os::numa_get_groups_num());
+        for (int i = 0; i < numa_node_list_holder.get_count(); i++) {
+          tty->print("%d ", numa_node_list_holder.get_node_list_entry(i));
+        }
+        tty->print("\n");
+      }
+      success = true;
+    } else {
+      WARN("Process does not cover multiple NUMA nodes.");
+    }
+  } else {
+    WARN("NUMA Interleaving is not supported by the operating system.");
+  }
+  if (!success) {
+    if (use_numa_specified) WARN("...Ignoring UseNUMA flag.");
+    if (use_numa_interleaving_specified) WARN("...Ignoring UseNUMAInterleaving flag.");
+  }
+  return success;
+#undef WARN
+}
+
+// this routine is used whenever we need to reserve a contiguous VA range
+// but we need to make separate VirtualAlloc calls for each piece of the range
+// Reasons for doing this:
+//  * UseLargePagesIndividualAllocation was set (normally only needed on WS2003 but possible to be set otherwise)
+//  * UseNUMAInterleaving requires a separate node for each piece
+static char* allocate_pages_individually(size_t bytes, char* addr, DWORD flags, DWORD prot,
+                                         bool should_inject_error=false) {
+  char * p_buf;
+  // note: at setup time we guaranteed that NUMAInterleaveGranularity was aligned up to a page size
+  size_t page_size = UseLargePages ? _large_page_size : os::vm_allocation_granularity();
+  size_t chunk_size = UseNUMAInterleaving ? NUMAInterleaveGranularity : page_size;
+
+  // first reserve enough address space in advance since we want to be
+  // able to break a single contiguous virtual address range into multiple
+  // large page commits but WS2003 does not allow reserving large page space
+  // so we just use 4K pages for reserve, this gives us a legal contiguous
+  // address space. then we will deallocate that reservation, and re alloc
+  // using large pages
+  const size_t size_of_reserve = bytes + chunk_size;
+  if (bytes > size_of_reserve) {
+    // Overflowed.
+    return NULL;
+  }
+  p_buf = (char *) VirtualAlloc(addr,
+                                size_of_reserve,  // size of Reserve
+                                MEM_RESERVE,
+                                PAGE_READWRITE);
+  // If reservation failed, return NULL
+  if (p_buf == NULL) return NULL;
+
+  os::release_memory(p_buf, bytes + chunk_size);
+
+  // we still need to round up to a page boundary (in case we are using large pages)
+  // but not to a chunk boundary (in case InterleavingGranularity doesn't align with page size)
+  // instead we handle this in the bytes_to_rq computation below
+  p_buf = (char *) align_size_up((size_t)p_buf, page_size);
+
+  // now go through and allocate one chunk at a time until all bytes are
+  // allocated
+  size_t  bytes_remaining = bytes;
+  // An overflow of align_size_up() would have been caught above
+  // in the calculation of size_of_reserve.
+  char * next_alloc_addr = p_buf;
+  HANDLE hProc = GetCurrentProcess();
+
+#ifdef ASSERT
+  // Variable for the failure injection
+  long ran_num = os::random();
+  size_t fail_after = ran_num % bytes;
+#endif
+
+  int count=0;
+  while (bytes_remaining) {
+    // select bytes_to_rq to get to the next chunk_size boundary
+
+    size_t bytes_to_rq = MIN2(bytes_remaining, chunk_size - ((size_t)next_alloc_addr % chunk_size));
+    // Note allocate and commit
+    char * p_new;
+
+#ifdef ASSERT
+    bool inject_error_now = should_inject_error && (bytes_remaining <= fail_after);
+#else
+    const bool inject_error_now = false;
+#endif
+
+    if (inject_error_now) {
+      p_new = NULL;
+    } else {
+      if (!UseNUMAInterleaving) {
+        p_new = (char *) VirtualAlloc(next_alloc_addr,
+                                      bytes_to_rq,
+                                      flags,
+                                      prot);
+      } else {
+        // get the next node to use from the used_node_list
+        DWORD node = numa_node_list_holder.get_node_list_entry(count % os::numa_get_groups_num());
+        p_new = (char *)os::Kernel32Dll::VirtualAllocExNuma(hProc,
+                                                            next_alloc_addr,
+                                                            bytes_to_rq,
+                                                            flags,
+                                                            prot,
+                                                            node);
+      }
+    }
+
+    if (p_new == NULL) {
+      // Free any allocated pages
+      if (next_alloc_addr > p_buf) {
+        // Some memory was committed so release it.
+        size_t bytes_to_release = bytes - bytes_remaining;
+        os::release_memory(p_buf, bytes_to_release);
+      }
+#ifdef ASSERT
+      if (should_inject_error) {
+        if (TracePageSizes && Verbose) {
+          tty->print_cr("Reserving pages individually failed.");
+        }
+      }
+#endif
+      return NULL;
+    }
+    bytes_remaining -= bytes_to_rq;
+    next_alloc_addr += bytes_to_rq;
+    count++;
+  }
+  // made it this far, success
+  return p_buf;
+}
+
+
+
 void os::large_page_init() {
   if (!UseLargePages) return;
 
@@ -2722,9 +2921,30 @@
   assert((size_t)addr % os::vm_allocation_granularity() == 0,
          "reserve alignment");
   assert(bytes % os::vm_allocation_granularity() == 0, "reserve block size");
-  char* res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE);
+  char* res;
+  // note that if UseLargePages is on, all the areas that require interleaving
+  // will go thru reserve_memory_special rather than thru here.
+  bool use_individual = (UseNUMAInterleaving && !UseLargePages);
+  if (!use_individual) {
+    res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE);
+  } else {
+    elapsedTimer reserveTimer;
+    if( Verbose && PrintMiscellaneous ) reserveTimer.start();
+    // in numa interleaving, we have to allocate pages individually
+    // (well really chunks of NUMAInterleaveGranularity size)
+    res = allocate_pages_individually(bytes, addr, MEM_RESERVE, PAGE_READWRITE);
+    if (res == NULL) {
+      warning("NUMA page allocation failed");
+    }
+    if( Verbose && PrintMiscellaneous ) {
+      reserveTimer.stop();
+      tty->print_cr("reserve_memory of %Ix bytes took %ld ms (%ld ticks)", bytes,
+                    reserveTimer.milliseconds(), reserveTimer.ticks());
+    }
+  }
   assert(res == NULL || addr == NULL || addr == res,
          "Unexpected address from reserve.");
+
   return res;
 }
 
@@ -2754,92 +2974,27 @@
 char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) {
 
   const DWORD prot = exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE;
-
-  if (UseLargePagesIndividualAllocation) {
+  const DWORD flags = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
+
+  // with large pages, there are two cases where we need to use Individual Allocation
+  // 1) the UseLargePagesIndividualAllocation flag is set (set by default on WS2003)
+  // 2) NUMA Interleaving is enabled, in which case we use a different node for each page
+  if (UseLargePagesIndividualAllocation || UseNUMAInterleaving) {
     if (TracePageSizes && Verbose) {
        tty->print_cr("Reserving large pages individually.");
     }
-    char * p_buf;
-    // first reserve enough address space in advance since we want to be
-    // able to break a single contiguous virtual address range into multiple
-    // large page commits but WS2003 does not allow reserving large page space
-    // so we just use 4K pages for reserve, this gives us a legal contiguous
-    // address space. then we will deallocate that reservation, and re alloc
-    // using large pages
-    const size_t size_of_reserve = bytes + _large_page_size;
-    if (bytes > size_of_reserve) {
-      // Overflowed.
-      warning("Individually allocated large pages failed, "
-        "use -XX:-UseLargePagesIndividualAllocation to turn off");
+    char * p_buf = allocate_pages_individually(bytes, addr, flags, prot, LargePagesIndividualAllocationInjectError);
+    if (p_buf == NULL) {
+      // give an appropriate warning message
+      if (UseNUMAInterleaving) {
+        warning("NUMA large page allocation failed, UseLargePages flag ignored");
+      }
+      if (UseLargePagesIndividualAllocation) {
+        warning("Individually allocated large pages failed, "
+                "use -XX:-UseLargePagesIndividualAllocation to turn off");
+      }
       return NULL;
     }
-    p_buf = (char *) VirtualAlloc(addr,
-                                 size_of_reserve,  // size of Reserve
-                                 MEM_RESERVE,
-                                 PAGE_READWRITE);
-    // If reservation failed, return NULL
-    if (p_buf == NULL) return NULL;
-
-    release_memory(p_buf, bytes + _large_page_size);
-    // round up to page boundary.  If the size_of_reserve did not
-    // overflow and the reservation did not fail, this align up
-    // should not overflow.
-    p_buf = (char *) align_size_up((size_t)p_buf, _large_page_size);
-
-    // now go through and allocate one page at a time until all bytes are
-    // allocated
-    size_t  bytes_remaining = align_size_up(bytes, _large_page_size);
-    // An overflow of align_size_up() would have been caught above
-    // in the calculation of size_of_reserve.
-    char * next_alloc_addr = p_buf;
-
-#ifdef ASSERT
-    // Variable for the failure injection
-    long ran_num = os::random();
-    size_t fail_after = ran_num % bytes;
-#endif
-
-    while (bytes_remaining) {
-      size_t bytes_to_rq = MIN2(bytes_remaining, _large_page_size);
-      // Note allocate and commit
-      char * p_new;
-
-#ifdef ASSERT
-      bool inject_error = LargePagesIndividualAllocationInjectError &&
-          (bytes_remaining <= fail_after);
-#else
-      const bool inject_error = false;
-#endif
-
-      if (inject_error) {
-        p_new = NULL;
-      } else {
-        p_new = (char *) VirtualAlloc(next_alloc_addr,
-                                    bytes_to_rq,
-                                    MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES,
-                                    prot);
-      }
-
-      if (p_new == NULL) {
-        // Free any allocated pages
-        if (next_alloc_addr > p_buf) {
-          // Some memory was committed so release it.
-          size_t bytes_to_release = bytes - bytes_remaining;
-          release_memory(p_buf, bytes_to_release);
-        }
-#ifdef ASSERT
-        if (UseLargePagesIndividualAllocation &&
-            LargePagesIndividualAllocationInjectError) {
-          if (TracePageSizes && Verbose) {
-             tty->print_cr("Reserving large pages individually failed.");
-          }
-        }
-#endif
-        return NULL;
-      }
-      bytes_remaining -= bytes_to_rq;
-      next_alloc_addr += bytes_to_rq;
-    }
 
     return p_buf;
 
@@ -2867,14 +3022,43 @@
   assert(bytes % os::vm_page_size() == 0, "commit in page-sized chunks");
   // Don't attempt to print anything if the OS call fails. We're
   // probably low on resources, so the print itself may cause crashes.
-  bool result = VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) != 0;
-  if (result != NULL && exec) {
-    DWORD oldprot;
-    // Windows doc says to use VirtualProtect to get execute permissions
-    return VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot) != 0;
+
+  // unless we have NUMAInterleaving enabled, the range of a commit
+  // is always within a reserve covered by a single VirtualAlloc
+  // in that case we can just do a single commit for the requested size
+  if (!UseNUMAInterleaving) {
+    if (VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) == NULL) return false;
+    if (exec) {
+      DWORD oldprot;
+      // Windows doc says to use VirtualProtect to get execute permissions
+      if (!VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot)) return false;
+    }
+    return true;
   } else {
-    return result;
-  }
+
+    // when NUMAInterleaving is enabled, the commit might cover a range that
+    // came from multiple VirtualAlloc reserves (using allocate_pages_individually).
+    // VirtualQuery can help us determine that.  The RegionSize that VirtualQuery
+    // returns represents the number of bytes that can be committed in one step.
+    size_t bytes_remaining = bytes;
+    char * next_alloc_addr = addr;
+    while (bytes_remaining > 0) {
+      MEMORY_BASIC_INFORMATION alloc_info;
+      VirtualQuery(next_alloc_addr, &alloc_info, sizeof(alloc_info));
+      size_t bytes_to_rq = MIN2(bytes_remaining, (size_t)alloc_info.RegionSize);
+      if (VirtualAlloc(next_alloc_addr, bytes_to_rq, MEM_COMMIT, PAGE_READWRITE) == NULL)
+        return false;
+      if (exec) {
+        DWORD oldprot;
+        if (!VirtualProtect(next_alloc_addr, bytes_to_rq, PAGE_EXECUTE_READWRITE, &oldprot))
+          return false;
+      }
+      bytes_remaining -= bytes_to_rq;
+      next_alloc_addr += bytes_to_rq;
+    }
+  }
+  // if we made it this far, return true
+  return true;
 }
 
 bool os::commit_memory(char* addr, size_t size, size_t alignment_hint,
@@ -2948,14 +3132,15 @@
 void os::numa_make_global(char *addr, size_t bytes)    { }
 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint)    { }
 bool os::numa_topology_changed()                       { return false; }
-size_t os::numa_get_groups_num()                       { return 1; }
+size_t os::numa_get_groups_num()                       { return numa_node_list_holder.get_count(); }
 int os::numa_get_group_id()                            { return 0; }
 size_t os::numa_get_leaf_groups(int *ids, size_t size) {
-  if (size > 0) {
-    ids[0] = 0;
-    return 1;
-  }
-  return 0;
+  // check for size bigger than actual groups_num
+  size = MIN2(size, numa_get_groups_num());
+  for (int i = 0; i < (int)size; i++) {
+    ids[i] = numa_node_list_holder.get_node_list_entry(i);
+  }
+  return size;
 }
 
 bool os::get_page_info(char *start, page_info* info) {
@@ -3480,7 +3665,7 @@
     if(Verbose && PrintMiscellaneous)
       tty->print("[Memory Serialize  Page address: " INTPTR_FORMAT "]\n", (intptr_t)mem_serialize_page);
 #endif
-}
+  }
 
   os::large_page_init();
 
@@ -3583,8 +3768,10 @@
   // initialize thread priority policy
   prio_init();
 
-  if (UseNUMA && !ForceNUMA) {
-    UseNUMA = false; // Currently unsupported.
+  if (UseNUMAInterleaving) {
+    // first check whether this Windows OS supports VirtualAllocExNuma, if not ignore this flag
+    bool success = numa_interleaving_init();
+    if (!success) UseNUMAInterleaving = false;
   }
 
   return JNI_OK;
@@ -4758,7 +4945,14 @@
 
 // Kernel32 API
 typedef SIZE_T (WINAPI* GetLargePageMinimum_Fn)(void);
+typedef LPVOID (WINAPI *VirtualAllocExNuma_Fn) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+typedef BOOL (WINAPI *GetNumaHighestNodeNumber_Fn) (PULONG);
+typedef BOOL (WINAPI *GetNumaNodeProcessorMask_Fn) (UCHAR, PULONGLONG);
+
 GetLargePageMinimum_Fn      os::Kernel32Dll::_GetLargePageMinimum = NULL;
+VirtualAllocExNuma_Fn       os::Kernel32Dll::_VirtualAllocExNuma = NULL;
+GetNumaHighestNodeNumber_Fn os::Kernel32Dll::_GetNumaHighestNodeNumber = NULL;
+GetNumaNodeProcessorMask_Fn os::Kernel32Dll::_GetNumaNodeProcessorMask = NULL;
 BOOL                        os::Kernel32Dll::initialized = FALSE;
 SIZE_T os::Kernel32Dll::GetLargePageMinimum() {
   assert(initialized && _GetLargePageMinimum != NULL,
@@ -4773,16 +4967,53 @@
   return _GetLargePageMinimum != NULL;
 }
 
+BOOL os::Kernel32Dll::NumaCallsAvailable() {
+  if (!initialized) {
+    initialize();
+  }
+  return _VirtualAllocExNuma != NULL;
+}
+
+LPVOID os::Kernel32Dll::VirtualAllocExNuma(HANDLE hProc, LPVOID addr, SIZE_T bytes, DWORD flags, DWORD prot, DWORD node) {
+  assert(initialized && _VirtualAllocExNuma != NULL,
+    "NUMACallsAvailable() not yet called");
+
+  return _VirtualAllocExNuma(hProc, addr, bytes, flags, prot, node);
+}
+
+BOOL os::Kernel32Dll::GetNumaHighestNodeNumber(PULONG ptr_highest_node_number) {
+  assert(initialized && _GetNumaHighestNodeNumber != NULL,
+    "NUMACallsAvailable() not yet called");
+
+  return _GetNumaHighestNodeNumber(ptr_highest_node_number);
+}
+
+BOOL os::Kernel32Dll::GetNumaNodeProcessorMask(UCHAR node, PULONGLONG proc_mask) {
+  assert(initialized && _GetNumaNodeProcessorMask != NULL,
+    "NUMACallsAvailable() not yet called");
+
+  return _GetNumaNodeProcessorMask(node, proc_mask);
+}
+
+
+void os::Kernel32Dll::initializeCommon() {
+  if (!initialized) {
+    HMODULE handle = ::GetModuleHandle("Kernel32.dll");
+    assert(handle != NULL, "Just check");
+    _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
+    _VirtualAllocExNuma = (VirtualAllocExNuma_Fn)::GetProcAddress(handle, "VirtualAllocExNuma");
+    _GetNumaHighestNodeNumber = (GetNumaHighestNodeNumber_Fn)::GetProcAddress(handle, "GetNumaHighestNodeNumber");
+    _GetNumaNodeProcessorMask = (GetNumaNodeProcessorMask_Fn)::GetProcAddress(handle, "GetNumaNodeProcessorMask");
+    initialized = TRUE;
+  }
+}
+
+
 
 #ifndef JDK6_OR_EARLIER
 
 void os::Kernel32Dll::initialize() {
-  if (!initialized) {
-    HMODULE handle = ::GetModuleHandle("Kernel32.dll");
-    assert(handle != NULL, "Just check");
-    _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
-    initialized = TRUE;
-  }
+  initializeCommon();
 }
 
 
@@ -4887,18 +5118,19 @@
 Module32Next_Fn             os::Kernel32Dll::_Module32Next = NULL;
 GetNativeSystemInfo_Fn      os::Kernel32Dll::_GetNativeSystemInfo = NULL;
 
+
 void os::Kernel32Dll::initialize() {
   if (!initialized) {
     HMODULE handle = ::GetModuleHandle("Kernel32.dll");
     assert(handle != NULL, "Just check");
 
     _SwitchToThread = (SwitchToThread_Fn)::GetProcAddress(handle, "SwitchToThread");
-    _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
     _CreateToolhelp32Snapshot = (CreateToolhelp32Snapshot_Fn)
       ::GetProcAddress(handle, "CreateToolhelp32Snapshot");
     _Module32First = (Module32First_Fn)::GetProcAddress(handle, "Module32First");
     _Module32Next = (Module32Next_Fn)::GetProcAddress(handle, "Module32Next");
     _GetNativeSystemInfo = (GetNativeSystemInfo_Fn)::GetProcAddress(handle, "GetNativeSystemInfo");
+    initializeCommon();  // resolve the functions that always need resolving
 
     initialized = TRUE;
   }
@@ -4964,6 +5196,8 @@
   _GetNativeSystemInfo(lpSystemInfo);
 }
 
+
+
 // PSAPI API
 
 
--- a/src/os/windows/vm/os_windows.hpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/os/windows/vm/os_windows.hpp	Wed Sep 07 11:52:00 2011 -0700
@@ -173,13 +173,25 @@
   static BOOL GetNativeSystemInfoAvailable();
   static void GetNativeSystemInfo(LPSYSTEM_INFO);
 
+  // NUMA calls
+  static BOOL NumaCallsAvailable();
+  static LPVOID VirtualAllocExNuma(HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+  static BOOL GetNumaHighestNodeNumber(PULONG);
+  static BOOL GetNumaNodeProcessorMask(UCHAR, PULONGLONG);
+
 private:
   // GetLargePageMinimum available on Windows Vista/Windows Server 2003
   // and later
+  // NUMA calls available Windows Vista/WS2008 and later
+
   static SIZE_T (WINAPI *_GetLargePageMinimum)(void);
+  static LPVOID (WINAPI *_VirtualAllocExNuma) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+  static BOOL (WINAPI *_GetNumaHighestNodeNumber) (PULONG);
+  static BOOL (WINAPI *_GetNumaNodeProcessorMask) (UCHAR, PULONGLONG);
   static BOOL initialized;
 
   static void initialize();
+  static void initializeCommon();
 
 #ifdef JDK6_OR_EARLIER
 private:
--- a/src/share/tools/ProjectCreator/WinGammaPlatformVC10.java	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/tools/ProjectCreator/WinGammaPlatformVC10.java	Wed Sep 07 11:52:00 2011 -0700
@@ -482,7 +482,7 @@
                 "/export:JVM_GetThreadStateNames "+
                 "/export:JVM_GetThreadStateValues "+
                 "/export:JVM_InitAgentProperties");
-        addAttr(rv, "AdditionalDependencies", "kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;Wsock32.lib;winmm.lib");
+        addAttr(rv, "AdditionalDependencies", "kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;Wsock32.lib;winmm.lib;psapi.lib");
         addAttr(rv, "OutputFile", outDll);
         addAttr(rv, "SuppressStartupBanner", "true");
         addAttr(rv, "ModuleDefinitionFile", outDir+Util.sep+"vm.def");
--- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -801,39 +801,6 @@
   reset();
 }
 
-class CMMarkRootsClosure: public OopsInGenClosure {
-private:
-  ConcurrentMark*  _cm;
-  G1CollectedHeap* _g1h;
-  bool             _do_barrier;
-
-public:
-  CMMarkRootsClosure(ConcurrentMark* cm,
-                     G1CollectedHeap* g1h,
-                     bool do_barrier) : _cm(cm), _g1h(g1h),
-                                        _do_barrier(do_barrier) { }
-
-  virtual void do_oop(narrowOop* p) { do_oop_work(p); }
-  virtual void do_oop(      oop* p) { do_oop_work(p); }
-
-  template <class T> void do_oop_work(T* p) {
-    T heap_oop = oopDesc::load_heap_oop(p);
-    if (!oopDesc::is_null(heap_oop)) {
-      oop obj = oopDesc::decode_heap_oop_not_null(heap_oop);
-      assert(obj->is_oop() || obj->mark() == NULL,
-             "expected an oop, possibly with mark word displaced");
-      HeapWord* addr = (HeapWord*)obj;
-      if (_g1h->is_in_g1_reserved(addr)) {
-        _cm->grayRoot(obj);
-      }
-    }
-    if (_do_barrier) {
-      assert(!_g1h->is_in_g1_reserved(p),
-             "Should be called on external roots");
-      do_barrier(p);
-    }
-  }
-};
 
 void ConcurrentMark::checkpointRootsInitialPost() {
   G1CollectedHeap*   g1h = G1CollectedHeap::heap();
@@ -868,50 +835,6 @@
   // during it. No need to call it here.
 }
 
-// Checkpoint the roots into this generation from outside
-// this generation. [Note this initial checkpoint need only
-// be approximate -- we'll do a catch up phase subsequently.]
-void ConcurrentMark::checkpointRootsInitial() {
-  assert(SafepointSynchronize::is_at_safepoint(), "world should be stopped");
-  G1CollectedHeap* g1h = G1CollectedHeap::heap();
-
-  double start = os::elapsedTime();
-
-  G1CollectorPolicy* g1p = G1CollectedHeap::heap()->g1_policy();
-  g1p->record_concurrent_mark_init_start();
-  checkpointRootsInitialPre();
-
-  // YSR: when concurrent precleaning is in place, we'll
-  // need to clear the cached card table here
-
-  ResourceMark rm;
-  HandleMark  hm;
-
-  g1h->ensure_parsability(false);
-  g1h->perm_gen()->save_marks();
-
-  CMMarkRootsClosure notOlder(this, g1h, false);
-  CMMarkRootsClosure older(this, g1h, true);
-
-  g1h->set_marking_started();
-  g1h->rem_set()->prepare_for_younger_refs_iterate(false);
-
-  g1h->process_strong_roots(true,    // activate StrongRootsScope
-                            false,   // fake perm gen collection
-                            SharedHeap::SO_AllClasses,
-                            &notOlder, // Regular roots
-                            NULL,     // do not visit active blobs
-                            &older    // Perm Gen Roots
-                            );
-  checkpointRootsInitialPost();
-
-  // Statistics.
-  double end = os::elapsedTime();
-  _init_times.add((end - start) * 1000.0);
-
-  g1p->record_concurrent_mark_init_end();
-}
-
 /*
  * Notice that in the next two methods, we actually leave the STS
  * during the barrier sync and join it immediately afterwards. If we
--- a/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Wed Sep 07 11:52:00 2011 -0700
@@ -756,9 +756,6 @@
   // Clear the next marking bitmap (will be called concurrently).
   void clearNextBitmap();
 
-  // main CMS steps and related support
-  void checkpointRootsInitial();
-
   // These two do the work that needs to be done before and after the
   // initial root checkpoint. Since this checkpoint can be done at two
   // different points (i.e. an explicit pause or piggy-backed on a
--- a/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -50,19 +50,6 @@
   create_and_start();
 }
 
-class CMCheckpointRootsInitialClosure: public VoidClosure {
-
-  ConcurrentMark* _cm;
-public:
-
-  CMCheckpointRootsInitialClosure(ConcurrentMark* cm) :
-    _cm(cm) {}
-
-  void do_void(){
-    _cm->checkpointRootsInitial();
-  }
-};
-
 class CMCheckpointRootsFinalClosure: public VoidClosure {
 
   ConcurrentMark* _cm;
@@ -116,27 +103,6 @@
         gclog_or_tty->print_cr("[GC concurrent-mark-start]");
       }
 
-      if (!g1_policy->in_young_gc_mode()) {
-        // this ensures the flag is not set if we bail out of the marking
-        // cycle; normally the flag is cleared immediately after cleanup
-        g1h->set_marking_complete();
-
-        if (g1_policy->adaptive_young_list_length()) {
-          double now = os::elapsedTime();
-          double init_prediction_ms = g1_policy->predict_init_time_ms();
-          jlong sleep_time_ms = mmu_tracker->when_ms(now, init_prediction_ms);
-          os::sleep(current_thread, sleep_time_ms, false);
-        }
-
-        // We don't have to skip here if we've been asked to restart, because
-        // in the worst case we just enqueue a new VM operation to start a
-        // marking.  Note that the init operation resets has_aborted()
-        CMCheckpointRootsInitialClosure init_cl(_cm);
-        strcpy(verbose_str, "GC initial-mark");
-        VM_CGC_Operation op(&init_cl, verbose_str);
-        VMThread::execute(&op);
-      }
-
       int iter = 0;
       do {
         iter++;
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -1227,6 +1227,7 @@
                        /* option      */ VerifyOption_G1UsePrevMarking);
 
     }
+    pre_full_gc_dump();
 
     COMPILER2_PRESENT(DerivedPointerTable::clear());
 
@@ -1263,10 +1264,8 @@
     g1_policy()->clear_incremental_cset();
     g1_policy()->stop_incremental_cset_building();
 
-    if (g1_policy()->in_young_gc_mode()) {
-      empty_young_list();
-      g1_policy()->set_full_young_gcs(true);
-    }
+    empty_young_list();
+    g1_policy()->set_full_young_gcs(true);
 
     // See the comment in G1CollectedHeap::ref_processing_init() about
     // how reference processing currently works in G1.
@@ -1387,13 +1386,11 @@
            || (G1DeferredRSUpdate && (dirty_card_queue_set().completed_buffers_num() == 0)), "Should not be any");
   }
 
-  if (g1_policy()->in_young_gc_mode()) {
-    _young_list->reset_sampled_info();
-    // At this point there should be no regions in the
-    // entire heap tagged as young.
-    assert( check_young_list_empty(true /* check_heap */),
-            "young list should be empty at this point");
-  }
+  _young_list->reset_sampled_info();
+  // At this point there should be no regions in the
+  // entire heap tagged as young.
+  assert( check_young_list_empty(true /* check_heap */),
+    "young list should be empty at this point");
 
   // Update the number of full collections that have been completed.
   increment_full_collections_completed(false /* concurrent */);
@@ -1405,6 +1402,7 @@
     Universe::print_heap_after_gc();
   }
   g1mm()->update_counters();
+  post_full_gc_dump();
 
   return true;
 }
@@ -3161,12 +3159,6 @@
   }
 }
 
-void G1CollectedHeap::do_sync_mark() {
-  _cm->checkpointRootsInitial();
-  _cm->markFromRoots();
-  _cm->checkpointRootsFinal(false);
-}
-
 // <NEW PREDICTION>
 
 double G1CollectedHeap::predict_region_elapsed_time_ms(HeapRegion *hr,
@@ -3317,11 +3309,10 @@
 
     char verbose_str[128];
     sprintf(verbose_str, "GC pause ");
-    if (g1_policy()->in_young_gc_mode()) {
-      if (g1_policy()->full_young_gcs())
-        strcat(verbose_str, "(young)");
-      else
-        strcat(verbose_str, "(partial)");
+    if (g1_policy()->full_young_gcs()) {
+      strcat(verbose_str, "(young)");
+    } else {
+      strcat(verbose_str, "(partial)");
     }
     if (g1_policy()->during_initial_mark_pause()) {
       strcat(verbose_str, " (initial-mark)");
@@ -3350,10 +3341,8 @@
       append_secondary_free_list_if_not_empty_with_lock();
     }
 
-    if (g1_policy()->in_young_gc_mode()) {
-      assert(check_young_list_well_formed(),
-             "young list should be well formed");
-    }
+    assert(check_young_list_well_formed(),
+      "young list should be well formed");
 
     { // Call to jvmpi::post_class_unload_events must occur outside of active GC
       IsGCActiveMark x;
@@ -3494,27 +3483,25 @@
       // evacuation pause.
       clear_cset_fast_test();
 
-      if (g1_policy()->in_young_gc_mode()) {
-        _young_list->reset_sampled_info();
-
-        // Don't check the whole heap at this point as the
-        // GC alloc regions from this pause have been tagged
-        // as survivors and moved on to the survivor list.
-        // Survivor regions will fail the !is_young() check.
-        assert(check_young_list_empty(false /* check_heap */),
-               "young list should be empty");
+      _young_list->reset_sampled_info();
+
+      // Don't check the whole heap at this point as the
+      // GC alloc regions from this pause have been tagged
+      // as survivors and moved on to the survivor list.
+      // Survivor regions will fail the !is_young() check.
+      assert(check_young_list_empty(false /* check_heap */),
+        "young list should be empty");
 
 #if YOUNG_LIST_VERBOSE
-        gclog_or_tty->print_cr("Before recording survivors.\nYoung List:");
-        _young_list->print();
+      gclog_or_tty->print_cr("Before recording survivors.\nYoung List:");
+      _young_list->print();
 #endif // YOUNG_LIST_VERBOSE
 
-        g1_policy()->record_survivor_regions(_young_list->survivor_length(),
-                                          _young_list->first_survivor_region(),
-                                          _young_list->last_survivor_region());
-
-        _young_list->reset_auxilary_lists();
-      }
+      g1_policy()->record_survivor_regions(_young_list->survivor_length(),
+        _young_list->first_survivor_region(),
+        _young_list->last_survivor_region());
+
+      _young_list->reset_auxilary_lists();
 
       if (evacuation_failed()) {
         _summary_bytes_used = recalculate_used();
@@ -3524,8 +3511,7 @@
         _summary_bytes_used += g1_policy()->bytes_copied_during_gc();
       }
 
-      if (g1_policy()->in_young_gc_mode() &&
-          g1_policy()->during_initial_mark_pause()) {
+      if (g1_policy()->during_initial_mark_pause()) {
         concurrent_mark()->checkpointRootsInitialPost();
         set_marking_started();
         // CAUTION: after the doConcurrentMark() call below,
@@ -4083,6 +4069,23 @@
 }
 #endif // PRODUCT
 
+G1ParGCAllocBuffer::G1ParGCAllocBuffer(size_t gclab_word_size) :
+  ParGCAllocBuffer(gclab_word_size),
+  _should_mark_objects(false),
+  _bitmap(G1CollectedHeap::heap()->reserved_region().start(), gclab_word_size),
+  _retired(false)
+{
+  //_should_mark_objects is set to true when G1ParCopyHelper needs to
+  // mark the forwarded location of an evacuated object.
+  // We set _should_mark_objects to true if marking is active, i.e. when we
+  // need to propagate a mark, or during an initial mark pause, i.e. when we
+  // need to mark objects immediately reachable by the roots.
+  if (G1CollectedHeap::heap()->mark_in_progress() ||
+      G1CollectedHeap::heap()->g1_policy()->during_initial_mark_pause()) {
+    _should_mark_objects = true;
+  }
+}
+
 G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, int queue_num)
   : _g1h(g1h),
     _refs(g1h->task_queue(queue_num)),
@@ -4198,12 +4201,14 @@
 
 G1ParClosureSuper::G1ParClosureSuper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
   _g1(g1), _g1_rem(_g1->g1_rem_set()), _cm(_g1->concurrent_mark()),
-  _par_scan_state(par_scan_state) { }
-
-template <class T> void G1ParCopyHelper::mark_forwardee(T* p) {
-  // This is called _after_ do_oop_work has been called, hence after
-  // the object has been relocated to its new location and *p points
-  // to its new location.
+  _par_scan_state(par_scan_state),
+  _during_initial_mark(_g1->g1_policy()->during_initial_mark_pause()),
+  _mark_in_progress(_g1->mark_in_progress()) { }
+
+template <class T> void G1ParCopyHelper::mark_object(T* p) {
+  // This is called from do_oop_work for objects that are not
+  // in the collection set. Objects in the collection set
+  // are marked after they have been evacuated.
 
   T heap_oop = oopDesc::load_heap_oop(p);
   if (!oopDesc::is_null(heap_oop)) {
@@ -4215,7 +4220,7 @@
   }
 }
 
-oop G1ParCopyHelper::copy_to_survivor_space(oop old) {
+oop G1ParCopyHelper::copy_to_survivor_space(oop old, bool should_mark_copy) {
   size_t    word_sz = old->size();
   HeapRegion* from_region = _g1->heap_region_containing_raw(old);
   // +1 to make the -1 indexes valid...
@@ -4271,8 +4276,8 @@
       obj->set_mark(m);
     }
 
-    // preserve "next" mark bit
-    if (_g1->mark_in_progress() && !_g1->is_obj_ill(old)) {
+    // Mark the evacuated object or propagate "next" mark bit
+    if (should_mark_copy) {
       if (!use_local_bitmaps ||
           !_par_scan_state->alloc_buffer(alloc_purpose)->mark(obj_ptr)) {
         // if we couldn't mark it on the local bitmap (this happens when
@@ -4280,11 +4285,12 @@
         // the bullet and do the standard parallel mark
         _cm->markAndGrayObjectIfNecessary(obj);
       }
-#if 1
+
       if (_g1->isMarkedNext(old)) {
+        // Unmark the object's old location so that marking
+        // doesn't think the old object is alive.
         _cm->nextMarkBitMap()->parClear((HeapWord*)old);
       }
-#endif
     }
 
     size_t* surv_young_words = _par_scan_state->surviving_young_words();
@@ -4307,26 +4313,62 @@
   return obj;
 }
 
-template <bool do_gen_barrier, G1Barrier barrier, bool do_mark_forwardee>
+template <bool do_gen_barrier, G1Barrier barrier, bool do_mark_object>
 template <class T>
-void G1ParCopyClosure <do_gen_barrier, barrier, do_mark_forwardee>
+void G1ParCopyClosure<do_gen_barrier, barrier, do_mark_object>
 ::do_oop_work(T* p) {
   oop obj = oopDesc::load_decode_heap_oop(p);
   assert(barrier != G1BarrierRS || obj != NULL,
          "Precondition: G1BarrierRS implies obj is nonNull");
 
+  // Marking:
+  // If the object is in the collection set, then the thread
+  // that copies the object should mark, or propagate the
+  // mark to, the evacuated object.
+  // If the object is not in the collection set then we
+  // should call the mark_object() method depending on the
+  // value of the template parameter do_mark_object (which will
+  // be true for root scanning closures during an initial mark
+  // pause).
+  // The mark_object() method first checks whether the object
+  // is marked and, if not, attempts to mark the object.
+
   // here the null check is implicit in the cset_fast_test() test
   if (_g1->in_cset_fast_test(obj)) {
     if (obj->is_forwarded()) {
       oopDesc::encode_store_heap_oop(p, obj->forwardee());
+      // If we are a root scanning closure during an initial
+      // mark pause (i.e. do_mark_object will be true) then
+      // we also need to handle marking of roots in the
+      // event of an evacuation failure. In the event of an
+      // evacuation failure, the object is forwarded to itself
+      // and not copied so let's mark it here.
+      if (do_mark_object && obj->forwardee() == obj) {
+        mark_object(p);
+      }
     } else {
-      oop copy_oop = copy_to_survivor_space(obj);
+      // We need to mark the copied object if we're a root scanning
+      // closure during an initial mark pause (i.e. do_mark_object
+      // will be true), or the object is already marked and we need
+      // to propagate the mark to the evacuated copy.
+      bool should_mark_copy = do_mark_object ||
+                              _during_initial_mark ||
+                              (_mark_in_progress && !_g1->is_obj_ill(obj));
+
+      oop copy_oop = copy_to_survivor_space(obj, should_mark_copy);
       oopDesc::encode_store_heap_oop(p, copy_oop);
     }
     // When scanning the RS, we only care about objs in CS.
     if (barrier == G1BarrierRS) {
       _par_scan_state->update_rs(_from, p, _par_scan_state->queue_num());
     }
+  } else {
+    // The object is not in collection set. If we're a root scanning
+    // closure during an initial mark pause (i.e. do_mark_object will
+    // be true) then attempt to mark the object.
+    if (do_mark_object) {
+      mark_object(p);
+    }
   }
 
   if (barrier == G1BarrierEvac && obj != NULL) {
@@ -5091,7 +5133,6 @@
 void G1CollectedHeap::empty_young_list() {
   assert(heap_lock_held_for_gc(),
               "the heap lock should already be held by or for this thread");
-  assert(g1_policy()->in_young_gc_mode(), "should be in young GC mode");
 
   _young_list->empty_list();
 }
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Wed Sep 07 11:52:00 2011 -0700
@@ -1263,16 +1263,10 @@
   // in the young gen: for the SATB pre-barrier, there is no
   // pre-value that needs to be remembered; for the remembered-set
   // update logging post-barrier, we don't maintain remembered set
-  // information for young gen objects. Note that non-generational
-  // G1 does not have any "young" objects, should not elide
-  // the rs logging barrier and so should always answer false below.
-  // However, non-generational G1 (-XX:-G1Gen) appears to have
-  // bit-rotted so was not tested below.
+  // information for young gen objects.
   virtual bool can_elide_initializing_store_barrier(oop new_obj) {
     // Re 6920090, 6920109 above.
     assert(ReduceInitialCardMarksForG1, "Else cannot be here");
-    assert(G1Gen || !is_in_young(new_obj),
-           "Non-generational G1 should never return true below");
     return is_in_young(new_obj);
   }
 
@@ -1389,9 +1383,6 @@
   // bitmap off to the side.
   void doConcurrentMark();
 
-  // Do a full concurrent marking, synchronously.
-  void do_sync_mark();
-
   bool isMarkedPrev(oop obj) const;
   bool isMarkedNext(oop obj) const;
 
@@ -1724,26 +1715,22 @@
 class G1ParGCAllocBuffer: public ParGCAllocBuffer {
 private:
   bool        _retired;
-  bool        _during_marking;
+  bool        _should_mark_objects;
   GCLabBitMap _bitmap;
 
 public:
-  G1ParGCAllocBuffer(size_t gclab_word_size) :
-    ParGCAllocBuffer(gclab_word_size),
-    _during_marking(G1CollectedHeap::heap()->mark_in_progress()),
-    _bitmap(G1CollectedHeap::heap()->reserved_region().start(), gclab_word_size),
-    _retired(false)
-  { }
+  G1ParGCAllocBuffer(size_t gclab_word_size);
 
   inline bool mark(HeapWord* addr) {
     guarantee(use_local_bitmaps, "invariant");
-    assert(_during_marking, "invariant");
+    assert(_should_mark_objects, "invariant");
     return _bitmap.mark(addr);
   }
 
   inline void set_buf(HeapWord* buf) {
-    if (use_local_bitmaps && _during_marking)
+    if (use_local_bitmaps && _should_mark_objects) {
       _bitmap.set_buffer(buf);
+    }
     ParGCAllocBuffer::set_buf(buf);
     _retired = false;
   }
@@ -1751,7 +1738,7 @@
   inline void retire(bool end_of_gc, bool retain) {
     if (_retired)
       return;
-    if (use_local_bitmaps && _during_marking) {
+    if (use_local_bitmaps && _should_mark_objects) {
       _bitmap.retire();
     }
     ParGCAllocBuffer::retire(end_of_gc, retain);
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -170,7 +170,6 @@
   _cur_aux_times_ms(new double[_aux_num]),
   _cur_aux_times_set(new bool[_aux_num]),
 
-  _concurrent_mark_init_times_ms(new TruncatedSeq(NumPrevPausesForHeuristics)),
   _concurrent_mark_remark_times_ms(new TruncatedSeq(NumPrevPausesForHeuristics)),
   _concurrent_mark_cleanup_times_ms(new TruncatedSeq(NumPrevPausesForHeuristics)),
 
@@ -201,7 +200,6 @@
 
   // </NEW PREDICTION>
 
-  _in_young_gc_mode(false),
   _full_young_gcs(true),
   _full_young_pause_num(0),
   _partial_young_pause_num(0),
@@ -400,15 +398,12 @@
   _sigma = (double) G1ConfidencePercent / 100.0;
 
   // start conservatively (around 50ms is about right)
-  _concurrent_mark_init_times_ms->add(0.05);
   _concurrent_mark_remark_times_ms->add(0.05);
   _concurrent_mark_cleanup_times_ms->add(0.20);
   _tenuring_threshold = MaxTenuringThreshold;
-
-  // if G1FixedSurvivorSpaceSize is 0 which means the size is not
-  // fixed, then _max_survivor_regions will be calculated at
-  // calculate_young_list_target_length during initialization
-  _max_survivor_regions = G1FixedSurvivorSpaceSize / HeapRegion::GrainBytes;
+  // _max_survivor_regions will be calculated by
+  // calculate_young_list_target_length() during initialization.
+  _max_survivor_regions = 0;
 
   assert(GCTimeRatio > 0,
          "we should have set it to a default value set_g1_gc_flags() "
@@ -468,27 +463,20 @@
 
   initialize_gc_policy_counters();
 
-  if (G1Gen) {
-    _in_young_gc_mode = true;
-
-    G1YoungGenSizer sizer;
-    size_t initial_region_num = sizer.initial_young_region_num();
-
-    if (UseAdaptiveSizePolicy) {
-      set_adaptive_young_list_length(true);
-      _young_list_fixed_length = 0;
-    } else {
-      set_adaptive_young_list_length(false);
-      _young_list_fixed_length = initial_region_num;
-    }
-    _free_regions_at_end_of_collection = _g1->free_regions();
-    calculate_young_list_min_length();
-    guarantee( _young_list_min_length == 0, "invariant, not enough info" );
-    calculate_young_list_target_length();
+  G1YoungGenSizer sizer;
+  size_t initial_region_num = sizer.initial_young_region_num();
+
+  if (UseAdaptiveSizePolicy) {
+    set_adaptive_young_list_length(true);
+    _young_list_fixed_length = 0;
   } else {
-     _young_list_fixed_length = 0;
-    _in_young_gc_mode = false;
+    set_adaptive_young_list_length(false);
+    _young_list_fixed_length = initial_region_num;
   }
+  _free_regions_at_end_of_collection = _g1->free_regions();
+  calculate_young_list_min_length();
+  guarantee( _young_list_min_length == 0, "invariant, not enough info" );
+  calculate_young_list_target_length();
 
   // We may immediately start allocating regions and placing them on the
   // collection set list. Initialize the per-collection set info
@@ -498,7 +486,7 @@
 // Create the jstat counters for the policy.
 void G1CollectorPolicy::initialize_gc_policy_counters()
 {
-  _gc_policy_counters = new GCPolicyCounters("GarbageFirst", 1, 2 + G1Gen);
+  _gc_policy_counters = new GCPolicyCounters("GarbageFirst", 1, 3);
 }
 
 void G1CollectorPolicy::calculate_young_list_min_length() {
@@ -868,8 +856,7 @@
   if (PrintGCDetails) {
     gclog_or_tty->stamp(PrintGCTimeStamps);
     gclog_or_tty->print("[GC pause");
-    if (in_young_gc_mode())
-      gclog_or_tty->print(" (%s)", full_young_gcs() ? "young" : "partial");
+    gclog_or_tty->print(" (%s)", full_young_gcs() ? "young" : "partial");
   }
 
   assert(_g1->used() == _g1->recalculate_used(),
@@ -921,8 +908,7 @@
   _satb_drain_time_set = false;
   _last_satb_drain_processed_buffers = -1;
 
-  if (in_young_gc_mode())
-    _last_young_gc_full = false;
+  _last_young_gc_full = false;
 
   // do that for any other surv rate groups
   _short_lived_surv_rate_group->stop_adding_regions();
@@ -935,12 +921,7 @@
   _mark_closure_time_ms = mark_closure_time_ms;
 }
 
-void G1CollectorPolicy::record_concurrent_mark_init_start() {
-  _mark_init_start_sec = os::elapsedTime();
-  guarantee(!in_young_gc_mode(), "should not do be here in young GC mode");
-}
-
-void G1CollectorPolicy::record_concurrent_mark_init_end_pre(double
+void G1CollectorPolicy::record_concurrent_mark_init_end(double
                                                    mark_init_elapsed_time_ms) {
   _during_marking = true;
   assert(!initiate_conc_mark_if_possible(), "we should have cleared it by now");
@@ -948,15 +929,6 @@
   _cur_mark_stop_world_time_ms = mark_init_elapsed_time_ms;
 }
 
-void G1CollectorPolicy::record_concurrent_mark_init_end() {
-  double end_time_sec = os::elapsedTime();
-  double elapsed_time_ms = (end_time_sec - _mark_init_start_sec) * 1000.0;
-  _concurrent_mark_init_times_ms->add(elapsed_time_ms);
-  record_concurrent_mark_init_end_pre(elapsed_time_ms);
-
-  _mmu_tracker->add_pause(_mark_init_start_sec, end_time_sec, true);
-}
-
 void G1CollectorPolicy::record_concurrent_mark_remark_start() {
   _mark_remark_start_sec = os::elapsedTime();
   _during_marking = false;
@@ -1019,13 +991,11 @@
 
 void
 G1CollectorPolicy::record_concurrent_mark_cleanup_completed() {
-  if (in_young_gc_mode()) {
-    _should_revert_to_full_young_gcs = false;
-    _last_full_young_gc = true;
-    _in_marking_window = false;
-    if (adaptive_young_list_length())
-      calculate_young_list_target_length();
-  }
+  _should_revert_to_full_young_gcs = false;
+  _last_full_young_gc = true;
+  _in_marking_window = false;
+  if (adaptive_young_list_length())
+    calculate_young_list_target_length();
 }
 
 void G1CollectorPolicy::record_concurrent_pause() {
@@ -1174,31 +1144,29 @@
   }
 #endif // PRODUCT
 
-  if (in_young_gc_mode()) {
-    last_pause_included_initial_mark = during_initial_mark_pause();
-    if (last_pause_included_initial_mark)
-      record_concurrent_mark_init_end_pre(0.0);
-
-    size_t min_used_targ =
-      (_g1->capacity() / 100) * InitiatingHeapOccupancyPercent;
-
-
-    if (!_g1->mark_in_progress() && !_last_full_young_gc) {
-      assert(!last_pause_included_initial_mark, "invariant");
-      if (cur_used_bytes > min_used_targ &&
-          cur_used_bytes > _prev_collection_pause_used_at_end_bytes) {
+  last_pause_included_initial_mark = during_initial_mark_pause();
+  if (last_pause_included_initial_mark)
+    record_concurrent_mark_init_end(0.0);
+
+  size_t min_used_targ =
+    (_g1->capacity() / 100) * InitiatingHeapOccupancyPercent;
+
+
+  if (!_g1->mark_in_progress() && !_last_full_young_gc) {
+    assert(!last_pause_included_initial_mark, "invariant");
+    if (cur_used_bytes > min_used_targ &&
+      cur_used_bytes > _prev_collection_pause_used_at_end_bytes) {
         assert(!during_initial_mark_pause(), "we should not see this here");
 
         // Note: this might have already been set, if during the last
         // pause we decided to start a cycle but at the beginning of
         // this pause we decided to postpone it. That's OK.
         set_initiate_conc_mark_if_possible();
-      }
     }
-
-    _prev_collection_pause_used_at_end_bytes = cur_used_bytes;
   }
 
+  _prev_collection_pause_used_at_end_bytes = cur_used_bytes;
+
   _mmu_tracker->add_pause(end_time_sec - elapsed_ms/1000.0,
                           end_time_sec, false);
 
@@ -1468,24 +1436,23 @@
     new_in_marking_window_im = true;
   }
 
-  if (in_young_gc_mode()) {
-    if (_last_full_young_gc) {
-      set_full_young_gcs(false);
-      _last_full_young_gc = false;
+  if (_last_full_young_gc) {
+    set_full_young_gcs(false);
+    _last_full_young_gc = false;
+  }
+
+  if ( !_last_young_gc_full ) {
+    if ( _should_revert_to_full_young_gcs ||
+      _known_garbage_ratio < 0.05 ||
+      (adaptive_young_list_length() &&
+      (get_gc_eff_factor() * cur_efficiency < predict_young_gc_eff())) ) {
+        set_full_young_gcs(true);
     }
-
-    if ( !_last_young_gc_full ) {
-      if ( _should_revert_to_full_young_gcs ||
-           _known_garbage_ratio < 0.05 ||
-           (adaptive_young_list_length() &&
-           (get_gc_eff_factor() * cur_efficiency < predict_young_gc_eff())) ) {
-        set_full_young_gcs(true);
-      }
-    }
-    _should_revert_to_full_young_gcs = false;
-
-    if (_last_young_gc_full && !_during_marking)
-      _young_gc_eff_seq->add(cur_efficiency);
+  }
+  _should_revert_to_full_young_gcs = false;
+
+  if (_last_young_gc_full && !_during_marking) {
+    _young_gc_eff_seq->add(cur_efficiency);
   }
 
   _short_lived_surv_rate_group->start_adding_regions();
@@ -1910,18 +1877,8 @@
   // I don't think we need to do this when in young GC mode since
   // marking will be initiated next time we hit the soft limit anyway...
   if (predicted_time_ms > _expensive_region_limit_ms) {
-    if (!in_young_gc_mode()) {
-        set_full_young_gcs(true);
-        // We might want to do something different here. However,
-        // right now we don't support the non-generational G1 mode
-        // (and in fact we are planning to remove the associated code,
-        // see CR 6814390). So, let's leave it as is and this will be
-        // removed some time in the future
-        ShouldNotReachHere();
-        set_during_initial_mark_pause();
-    } else
-      // no point in doing another partial one
-      _should_revert_to_full_young_gcs = true;
+    // no point in doing another partial one
+    _should_revert_to_full_young_gcs = true;
   }
 }
 
@@ -2331,18 +2288,9 @@
 // Calculates survivor space parameters.
 void G1CollectorPolicy::calculate_survivors_policy()
 {
-  if (G1FixedSurvivorSpaceSize == 0) {
-    _max_survivor_regions = _young_list_target_length / SurvivorRatio;
-  } else {
-    _max_survivor_regions = G1FixedSurvivorSpaceSize / HeapRegion::GrainBytes;
-  }
-
-  if (G1FixedTenuringThreshold) {
-    _tenuring_threshold = MaxTenuringThreshold;
-  } else {
-    _tenuring_threshold = _survivors_age_table.compute_tenuring_threshold(
+  _max_survivor_regions = _young_list_target_length / SurvivorRatio;
+  _tenuring_threshold = _survivors_age_table.compute_tenuring_threshold(
         HeapRegion::GrainWords * _max_survivor_regions);
-  }
 }
 
 #ifndef PRODUCT
@@ -2617,9 +2565,7 @@
   _inc_cset_size = 0;
   _inc_cset_bytes_used_before = 0;
 
-  if (in_young_gc_mode()) {
-    _inc_cset_young_index = 0;
-  }
+  _inc_cset_young_index = 0;
 
   _inc_cset_max_finger = 0;
   _inc_cset_recorded_young_bytes = 0;
@@ -2848,86 +2794,77 @@
   max_live_bytes = max_live_bytes + expansion_bytes;
 
   HeapRegion* hr;
-  if (in_young_gc_mode()) {
-    double young_start_time_sec = os::elapsedTime();
-
-    if (G1PolicyVerbose > 0) {
-      gclog_or_tty->print_cr("Adding %d young regions to the CSet",
-                    _g1->young_list()->length());
-    }
-
-    _young_cset_length  = 0;
-    _last_young_gc_full = full_young_gcs() ? true : false;
-
-    if (_last_young_gc_full)
-      ++_full_young_pause_num;
-    else
-      ++_partial_young_pause_num;
-
-    // The young list is laid with the survivor regions from the previous
-    // pause are appended to the RHS of the young list, i.e.
-    //   [Newly Young Regions ++ Survivors from last pause].
-
-    hr = _g1->young_list()->first_survivor_region();
-    while (hr != NULL) {
-      assert(hr->is_survivor(), "badly formed young list");
-      hr->set_young();
-      hr = hr->get_next_young_region();
-    }
-
-    // Clear the fields that point to the survivor list - they are
-    // all young now.
-    _g1->young_list()->clear_survivors();
-
-    if (_g1->mark_in_progress())
-      _g1->concurrent_mark()->register_collection_set_finger(_inc_cset_max_finger);
-
-    _young_cset_length = _inc_cset_young_index;
-    _collection_set = _inc_cset_head;
-    _collection_set_size = _inc_cset_size;
-    _collection_set_bytes_used_before = _inc_cset_bytes_used_before;
-
-    // For young regions in the collection set, we assume the worst
-    // case of complete survival
-    max_live_bytes -= _inc_cset_size * HeapRegion::GrainBytes;
-
-    time_remaining_ms -= _inc_cset_predicted_elapsed_time_ms;
-    predicted_pause_time_ms += _inc_cset_predicted_elapsed_time_ms;
-
-    // The number of recorded young regions is the incremental
-    // collection set's current size
-    set_recorded_young_regions(_inc_cset_size);
-    set_recorded_rs_lengths(_inc_cset_recorded_rs_lengths);
-    set_recorded_young_bytes(_inc_cset_recorded_young_bytes);
+  double young_start_time_sec = os::elapsedTime();
+
+  if (G1PolicyVerbose > 0) {
+    gclog_or_tty->print_cr("Adding %d young regions to the CSet",
+      _g1->young_list()->length());
+  }
+
+  _young_cset_length  = 0;
+  _last_young_gc_full = full_young_gcs() ? true : false;
+
+  if (_last_young_gc_full)
+    ++_full_young_pause_num;
+  else
+    ++_partial_young_pause_num;
+
+  // The young list is laid with the survivor regions from the previous
+  // pause are appended to the RHS of the young list, i.e.
+  //   [Newly Young Regions ++ Survivors from last pause].
+
+  hr = _g1->young_list()->first_survivor_region();
+  while (hr != NULL) {
+    assert(hr->is_survivor(), "badly formed young list");
+    hr->set_young();
+    hr = hr->get_next_young_region();
+  }
+
+  // Clear the fields that point to the survivor list - they are
+  // all young now.
+  _g1->young_list()->clear_survivors();
+
+  if (_g1->mark_in_progress())
+    _g1->concurrent_mark()->register_collection_set_finger(_inc_cset_max_finger);
+
+  _young_cset_length = _inc_cset_young_index;
+  _collection_set = _inc_cset_head;
+  _collection_set_size = _inc_cset_size;
+  _collection_set_bytes_used_before = _inc_cset_bytes_used_before;
+
+  // For young regions in the collection set, we assume the worst
+  // case of complete survival
+  max_live_bytes -= _inc_cset_size * HeapRegion::GrainBytes;
+
+  time_remaining_ms -= _inc_cset_predicted_elapsed_time_ms;
+  predicted_pause_time_ms += _inc_cset_predicted_elapsed_time_ms;
+
+  // The number of recorded young regions is the incremental
+  // collection set's current size
+  set_recorded_young_regions(_inc_cset_size);
+  set_recorded_rs_lengths(_inc_cset_recorded_rs_lengths);
+  set_recorded_young_bytes(_inc_cset_recorded_young_bytes);
 #if PREDICTIONS_VERBOSE
-    set_predicted_bytes_to_copy(_inc_cset_predicted_bytes_to_copy);
+  set_predicted_bytes_to_copy(_inc_cset_predicted_bytes_to_copy);
 #endif // PREDICTIONS_VERBOSE
 
-    if (G1PolicyVerbose > 0) {
-      gclog_or_tty->print_cr("  Added " PTR_FORMAT " Young Regions to CS.",
-                             _inc_cset_size);
-      gclog_or_tty->print_cr("    (" SIZE_FORMAT " KB left in heap.)",
-                            max_live_bytes/K);
-    }
-
-    assert(_inc_cset_size == _g1->young_list()->length(), "Invariant");
-
-    double young_end_time_sec = os::elapsedTime();
-    _recorded_young_cset_choice_time_ms =
-      (young_end_time_sec - young_start_time_sec) * 1000.0;
-
-    // We are doing young collections so reset this.
-    non_young_start_time_sec = young_end_time_sec;
-
-    // Note we can use either _collection_set_size or
-    // _young_cset_length here
-    if (_collection_set_size > 0 && _last_young_gc_full) {
-      // don't bother adding more regions...
-      goto choose_collection_set_end;
-    }
+  if (G1PolicyVerbose > 0) {
+    gclog_or_tty->print_cr("  Added " PTR_FORMAT " Young Regions to CS.",
+      _inc_cset_size);
+    gclog_or_tty->print_cr("    (" SIZE_FORMAT " KB left in heap.)",
+      max_live_bytes/K);
   }
 
-  if (!in_young_gc_mode() || !full_young_gcs()) {
+  assert(_inc_cset_size == _g1->young_list()->length(), "Invariant");
+
+  double young_end_time_sec = os::elapsedTime();
+  _recorded_young_cset_choice_time_ms =
+    (young_end_time_sec - young_start_time_sec) * 1000.0;
+
+  // We are doing young collections so reset this.
+  non_young_start_time_sec = young_end_time_sec;
+
+  if (!full_young_gcs()) {
     bool should_continue = true;
     NumberSeq seq;
     double avg_prediction = 100000000000000000.0; // something very large
@@ -2960,7 +2897,6 @@
       _should_revert_to_full_young_gcs  = true;
   }
 
-choose_collection_set_end:
   stop_incremental_cset_building();
 
   count_CS_bytes_used();
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp	Wed Sep 07 11:52:00 2011 -0700
@@ -141,7 +141,6 @@
 
   TruncatedSeq* _recent_rs_sizes;
 
-  TruncatedSeq* _concurrent_mark_init_times_ms;
   TruncatedSeq* _concurrent_mark_remark_times_ms;
   TruncatedSeq* _concurrent_mark_cleanup_times_ms;
 
@@ -178,9 +177,6 @@
   double* _par_last_gc_worker_end_times_ms;
   double* _par_last_gc_worker_times_ms;
 
-  // indicates that we are in young GC mode
-  bool _in_young_gc_mode;
-
   // indicates whether we are in full young or partially young GC mode
   bool _full_young_gcs;
 
@@ -527,10 +523,6 @@
     return _mmu_tracker->max_gc_time() * 1000.0;
   }
 
-  double predict_init_time_ms() {
-    return get_new_prediction(_concurrent_mark_init_times_ms);
-  }
-
   double predict_remark_time_ms() {
     return get_new_prediction(_concurrent_mark_remark_times_ms);
   }
@@ -776,7 +768,6 @@
   // This set of variables tracks the collector efficiency, in order to
   // determine whether we should initiate a new marking.
   double _cur_mark_stop_world_time_ms;
-  double _mark_init_start_sec;
   double _mark_remark_start_sec;
   double _mark_cleanup_start_sec;
   double _mark_closure_time_ms;
@@ -849,9 +840,7 @@
                                              size_t start_used);
 
   // Must currently be called while the world is stopped.
-  virtual void record_concurrent_mark_init_start();
-  virtual void record_concurrent_mark_init_end();
-  void record_concurrent_mark_init_end_pre(double
+  void record_concurrent_mark_init_end(double
                                            mark_init_elapsed_time_ms);
 
   void record_mark_closure_time(double mark_closure_time_ms);
@@ -1101,30 +1090,17 @@
   bool is_young_list_full() {
     size_t young_list_length = _g1->young_list()->length();
     size_t young_list_target_length = _young_list_target_length;
-    if (G1FixedEdenSize) {
-      young_list_target_length -= _max_survivor_regions;
-    }
     return young_list_length >= young_list_target_length;
   }
 
   bool can_expand_young_list() {
     size_t young_list_length = _g1->young_list()->length();
     size_t young_list_max_length = _young_list_max_length;
-    if (G1FixedEdenSize) {
-      young_list_max_length -= _max_survivor_regions;
-    }
     return young_list_length < young_list_max_length;
   }
 
   void update_region_num(bool young);
 
-  bool in_young_gc_mode() {
-    return _in_young_gc_mode;
-  }
-  void set_in_young_gc_mode(bool in_young_gc_mode) {
-    _in_young_gc_mode = in_young_gc_mode;
-  }
-
   bool full_young_gcs() {
     return _full_young_gcs;
   }
--- a/src/share/vm/gc_implementation/g1/g1OopClosures.hpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/g1OopClosures.hpp	Wed Sep 07 11:52:00 2011 -0700
@@ -50,6 +50,8 @@
   G1RemSet* _g1_rem;
   ConcurrentMark* _cm;
   G1ParScanThreadState* _par_scan_state;
+  bool _during_initial_mark;
+  bool _mark_in_progress;
 public:
   G1ParClosureSuper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state);
   bool apply_to_weak_ref_discovered_field() { return true; }
@@ -102,8 +104,8 @@
 class G1ParCopyHelper : public G1ParClosureSuper {
   G1ParScanClosure *_scanner;
 protected:
-  template <class T> void mark_forwardee(T* p);
-  oop copy_to_survivor_space(oop obj);
+  template <class T> void mark_object(T* p);
+  oop copy_to_survivor_space(oop obj, bool should_mark_copy);
 public:
   G1ParCopyHelper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state,
                   G1ParScanClosure *scanner) :
@@ -111,7 +113,7 @@
 };
 
 template<bool do_gen_barrier, G1Barrier barrier,
-         bool do_mark_forwardee>
+         bool do_mark_object>
 class G1ParCopyClosure : public G1ParCopyHelper {
   G1ParScanClosure _scanner;
   template <class T> void do_oop_work(T* p);
@@ -120,8 +122,6 @@
     _scanner(g1, par_scan_state), G1ParCopyHelper(g1, par_scan_state, &_scanner) { }
   template <class T> void do_oop_nv(T* p) {
     do_oop_work(p);
-    if (do_mark_forwardee)
-      mark_forwardee(p);
   }
   virtual void do_oop(oop* p)       { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
--- a/src/share/vm/gc_implementation/g1/g1_globals.hpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp	Wed Sep 07 11:52:00 2011 -0700
@@ -39,8 +39,6 @@
   develop(intx, G1MarkingOverheadPercent, 0,                                \
           "Overhead of concurrent marking")                                 \
                                                                             \
-  develop(bool, G1Gen, true,                                                \
-          "If true, it will enable the generational G1")                    \
                                                                             \
   develop(intx, G1PolicyVerbose, 0,                                         \
           "The verbosity level on G1 policy decisions")                     \
@@ -126,9 +124,6 @@
   develop(bool, G1RSBarrierNullFilter, true,                                \
           "If true, generate null-pointer filtering code in RS barrier")    \
                                                                             \
-  develop(bool, G1PrintCTFilterStats, false,                                \
-          "If true, print stats on RS filtering effectiveness")             \
-                                                                            \
   develop(bool, G1DeferredRSUpdate, true,                                   \
           "If true, use deferred RS updates")                               \
                                                                             \
@@ -251,16 +246,6 @@
           "When set, G1 will fail when it encounters an FP 'error', "       \
           "so as to allow debugging")                                       \
                                                                             \
-  develop(bool, G1FixedTenuringThreshold, false,                            \
-          "When set, G1 will not adjust the tenuring threshold")            \
-                                                                            \
-  develop(bool, G1FixedEdenSize, false,                                     \
-          "When set, G1 will not allocate unused survivor space regions")   \
-                                                                            \
-  develop(uintx, G1FixedSurvivorSpaceSize, 0,                               \
-          "If non-0 is the size of the G1 survivor space, "                 \
-          "otherwise SurvivorRatio is used to determine the size")          \
-                                                                            \
   product(uintx, G1HeapRegionSize, 0,                                       \
           "Size of the G1 regions.")                                        \
                                                                             \
--- a/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp	Wed Sep 07 11:52:00 2011 -0700
@@ -36,7 +36,7 @@
 };
 
 template<bool do_gen_barrier, G1Barrier barrier,
-         bool do_mark_forwardee>
+         bool do_mark_object>
 class G1ParCopyClosure;
 class G1ParScanClosure;
 class G1ParPushHeapRSClosure;
--- a/src/share/vm/gc_interface/collectedHeap.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/gc_interface/collectedHeap.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -410,13 +410,13 @@
 
 void CollectedHeap::pre_full_gc_dump() {
   if (HeapDumpBeforeFullGC) {
-    TraceTime tt("Heap Dump: ", PrintGCDetails, false, gclog_or_tty);
+    TraceTime tt("Heap Dump (before full gc): ", PrintGCDetails, false, gclog_or_tty);
     // We are doing a "major" collection and a heap dump before
     // major collection has been requested.
     HeapDumper::dump_heap();
   }
   if (PrintClassHistogramBeforeFullGC) {
-    TraceTime tt("Class Histogram: ", PrintGCDetails, true, gclog_or_tty);
+    TraceTime tt("Class Histogram (before full gc): ", PrintGCDetails, true, gclog_or_tty);
     VM_GC_HeapInspection inspector(gclog_or_tty, false /* ! full gc */, false /* ! prologue */);
     inspector.doit();
   }
@@ -424,11 +424,11 @@
 
 void CollectedHeap::post_full_gc_dump() {
   if (HeapDumpAfterFullGC) {
-    TraceTime tt("Heap Dump", PrintGCDetails, false, gclog_or_tty);
+    TraceTime tt("Heap Dump (after full gc): ", PrintGCDetails, false, gclog_or_tty);
     HeapDumper::dump_heap();
   }
   if (PrintClassHistogramAfterFullGC) {
-    TraceTime tt("Class Histogram", PrintGCDetails, true, gclog_or_tty);
+    TraceTime tt("Class Histogram (after full gc): ", PrintGCDetails, true, gclog_or_tty);
     VM_GC_HeapInspection inspector(gclog_or_tty, false /* ! full gc */, false /* ! prologue */);
     inspector.doit();
   }
--- a/src/share/vm/runtime/arguments.cpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/runtime/arguments.cpp	Wed Sep 07 11:52:00 2011 -0700
@@ -1423,6 +1423,9 @@
     if (FLAG_IS_DEFAULT(MinHeapDeltaBytes)) {
       FLAG_SET_DEFAULT(MinHeapDeltaBytes, 64*M);
     }
+    // For those collectors or operating systems (eg, Windows) that do
+    // not support full UseNUMA, we will map to UseNUMAInterleaving for now
+    UseNUMAInterleaving = true;
   }
 }
 
--- a/src/share/vm/runtime/globals.hpp	Wed Sep 07 09:35:52 2011 +0200
+++ b/src/share/vm/runtime/globals.hpp	Wed Sep 07 11:52:00 2011 -0700
@@ -475,6 +475,12 @@
   product(bool, UseNUMA, false,                                             \
           "Use NUMA if available")                                          \
                                                                             \
+  product(bool, UseNUMAInterleaving, false,                                 \
+          "Interleave memory across NUMA nodes if available")               \
+                                                                            \
+  product(uintx, NUMAInterleaveGranularity, 2*M,                            \
+          "Granularity to use for NUMA interleaving on Windows OS")         \
+                                                                            \
   product(bool, ForceNUMA, false,                                           \
           "Force NUMA optimizations on single-node/UMA systems")            \
                                                                             \