# HG changeset patch # User jcoomes # Date 1315024437 25200 # Node ID 4668545121b89e34e501f22bcfcc0d541034d392 # Parent 5c123cbeebbef54be9d8a2650750925bb75692af# Parent ae1b1788f63fb364549a50af9d3f83fbd434a67c Merge diff -r 5c123cbeebbe -r 4668545121b8 src/cpu/sparc/vm/assembler_sparc.cpp --- a/src/cpu/sparc/vm/assembler_sparc.cpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/cpu/sparc/vm/assembler_sparc.cpp Fri Sep 02 21:33:57 2011 -0700 @@ -2161,29 +2161,6 @@ #endif } -void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p, - Register s1, address d, - relocInfo::relocType rt ) { - assert_not_delayed(); - if (VM_Version::v9_instructions_work()) { - bpr(rc, a, p, s1, d, rt); - } else { - tst(s1); - br(reg_cond_to_cc_cond(rc), a, p, d, rt); - } -} - -void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p, - Register s1, Label& L ) { - assert_not_delayed(); - if (VM_Version::v9_instructions_work()) { - bpr(rc, a, p, s1, L); - } else { - tst(s1); - br(reg_cond_to_cc_cond(rc), a, p, L); - } -} - // Compare registers and branch with nop in delay slot or cbcond without delay slot. // Compare integer (32 bit) values (icc only). @@ -4340,22 +4317,29 @@ } else { pre_val = O0; } + int satb_q_index_byte_offset = in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()); + int satb_q_buf_byte_offset = in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_buf()); + assert(in_bytes(PtrQueue::byte_width_of_index()) == sizeof(intptr_t) && in_bytes(PtrQueue::byte_width_of_buf()) == sizeof(intptr_t), "check sizes in assembly below"); __ bind(restart); + + // Load the index into the SATB buffer. PtrQueue::_index is a size_t + // so ld_ptr is appropriate. __ ld_ptr(G2_thread, satb_q_index_byte_offset, L0); - __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn, L0, refill); - // If the branch is taken, no harm in executing this in the delay slot. - __ delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, L1); + // index == 0? + __ cmp_and_brx_short(L0, G0, Assembler::equal, Assembler::pn, refill); + + __ ld_ptr(G2_thread, satb_q_buf_byte_offset, L1); __ sub(L0, oopSize, L0); __ st_ptr(pre_val, L1, L0); // [_buf + index] := I0 @@ -4466,9 +4450,8 @@ tmp); } - // Check on whether to annul. - br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered); - delayed()->nop(); + // Is marking active? + cmp_and_br_short(tmp, G0, Assembler::equal, Assembler::pt, filtered); // Do we need to load the previous value? if (obj != noreg) { @@ -4490,9 +4473,7 @@ assert(pre_val != noreg, "must have a real register"); // Is the previous value null? - // Check on whether to annul. - br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, pre_val, filtered); - delayed()->nop(); + cmp_and_brx_short(pre_val, G0, Assembler::equal, Assembler::pt, filtered); // OK, it's not filtered, so we'll need to call enqueue. In the normal // case, pre_val will be a scratch G-reg, but there are some cases in @@ -4519,39 +4500,6 @@ bind(filtered); } -static jint num_ct_writes = 0; -static jint num_ct_writes_filtered_in_hr = 0; -static jint num_ct_writes_filtered_null = 0; -static G1CollectedHeap* g1 = NULL; - -static Thread* count_ct_writes(void* filter_val, void* new_val) { - Atomic::inc(&num_ct_writes); - if (filter_val == NULL) { - Atomic::inc(&num_ct_writes_filtered_in_hr); - } else if (new_val == NULL) { - Atomic::inc(&num_ct_writes_filtered_null); - } else { - if (g1 == NULL) { - g1 = G1CollectedHeap::heap(); - } - } - if ((num_ct_writes % 1000000) == 0) { - jint num_ct_writes_filtered = - num_ct_writes_filtered_in_hr + - num_ct_writes_filtered_null; - - tty->print_cr("%d potential CT writes: %5.2f%% filtered\n" - " (%5.2f%% intra-HR, %5.2f%% null).", - num_ct_writes, - 100.0*(float)num_ct_writes_filtered/(float)num_ct_writes, - 100.0*(float)num_ct_writes_filtered_in_hr/ - (float)num_ct_writes, - 100.0*(float)num_ct_writes_filtered_null/ - (float)num_ct_writes); - } - return Thread::current(); -} - static address dirty_card_log_enqueue = 0; static u_char* dirty_card_log_enqueue_end = 0; @@ -4574,11 +4522,8 @@ __ set(addrlit, O1); // O1 := __ ldub(O0, O1, O2); // O2 := [O0 + O1] - __ br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt, - O2, not_already_dirty); - // Get O1 + O2 into a reg by itself -- useful in the take-the-branch - // case, harmless if not. - __ delayed()->add(O0, O1, O3); + assert(CardTableModRefBS::dirty_card_val() == 0, "otherwise check this code"); + __ cmp_and_br_short(O2, G0, Assembler::notEqual, Assembler::pt, not_already_dirty); // We didn't take the branch, so we're already dirty: return. // Use return-from-leaf @@ -4587,8 +4532,13 @@ // Not dirty. __ bind(not_already_dirty); + + // Get O0 + O1 into a reg by itself + __ add(O0, O1, O3); + // First, dirty it. __ stb(G0, O3, G0); // [cardPtr] := 0 (i.e., dirty). + int dirty_card_q_index_byte_offset = in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()); @@ -4596,12 +4546,15 @@ in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_buf()); __ bind(restart); + + // Load the index into the update buffer. PtrQueue::_index is + // a size_t so ld_ptr is appropriate here. __ ld_ptr(G2_thread, dirty_card_q_index_byte_offset, L0); - __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn, - L0, refill); - // If the branch is taken, no harm in executing this in the delay slot. - __ delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1); + // index == 0? + __ cmp_and_brx_short(L0, G0, Assembler::equal, Assembler::pn, refill); + + __ ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1); __ sub(L0, oopSize, L0); __ st_ptr(O3, L1, L0); // [_buf + index] := I0 @@ -4664,6 +4617,7 @@ G1SATBCardTableModRefBS* bs = (G1SATBCardTableModRefBS*) Universe::heap()->barrier_set(); assert(bs->kind() == BarrierSet::G1SATBCT || bs->kind() == BarrierSet::G1SATBCTLogging, "wrong barrier"); + if (G1RSBarrierRegionFilter) { xor3(store_addr, new_val, tmp); #ifdef _LP64 @@ -4672,33 +4626,8 @@ srl(tmp, HeapRegion::LogOfHRGrainBytes, tmp); #endif - if (G1PrintCTFilterStats) { - guarantee(tmp->is_global(), "Or stats won't work..."); - // This is a sleazy hack: I'm temporarily hijacking G2, which I - // promise to restore. - mov(new_val, G2); - save_frame(0); - mov(tmp, O0); - mov(G2, O1); - // Save G-regs that target may use. - mov(G1, L1); - mov(G2, L2); - mov(G3, L3); - mov(G4, L4); - mov(G5, L5); - call(CAST_FROM_FN_PTR(address, &count_ct_writes)); - delayed()->nop(); - mov(O0, G2); - // Restore G-regs that target may have used. - mov(L1, G1); - mov(L3, G3); - mov(L4, G4); - mov(L5, G5); - restore(G0, G0, G0); - } - // XXX Should I predict this taken or not? Does it mattern? - br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered); - delayed()->nop(); + // XXX Should I predict this taken or not? Does it matter? + cmp_and_brx_short(tmp, G0, Assembler::equal, Assembler::pt, filtered); } // If the "store_addr" register is an "in" or "local" register, move it to @@ -4723,7 +4652,6 @@ restore(); bind(filtered); - } #endif // SERIALGC diff -r 5c123cbeebbe -r 4668545121b8 src/cpu/sparc/vm/assembler_sparc.hpp --- a/src/cpu/sparc/vm/assembler_sparc.hpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/cpu/sparc/vm/assembler_sparc.hpp Fri Sep 02 21:33:57 2011 -0700 @@ -1940,12 +1940,6 @@ void br_null ( Register s1, bool a, Predict p, Label& L ); void br_notnull( Register s1, bool a, Predict p, Label& L ); - // These versions will do the most efficient thing on v8 and v9. Perhaps - // this is what the routine above was meant to do, but it didn't (and - // didn't cover both target address kinds.) - void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt = relocInfo::none ); - void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, Label& L); - // // Compare registers and branch with nop in delay slot or cbcond without delay slot. // diff -r 5c123cbeebbe -r 4668545121b8 src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp --- a/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp Fri Sep 02 21:33:57 2011 -0700 @@ -421,8 +421,7 @@ } if (__ is_in_wdisp16_range(_continuation)) { - __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt, - pre_val_reg, _continuation); + __ br_null(pre_val_reg, /*annul*/false, Assembler::pt, _continuation); } else { __ cmp(pre_val_reg, G0); __ brx(Assembler::equal, false, Assembler::pn, _continuation); @@ -458,8 +457,7 @@ // The original src operand was not a constant. // Generate src == null? if (__ is_in_wdisp16_range(_continuation)) { - __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt, - src_reg, _continuation); + __ br_null(src_reg, /*annul*/false, Assembler::pt, _continuation); } else { __ cmp(src_reg, G0); __ brx(Assembler::equal, false, Assembler::pt, _continuation); @@ -476,13 +474,9 @@ Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset_in_bytes() + sizeof(oopDesc)); __ ld(ref_type_adr, tmp_reg); - if (__ is_in_wdisp16_range(_continuation)) { - __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt, - tmp_reg, _continuation); - } else { - __ cmp(tmp_reg, G0); - __ brx(Assembler::equal, false, Assembler::pt, _continuation); - } + // _reference_type field is of type ReferenceType (enum) + assert(REF_NONE == 0, "check this code"); + __ cmp_zero_and_br(Assembler::equal, tmp_reg, _continuation, /*annul*/false, Assembler::pt); __ delayed()->nop(); // Is marking active? @@ -498,13 +492,8 @@ assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption"); __ ldsb(in_progress, tmp_reg); } - if (__ is_in_wdisp16_range(_continuation)) { - __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt, - tmp_reg, _continuation); - } else { - __ cmp(tmp_reg, G0); - __ brx(Assembler::equal, false, Assembler::pt, _continuation); - } + + __ cmp_zero_and_br(Assembler::equal, tmp_reg, _continuation, /*annul*/false, Assembler::pt); __ delayed()->nop(); // val == null? @@ -512,8 +501,7 @@ Register val_reg = val()->as_register(); if (__ is_in_wdisp16_range(_continuation)) { - __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt, - val_reg, _continuation); + __ br_null(val_reg, /*annul*/false, Assembler::pt, _continuation); } else { __ cmp(val_reg, G0); __ brx(Assembler::equal, false, Assembler::pt, _continuation); @@ -542,9 +530,9 @@ assert(new_val()->is_register(), "Precondition."); Register addr_reg = addr()->as_pointer_register(); Register new_val_reg = new_val()->as_register(); + if (__ is_in_wdisp16_range(_continuation)) { - __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt, - new_val_reg, _continuation); + __ br_null(new_val_reg, /*annul*/false, Assembler::pt, _continuation); } else { __ cmp(new_val_reg, G0); __ brx(Assembler::equal, false, Assembler::pn, _continuation); diff -r 5c123cbeebbe -r 4668545121b8 src/cpu/sparc/vm/c1_Runtime1_sparc.cpp --- a/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp Fri Sep 02 21:33:57 2011 -0700 @@ -834,14 +834,16 @@ int satb_q_buf_byte_offset = in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_buf()); + __ bind(restart); + // Load the index into the SATB buffer. PtrQueue::_index is a + // size_t so ld_ptr is appropriate __ ld_ptr(G2_thread, satb_q_index_byte_offset, tmp); - __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, - Assembler::pn, tmp, refill); + // index == 0? + __ cmp_and_brx_short(tmp, G0, Assembler::equal, Assembler::pn, refill); - // If the branch is taken, no harm in executing this in the delay slot. - __ delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, tmp2); + __ ld_ptr(G2_thread, satb_q_buf_byte_offset, tmp2); __ sub(tmp, oopSize, tmp); __ st_ptr(pre_val, tmp2, tmp); // [_buf + index] := @@ -901,11 +903,8 @@ __ set(rs, cardtable); // cardtable := __ ldub(addr, cardtable, tmp); // tmp := [addr + cardtable] - __ br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt, - tmp, not_already_dirty); - // Get cardtable + tmp into a reg by itself -- useful in the take-the-branch - // case, harmless if not. - __ delayed()->add(addr, cardtable, tmp2); + assert(CardTableModRefBS::dirty_card_val() == 0, "otherwise check this code"); + __ cmp_and_br_short(tmp, G0, Assembler::notEqual, Assembler::pt, not_already_dirty); // We didn't take the branch, so we're already dirty: return. // Use return-from-leaf @@ -914,6 +913,10 @@ // Not dirty. __ bind(not_already_dirty); + + // Get cardtable + tmp into a reg by itself + __ add(addr, cardtable, tmp2); + // First, dirty it. __ stb(G0, tmp2, 0); // [cardPtr] := 0 (i.e., dirty). @@ -929,13 +932,17 @@ int dirty_card_q_buf_byte_offset = in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_buf()); + __ bind(restart); + + // Get the index into the update buffer. PtrQueue::_index is + // a size_t so ld_ptr is appropriate here. __ ld_ptr(G2_thread, dirty_card_q_index_byte_offset, tmp3); - __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn, - tmp3, refill); - // If the branch is taken, no harm in executing this in the delay slot. - __ delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, tmp4); + // index == 0? + __ cmp_and_brx_short(tmp3, G0, Assembler::equal, Assembler::pn, refill); + + __ ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, tmp4); __ sub(tmp3, oopSize, tmp3); __ st_ptr(tmp2, tmp4, tmp3); // [_buf + index] := diff -r 5c123cbeebbe -r 4668545121b8 src/os/linux/vm/os_linux.cpp --- a/src/os/linux/vm/os_linux.cpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/os/linux/vm/os_linux.cpp Fri Sep 02 21:33:57 2011 -0700 @@ -125,10 +125,6 @@ # include # include -#ifdef AMD64 -#include -#endif - #define MAX_PATH (2 * K) // for timer info max values which include all bits @@ -2502,7 +2498,13 @@ int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE; uintptr_t res = (uintptr_t) ::mmap(addr, size, prot, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0); - return res != (uintptr_t) MAP_FAILED; + if (res != (uintptr_t) MAP_FAILED) { + if (UseNUMAInterleaving) { + numa_make_global(addr, size); + } + return true; + } + return false; } // Define MAP_HUGETLB here so we can build HotSpot on old systems. @@ -2523,7 +2525,13 @@ (uintptr_t) ::mmap(addr, size, prot, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS|MAP_HUGETLB, -1, 0); - return res != (uintptr_t) MAP_FAILED; + if (res != (uintptr_t) MAP_FAILED) { + if (UseNUMAInterleaving) { + numa_make_global(addr, size); + } + return true; + } + return false; } return commit_memory(addr, size, exec); @@ -2588,8 +2596,17 @@ int retval = -1; #if defined(IA32) +# ifndef SYS_getcpu +# define SYS_getcpu 318 +# endif retval = syscall(SYS_getcpu, &cpu, NULL, NULL); #elif defined(AMD64) +// Unfortunately we have to bring all these macros here from vsyscall.h +// to be able to compile on old linuxes. +# define __NR_vgetcpu 2 +# define VSYSCALL_START (-10UL << 20) +# define VSYSCALL_SIZE 1024 +# define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr)) typedef long (*vgetcpu_t)(unsigned int *cpu, unsigned int *node, unsigned long *tcache); vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu); retval = vgetcpu(&cpu, NULL, NULL); @@ -3115,6 +3132,10 @@ return NULL; } + if ((addr != NULL) && UseNUMAInterleaving) { + numa_make_global(addr, bytes); + } + return addr; } diff -r 5c123cbeebbe -r 4668545121b8 src/os/solaris/vm/os_solaris.cpp --- a/src/os/solaris/vm/os_solaris.cpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/os/solaris/vm/os_solaris.cpp Fri Sep 02 21:33:57 2011 -0700 @@ -2777,8 +2777,14 @@ bool os::commit_memory(char* addr, size_t bytes, bool exec) { int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE; size_t size = bytes; - return - NULL != Solaris::mmap_chunk(addr, size, MAP_PRIVATE|MAP_FIXED, prot); + char *res = Solaris::mmap_chunk(addr, size, MAP_PRIVATE|MAP_FIXED, prot); + if (res != NULL) { + if (UseNUMAInterleaving) { + numa_make_global(addr, bytes); + } + return true; + } + return false; } bool os::commit_memory(char* addr, size_t bytes, size_t alignment_hint, @@ -3389,12 +3395,11 @@ return true; } -char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) { +char* os::reserve_memory_special(size_t size, char* addr, bool exec) { // "exec" is passed in but not used. Creating the shared image for // the code cache doesn't have an SHM_X executable permission to check. assert(UseLargePages && UseISM, "only for ISM large pages"); - size_t size = bytes; char* retAddr = NULL; int shmid; key_t ismKey; @@ -3436,7 +3441,9 @@ } return NULL; } - + if ((retAddr != NULL) && UseNUMAInterleaving) { + numa_make_global(retAddr, size); + } return retAddr; } diff -r 5c123cbeebbe -r 4668545121b8 src/os/windows/vm/os_windows.cpp --- a/src/os/windows/vm/os_windows.cpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/os/windows/vm/os_windows.cpp Fri Sep 02 21:33:57 2011 -0700 @@ -2614,6 +2614,57 @@ static HANDLE _hProcess; static HANDLE _hToken; +// Container for NUMA node list info +class NUMANodeListHolder { +private: + int *_numa_used_node_list; // allocated below + int _numa_used_node_count; + + void free_node_list() { + if (_numa_used_node_list != NULL) { + FREE_C_HEAP_ARRAY(int, _numa_used_node_list); + } + } + +public: + NUMANodeListHolder() { + _numa_used_node_count = 0; + _numa_used_node_list = NULL; + // do rest of initialization in build routine (after function pointers are set up) + } + + ~NUMANodeListHolder() { + free_node_list(); + } + + bool build() { + DWORD_PTR proc_aff_mask; + DWORD_PTR sys_aff_mask; + if (!GetProcessAffinityMask(GetCurrentProcess(), &proc_aff_mask, &sys_aff_mask)) return false; + ULONG highest_node_number; + if (!os::Kernel32Dll::GetNumaHighestNodeNumber(&highest_node_number)) return false; + free_node_list(); + _numa_used_node_list = NEW_C_HEAP_ARRAY(int, highest_node_number); + for (unsigned int i = 0; i <= highest_node_number; i++) { + ULONGLONG proc_mask_numa_node; + if (!os::Kernel32Dll::GetNumaNodeProcessorMask(i, &proc_mask_numa_node)) return false; + if ((proc_aff_mask & proc_mask_numa_node)!=0) { + _numa_used_node_list[_numa_used_node_count++] = i; + } + } + return (_numa_used_node_count > 1); + } + + int get_count() {return _numa_used_node_count;} + int get_node_list_entry(int n) { + // for indexes out of range, returns -1 + return (n < _numa_used_node_count ? _numa_used_node_list[n] : -1); + } + +} numa_node_list_holder; + + + static size_t _large_page_size = 0; static bool resolve_functions_for_large_page_init() { @@ -2653,6 +2704,154 @@ _hToken = NULL; } +static bool numa_interleaving_init() { + bool success = false; + bool use_numa_specified = !FLAG_IS_DEFAULT(UseNUMA); + bool use_numa_interleaving_specified = !FLAG_IS_DEFAULT(UseNUMAInterleaving); + + // print a warning if UseNUMA or UseNUMAInterleaving flag is specified on command line + bool warn_on_failure = use_numa_specified || use_numa_interleaving_specified; +# define WARN(msg) if (warn_on_failure) { warning(msg); } + + // NUMAInterleaveGranularity cannot be less than vm_allocation_granularity (or _large_page_size if using large pages) + size_t min_interleave_granularity = UseLargePages ? _large_page_size : os::vm_allocation_granularity(); + NUMAInterleaveGranularity = align_size_up(NUMAInterleaveGranularity, min_interleave_granularity); + + if (os::Kernel32Dll::NumaCallsAvailable()) { + if (numa_node_list_holder.build()) { + if (PrintMiscellaneous && Verbose) { + tty->print("NUMA UsedNodeCount=%d, namely ", os::numa_get_groups_num()); + for (int i = 0; i < numa_node_list_holder.get_count(); i++) { + tty->print("%d ", numa_node_list_holder.get_node_list_entry(i)); + } + tty->print("\n"); + } + success = true; + } else { + WARN("Process does not cover multiple NUMA nodes."); + } + } else { + WARN("NUMA Interleaving is not supported by the operating system."); + } + if (!success) { + if (use_numa_specified) WARN("...Ignoring UseNUMA flag."); + if (use_numa_interleaving_specified) WARN("...Ignoring UseNUMAInterleaving flag."); + } + return success; +#undef WARN +} + +// this routine is used whenever we need to reserve a contiguous VA range +// but we need to make separate VirtualAlloc calls for each piece of the range +// Reasons for doing this: +// * UseLargePagesIndividualAllocation was set (normally only needed on WS2003 but possible to be set otherwise) +// * UseNUMAInterleaving requires a separate node for each piece +static char* allocate_pages_individually(size_t bytes, char* addr, DWORD flags, DWORD prot, + bool should_inject_error=false) { + char * p_buf; + // note: at setup time we guaranteed that NUMAInterleaveGranularity was aligned up to a page size + size_t page_size = UseLargePages ? _large_page_size : os::vm_allocation_granularity(); + size_t chunk_size = UseNUMAInterleaving ? NUMAInterleaveGranularity : page_size; + + // first reserve enough address space in advance since we want to be + // able to break a single contiguous virtual address range into multiple + // large page commits but WS2003 does not allow reserving large page space + // so we just use 4K pages for reserve, this gives us a legal contiguous + // address space. then we will deallocate that reservation, and re alloc + // using large pages + const size_t size_of_reserve = bytes + chunk_size; + if (bytes > size_of_reserve) { + // Overflowed. + return NULL; + } + p_buf = (char *) VirtualAlloc(addr, + size_of_reserve, // size of Reserve + MEM_RESERVE, + PAGE_READWRITE); + // If reservation failed, return NULL + if (p_buf == NULL) return NULL; + + os::release_memory(p_buf, bytes + chunk_size); + + // we still need to round up to a page boundary (in case we are using large pages) + // but not to a chunk boundary (in case InterleavingGranularity doesn't align with page size) + // instead we handle this in the bytes_to_rq computation below + p_buf = (char *) align_size_up((size_t)p_buf, page_size); + + // now go through and allocate one chunk at a time until all bytes are + // allocated + size_t bytes_remaining = bytes; + // An overflow of align_size_up() would have been caught above + // in the calculation of size_of_reserve. + char * next_alloc_addr = p_buf; + HANDLE hProc = GetCurrentProcess(); + +#ifdef ASSERT + // Variable for the failure injection + long ran_num = os::random(); + size_t fail_after = ran_num % bytes; +#endif + + int count=0; + while (bytes_remaining) { + // select bytes_to_rq to get to the next chunk_size boundary + + size_t bytes_to_rq = MIN2(bytes_remaining, chunk_size - ((size_t)next_alloc_addr % chunk_size)); + // Note allocate and commit + char * p_new; + +#ifdef ASSERT + bool inject_error_now = should_inject_error && (bytes_remaining <= fail_after); +#else + const bool inject_error_now = false; +#endif + + if (inject_error_now) { + p_new = NULL; + } else { + if (!UseNUMAInterleaving) { + p_new = (char *) VirtualAlloc(next_alloc_addr, + bytes_to_rq, + flags, + prot); + } else { + // get the next node to use from the used_node_list + DWORD node = numa_node_list_holder.get_node_list_entry(count % os::numa_get_groups_num()); + p_new = (char *)os::Kernel32Dll::VirtualAllocExNuma(hProc, + next_alloc_addr, + bytes_to_rq, + flags, + prot, + node); + } + } + + if (p_new == NULL) { + // Free any allocated pages + if (next_alloc_addr > p_buf) { + // Some memory was committed so release it. + size_t bytes_to_release = bytes - bytes_remaining; + os::release_memory(p_buf, bytes_to_release); + } +#ifdef ASSERT + if (should_inject_error) { + if (TracePageSizes && Verbose) { + tty->print_cr("Reserving pages individually failed."); + } + } +#endif + return NULL; + } + bytes_remaining -= bytes_to_rq; + next_alloc_addr += bytes_to_rq; + count++; + } + // made it this far, success + return p_buf; +} + + + void os::large_page_init() { if (!UseLargePages) return; @@ -2722,9 +2921,30 @@ assert((size_t)addr % os::vm_allocation_granularity() == 0, "reserve alignment"); assert(bytes % os::vm_allocation_granularity() == 0, "reserve block size"); - char* res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE); + char* res; + // note that if UseLargePages is on, all the areas that require interleaving + // will go thru reserve_memory_special rather than thru here. + bool use_individual = (UseNUMAInterleaving && !UseLargePages); + if (!use_individual) { + res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE); + } else { + elapsedTimer reserveTimer; + if( Verbose && PrintMiscellaneous ) reserveTimer.start(); + // in numa interleaving, we have to allocate pages individually + // (well really chunks of NUMAInterleaveGranularity size) + res = allocate_pages_individually(bytes, addr, MEM_RESERVE, PAGE_READWRITE); + if (res == NULL) { + warning("NUMA page allocation failed"); + } + if( Verbose && PrintMiscellaneous ) { + reserveTimer.stop(); + tty->print_cr("reserve_memory of %Ix bytes took %ld ms (%ld ticks)", bytes, + reserveTimer.milliseconds(), reserveTimer.ticks()); + } + } assert(res == NULL || addr == NULL || addr == res, "Unexpected address from reserve."); + return res; } @@ -2754,92 +2974,27 @@ char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) { const DWORD prot = exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE; - - if (UseLargePagesIndividualAllocation) { + const DWORD flags = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES; + + // with large pages, there are two cases where we need to use Individual Allocation + // 1) the UseLargePagesIndividualAllocation flag is set (set by default on WS2003) + // 2) NUMA Interleaving is enabled, in which case we use a different node for each page + if (UseLargePagesIndividualAllocation || UseNUMAInterleaving) { if (TracePageSizes && Verbose) { tty->print_cr("Reserving large pages individually."); } - char * p_buf; - // first reserve enough address space in advance since we want to be - // able to break a single contiguous virtual address range into multiple - // large page commits but WS2003 does not allow reserving large page space - // so we just use 4K pages for reserve, this gives us a legal contiguous - // address space. then we will deallocate that reservation, and re alloc - // using large pages - const size_t size_of_reserve = bytes + _large_page_size; - if (bytes > size_of_reserve) { - // Overflowed. - warning("Individually allocated large pages failed, " - "use -XX:-UseLargePagesIndividualAllocation to turn off"); + char * p_buf = allocate_pages_individually(bytes, addr, flags, prot, LargePagesIndividualAllocationInjectError); + if (p_buf == NULL) { + // give an appropriate warning message + if (UseNUMAInterleaving) { + warning("NUMA large page allocation failed, UseLargePages flag ignored"); + } + if (UseLargePagesIndividualAllocation) { + warning("Individually allocated large pages failed, " + "use -XX:-UseLargePagesIndividualAllocation to turn off"); + } return NULL; } - p_buf = (char *) VirtualAlloc(addr, - size_of_reserve, // size of Reserve - MEM_RESERVE, - PAGE_READWRITE); - // If reservation failed, return NULL - if (p_buf == NULL) return NULL; - - release_memory(p_buf, bytes + _large_page_size); - // round up to page boundary. If the size_of_reserve did not - // overflow and the reservation did not fail, this align up - // should not overflow. - p_buf = (char *) align_size_up((size_t)p_buf, _large_page_size); - - // now go through and allocate one page at a time until all bytes are - // allocated - size_t bytes_remaining = align_size_up(bytes, _large_page_size); - // An overflow of align_size_up() would have been caught above - // in the calculation of size_of_reserve. - char * next_alloc_addr = p_buf; - -#ifdef ASSERT - // Variable for the failure injection - long ran_num = os::random(); - size_t fail_after = ran_num % bytes; -#endif - - while (bytes_remaining) { - size_t bytes_to_rq = MIN2(bytes_remaining, _large_page_size); - // Note allocate and commit - char * p_new; - -#ifdef ASSERT - bool inject_error = LargePagesIndividualAllocationInjectError && - (bytes_remaining <= fail_after); -#else - const bool inject_error = false; -#endif - - if (inject_error) { - p_new = NULL; - } else { - p_new = (char *) VirtualAlloc(next_alloc_addr, - bytes_to_rq, - MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, - prot); - } - - if (p_new == NULL) { - // Free any allocated pages - if (next_alloc_addr > p_buf) { - // Some memory was committed so release it. - size_t bytes_to_release = bytes - bytes_remaining; - release_memory(p_buf, bytes_to_release); - } -#ifdef ASSERT - if (UseLargePagesIndividualAllocation && - LargePagesIndividualAllocationInjectError) { - if (TracePageSizes && Verbose) { - tty->print_cr("Reserving large pages individually failed."); - } - } -#endif - return NULL; - } - bytes_remaining -= bytes_to_rq; - next_alloc_addr += bytes_to_rq; - } return p_buf; @@ -2867,14 +3022,43 @@ assert(bytes % os::vm_page_size() == 0, "commit in page-sized chunks"); // Don't attempt to print anything if the OS call fails. We're // probably low on resources, so the print itself may cause crashes. - bool result = VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) != 0; - if (result != NULL && exec) { - DWORD oldprot; - // Windows doc says to use VirtualProtect to get execute permissions - return VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot) != 0; + + // unless we have NUMAInterleaving enabled, the range of a commit + // is always within a reserve covered by a single VirtualAlloc + // in that case we can just do a single commit for the requested size + if (!UseNUMAInterleaving) { + if (VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) == NULL) return false; + if (exec) { + DWORD oldprot; + // Windows doc says to use VirtualProtect to get execute permissions + if (!VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot)) return false; + } + return true; } else { - return result; - } + + // when NUMAInterleaving is enabled, the commit might cover a range that + // came from multiple VirtualAlloc reserves (using allocate_pages_individually). + // VirtualQuery can help us determine that. The RegionSize that VirtualQuery + // returns represents the number of bytes that can be committed in one step. + size_t bytes_remaining = bytes; + char * next_alloc_addr = addr; + while (bytes_remaining > 0) { + MEMORY_BASIC_INFORMATION alloc_info; + VirtualQuery(next_alloc_addr, &alloc_info, sizeof(alloc_info)); + size_t bytes_to_rq = MIN2(bytes_remaining, (size_t)alloc_info.RegionSize); + if (VirtualAlloc(next_alloc_addr, bytes_to_rq, MEM_COMMIT, PAGE_READWRITE) == NULL) + return false; + if (exec) { + DWORD oldprot; + if (!VirtualProtect(next_alloc_addr, bytes_to_rq, PAGE_EXECUTE_READWRITE, &oldprot)) + return false; + } + bytes_remaining -= bytes_to_rq; + next_alloc_addr += bytes_to_rq; + } + } + // if we made it this far, return true + return true; } bool os::commit_memory(char* addr, size_t size, size_t alignment_hint, @@ -2948,14 +3132,15 @@ void os::numa_make_global(char *addr, size_t bytes) { } void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) { } bool os::numa_topology_changed() { return false; } -size_t os::numa_get_groups_num() { return 1; } +size_t os::numa_get_groups_num() { return numa_node_list_holder.get_count(); } int os::numa_get_group_id() { return 0; } size_t os::numa_get_leaf_groups(int *ids, size_t size) { - if (size > 0) { - ids[0] = 0; - return 1; - } - return 0; + // check for size bigger than actual groups_num + size = MIN2(size, numa_get_groups_num()); + for (int i = 0; i < (int)size; i++) { + ids[i] = numa_node_list_holder.get_node_list_entry(i); + } + return size; } bool os::get_page_info(char *start, page_info* info) { @@ -3480,7 +3665,7 @@ if(Verbose && PrintMiscellaneous) tty->print("[Memory Serialize Page address: " INTPTR_FORMAT "]\n", (intptr_t)mem_serialize_page); #endif -} + } os::large_page_init(); @@ -3583,8 +3768,10 @@ // initialize thread priority policy prio_init(); - if (UseNUMA && !ForceNUMA) { - UseNUMA = false; // Currently unsupported. + if (UseNUMAInterleaving) { + // first check whether this Windows OS supports VirtualAllocExNuma, if not ignore this flag + bool success = numa_interleaving_init(); + if (!success) UseNUMAInterleaving = false; } return JNI_OK; @@ -4758,7 +4945,14 @@ // Kernel32 API typedef SIZE_T (WINAPI* GetLargePageMinimum_Fn)(void); +typedef LPVOID (WINAPI *VirtualAllocExNuma_Fn) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD); +typedef BOOL (WINAPI *GetNumaHighestNodeNumber_Fn) (PULONG); +typedef BOOL (WINAPI *GetNumaNodeProcessorMask_Fn) (UCHAR, PULONGLONG); + GetLargePageMinimum_Fn os::Kernel32Dll::_GetLargePageMinimum = NULL; +VirtualAllocExNuma_Fn os::Kernel32Dll::_VirtualAllocExNuma = NULL; +GetNumaHighestNodeNumber_Fn os::Kernel32Dll::_GetNumaHighestNodeNumber = NULL; +GetNumaNodeProcessorMask_Fn os::Kernel32Dll::_GetNumaNodeProcessorMask = NULL; BOOL os::Kernel32Dll::initialized = FALSE; SIZE_T os::Kernel32Dll::GetLargePageMinimum() { assert(initialized && _GetLargePageMinimum != NULL, @@ -4773,16 +4967,53 @@ return _GetLargePageMinimum != NULL; } +BOOL os::Kernel32Dll::NumaCallsAvailable() { + if (!initialized) { + initialize(); + } + return _VirtualAllocExNuma != NULL; +} + +LPVOID os::Kernel32Dll::VirtualAllocExNuma(HANDLE hProc, LPVOID addr, SIZE_T bytes, DWORD flags, DWORD prot, DWORD node) { + assert(initialized && _VirtualAllocExNuma != NULL, + "NUMACallsAvailable() not yet called"); + + return _VirtualAllocExNuma(hProc, addr, bytes, flags, prot, node); +} + +BOOL os::Kernel32Dll::GetNumaHighestNodeNumber(PULONG ptr_highest_node_number) { + assert(initialized && _GetNumaHighestNodeNumber != NULL, + "NUMACallsAvailable() not yet called"); + + return _GetNumaHighestNodeNumber(ptr_highest_node_number); +} + +BOOL os::Kernel32Dll::GetNumaNodeProcessorMask(UCHAR node, PULONGLONG proc_mask) { + assert(initialized && _GetNumaNodeProcessorMask != NULL, + "NUMACallsAvailable() not yet called"); + + return _GetNumaNodeProcessorMask(node, proc_mask); +} + + +void os::Kernel32Dll::initializeCommon() { + if (!initialized) { + HMODULE handle = ::GetModuleHandle("Kernel32.dll"); + assert(handle != NULL, "Just check"); + _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum"); + _VirtualAllocExNuma = (VirtualAllocExNuma_Fn)::GetProcAddress(handle, "VirtualAllocExNuma"); + _GetNumaHighestNodeNumber = (GetNumaHighestNodeNumber_Fn)::GetProcAddress(handle, "GetNumaHighestNodeNumber"); + _GetNumaNodeProcessorMask = (GetNumaNodeProcessorMask_Fn)::GetProcAddress(handle, "GetNumaNodeProcessorMask"); + initialized = TRUE; + } +} + + #ifndef JDK6_OR_EARLIER void os::Kernel32Dll::initialize() { - if (!initialized) { - HMODULE handle = ::GetModuleHandle("Kernel32.dll"); - assert(handle != NULL, "Just check"); - _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum"); - initialized = TRUE; - } + initializeCommon(); } @@ -4887,18 +5118,19 @@ Module32Next_Fn os::Kernel32Dll::_Module32Next = NULL; GetNativeSystemInfo_Fn os::Kernel32Dll::_GetNativeSystemInfo = NULL; + void os::Kernel32Dll::initialize() { if (!initialized) { HMODULE handle = ::GetModuleHandle("Kernel32.dll"); assert(handle != NULL, "Just check"); _SwitchToThread = (SwitchToThread_Fn)::GetProcAddress(handle, "SwitchToThread"); - _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum"); _CreateToolhelp32Snapshot = (CreateToolhelp32Snapshot_Fn) ::GetProcAddress(handle, "CreateToolhelp32Snapshot"); _Module32First = (Module32First_Fn)::GetProcAddress(handle, "Module32First"); _Module32Next = (Module32Next_Fn)::GetProcAddress(handle, "Module32Next"); _GetNativeSystemInfo = (GetNativeSystemInfo_Fn)::GetProcAddress(handle, "GetNativeSystemInfo"); + initializeCommon(); // resolve the functions that always need resolving initialized = TRUE; } @@ -4964,6 +5196,8 @@ _GetNativeSystemInfo(lpSystemInfo); } + + // PSAPI API diff -r 5c123cbeebbe -r 4668545121b8 src/os/windows/vm/os_windows.hpp --- a/src/os/windows/vm/os_windows.hpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/os/windows/vm/os_windows.hpp Fri Sep 02 21:33:57 2011 -0700 @@ -173,13 +173,25 @@ static BOOL GetNativeSystemInfoAvailable(); static void GetNativeSystemInfo(LPSYSTEM_INFO); + // NUMA calls + static BOOL NumaCallsAvailable(); + static LPVOID VirtualAllocExNuma(HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD); + static BOOL GetNumaHighestNodeNumber(PULONG); + static BOOL GetNumaNodeProcessorMask(UCHAR, PULONGLONG); + private: // GetLargePageMinimum available on Windows Vista/Windows Server 2003 // and later + // NUMA calls available Windows Vista/WS2008 and later + static SIZE_T (WINAPI *_GetLargePageMinimum)(void); + static LPVOID (WINAPI *_VirtualAllocExNuma) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD); + static BOOL (WINAPI *_GetNumaHighestNodeNumber) (PULONG); + static BOOL (WINAPI *_GetNumaNodeProcessorMask) (UCHAR, PULONGLONG); static BOOL initialized; static void initialize(); + static void initializeCommon(); #ifdef JDK6_OR_EARLIER private: diff -r 5c123cbeebbe -r 4668545121b8 src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp --- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Fri Sep 02 21:33:57 2011 -0700 @@ -4069,6 +4069,23 @@ } #endif // PRODUCT +G1ParGCAllocBuffer::G1ParGCAllocBuffer(size_t gclab_word_size) : + ParGCAllocBuffer(gclab_word_size), + _should_mark_objects(false), + _bitmap(G1CollectedHeap::heap()->reserved_region().start(), gclab_word_size), + _retired(false) +{ + //_should_mark_objects is set to true when G1ParCopyHelper needs to + // mark the forwarded location of an evacuated object. + // We set _should_mark_objects to true if marking is active, i.e. when we + // need to propagate a mark, or during an initial mark pause, i.e. when we + // need to mark objects immediately reachable by the roots. + if (G1CollectedHeap::heap()->mark_in_progress() || + G1CollectedHeap::heap()->g1_policy()->during_initial_mark_pause()) { + _should_mark_objects = true; + } +} + G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, int queue_num) : _g1h(g1h), _refs(g1h->task_queue(queue_num)), @@ -4184,12 +4201,14 @@ G1ParClosureSuper::G1ParClosureSuper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) : _g1(g1), _g1_rem(_g1->g1_rem_set()), _cm(_g1->concurrent_mark()), - _par_scan_state(par_scan_state) { } - -template void G1ParCopyHelper::mark_forwardee(T* p) { - // This is called _after_ do_oop_work has been called, hence after - // the object has been relocated to its new location and *p points - // to its new location. + _par_scan_state(par_scan_state), + _during_initial_mark(_g1->g1_policy()->during_initial_mark_pause()), + _mark_in_progress(_g1->mark_in_progress()) { } + +template void G1ParCopyHelper::mark_object(T* p) { + // This is called from do_oop_work for objects that are not + // in the collection set. Objects in the collection set + // are marked after they have been evacuated. T heap_oop = oopDesc::load_heap_oop(p); if (!oopDesc::is_null(heap_oop)) { @@ -4201,7 +4220,7 @@ } } -oop G1ParCopyHelper::copy_to_survivor_space(oop old) { +oop G1ParCopyHelper::copy_to_survivor_space(oop old, bool should_mark_copy) { size_t word_sz = old->size(); HeapRegion* from_region = _g1->heap_region_containing_raw(old); // +1 to make the -1 indexes valid... @@ -4257,8 +4276,8 @@ obj->set_mark(m); } - // preserve "next" mark bit - if (_g1->mark_in_progress() && !_g1->is_obj_ill(old)) { + // Mark the evacuated object or propagate "next" mark bit + if (should_mark_copy) { if (!use_local_bitmaps || !_par_scan_state->alloc_buffer(alloc_purpose)->mark(obj_ptr)) { // if we couldn't mark it on the local bitmap (this happens when @@ -4266,11 +4285,12 @@ // the bullet and do the standard parallel mark _cm->markAndGrayObjectIfNecessary(obj); } -#if 1 + if (_g1->isMarkedNext(old)) { + // Unmark the object's old location so that marking + // doesn't think the old object is alive. _cm->nextMarkBitMap()->parClear((HeapWord*)old); } -#endif } size_t* surv_young_words = _par_scan_state->surviving_young_words(); @@ -4293,26 +4313,62 @@ return obj; } -template +template template -void G1ParCopyClosure +void G1ParCopyClosure ::do_oop_work(T* p) { oop obj = oopDesc::load_decode_heap_oop(p); assert(barrier != G1BarrierRS || obj != NULL, "Precondition: G1BarrierRS implies obj is nonNull"); + // Marking: + // If the object is in the collection set, then the thread + // that copies the object should mark, or propagate the + // mark to, the evacuated object. + // If the object is not in the collection set then we + // should call the mark_object() method depending on the + // value of the template parameter do_mark_object (which will + // be true for root scanning closures during an initial mark + // pause). + // The mark_object() method first checks whether the object + // is marked and, if not, attempts to mark the object. + // here the null check is implicit in the cset_fast_test() test if (_g1->in_cset_fast_test(obj)) { if (obj->is_forwarded()) { oopDesc::encode_store_heap_oop(p, obj->forwardee()); + // If we are a root scanning closure during an initial + // mark pause (i.e. do_mark_object will be true) then + // we also need to handle marking of roots in the + // event of an evacuation failure. In the event of an + // evacuation failure, the object is forwarded to itself + // and not copied so let's mark it here. + if (do_mark_object && obj->forwardee() == obj) { + mark_object(p); + } } else { - oop copy_oop = copy_to_survivor_space(obj); + // We need to mark the copied object if we're a root scanning + // closure during an initial mark pause (i.e. do_mark_object + // will be true), or the object is already marked and we need + // to propagate the mark to the evacuated copy. + bool should_mark_copy = do_mark_object || + _during_initial_mark || + (_mark_in_progress && !_g1->is_obj_ill(obj)); + + oop copy_oop = copy_to_survivor_space(obj, should_mark_copy); oopDesc::encode_store_heap_oop(p, copy_oop); } // When scanning the RS, we only care about objs in CS. if (barrier == G1BarrierRS) { _par_scan_state->update_rs(_from, p, _par_scan_state->queue_num()); } + } else { + // The object is not in collection set. If we're a root scanning + // closure during an initial mark pause (i.e. do_mark_object will + // be true) then attempt to mark the object. + if (do_mark_object) { + mark_object(p); + } } if (barrier == G1BarrierEvac && obj != NULL) { diff -r 5c123cbeebbe -r 4668545121b8 src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp --- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp Fri Sep 02 21:33:57 2011 -0700 @@ -1715,26 +1715,22 @@ class G1ParGCAllocBuffer: public ParGCAllocBuffer { private: bool _retired; - bool _during_marking; + bool _should_mark_objects; GCLabBitMap _bitmap; public: - G1ParGCAllocBuffer(size_t gclab_word_size) : - ParGCAllocBuffer(gclab_word_size), - _during_marking(G1CollectedHeap::heap()->mark_in_progress()), - _bitmap(G1CollectedHeap::heap()->reserved_region().start(), gclab_word_size), - _retired(false) - { } + G1ParGCAllocBuffer(size_t gclab_word_size); inline bool mark(HeapWord* addr) { guarantee(use_local_bitmaps, "invariant"); - assert(_during_marking, "invariant"); + assert(_should_mark_objects, "invariant"); return _bitmap.mark(addr); } inline void set_buf(HeapWord* buf) { - if (use_local_bitmaps && _during_marking) + if (use_local_bitmaps && _should_mark_objects) { _bitmap.set_buffer(buf); + } ParGCAllocBuffer::set_buf(buf); _retired = false; } @@ -1742,7 +1738,7 @@ inline void retire(bool end_of_gc, bool retain) { if (_retired) return; - if (use_local_bitmaps && _during_marking) { + if (use_local_bitmaps && _should_mark_objects) { _bitmap.retire(); } ParGCAllocBuffer::retire(end_of_gc, retain); diff -r 5c123cbeebbe -r 4668545121b8 src/share/vm/gc_implementation/g1/g1OopClosures.hpp --- a/src/share/vm/gc_implementation/g1/g1OopClosures.hpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/share/vm/gc_implementation/g1/g1OopClosures.hpp Fri Sep 02 21:33:57 2011 -0700 @@ -50,6 +50,8 @@ G1RemSet* _g1_rem; ConcurrentMark* _cm; G1ParScanThreadState* _par_scan_state; + bool _during_initial_mark; + bool _mark_in_progress; public: G1ParClosureSuper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state); bool apply_to_weak_ref_discovered_field() { return true; } @@ -102,8 +104,8 @@ class G1ParCopyHelper : public G1ParClosureSuper { G1ParScanClosure *_scanner; protected: - template void mark_forwardee(T* p); - oop copy_to_survivor_space(oop obj); + template void mark_object(T* p); + oop copy_to_survivor_space(oop obj, bool should_mark_copy); public: G1ParCopyHelper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state, G1ParScanClosure *scanner) : @@ -111,7 +113,7 @@ }; template + bool do_mark_object> class G1ParCopyClosure : public G1ParCopyHelper { G1ParScanClosure _scanner; template void do_oop_work(T* p); @@ -120,8 +122,6 @@ _scanner(g1, par_scan_state), G1ParCopyHelper(g1, par_scan_state, &_scanner) { } template void do_oop_nv(T* p) { do_oop_work(p); - if (do_mark_forwardee) - mark_forwardee(p); } virtual void do_oop(oop* p) { do_oop_nv(p); } virtual void do_oop(narrowOop* p) { do_oop_nv(p); } diff -r 5c123cbeebbe -r 4668545121b8 src/share/vm/gc_implementation/g1/g1_globals.hpp --- a/src/share/vm/gc_implementation/g1/g1_globals.hpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp Fri Sep 02 21:33:57 2011 -0700 @@ -124,9 +124,6 @@ develop(bool, G1RSBarrierNullFilter, true, \ "If true, generate null-pointer filtering code in RS barrier") \ \ - develop(bool, G1PrintCTFilterStats, false, \ - "If true, print stats on RS filtering effectiveness") \ - \ develop(bool, G1DeferredRSUpdate, true, \ "If true, use deferred RS updates") \ \ diff -r 5c123cbeebbe -r 4668545121b8 src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp --- a/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp Fri Sep 02 21:33:57 2011 -0700 @@ -36,7 +36,7 @@ }; template + bool do_mark_object> class G1ParCopyClosure; class G1ParScanClosure; class G1ParPushHeapRSClosure; diff -r 5c123cbeebbe -r 4668545121b8 src/share/vm/runtime/arguments.cpp --- a/src/share/vm/runtime/arguments.cpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/share/vm/runtime/arguments.cpp Fri Sep 02 21:33:57 2011 -0700 @@ -1423,6 +1423,9 @@ if (FLAG_IS_DEFAULT(MinHeapDeltaBytes)) { FLAG_SET_DEFAULT(MinHeapDeltaBytes, 64*M); } + // For those collectors or operating systems (eg, Windows) that do + // not support full UseNUMA, we will map to UseNUMAInterleaving for now + UseNUMAInterleaving = true; } } diff -r 5c123cbeebbe -r 4668545121b8 src/share/vm/runtime/globals.hpp --- a/src/share/vm/runtime/globals.hpp Fri Sep 02 15:52:03 2011 -0700 +++ b/src/share/vm/runtime/globals.hpp Fri Sep 02 21:33:57 2011 -0700 @@ -475,6 +475,12 @@ product(bool, UseNUMA, false, \ "Use NUMA if available") \ \ + product(bool, UseNUMAInterleaving, false, \ + "Interleave memory across NUMA nodes if available") \ + \ + product(uintx, NUMAInterleaveGranularity, 2*M, \ + "Granularity to use for NUMA interleaving on Windows OS") \ + \ product(bool, ForceNUMA, false, \ "Force NUMA optimizations on single-node/UMA systems") \ \