# HG changeset patch # User ysr # Date 1305012801 25200 # Node ID fc2b798ab316df025526f208aeeef19609ad51b3 # Parent 54a56bbaf95b1a2c63aea5b45c0219f03bed4822 6883834: ParNew: assert(!_g->to()->is_in_reserved(obj),"Scanning field twice?") with LargeObjects tests Summary: Fixed process_chunk_boundaries(), used for parallel card scanning when using ParNew/CMS, so as to prevent double-scanning, or worse, non-scanning of imprecisely marked objects exceeding parallel chunk size. Made some sizing parameters for parallel card scanning diagnostic, disabled ParallelGCRetainPLAB, and elaborated and clarified some comments. Reviewed-by: stefank, johnc diff -r 54a56bbaf95b -r fc2b798ab316 src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp --- a/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp Fri May 06 09:45:18 2011 +0200 +++ b/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp Tue May 10 00:33:21 2011 -0700 @@ -29,13 +29,14 @@ #include "memory/sharedHeap.hpp" #include "memory/space.inline.hpp" #include "memory/universe.hpp" +#include "oops/oop.inline.hpp" #include "runtime/java.hpp" #include "runtime/mutexLocker.hpp" #include "runtime/virtualspace.hpp" void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr, - DirtyCardToOopClosure* dcto_cl, - ClearNoncleanCardWrapper* cl, + OopsInGenClosure* cl, + CardTableRS* ct, int n_threads) { assert(n_threads > 0, "Error: expected n_threads > 0"); assert((n_threads == 1 && ParallelGCThreads == 0) || @@ -49,14 +50,14 @@ lowest_non_clean_base_chunk_index, lowest_non_clean_chunk_size); - int n_strides = n_threads * StridesPerThread; + int n_strides = n_threads * ParGCStridesPerThread; SequentialSubTasksDone* pst = sp->par_seq_tasks(); pst->set_n_threads(n_threads); pst->set_n_tasks(n_strides); int stride = 0; while (!pst->is_task_claimed(/* reference */ stride)) { - process_stride(sp, mr, stride, n_strides, dcto_cl, cl, + process_stride(sp, mr, stride, n_strides, cl, ct, lowest_non_clean, lowest_non_clean_base_chunk_index, lowest_non_clean_chunk_size); @@ -79,13 +80,13 @@ process_stride(Space* sp, MemRegion used, jint stride, int n_strides, - DirtyCardToOopClosure* dcto_cl, - ClearNoncleanCardWrapper* cl, + OopsInGenClosure* cl, + CardTableRS* ct, jbyte** lowest_non_clean, uintptr_t lowest_non_clean_base_chunk_index, size_t lowest_non_clean_chunk_size) { - // We don't have to go downwards here; it wouldn't help anyway, - // because of parallelism. + // We go from higher to lower addresses here; it wouldn't help that much + // because of the strided parallelism pattern used here. // Find the first card address of the first chunk in the stride that is // at least "bottom" of the used region. @@ -98,25 +99,35 @@ if ((uintptr_t)stride >= start_chunk_stride_num) { chunk_card_start = (jbyte*)(start_card + (stride - start_chunk_stride_num) * - CardsPerStrideChunk); + ParGCCardsPerStrideChunk); } else { // Go ahead to the next chunk group boundary, then to the requested stride. chunk_card_start = (jbyte*)(start_card + (n_strides - start_chunk_stride_num + stride) * - CardsPerStrideChunk); + ParGCCardsPerStrideChunk); } while (chunk_card_start < end_card) { - // We don't have to go downwards here; it wouldn't help anyway, - // because of parallelism. (We take care with "min_done"; see below.) + // Even though we go from lower to higher addresses below, the + // strided parallelism can interleave the actual processing of the + // dirty pages in various ways. For a specific chunk within this + // stride, we take care to avoid double scanning or missing a card + // by suitably initializing the "min_done" field in process_chunk_boundaries() + // below, together with the dirty region extension accomplished in + // DirtyCardToOopClosure::do_MemRegion(). + jbyte* chunk_card_end = chunk_card_start + ParGCCardsPerStrideChunk; // Invariant: chunk_mr should be fully contained within the "used" region. - jbyte* chunk_card_end = chunk_card_start + CardsPerStrideChunk; MemRegion chunk_mr = MemRegion(addr_for(chunk_card_start), chunk_card_end >= end_card ? used.end() : addr_for(chunk_card_end)); assert(chunk_mr.word_size() > 0, "[chunk_card_start > used_end)"); assert(used.contains(chunk_mr), "chunk_mr should be subset of used"); + DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, precision(), + cl->gen_boundary()); + ClearNoncleanCardWrapper clear_cl(dcto_cl, ct); + + // Process the chunk. process_chunk_boundaries(sp, dcto_cl, @@ -126,17 +137,30 @@ lowest_non_clean_base_chunk_index, lowest_non_clean_chunk_size); + // We want the LNC array updates above in process_chunk_boundaries + // to be visible before any of the card table value changes as a + // result of the dirty card iteration below. + OrderAccess::storestore(); + // We do not call the non_clean_card_iterate_serial() version because - // we want to clear the cards, and the ClearNoncleanCardWrapper closure - // itself does the work of finding contiguous dirty ranges of cards to - // process (and clear). - cl->do_MemRegion(chunk_mr); + // we want to clear the cards: clear_cl here does the work of finding + // contiguous dirty ranges of cards to process and clear. + clear_cl.do_MemRegion(chunk_mr); // Find the next chunk of the stride. - chunk_card_start += CardsPerStrideChunk * n_strides; + chunk_card_start += ParGCCardsPerStrideChunk * n_strides; } } + +// If you want a talkative process_chunk_boundaries, +// then #define NOISY(x) x +#ifdef NOISY +#error "Encountered a global preprocessor flag, NOISY, which might clash with local definition to follow" +#else +#define NOISY(x) +#endif + void CardTableModRefBS:: process_chunk_boundaries(Space* sp, @@ -147,126 +171,232 @@ uintptr_t lowest_non_clean_base_chunk_index, size_t lowest_non_clean_chunk_size) { - // We must worry about the chunk boundaries. + // We must worry about non-array objects that cross chunk boundaries, + // because such objects are both precisely and imprecisely marked: + // .. if the head of such an object is dirty, the entire object + // needs to be scanned, under the interpretation that this + // was an imprecise mark + // .. if the head of such an object is not dirty, we can assume + // precise marking and it's efficient to scan just the dirty + // cards. + // In either case, each scanned reference must be scanned precisely + // once so as to avoid cloning of a young referent. For efficiency, + // our closures depend on this property and do not protect against + // double scans. - // First, set our max_to_do: - HeapWord* max_to_do = NULL; uintptr_t cur_chunk_index = addr_to_chunk_index(chunk_mr.start()); cur_chunk_index = cur_chunk_index - lowest_non_clean_base_chunk_index; + NOISY(tty->print_cr("===========================================================================");) + NOISY(tty->print_cr(" process_chunk_boundary: Called with [" PTR_FORMAT "," PTR_FORMAT ")", + chunk_mr.start(), chunk_mr.end());) + + // First, set "our" lowest_non_clean entry, which would be + // used by the thread scanning an adjoining left chunk with + // a non-array object straddling the mutual boundary. + // Find the object that spans our boundary, if one exists. + // first_block is the block possibly straddling our left boundary. + HeapWord* first_block = sp->block_start(chunk_mr.start()); + assert((chunk_mr.start() != used.start()) || (first_block == chunk_mr.start()), + "First chunk should always have a co-initial block"); + // Does the block straddle the chunk's left boundary, and is it + // a non-array object? + if (first_block < chunk_mr.start() // first block straddles left bdry + && sp->block_is_obj(first_block) // first block is an object + && !(oop(first_block)->is_objArray() // first block is not an array (arrays are precisely dirtied) + || oop(first_block)->is_typeArray())) { + // Find our least non-clean card, so that a left neighbour + // does not scan an object straddling the mutual boundary + // too far to the right, and attempt to scan a portion of + // that object twice. + jbyte* first_dirty_card = NULL; + jbyte* last_card_of_first_obj = + byte_for(first_block + sp->block_size(first_block) - 1); + jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start()); + jbyte* last_card_of_cur_chunk = byte_for(chunk_mr.last()); + jbyte* last_card_to_check = + (jbyte*) MIN2((intptr_t) last_card_of_cur_chunk, + (intptr_t) last_card_of_first_obj); + // Note that this does not need to go beyond our last card + // if our first object completely straddles this chunk. + for (jbyte* cur = first_card_of_cur_chunk; + cur <= last_card_to_check; cur++) { + jbyte val = *cur; + if (card_will_be_scanned(val)) { + first_dirty_card = cur; break; + } else { + assert(!card_may_have_been_dirty(val), "Error"); + } + } + if (first_dirty_card != NULL) { + NOISY(tty->print_cr(" LNC: Found a dirty card at " PTR_FORMAT " in current chunk", + first_dirty_card);) + assert(0 <= cur_chunk_index && cur_chunk_index < lowest_non_clean_chunk_size, + "Bounds error."); + assert(lowest_non_clean[cur_chunk_index] == NULL, + "Write exactly once : value should be stable hereafter for this round"); + lowest_non_clean[cur_chunk_index] = first_dirty_card; + } NOISY(else { + tty->print_cr(" LNC: Found no dirty card in current chunk; leaving LNC entry NULL"); + // In the future, we could have this thread look for a non-NULL value to copy from its + // right neighbour (up to the end of the first object). + if (last_card_of_cur_chunk < last_card_of_first_obj) { + tty->print_cr(" LNC: BEWARE!!! first obj straddles past right end of chunk:\n" + " might be efficient to get value from right neighbour?"); + } + }) + } else { + // In this case we can help our neighbour by just asking them + // to stop at our first card (even though it may not be dirty). + NOISY(tty->print_cr(" LNC: first block is not a non-array object; setting LNC to first card of current chunk");) + assert(lowest_non_clean[cur_chunk_index] == NULL, "Write once : value should be stable hereafter"); + jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start()); + lowest_non_clean[cur_chunk_index] = first_card_of_cur_chunk; + } + NOISY(tty->print_cr(" process_chunk_boundary: lowest_non_clean[" INTPTR_FORMAT "] = " PTR_FORMAT + " which corresponds to the heap address " PTR_FORMAT, + cur_chunk_index, lowest_non_clean[cur_chunk_index], + (lowest_non_clean[cur_chunk_index] != NULL) + ? addr_for(lowest_non_clean[cur_chunk_index]) + : NULL);) + NOISY(tty->print_cr("---------------------------------------------------------------------------");) + + // Next, set our own max_to_do, which will strictly/exclusively bound + // the highest address that we will scan past the right end of our chunk. + HeapWord* max_to_do = NULL; if (chunk_mr.end() < used.end()) { - // This is not the last chunk in the used region. What is the last - // object? - HeapWord* last_block = sp->block_start(chunk_mr.end()); + // This is not the last chunk in the used region. + // What is our last block? We check the first block of + // the next (right) chunk rather than strictly check our last block + // because it's potentially more efficient to do so. + HeapWord* const last_block = sp->block_start(chunk_mr.end()); assert(last_block <= chunk_mr.end(), "In case this property changes."); - if (last_block == chunk_mr.end() - || !sp->block_is_obj(last_block)) { + if ((last_block == chunk_mr.end()) // our last block does not straddle boundary + || !sp->block_is_obj(last_block) // last_block isn't an object + || oop(last_block)->is_objArray() // last_block is an array (precisely marked) + || oop(last_block)->is_typeArray()) { max_to_do = chunk_mr.end(); - + NOISY(tty->print_cr(" process_chunk_boundary: Last block on this card is not a non-array object;\n" + " max_to_do left at " PTR_FORMAT, max_to_do);) } else { - // It is an object and starts before the end of the current chunk. + assert(last_block < chunk_mr.end(), "Tautology"); + // It is a non-array object that straddles the right boundary of this chunk. // last_obj_card is the card corresponding to the start of the last object // in the chunk. Note that the last object may not start in // the chunk. - jbyte* last_obj_card = byte_for(last_block); - if (!card_may_have_been_dirty(*last_obj_card)) { - // The card containing the head is not dirty. Any marks in + jbyte* const last_obj_card = byte_for(last_block); + const jbyte val = *last_obj_card; + if (!card_will_be_scanned(val)) { + assert(!card_may_have_been_dirty(val), "Error"); + // The card containing the head is not dirty. Any marks on // subsequent cards still in this chunk must have been made - // precisely; we can cap processing at the end. + // precisely; we can cap processing at the end of our chunk. max_to_do = chunk_mr.end(); + NOISY(tty->print_cr(" process_chunk_boundary: Head of last object on this card is not dirty;\n" + " max_to_do left at " PTR_FORMAT, + max_to_do);) } else { // The last object must be considered dirty, and extends onto the // following chunk. Look for a dirty card in that chunk that will // bound our processing. jbyte* limit_card = NULL; - size_t last_block_size = sp->block_size(last_block); - jbyte* last_card_of_last_obj = + const size_t last_block_size = sp->block_size(last_block); + jbyte* const last_card_of_last_obj = byte_for(last_block + last_block_size - 1); - jbyte* first_card_of_next_chunk = byte_for(chunk_mr.end()); + jbyte* const first_card_of_next_chunk = byte_for(chunk_mr.end()); // This search potentially goes a long distance looking - // for the next card that will be scanned. For example, - // an object that is an array of primitives will not - // have any cards covering regions interior to the array - // that will need to be scanned. The scan can be terminated - // at the last card of the next chunk. That would leave - // limit_card as NULL and would result in "max_to_do" - // being set with the LNC value or with the end - // of the last block. - jbyte* last_card_of_next_chunk = first_card_of_next_chunk + - CardsPerStrideChunk; - assert(byte_for(chunk_mr.end()) - byte_for(chunk_mr.start()) - == CardsPerStrideChunk, "last card of next chunk may be wrong"); - jbyte* last_card_to_check = (jbyte*) MIN2(last_card_of_last_obj, - last_card_of_next_chunk); + // for the next card that will be scanned, terminating + // at the end of the last_block, if no earlier dirty card + // is found. + assert(byte_for(chunk_mr.end()) - byte_for(chunk_mr.start()) == ParGCCardsPerStrideChunk, + "last card of next chunk may be wrong"); for (jbyte* cur = first_card_of_next_chunk; - cur <= last_card_to_check; cur++) { - if (card_will_be_scanned(*cur)) { + cur <= last_card_of_last_obj; cur++) { + const jbyte val = *cur; + if (card_will_be_scanned(val)) { + NOISY(tty->print_cr(" Found a non-clean card " PTR_FORMAT " with value 0x%x", + cur, (int)val);) limit_card = cur; break; + } else { + assert(!card_may_have_been_dirty(val), "Error: card can't be skipped"); } } - assert(0 <= cur_chunk_index+1 && - cur_chunk_index+1 < lowest_non_clean_chunk_size, + if (limit_card != NULL) { + max_to_do = addr_for(limit_card); + assert(limit_card != NULL && max_to_do != NULL, "Error"); + NOISY(tty->print_cr(" process_chunk_boundary: Found a dirty card at " PTR_FORMAT + " max_to_do set at " PTR_FORMAT " which is before end of last block in chunk: " + PTR_FORMAT " + " PTR_FORMAT " = " PTR_FORMAT, + limit_card, max_to_do, last_block, last_block_size, (last_block+last_block_size));) + } else { + // The following is a pessimistic value, because it's possible + // that a dirty card on a subsequent chunk has been cleared by + // the time we get to look at it; we'll correct for that further below, + // using the LNC array which records the least non-clean card + // before cards were cleared in a particular chunk. + limit_card = last_card_of_last_obj; + max_to_do = last_block + last_block_size; + assert(limit_card != NULL && max_to_do != NULL, "Error"); + NOISY(tty->print_cr(" process_chunk_boundary: Found no dirty card before end of last block in chunk\n" + " Setting limit_card to " PTR_FORMAT + " and max_to_do " PTR_FORMAT " + " PTR_FORMAT " = " PTR_FORMAT, + limit_card, last_block, last_block_size, max_to_do);) + } + assert(0 < cur_chunk_index+1 && cur_chunk_index+1 < lowest_non_clean_chunk_size, "Bounds error."); - // LNC for the next chunk - jbyte* lnc_card = lowest_non_clean[cur_chunk_index+1]; - if (limit_card == NULL) { - limit_card = lnc_card; - } - if (limit_card != NULL) { + // It is possible that a dirty card for the last object may have been + // cleared before we had a chance to examine it. In that case, the value + // will have been logged in the LNC for that chunk. + // We need to examine as many chunks to the right as this object + // covers. + const uintptr_t last_chunk_index_to_check = addr_to_chunk_index(last_block + last_block_size - 1) + - lowest_non_clean_base_chunk_index; + DEBUG_ONLY(const uintptr_t last_chunk_index = addr_to_chunk_index(used.end()) + - lowest_non_clean_base_chunk_index;) + assert(last_chunk_index_to_check <= last_chunk_index, + err_msg("Out of bounds: last_chunk_index_to_check " INTPTR_FORMAT + " exceeds last_chunk_index " INTPTR_FORMAT, + last_chunk_index_to_check, last_chunk_index)); + for (uintptr_t lnc_index = cur_chunk_index + 1; + lnc_index <= last_chunk_index_to_check; + lnc_index++) { + jbyte* lnc_card = lowest_non_clean[lnc_index]; if (lnc_card != NULL) { - limit_card = (jbyte*)MIN2((intptr_t)limit_card, - (intptr_t)lnc_card); - } - max_to_do = addr_for(limit_card); - } else { - max_to_do = last_block + last_block_size; + // we can stop at the first non-NULL entry we find + if (lnc_card <= limit_card) { + NOISY(tty->print_cr(" process_chunk_boundary: LNC card " PTR_FORMAT " is lower than limit_card " PTR_FORMAT, + " max_to_do will be lowered to " PTR_FORMAT " from " PTR_FORMAT, + lnc_card, limit_card, addr_for(lnc_card), max_to_do);) + limit_card = lnc_card; + max_to_do = addr_for(limit_card); + assert(limit_card != NULL && max_to_do != NULL, "Error"); + } + // In any case, we break now + break; + } // else continue to look for a non-NULL entry if any } + assert(limit_card != NULL && max_to_do != NULL, "Error"); } + assert(max_to_do != NULL, "OOPS 1 !"); } - assert(max_to_do != NULL, "OOPS!"); + assert(max_to_do != NULL, "OOPS 2!"); } else { max_to_do = used.end(); + NOISY(tty->print_cr(" process_chunk_boundary: Last chunk of this space;\n" + " max_to_do left at " PTR_FORMAT, + max_to_do);) } + assert(max_to_do != NULL, "OOPS 3!"); // Now we can set the closure we're using so it doesn't to beyond // max_to_do. dcto_cl->set_min_done(max_to_do); #ifndef PRODUCT dcto_cl->set_last_bottom(max_to_do); #endif + NOISY(tty->print_cr("===========================================================================\n");) +} - // Now we set *our" lowest_non_clean entry. - // Find the object that spans our boundary, if one exists. - // Nothing to do on the first chunk. - if (chunk_mr.start() > used.start()) { - // first_block is the block possibly spanning the chunk start - HeapWord* first_block = sp->block_start(chunk_mr.start()); - // Does the block span the start of the chunk and is it - // an object? - if (first_block < chunk_mr.start() && - sp->block_is_obj(first_block)) { - jbyte* first_dirty_card = NULL; - jbyte* last_card_of_first_obj = - byte_for(first_block + sp->block_size(first_block) - 1); - jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start()); - jbyte* last_card_of_cur_chunk = byte_for(chunk_mr.last()); - jbyte* last_card_to_check = - (jbyte*) MIN2((intptr_t) last_card_of_cur_chunk, - (intptr_t) last_card_of_first_obj); - for (jbyte* cur = first_card_of_cur_chunk; - cur <= last_card_to_check; cur++) { - if (card_will_be_scanned(*cur)) { - first_dirty_card = cur; break; - } - } - if (first_dirty_card != NULL) { - assert(0 <= cur_chunk_index && - cur_chunk_index < lowest_non_clean_chunk_size, - "Bounds error."); - lowest_non_clean[cur_chunk_index] = first_dirty_card; - } - } - } -} +#undef NOISY void CardTableModRefBS:: @@ -283,8 +413,8 @@ // LNC array for the covered region. Any later expansion can't affect // the used_at_save_marks region. // (I observed a bug in which the first thread to execute this would - // resize, and then it would cause "expand_and_allocates" that would - // Increase the number of chunks in the covered region. Then a second + // resize, and then it would cause "expand_and_allocate" that would + // increase the number of chunks in the covered region. Then a second // thread would come and execute this, see that the size didn't match, // and free and allocate again. So the first thread would be using a // freed "_lowest_non_clean" array.) diff -r 54a56bbaf95b -r fc2b798ab316 src/share/vm/gc_implementation/parNew/parOopClosures.inline.hpp --- a/src/share/vm/gc_implementation/parNew/parOopClosures.inline.hpp Fri May 06 09:45:18 2011 +0200 +++ b/src/share/vm/gc_implementation/parNew/parOopClosures.inline.hpp Tue May 10 00:33:21 2011 -0700 @@ -77,7 +77,23 @@ if (!oopDesc::is_null(heap_oop)) { oop obj = oopDesc::decode_heap_oop_not_null(heap_oop); if ((HeapWord*)obj < _boundary) { - assert(!_g->to()->is_in_reserved(obj), "Scanning field twice?"); +#ifndef PRODUCT + if (_g->to()->is_in_reserved(obj)) { + tty->print_cr("Scanning field (" PTR_FORMAT ") twice?", p); + GenCollectedHeap* gch = (GenCollectedHeap*)Universe::heap(); + Space* sp = gch->space_containing(p); + oop obj = oop(sp->block_start(p)); + assert((HeapWord*)obj < (HeapWord*)p, "Error"); + tty->print_cr("Object: " PTR_FORMAT, obj); + tty->print_cr("-------"); + obj->print(); + tty->print_cr("-----"); + tty->print_cr("Heap:"); + tty->print_cr("-----"); + gch->print(); + ShouldNotReachHere(); + } +#endif // OK, we need to ensure that it is copied. // We read the klass and mark in this order, so that we can reliably // get the size of the object: if the mark we read is not a diff -r 54a56bbaf95b -r fc2b798ab316 src/share/vm/memory/cardTableModRefBS.cpp --- a/src/share/vm/memory/cardTableModRefBS.cpp Fri May 06 09:45:18 2011 +0200 +++ b/src/share/vm/memory/cardTableModRefBS.cpp Tue May 10 00:33:21 2011 -0700 @@ -455,25 +455,29 @@ return true; } - void CardTableModRefBS::non_clean_card_iterate_possibly_parallel(Space* sp, MemRegion mr, - DirtyCardToOopClosure* dcto_cl, - ClearNoncleanCardWrapper* cl) { + OopsInGenClosure* cl, + CardTableRS* ct) { if (!mr.is_empty()) { int n_threads = SharedHeap::heap()->n_par_threads(); if (n_threads > 0) { #ifndef SERIALGC - non_clean_card_iterate_parallel_work(sp, mr, dcto_cl, cl, n_threads); + non_clean_card_iterate_parallel_work(sp, mr, cl, ct, n_threads); #else // SERIALGC fatal("Parallel gc not supported here."); #endif // SERIALGC } else { // We do not call the non_clean_card_iterate_serial() version below because // we want to clear the cards (which non_clean_card_iterate_serial() does not - // do for us), and the ClearNoncleanCardWrapper closure itself does the work - // of finding contiguous dirty ranges of cards to process (and clear). - cl->do_MemRegion(mr); + // do for us): clear_cl here does the work of finding contiguous dirty ranges + // of cards to process and clear. + + DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, precision(), + cl->gen_boundary()); + ClearNoncleanCardWrapper clear_cl(dcto_cl, ct); + + clear_cl.do_MemRegion(mr); } } } diff -r 54a56bbaf95b -r fc2b798ab316 src/share/vm/memory/cardTableModRefBS.hpp --- a/src/share/vm/memory/cardTableModRefBS.hpp Fri May 06 09:45:18 2011 +0200 +++ b/src/share/vm/memory/cardTableModRefBS.hpp Tue May 10 00:33:21 2011 -0700 @@ -173,18 +173,17 @@ // A variant of the above that will operate in a parallel mode if // worker threads are available, and clear the dirty cards as it // processes them. - // ClearNoncleanCardWrapper cl must wrap the DirtyCardToOopClosure dcto_cl, - // which may itself be modified by the method. + // XXX ??? MemRegionClosure above vs OopsInGenClosure below XXX + // XXX some new_dcto_cl's take OopClosure's, plus as above there are + // some MemRegionClosures. Clean this up everywhere. XXX void non_clean_card_iterate_possibly_parallel(Space* sp, MemRegion mr, - DirtyCardToOopClosure* dcto_cl, - ClearNoncleanCardWrapper* cl); + OopsInGenClosure* cl, CardTableRS* ct); private: // Work method used to implement non_clean_card_iterate_possibly_parallel() // above in the parallel case. void non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr, - DirtyCardToOopClosure* dcto_cl, - ClearNoncleanCardWrapper* cl, + OopsInGenClosure* cl, CardTableRS* ct, int n_threads); protected: @@ -198,11 +197,6 @@ // *** Support for parallel card scanning. - enum SomeConstantsForParallelism { - StridesPerThread = 2, - CardsPerStrideChunk = 256 - }; - // This is an array, one element per covered region of the card table. // Each entry is itself an array, with one element per chunk in the // covered region. Each entry of these arrays is the lowest non-clean @@ -235,7 +229,7 @@ // covers the given address. uintptr_t addr_to_chunk_index(const void* addr) { uintptr_t card = (uintptr_t) byte_for(addr); - return card / CardsPerStrideChunk; + return card / ParGCCardsPerStrideChunk; } // Apply cl, which must either itself apply dcto_cl or be dcto_cl, @@ -243,8 +237,8 @@ void process_stride(Space* sp, MemRegion used, jint stride, int n_strides, - DirtyCardToOopClosure* dcto_cl, - ClearNoncleanCardWrapper* cl, + OopsInGenClosure* cl, + CardTableRS* ct, jbyte** lowest_non_clean, uintptr_t lowest_non_clean_base_chunk_index, size_t lowest_non_clean_chunk_size); @@ -482,7 +476,7 @@ void verify_dirty_region(MemRegion mr) PRODUCT_RETURN; static size_t par_chunk_heapword_alignment() { - return CardsPerStrideChunk * card_size_in_words; + return ParGCCardsPerStrideChunk * card_size_in_words; } }; diff -r 54a56bbaf95b -r fc2b798ab316 src/share/vm/memory/cardTableRS.cpp --- a/src/share/vm/memory/cardTableRS.cpp Fri May 06 09:45:18 2011 +0200 +++ b/src/share/vm/memory/cardTableRS.cpp Tue May 10 00:33:21 2011 -0700 @@ -162,7 +162,7 @@ } ClearNoncleanCardWrapper::ClearNoncleanCardWrapper( - MemRegionClosure* dirty_card_closure, CardTableRS* ct) : + DirtyCardToOopClosure* dirty_card_closure, CardTableRS* ct) : _dirty_card_closure(dirty_card_closure), _ct(ct) { _is_par = (SharedHeap::heap()->n_par_threads() > 0); } @@ -246,10 +246,6 @@ void CardTableRS::younger_refs_in_space_iterate(Space* sp, OopsInGenClosure* cl) { - DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, _ct_bs->precision(), - cl->gen_boundary()); - ClearNoncleanCardWrapper clear_cl(dcto_cl, this); - const MemRegion urasm = sp->used_region_at_save_marks(); #ifdef ASSERT // Convert the assertion check to a warning if we are running @@ -275,10 +271,10 @@ if (!urasm.equals(urasm2)) { warning("CMS+ParNew: Flickering used_region_at_save_marks()!!"); } + ShouldNotReachHere(); } #endif - _ct_bs->non_clean_card_iterate_possibly_parallel(sp, urasm, - dcto_cl, &clear_cl); + _ct_bs->non_clean_card_iterate_possibly_parallel(sp, urasm, cl, this); } void CardTableRS::clear_into_younger(Generation* gen, bool clear_perm) { diff -r 54a56bbaf95b -r fc2b798ab316 src/share/vm/memory/cardTableRS.hpp --- a/src/share/vm/memory/cardTableRS.hpp Fri May 06 09:45:18 2011 +0200 +++ b/src/share/vm/memory/cardTableRS.hpp Tue May 10 00:33:21 2011 -0700 @@ -31,7 +31,6 @@ class Space; class OopsInGenClosure; -class DirtyCardToOopClosure; // This kind of "GenRemSet" uses a card table both as shared data structure // for a mod ref barrier set and for the rem set information. @@ -167,7 +166,7 @@ }; class ClearNoncleanCardWrapper: public MemRegionClosure { - MemRegionClosure* _dirty_card_closure; + DirtyCardToOopClosure* _dirty_card_closure; CardTableRS* _ct; bool _is_par; private: @@ -179,7 +178,7 @@ inline bool clear_card_parallel(jbyte* entry); public: - ClearNoncleanCardWrapper(MemRegionClosure* dirty_card_closure, CardTableRS* ct); + ClearNoncleanCardWrapper(DirtyCardToOopClosure* dirty_card_closure, CardTableRS* ct); void do_MemRegion(MemRegion mr); }; diff -r 54a56bbaf95b -r fc2b798ab316 src/share/vm/memory/space.cpp --- a/src/share/vm/memory/space.cpp Fri May 06 09:45:18 2011 +0200 +++ b/src/share/vm/memory/space.cpp Tue May 10 00:33:21 2011 -0700 @@ -97,6 +97,14 @@ } } +// We get called with "mr" representing the dirty region +// that we want to process. Because of imprecise marking, +// we may need to extend the incoming "mr" to the right, +// and scan more. However, because we may already have +// scanned some of that extended region, we may need to +// trim its right-end back some so we do not scan what +// we (or another worker thread) may already have scanned +// or planning to scan. void DirtyCardToOopClosure::do_MemRegion(MemRegion mr) { // Some collectors need to do special things whenever their dirty @@ -148,7 +156,7 @@ // e.g. the dirty card region is entirely in a now free object // -- something that could happen with a concurrent sweeper. bottom = MIN2(bottom, top); - mr = MemRegion(bottom, top); + MemRegion extended_mr = MemRegion(bottom, top); assert(bottom <= top && (_precision != CardTableModRefBS::ObjHeadPreciseArray || _min_done == NULL || @@ -156,8 +164,8 @@ "overlap!"); // Walk the region if it is not empty; otherwise there is nothing to do. - if (!mr.is_empty()) { - walk_mem_region(mr, bottom_obj, top); + if (!extended_mr.is_empty()) { + walk_mem_region(extended_mr, bottom_obj, top); } // An idempotent closure might be applied in any order, so we don't diff -r 54a56bbaf95b -r fc2b798ab316 src/share/vm/runtime/globals.hpp --- a/src/share/vm/runtime/globals.hpp Fri May 06 09:45:18 2011 +0200 +++ b/src/share/vm/runtime/globals.hpp Tue May 10 00:33:21 2011 -0700 @@ -1460,8 +1460,10 @@ product(intx, ParallelGCBufferWastePct, 10, \ "wasted fraction of parallel allocation buffer.") \ \ - product(bool, ParallelGCRetainPLAB, true, \ - "Retain parallel allocation buffers across scavenges.") \ + diagnostic(bool, ParallelGCRetainPLAB, false, \ + "Retain parallel allocation buffers across scavenges; " \ + " -- disabled because this currently conflicts with " \ + " parallel card scanning under certain conditions ") \ \ product(intx, TargetPLABWastePct, 10, \ "target wasted space in last buffer as pct of overall allocation")\ @@ -1495,7 +1497,15 @@ product(uintx, ParGCDesiredObjsFromOverflowList, 20, \ "The desired number of objects to claim from the overflow list") \ \ - product(uintx, CMSParPromoteBlocksToClaim, 16, \ + diagnostic(intx, ParGCStridesPerThread, 2, \ + "The number of strides per worker thread that we divide up the " \ + "card table scanning work into") \ + \ + diagnostic(intx, ParGCCardsPerStrideChunk, 256, \ + "The number of cards in each chunk of the parallel chunks used " \ + "during card table scanning") \ + \ + product(uintx, CMSParPromoteBlocksToClaim, 16, \ "Number of blocks to attempt to claim when refilling CMS LAB for "\ "parallel GC.") \ \