truffle: src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp comparison

comparison src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp @ 3357:fc2b798ab316

6883834: ParNew: assert(!_g->to()->is_in_reserved(obj),"Scanning field twice?") with LargeObjects tests Summary: Fixed process_chunk_boundaries(), used for parallel card scanning when using ParNew/CMS, so as to prevent double-scanning, or worse, non-scanning of imprecisely marked objects exceeding parallel chunk size. Made some sizing parameters for parallel card scanning diagnostic, disabled ParallelGCRetainPLAB, and elaborated and clarified some comments. Reviewed-by: stefank, johnc

author	ysr
date	Tue, 10 May 2011 00:33:21 -0700
parents	c48ad6ab8bdf
children	7d64aa23eb96

comparison

equal deleted inserted replaced

-:54a56bbaf95b
+:fc2b798ab316
 #include "memory/cardTableModRefBS.hpp"
 #include "memory/cardTableRS.hpp"
 #include "memory/sharedHeap.hpp"
 #include "memory/space.inline.hpp"
 #include "memory/universe.hpp"
+#include "oops/oop.inline.hpp"
 #include "runtime/java.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "runtime/virtualspace.hpp"
 void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr,
-DirtyCardToOopClosure* dcto_cl,
+OopsInGenClosure* cl,
-ClearNoncleanCardWrapper* cl,
+CardTableRS* ct,
 int n_threads) {
 assert(n_threads > 0, "Error: expected n_threads > 0");
 assert((n_threads == 1 && ParallelGCThreads == 0) ||
 n_threads <= (int)ParallelGCThreads,
 "# worker threads != # requested!");
 size_t    lowest_non_clean_chunk_size;
 get_LNC_array_for_space(sp, lowest_non_clean,
 lowest_non_clean_base_chunk_index,
 lowest_non_clean_chunk_size);
-int n_strides = n_threads * StridesPerThread;
+int n_strides = n_threads * ParGCStridesPerThread;
 SequentialSubTasksDone* pst = sp->par_seq_tasks();
 pst->set_n_threads(n_threads);
 pst->set_n_tasks(n_strides);
 int stride = 0;
 while (!pst->is_task_claimed(/* reference */ stride)) {
-process_stride(sp, mr, stride, n_strides, dcto_cl, cl,
+process_stride(sp, mr, stride, n_strides, cl, ct,
 lowest_non_clean,
 lowest_non_clean_base_chunk_index,
 lowest_non_clean_chunk_size);
 }
 if (pst->all_tasks_completed()) {
 void
 CardTableModRefBS::
 process_stride(Space* sp,
 MemRegion used,
 jint stride, int n_strides,
-DirtyCardToOopClosure* dcto_cl,
+OopsInGenClosure* cl,
-ClearNoncleanCardWrapper* cl,
+CardTableRS* ct,
 jbyte** lowest_non_clean,
 uintptr_t lowest_non_clean_base_chunk_index,
 size_t    lowest_non_clean_chunk_size) {
-// We don't have to go downwards here; it wouldn't help anyway,
+// We go from higher to lower addresses here; it wouldn't help that much
-// because of parallelism.
+// because of the strided parallelism pattern used here.
 // Find the first card address of the first chunk in the stride that is
 // at least "bottom" of the used region.
 jbyte*    start_card  = byte_for(used.start());
 jbyte*    end_card    = byte_after(used.last());
 jbyte* chunk_card_start;
 if ((uintptr_t)stride >= start_chunk_stride_num) {
 chunk_card_start = (jbyte*)(start_card +
 (stride - start_chunk_stride_num) *
-CardsPerStrideChunk);
+ParGCCardsPerStrideChunk);
 } else {
 // Go ahead to the next chunk group boundary, then to the requested stride.
 chunk_card_start = (jbyte*)(start_card +
 (n_strides - start_chunk_stride_num + stride) *
-CardsPerStrideChunk);
+ParGCCardsPerStrideChunk);
 }
 while (chunk_card_start < end_card) {
-// We don't have to go downwards here; it wouldn't help anyway,
+// Even though we go from lower to higher addresses below, the
-// because of parallelism.  (We take care with "min_done"; see below.)
+// strided parallelism can interleave the actual processing of the
+// dirty pages in various ways. For a specific chunk within this
+// stride, we take care to avoid double scanning or missing a card
+// by suitably initializing the "min_done" field in process_chunk_boundaries()
+// below, together with the dirty region extension accomplished in
+// DirtyCardToOopClosure::do_MemRegion().
+jbyte*    chunk_card_end = chunk_card_start + ParGCCardsPerStrideChunk;
 // Invariant: chunk_mr should be fully contained within the "used" region.
-jbyte*    chunk_card_end = chunk_card_start + CardsPerStrideChunk;
 MemRegion chunk_mr       = MemRegion(addr_for(chunk_card_start),
 chunk_card_end >= end_card ?
 used.end() : addr_for(chunk_card_end));
 assert(chunk_mr.word_size() > 0, "[chunk_card_start > used_end)");
 assert(used.contains(chunk_mr), "chunk_mr should be subset of used");
+DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, precision(),
+cl->gen_boundary());
+ClearNoncleanCardWrapper clear_cl(dcto_cl, ct);
 // Process the chunk.
 process_chunk_boundaries(sp,
 dcto_cl,
 chunk_mr,
 used,
 lowest_non_clean,
 lowest_non_clean_base_chunk_index,
 lowest_non_clean_chunk_size);
+// We want the LNC array updates above in process_chunk_boundaries
+// to be visible before any of the card table value changes as a
+// result of the dirty card iteration below.
+OrderAccess::storestore();
 // We do not call the non_clean_card_iterate_serial() version because
-// we want to clear the cards, and the ClearNoncleanCardWrapper closure
+// we want to clear the cards: clear_cl here does the work of finding
-// itself does the work of finding contiguous dirty ranges of cards to
+// contiguous dirty ranges of cards to process and clear.
-// process (and clear).
+clear_cl.do_MemRegion(chunk_mr);
-cl->do_MemRegion(chunk_mr);
 // Find the next chunk of the stride.
-chunk_card_start += CardsPerStrideChunk * n_strides;
+chunk_card_start += ParGCCardsPerStrideChunk * n_strides;
 }
 }
+// If you want a talkative process_chunk_boundaries,
+// then #define NOISY(x) x
+#ifdef NOISY
+#error "Encountered a global preprocessor flag, NOISY, which might clash with local definition to follow"
+#else
+#define NOISY(x)
+#endif
 void
 CardTableModRefBS::
 process_chunk_boundaries(Space* sp,
 DirtyCardToOopClosure* dcto_cl,
 MemRegion used,
 jbyte** lowest_non_clean,
 uintptr_t lowest_non_clean_base_chunk_index,
 size_t    lowest_non_clean_chunk_size)
 {
-// We must worry about the chunk boundaries.
+// We must worry about non-array objects that cross chunk boundaries,
+// because such objects are both precisely and imprecisely marked:
-// First, set our max_to_do:
+// .. if the head of such an object is dirty, the entire object
-HeapWord* max_to_do = NULL;
+//    needs to be scanned, under the interpretation that this
+//    was an imprecise mark
+// .. if the head of such an object is not dirty, we can assume
+//    precise marking and it's efficient to scan just the dirty
+//    cards.
+// In either case, each scanned reference must be scanned precisely
+// once so as to avoid cloning of a young referent. For efficiency,
+// our closures depend on this property and do not protect against
+// double scans.
 uintptr_t cur_chunk_index = addr_to_chunk_index(chunk_mr.start());
 cur_chunk_index           = cur_chunk_index - lowest_non_clean_base_chunk_index;
+NOISY(tty->print_cr("===========================================================================");)
+NOISY(tty->print_cr(" process_chunk_boundary: Called with [" PTR_FORMAT "," PTR_FORMAT ")",
+chunk_mr.start(), chunk_mr.end());)
+// First, set "our" lowest_non_clean entry, which would be
+// used by the thread scanning an adjoining left chunk with
+// a non-array object straddling the mutual boundary.
+// Find the object that spans our boundary, if one exists.
+// first_block is the block possibly straddling our left boundary.
+HeapWord* first_block = sp->block_start(chunk_mr.start());
+assert((chunk_mr.start() != used.start()) || (first_block == chunk_mr.start()),
+"First chunk should always have a co-initial block");
+// Does the block straddle the chunk's left boundary, and is it
+// a non-array object?
+if (first_block < chunk_mr.start()        // first block straddles left bdry
+&& sp->block_is_obj(first_block)      // first block is an object
+&& !(oop(first_block)->is_objArray()  // first block is not an array (arrays are precisely dirtied)
+|| oop(first_block)->is_typeArray())) {
+// Find our least non-clean card, so that a left neighbour
+// does not scan an object straddling the mutual boundary
+// too far to the right, and attempt to scan a portion of
+// that object twice.
+jbyte* first_dirty_card = NULL;
+jbyte* last_card_of_first_obj =
+byte_for(first_block + sp->block_size(first_block) - 1);
+jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start());
+jbyte* last_card_of_cur_chunk = byte_for(chunk_mr.last());
+jbyte* last_card_to_check =
+(jbyte*) MIN2((intptr_t) last_card_of_cur_chunk,
+(intptr_t) last_card_of_first_obj);
+// Note that this does not need to go beyond our last card
+// if our first object completely straddles this chunk.
+for (jbyte* cur = first_card_of_cur_chunk;
+cur <= last_card_to_check; cur++) {
+jbyte val = *cur;
+if (card_will_be_scanned(val)) {
+first_dirty_card = cur; break;
+} else {
+assert(!card_may_have_been_dirty(val), "Error");
+}
+}
+if (first_dirty_card != NULL) {
+NOISY(tty->print_cr(" LNC: Found a dirty card at " PTR_FORMAT " in current chunk",
+first_dirty_card);)
+assert(0 <= cur_chunk_index && cur_chunk_index < lowest_non_clean_chunk_size,
+"Bounds error.");
+assert(lowest_non_clean[cur_chunk_index] == NULL,
+"Write exactly once : value should be stable hereafter for this round");
+lowest_non_clean[cur_chunk_index] = first_dirty_card;
+} NOISY(else {
+tty->print_cr(" LNC: Found no dirty card in current chunk; leaving LNC entry NULL");
+// In the future, we could have this thread look for a non-NULL value to copy from its
+// right neighbour (up to the end of the first object).
+if (last_card_of_cur_chunk < last_card_of_first_obj) {
+tty->print_cr(" LNC: BEWARE!!! first obj straddles past right end of chunk:\n"
+"   might be efficient to get value from right neighbour?");
+}
+})
+} else {
+// In this case we can help our neighbour by just asking them
+// to stop at our first card (even though it may not be dirty).
+NOISY(tty->print_cr(" LNC: first block is not a non-array object; setting LNC to first card of current chunk");)
+assert(lowest_non_clean[cur_chunk_index] == NULL, "Write once : value should be stable hereafter");
+jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start());
+lowest_non_clean[cur_chunk_index] = first_card_of_cur_chunk;
+}
+NOISY(tty->print_cr(" process_chunk_boundary: lowest_non_clean[" INTPTR_FORMAT "] = " PTR_FORMAT
+"   which corresponds to the heap address " PTR_FORMAT,
+cur_chunk_index, lowest_non_clean[cur_chunk_index],
+(lowest_non_clean[cur_chunk_index] != NULL)
+? addr_for(lowest_non_clean[cur_chunk_index])
+: NULL);)
+NOISY(tty->print_cr("---------------------------------------------------------------------------");)
+// Next, set our own max_to_do, which will strictly/exclusively bound
+// the highest address that we will scan past the right end of our chunk.
+HeapWord* max_to_do = NULL;
 if (chunk_mr.end() < used.end()) {
-// This is not the last chunk in the used region.  What is the last
+// This is not the last chunk in the used region.
-// object?
+// What is our last block? We check the first block of
-HeapWord* last_block = sp->block_start(chunk_mr.end());
+// the next (right) chunk rather than strictly check our last block
+// because it's potentially more efficient to do so.
+HeapWord* const last_block = sp->block_start(chunk_mr.end());
 assert(last_block <= chunk_mr.end(), "In case this property changes.");
-if (last_block == chunk_mr.end()
+if ((last_block == chunk_mr.end())     // our last block does not straddle boundary
-|| !sp->block_is_obj(last_block)) {
+|| !sp->block_is_obj(last_block)   // last_block isn't an object
+|| oop(last_block)->is_objArray()  // last_block is an array (precisely marked)
+|| oop(last_block)->is_typeArray()) {
 max_to_do = chunk_mr.end();
+NOISY(tty->print_cr(" process_chunk_boundary: Last block on this card is not a non-array object;\n"
+"   max_to_do left at " PTR_FORMAT, max_to_do);)
 } else {
-// It is an object and starts before the end of the current chunk.
+assert(last_block < chunk_mr.end(), "Tautology");
+// It is a non-array object that straddles the right boundary of this chunk.
 // last_obj_card is the card corresponding to the start of the last object
 // in the chunk.  Note that the last object may not start in
 // the chunk.
-jbyte* last_obj_card = byte_for(last_block);
+jbyte* const last_obj_card = byte_for(last_block);
-if (!card_may_have_been_dirty(*last_obj_card)) {
+const jbyte val = *last_obj_card;
-// The card containing the head is not dirty.  Any marks in
+if (!card_will_be_scanned(val)) {
+assert(!card_may_have_been_dirty(val), "Error");
+// The card containing the head is not dirty.  Any marks on
 // subsequent cards still in this chunk must have been made
-// precisely; we can cap processing at the end.
+// precisely; we can cap processing at the end of our chunk.
 max_to_do = chunk_mr.end();
+NOISY(tty->print_cr(" process_chunk_boundary: Head of last object on this card is not dirty;\n"
+"   max_to_do left at " PTR_FORMAT,
+max_to_do);)
 } else {
 // The last object must be considered dirty, and extends onto the
 // following chunk.  Look for a dirty card in that chunk that will
 // bound our processing.
 jbyte* limit_card = NULL;
-size_t last_block_size = sp->block_size(last_block);
+const size_t last_block_size = sp->block_size(last_block);
-jbyte* last_card_of_last_obj =
+jbyte* const last_card_of_last_obj =
 byte_for(last_block + last_block_size - 1);
-jbyte* first_card_of_next_chunk = byte_for(chunk_mr.end());
+jbyte* const first_card_of_next_chunk = byte_for(chunk_mr.end());
 // This search potentially goes a long distance looking
-// for the next card that will be scanned.  For example,
+// for the next card that will be scanned, terminating
-// an object that is an array of primitives will not
+// at the end of the last_block, if no earlier dirty card
-// have any cards covering regions interior to the array
+// is found.
-// that will need to be scanned. The scan can be terminated
+assert(byte_for(chunk_mr.end()) - byte_for(chunk_mr.start()) == ParGCCardsPerStrideChunk,
-// at the last card of the next chunk.  That would leave
+"last card of next chunk may be wrong");
-// limit_card as NULL and would result in "max_to_do"
-// being set with the LNC value or with the end
-// of the last block.
-jbyte* last_card_of_next_chunk = first_card_of_next_chunk +
-CardsPerStrideChunk;
-assert(byte_for(chunk_mr.end()) - byte_for(chunk_mr.start())
-== CardsPerStrideChunk, "last card of next chunk may be wrong");
-jbyte* last_card_to_check = (jbyte*) MIN2(last_card_of_last_obj,
-last_card_of_next_chunk);
 for (jbyte* cur = first_card_of_next_chunk;
-cur <= last_card_to_check; cur++) {
+cur <= last_card_of_last_obj; cur++) {
-if (card_will_be_scanned(*cur)) {
+const jbyte val = *cur;
+if (card_will_be_scanned(val)) {
+NOISY(tty->print_cr(" Found a non-clean card " PTR_FORMAT " with value 0x%x",
+cur, (int)val);)
 limit_card = cur; break;
+} else {
+assert(!card_may_have_been_dirty(val), "Error: card can't be skipped");
 }
 }
-assert(0 <= cur_chunk_index+1 &&
+if (limit_card != NULL) {
-cur_chunk_index+1 < lowest_non_clean_chunk_size,
+max_to_do = addr_for(limit_card);
+assert(limit_card != NULL && max_to_do != NULL, "Error");
+NOISY(tty->print_cr(" process_chunk_boundary: Found a dirty card at " PTR_FORMAT
+"   max_to_do set at " PTR_FORMAT " which is before end of last block in chunk: "
+PTR_FORMAT " + " PTR_FORMAT " = " PTR_FORMAT,
+limit_card, max_to_do, last_block, last_block_size, (last_block+last_block_size));)
+} else {
+// The following is a pessimistic value, because it's possible
+// that a dirty card on a subsequent chunk has been cleared by
+// the time we get to look at it; we'll correct for that further below,
+// using the LNC array which records the least non-clean card
+// before cards were cleared in a particular chunk.
+limit_card = last_card_of_last_obj;
+max_to_do = last_block + last_block_size;
+assert(limit_card != NULL && max_to_do != NULL, "Error");
+NOISY(tty->print_cr(" process_chunk_boundary: Found no dirty card before end of last block in chunk\n"
+"   Setting limit_card to " PTR_FORMAT
+" and max_to_do " PTR_FORMAT " + " PTR_FORMAT " = " PTR_FORMAT,
+limit_card, last_block, last_block_size, max_to_do);)
+}
+assert(0 < cur_chunk_index+1 && cur_chunk_index+1 < lowest_non_clean_chunk_size,
 "Bounds error.");
-// LNC for the next chunk
+// It is possible that a dirty card for the last object may have been
-jbyte* lnc_card = lowest_non_clean[cur_chunk_index+1];
+// cleared before we had a chance to examine it. In that case, the value
-if (limit_card == NULL) {
+// will have been logged in the LNC for that chunk.
-limit_card = lnc_card;
+// We need to examine as many chunks to the right as this object
+// covers.
+const uintptr_t last_chunk_index_to_check = addr_to_chunk_index(last_block + last_block_size - 1)
+- lowest_non_clean_base_chunk_index;
+DEBUG_ONLY(const uintptr_t last_chunk_index = addr_to_chunk_index(used.end())
+- lowest_non_clean_base_chunk_index;)
+assert(last_chunk_index_to_check <= last_chunk_index,
+err_msg("Out of bounds: last_chunk_index_to_check " INTPTR_FORMAT
+" exceeds last_chunk_index " INTPTR_FORMAT,
+last_chunk_index_to_check, last_chunk_index));
+for (uintptr_t lnc_index = cur_chunk_index + 1;
+lnc_index <= last_chunk_index_to_check;
+lnc_index++) {
+jbyte* lnc_card = lowest_non_clean[lnc_index];
+if (lnc_card != NULL) {
+// we can stop at the first non-NULL entry we find
+if (lnc_card <= limit_card) {
+NOISY(tty->print_cr(" process_chunk_boundary: LNC card " PTR_FORMAT " is lower than limit_card " PTR_FORMAT,
+"   max_to_do will be lowered to " PTR_FORMAT " from " PTR_FORMAT,
+lnc_card, limit_card, addr_for(lnc_card), max_to_do);)
+limit_card = lnc_card;
+max_to_do = addr_for(limit_card);
+assert(limit_card != NULL && max_to_do != NULL, "Error");
+}
+// In any case, we break now
+break;
+}  // else continue to look for a non-NULL entry if any
 }
-if (limit_card != NULL) {
+assert(limit_card != NULL && max_to_do != NULL, "Error");
-if (lnc_card != NULL) {
-limit_card = (jbyte*)MIN2((intptr_t)limit_card,
-(intptr_t)lnc_card);
-}
-max_to_do = addr_for(limit_card);
-} else {
-max_to_do = last_block + last_block_size;
-}
 }
+assert(max_to_do != NULL, "OOPS 1 !");
 }
-assert(max_to_do != NULL, "OOPS!");
+assert(max_to_do != NULL, "OOPS 2!");
 } else {
 max_to_do = used.end();
-}
+NOISY(tty->print_cr(" process_chunk_boundary: Last chunk of this space;\n"
+"   max_to_do left at " PTR_FORMAT,
+max_to_do);)
+}
+assert(max_to_do != NULL, "OOPS 3!");
 // Now we can set the closure we're using so it doesn't to beyond
 // max_to_do.
 dcto_cl->set_min_done(max_to_do);
 #ifndef PRODUCT
 dcto_cl->set_last_bottom(max_to_do);
 #endif
+NOISY(tty->print_cr("===========================================================================\n");)
-// Now we set *our" lowest_non_clean entry.
-// Find the object that spans our boundary, if one exists.
-// Nothing to do on the first chunk.
-if (chunk_mr.start() > used.start()) {
-// first_block is the block possibly spanning the chunk start
-HeapWord* first_block = sp->block_start(chunk_mr.start());
-// Does the block span the start of the chunk and is it
-// an object?
-if (first_block < chunk_mr.start() &&
-sp->block_is_obj(first_block)) {
-jbyte* first_dirty_card = NULL;
-jbyte* last_card_of_first_obj =
-byte_for(first_block + sp->block_size(first_block) - 1);
-jbyte* first_card_of_cur_chunk = byte_for(chunk_mr.start());
-jbyte* last_card_of_cur_chunk = byte_for(chunk_mr.last());
-jbyte* last_card_to_check =
-(jbyte*) MIN2((intptr_t) last_card_of_cur_chunk,
-(intptr_t) last_card_of_first_obj);
-for (jbyte* cur = first_card_of_cur_chunk;
-cur <= last_card_to_check; cur++) {
-if (card_will_be_scanned(*cur)) {
-first_dirty_card = cur; break;
-}
-}
-if (first_dirty_card != NULL) {
-assert(0 <= cur_chunk_index &&
-cur_chunk_index < lowest_non_clean_chunk_size,
-"Bounds error.");
-lowest_non_clean[cur_chunk_index] = first_dirty_card;
-}
-}
-}
 }
+#undef NOISY
 void
 CardTableModRefBS::
 get_LNC_array_for_space(Space* sp,
 jbyte**& lowest_non_clean,
 // Only the first thread to obtain the lock will resize the
 // LNC array for the covered region.  Any later expansion can't affect
 // the used_at_save_marks region.
 // (I observed a bug in which the first thread to execute this would
-// resize, and then it would cause "expand_and_allocates" that would
+// resize, and then it would cause "expand_and_allocate" that would
-// Increase the number of chunks in the covered region.  Then a second
+// increase the number of chunks in the covered region.  Then a second
 // thread would come and execute this, see that the size didn't match,
 // and free and allocate again.  So the first thread would be using a
 // freed "_lowest_non_clean" array.)
 // Do a dirty read here. If we pass the conditional then take the rare

Mercurial > hg > truffle

comparison src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp @ 3357:fc2b798ab316