# HG changeset patch
# User ysr
# Date 1303352370 25200
# Node ID c48ad6ab8bdfd9f7f084d986d02b246526f57244
# Parent  49a67202bc671d23d719c9e14752d80295a9e8f6
7037276: Unnecessary double traversal of dirty card windows
Summary: Short-circuited an unnecessary double traversal of dirty card windows when iterating younger refs. Also renamed some cardtable methods for more clarity.
Reviewed-by: jmasa, stefank, poonam

diff -r 49a67202bc67 -r c48ad6ab8bdf src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp
--- a/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp	Tue Apr 19 15:46:59 2011 -0400
+++ b/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp	Wed Apr 20 19:19:30 2011 -0700
@@ -33,44 +33,43 @@
 #include "runtime/mutexLocker.hpp"
 #include "runtime/virtualspace.hpp"
 
-void CardTableModRefBS::par_non_clean_card_iterate_work(Space* sp, MemRegion mr,
-                                                        DirtyCardToOopClosure* dcto_cl,
-                                                        MemRegionClosure* cl,
-                                                        int n_threads) {
-  if (n_threads > 0) {
-    assert((n_threads == 1 && ParallelGCThreads == 0) ||
-           n_threads <= (int)ParallelGCThreads,
-           "# worker threads != # requested!");
-    // Make sure the LNC array is valid for the space.
-    jbyte**   lowest_non_clean;
-    uintptr_t lowest_non_clean_base_chunk_index;
-    size_t    lowest_non_clean_chunk_size;
-    get_LNC_array_for_space(sp, lowest_non_clean,
-                            lowest_non_clean_base_chunk_index,
-                            lowest_non_clean_chunk_size);
+void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr,
+                                                             DirtyCardToOopClosure* dcto_cl,
+                                                             ClearNoncleanCardWrapper* cl,
+                                                             int n_threads) {
+  assert(n_threads > 0, "Error: expected n_threads > 0");
+  assert((n_threads == 1 && ParallelGCThreads == 0) ||
+         n_threads <= (int)ParallelGCThreads,
+         "# worker threads != # requested!");
+  // Make sure the LNC array is valid for the space.
+  jbyte**   lowest_non_clean;
+  uintptr_t lowest_non_clean_base_chunk_index;
+  size_t    lowest_non_clean_chunk_size;
+  get_LNC_array_for_space(sp, lowest_non_clean,
+                          lowest_non_clean_base_chunk_index,
+                          lowest_non_clean_chunk_size);
 
-    int n_strides = n_threads * StridesPerThread;
-    SequentialSubTasksDone* pst = sp->par_seq_tasks();
-    pst->set_n_threads(n_threads);
-    pst->set_n_tasks(n_strides);
+  int n_strides = n_threads * StridesPerThread;
+  SequentialSubTasksDone* pst = sp->par_seq_tasks();
+  pst->set_n_threads(n_threads);
+  pst->set_n_tasks(n_strides);
 
-    int stride = 0;
-    while (!pst->is_task_claimed(/* reference */ stride)) {
-      process_stride(sp, mr, stride, n_strides, dcto_cl, cl,
-                     lowest_non_clean,
-                     lowest_non_clean_base_chunk_index,
-                     lowest_non_clean_chunk_size);
-    }
-    if (pst->all_tasks_completed()) {
-      // Clear lowest_non_clean array for next time.
-      intptr_t first_chunk_index = addr_to_chunk_index(mr.start());
-      uintptr_t last_chunk_index  = addr_to_chunk_index(mr.last());
-      for (uintptr_t ch = first_chunk_index; ch <= last_chunk_index; ch++) {
-        intptr_t ind = ch - lowest_non_clean_base_chunk_index;
-        assert(0 <= ind && ind < (intptr_t)lowest_non_clean_chunk_size,
-               "Bounds error");
-        lowest_non_clean[ind] = NULL;
-      }
+  int stride = 0;
+  while (!pst->is_task_claimed(/* reference */ stride)) {
+    process_stride(sp, mr, stride, n_strides, dcto_cl, cl,
+                   lowest_non_clean,
+                   lowest_non_clean_base_chunk_index,
+                   lowest_non_clean_chunk_size);
+  }
+  if (pst->all_tasks_completed()) {
+    // Clear lowest_non_clean array for next time.
+    intptr_t first_chunk_index = addr_to_chunk_index(mr.start());
+    uintptr_t last_chunk_index  = addr_to_chunk_index(mr.last());
+    for (uintptr_t ch = first_chunk_index; ch <= last_chunk_index; ch++) {
+      intptr_t ind = ch - lowest_non_clean_base_chunk_index;
+      assert(0 <= ind && ind < (intptr_t)lowest_non_clean_chunk_size,
+             "Bounds error");
+      lowest_non_clean[ind] = NULL;
     }
   }
 }
@@ -81,7 +80,7 @@
                MemRegion used,
                jint stride, int n_strides,
                DirtyCardToOopClosure* dcto_cl,
-               MemRegionClosure* cl,
+               ClearNoncleanCardWrapper* cl,
                jbyte** lowest_non_clean,
                uintptr_t lowest_non_clean_base_chunk_index,
                size_t    lowest_non_clean_chunk_size) {
@@ -127,7 +126,11 @@
                              lowest_non_clean_base_chunk_index,
                              lowest_non_clean_chunk_size);
 
-    non_clean_card_iterate_work(chunk_mr, cl);
+    // We do not call the non_clean_card_iterate_serial() version because
+    // we want to clear the cards, and the ClearNoncleanCardWrapper closure
+    // itself does the work of finding contiguous dirty ranges of cards to
+    // process (and clear).
+    cl->do_MemRegion(chunk_mr);
 
     // Find the next chunk of the stride.
     chunk_card_start += CardsPerStrideChunk * n_strides;
diff -r 49a67202bc67 -r c48ad6ab8bdf src/share/vm/memory/cardTableModRefBS.cpp
--- a/src/share/vm/memory/cardTableModRefBS.cpp	Tue Apr 19 15:46:59 2011 -0400
+++ b/src/share/vm/memory/cardTableModRefBS.cpp	Wed Apr 20 19:19:30 2011 -0700
@@ -456,31 +456,35 @@
 }
 
 
-void CardTableModRefBS::non_clean_card_iterate(Space* sp,
-                                               MemRegion mr,
-                                               DirtyCardToOopClosure* dcto_cl,
-                                               MemRegionClosure* cl) {
+void CardTableModRefBS::non_clean_card_iterate_possibly_parallel(Space* sp,
+                                                                 MemRegion mr,
+                                                                 DirtyCardToOopClosure* dcto_cl,
+                                                                 ClearNoncleanCardWrapper* cl) {
   if (!mr.is_empty()) {
     int n_threads = SharedHeap::heap()->n_par_threads();
     if (n_threads > 0) {
 #ifndef SERIALGC
-      par_non_clean_card_iterate_work(sp, mr, dcto_cl, cl, n_threads);
+      non_clean_card_iterate_parallel_work(sp, mr, dcto_cl, cl, n_threads);
 #else  // SERIALGC
       fatal("Parallel gc not supported here.");
 #endif // SERIALGC
     } else {
-      non_clean_card_iterate_work(mr, cl);
+      // We do not call the non_clean_card_iterate_serial() version below because
+      // we want to clear the cards (which non_clean_card_iterate_serial() does not
+      // do for us), and the ClearNoncleanCardWrapper closure itself does the work
+      // of finding contiguous dirty ranges of cards to process (and clear).
+      cl->do_MemRegion(mr);
     }
   }
 }
 
-// NOTE: For this to work correctly, it is important that
-// we look for non-clean cards below (so as to catch those
-// marked precleaned), rather than look explicitly for dirty
-// cards (and miss those marked precleaned). In that sense,
-// the name precleaned is currently somewhat of a misnomer.
-void CardTableModRefBS::non_clean_card_iterate_work(MemRegion mr,
-                                                    MemRegionClosure* cl) {
+// The iterator itself is not MT-aware, but
+// MT-aware callers and closures can use this to
+// accomplish dirty card iteration in parallel. The
+// iterator itself does not clear the dirty cards, or
+// change their values in any manner.
+void CardTableModRefBS::non_clean_card_iterate_serial(MemRegion mr,
+                                                      MemRegionClosure* cl) {
   for (int i = 0; i < _cur_covered_regions; i++) {
     MemRegion mri = mr.intersection(_covered[i]);
     if (mri.word_size() > 0) {
@@ -661,7 +665,7 @@
 
 void CardTableModRefBS::verify_clean_region(MemRegion mr) {
   GuaranteeNotModClosure blk(this);
-  non_clean_card_iterate_work(mr, &blk);
+  non_clean_card_iterate_serial(mr, &blk);
 }
 
 // To verify a MemRegion is entirely dirty this closure is passed to
diff -r 49a67202bc67 -r c48ad6ab8bdf src/share/vm/memory/cardTableModRefBS.hpp
--- a/src/share/vm/memory/cardTableModRefBS.hpp	Tue Apr 19 15:46:59 2011 -0400
+++ b/src/share/vm/memory/cardTableModRefBS.hpp	Wed Apr 20 19:19:30 2011 -0700
@@ -44,6 +44,7 @@
 class Generation;
 class OopsInGenClosure;
 class DirtyCardToOopClosure;
+class ClearNoncleanCardWrapper;
 
 class CardTableModRefBS: public ModRefBarrierSet {
   // Some classes get to look at some private stuff.
@@ -165,22 +166,28 @@
 
   // Iterate over the portion of the card-table which covers the given
   // region mr in the given space and apply cl to any dirty sub-regions
-  // of mr. cl and dcto_cl must either be the same closure or cl must
-  // wrap dcto_cl. Both are required - neither may be NULL. Also, dcto_cl
-  // may be modified. Note that this function will operate in a parallel
-  // mode if worker threads are available.
-  void non_clean_card_iterate(Space* sp, MemRegion mr,
-                              DirtyCardToOopClosure* dcto_cl,
-                              MemRegionClosure* cl);
+  // of mr. Dirty cards are _not_ cleared by the iterator method itself,
+  // but closures may arrange to do so on their own should they so wish.
+  void non_clean_card_iterate_serial(MemRegion mr, MemRegionClosure* cl);
 
-  // Utility function used to implement the other versions below.
-  void non_clean_card_iterate_work(MemRegion mr, MemRegionClosure* cl);
+  // A variant of the above that will operate in a parallel mode if
+  // worker threads are available, and clear the dirty cards as it
+  // processes them.
+  // ClearNoncleanCardWrapper cl must wrap the DirtyCardToOopClosure dcto_cl,
+  // which may itself be modified by the method.
+  void non_clean_card_iterate_possibly_parallel(Space* sp, MemRegion mr,
+                                                DirtyCardToOopClosure* dcto_cl,
+                                                ClearNoncleanCardWrapper* cl);
 
-  void par_non_clean_card_iterate_work(Space* sp, MemRegion mr,
-                                       DirtyCardToOopClosure* dcto_cl,
-                                       MemRegionClosure* cl,
-                                       int n_threads);
+ private:
+  // Work method used to implement non_clean_card_iterate_possibly_parallel()
+  // above in the parallel case.
+  void non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr,
+                                            DirtyCardToOopClosure* dcto_cl,
+                                            ClearNoncleanCardWrapper* cl,
+                                            int n_threads);
 
+ protected:
   // Dirty the bytes corresponding to "mr" (not all of which must be
   // covered.)
   void dirty_MemRegion(MemRegion mr);
@@ -237,7 +244,7 @@
                       MemRegion used,
                       jint stride, int n_strides,
                       DirtyCardToOopClosure* dcto_cl,
-                      MemRegionClosure* cl,
+                      ClearNoncleanCardWrapper* cl,
                       jbyte** lowest_non_clean,
                       uintptr_t lowest_non_clean_base_chunk_index,
                       size_t lowest_non_clean_chunk_size);
@@ -409,14 +416,14 @@
   // marking, where a dirty card may cause scanning, and summarization
   // marking, of objects that extend onto subsequent cards.)
   void mod_card_iterate(MemRegionClosure* cl) {
-    non_clean_card_iterate_work(_whole_heap, cl);
+    non_clean_card_iterate_serial(_whole_heap, cl);
   }
 
   // Like the "mod_cards_iterate" above, except only invokes the closure
   // for cards within the MemRegion "mr" (which is required to be
   // card-aligned and sized.)
   void mod_card_iterate(MemRegion mr, MemRegionClosure* cl) {
-    non_clean_card_iterate_work(mr, cl);
+    non_clean_card_iterate_serial(mr, cl);
   }
 
   static uintx ct_max_alignment_constraint();
@@ -493,4 +500,5 @@
   void set_CTRS(CardTableRS* rs) { _rs = rs; }
 };
 
+
 #endif // SHARE_VM_MEMORY_CARDTABLEMODREFBS_HPP
diff -r 49a67202bc67 -r c48ad6ab8bdf src/share/vm/memory/cardTableRS.cpp
--- a/src/share/vm/memory/cardTableRS.cpp	Tue Apr 19 15:46:59 2011 -0400
+++ b/src/share/vm/memory/cardTableRS.cpp	Wed Apr 20 19:19:30 2011 -0700
@@ -105,107 +105,111 @@
   g->younger_refs_iterate(blk);
 }
 
-class ClearNoncleanCardWrapper: public MemRegionClosure {
-  MemRegionClosure* _dirty_card_closure;
-  CardTableRS* _ct;
-  bool _is_par;
-private:
-  // Clears the given card, return true if the corresponding card should be
-  // processed.
-  bool clear_card(jbyte* entry) {
-    if (_is_par) {
-      while (true) {
-        // In the parallel case, we may have to do this several times.
-        jbyte entry_val = *entry;
-        assert(entry_val != CardTableRS::clean_card_val(),
-               "We shouldn't be looking at clean cards, and this should "
-               "be the only place they get cleaned.");
-        if (CardTableRS::card_is_dirty_wrt_gen_iter(entry_val)
-            || _ct->is_prev_youngergen_card_val(entry_val)) {
-          jbyte res =
-            Atomic::cmpxchg(CardTableRS::clean_card_val(), entry, entry_val);
-          if (res == entry_val) {
-            break;
-          } else {
-            assert(res == CardTableRS::cur_youngergen_and_prev_nonclean_card,
-                   "The CAS above should only fail if another thread did "
-                   "a GC write barrier.");
-          }
-        } else if (entry_val ==
-                   CardTableRS::cur_youngergen_and_prev_nonclean_card) {
-          // Parallelism shouldn't matter in this case.  Only the thread
-          // assigned to scan the card should change this value.
-          *entry = _ct->cur_youngergen_card_val();
-          break;
-        } else {
-          assert(entry_val == _ct->cur_youngergen_card_val(),
-                 "Should be the only possibility.");
-          // In this case, the card was clean before, and become
-          // cur_youngergen only because of processing of a promoted object.
-          // We don't have to look at the card.
-          return false;
-        }
+inline bool ClearNoncleanCardWrapper::clear_card(jbyte* entry) {
+  if (_is_par) {
+    return clear_card_parallel(entry);
+  } else {
+    return clear_card_serial(entry);
+  }
+}
+
+inline bool ClearNoncleanCardWrapper::clear_card_parallel(jbyte* entry) {
+  while (true) {
+    // In the parallel case, we may have to do this several times.
+    jbyte entry_val = *entry;
+    assert(entry_val != CardTableRS::clean_card_val(),
+           "We shouldn't be looking at clean cards, and this should "
+           "be the only place they get cleaned.");
+    if (CardTableRS::card_is_dirty_wrt_gen_iter(entry_val)
+        || _ct->is_prev_youngergen_card_val(entry_val)) {
+      jbyte res =
+        Atomic::cmpxchg(CardTableRS::clean_card_val(), entry, entry_val);
+      if (res == entry_val) {
+        break;
+      } else {
+        assert(res == CardTableRS::cur_youngergen_and_prev_nonclean_card,
+               "The CAS above should only fail if another thread did "
+               "a GC write barrier.");
       }
-      return true;
+    } else if (entry_val ==
+               CardTableRS::cur_youngergen_and_prev_nonclean_card) {
+      // Parallelism shouldn't matter in this case.  Only the thread
+      // assigned to scan the card should change this value.
+      *entry = _ct->cur_youngergen_card_val();
+      break;
     } else {
-      jbyte entry_val = *entry;
-      assert(entry_val != CardTableRS::clean_card_val(),
-             "We shouldn't be looking at clean cards, and this should "
-             "be the only place they get cleaned.");
-      assert(entry_val != CardTableRS::cur_youngergen_and_prev_nonclean_card,
-             "This should be possible in the sequential case.");
-      *entry = CardTableRS::clean_card_val();
-      return true;
+      assert(entry_val == _ct->cur_youngergen_card_val(),
+             "Should be the only possibility.");
+      // In this case, the card was clean before, and become
+      // cur_youngergen only because of processing of a promoted object.
+      // We don't have to look at the card.
+      return false;
     }
   }
+  return true;
+}
 
-public:
-  ClearNoncleanCardWrapper(MemRegionClosure* dirty_card_closure,
-                           CardTableRS* ct) :
+
+inline bool ClearNoncleanCardWrapper::clear_card_serial(jbyte* entry) {
+  jbyte entry_val = *entry;
+  assert(entry_val != CardTableRS::clean_card_val(),
+         "We shouldn't be looking at clean cards, and this should "
+         "be the only place they get cleaned.");
+  assert(entry_val != CardTableRS::cur_youngergen_and_prev_nonclean_card,
+         "This should be possible in the sequential case.");
+  *entry = CardTableRS::clean_card_val();
+  return true;
+}
+
+ClearNoncleanCardWrapper::ClearNoncleanCardWrapper(
+  MemRegionClosure* dirty_card_closure, CardTableRS* ct) :
     _dirty_card_closure(dirty_card_closure), _ct(ct) {
     _is_par = (SharedHeap::heap()->n_par_threads() > 0);
+}
+
+void ClearNoncleanCardWrapper::do_MemRegion(MemRegion mr) {
+  assert(mr.word_size() > 0, "Error");
+  assert(_ct->is_aligned(mr.start()), "mr.start() should be card aligned");
+  // mr.end() may not necessarily be card aligned.
+  jbyte* cur_entry = _ct->byte_for(mr.last());
+  const jbyte* limit = _ct->byte_for(mr.start());
+  HeapWord* end_of_non_clean = mr.end();
+  HeapWord* start_of_non_clean = end_of_non_clean;
+  while (cur_entry >= limit) {
+    HeapWord* cur_hw = _ct->addr_for(cur_entry);
+    if ((*cur_entry != CardTableRS::clean_card_val()) && clear_card(cur_entry)) {
+      // Continue the dirty range by opening the
+      // dirty window one card to the left.
+      start_of_non_clean = cur_hw;
+    } else {
+      // We hit a "clean" card; process any non-empty
+      // "dirty" range accumulated so far.
+      if (start_of_non_clean < end_of_non_clean) {
+        const MemRegion mrd(start_of_non_clean, end_of_non_clean);
+        _dirty_card_closure->do_MemRegion(mrd);
+      }
+      // Reset the dirty window, while continuing to look
+      // for the next dirty card that will start a
+      // new dirty window.
+      end_of_non_clean = cur_hw;
+      start_of_non_clean = cur_hw;
+    }
+    // Note that "cur_entry" leads "start_of_non_clean" in
+    // its leftward excursion after this point
+    // in the loop and, when we hit the left end of "mr",
+    // will point off of the left end of the card-table
+    // for "mr".
+    cur_entry--;
   }
-  void do_MemRegion(MemRegion mr) {
-    // We start at the high end of "mr", walking backwards
-    // while accumulating a contiguous dirty range of cards in
-    // [start_of_non_clean, end_of_non_clean) which we then
-    // process en masse.
-    HeapWord* end_of_non_clean = mr.end();
-    HeapWord* start_of_non_clean = end_of_non_clean;
-    jbyte*       entry = _ct->byte_for(mr.last());
-    const jbyte* first_entry = _ct->byte_for(mr.start());
-    while (entry >= first_entry) {
-      HeapWord* cur = _ct->addr_for(entry);
-      if (!clear_card(entry)) {
-        // We hit a clean card; process any non-empty
-        // dirty range accumulated so far.
-        if (start_of_non_clean < end_of_non_clean) {
-          MemRegion mr2(start_of_non_clean, end_of_non_clean);
-          _dirty_card_closure->do_MemRegion(mr2);
-        }
-        // Reset the dirty window while continuing to
-        // look for the next dirty window to process.
-        end_of_non_clean = cur;
-        start_of_non_clean = end_of_non_clean;
-      }
-      // Open the left end of the window one card to the left.
-      start_of_non_clean = cur;
-      // Note that "entry" leads "start_of_non_clean" in
-      // its leftward excursion after this point
-      // in the loop and, when we hit the left end of "mr",
-      // will point off of the left end of the card-table
-      // for "mr".
-      entry--;
-    }
-    // If the first card of "mr" was dirty, we will have
-    // been left with a dirty window, co-initial with "mr",
-    // which we now process.
-    if (start_of_non_clean < end_of_non_clean) {
-      MemRegion mr2(start_of_non_clean, end_of_non_clean);
-      _dirty_card_closure->do_MemRegion(mr2);
-    }
+  // If the first card of "mr" was dirty, we will have
+  // been left with a dirty window, co-initial with "mr",
+  // which we now process.
+  if (start_of_non_clean < end_of_non_clean) {
+    const MemRegion mrd(start_of_non_clean, end_of_non_clean);
+    _dirty_card_closure->do_MemRegion(mrd);
   }
-};
+}
+
 // clean (by dirty->clean before) ==> cur_younger_gen
 // dirty                          ==> cur_youngergen_and_prev_nonclean_card
 // precleaned                     ==> cur_youngergen_and_prev_nonclean_card
@@ -246,8 +250,8 @@
                                                    cl->gen_boundary());
   ClearNoncleanCardWrapper clear_cl(dcto_cl, this);
 
-  _ct_bs->non_clean_card_iterate(sp, sp->used_region_at_save_marks(),
-                                dcto_cl, &clear_cl);
+  _ct_bs->non_clean_card_iterate_possibly_parallel(sp, sp->used_region_at_save_marks(),
+                                                   dcto_cl, &clear_cl);
 }
 
 void CardTableRS::clear_into_younger(Generation* gen, bool clear_perm) {
diff -r 49a67202bc67 -r c48ad6ab8bdf src/share/vm/memory/cardTableRS.hpp
--- a/src/share/vm/memory/cardTableRS.hpp	Tue Apr 19 15:46:59 2011 -0400
+++ b/src/share/vm/memory/cardTableRS.hpp	Wed Apr 20 19:19:30 2011 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -166,4 +166,21 @@
 
 };
 
+class ClearNoncleanCardWrapper: public MemRegionClosure {
+  MemRegionClosure* _dirty_card_closure;
+  CardTableRS* _ct;
+  bool _is_par;
+private:
+  // Clears the given card, return true if the corresponding card should be
+  // processed.
+  inline bool clear_card(jbyte* entry);
+  // Work methods called by the clear_card()
+  inline bool clear_card_serial(jbyte* entry);
+  inline bool clear_card_parallel(jbyte* entry);
+
+public:
+  ClearNoncleanCardWrapper(MemRegionClosure* dirty_card_closure, CardTableRS* ct);
+  void do_MemRegion(MemRegion mr);
+};
+
 #endif // SHARE_VM_MEMORY_CARDTABLERS_HPP