diff src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp @ 4095:bca17e38de00

6593758: RFE: Enhance GC ergonomics to dynamically choose ParallelGCThreads Summary: Select number of GC threads dynamically based on heap usage and number of Java threads Reviewed-by: johnc, ysr, jcoomes
author jmasa
date Tue, 09 Aug 2011 10:16:01 -0700
parents a88de71c4e3a
children dc467e8b2c5e
line wrap: on
line diff
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Tue Nov 22 04:47:10 2011 -0500
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Tue Aug 09 10:16:01 2011 -0700
@@ -66,6 +66,18 @@
 // apply to TLAB allocation, which is not part of this interface: it
 // is done by clients of this interface.)
 
+// Notes on implementation of parallelism in different tasks.
+//
+// G1ParVerifyTask uses heap_region_par_iterate_chunked() for parallelism.
+// The number of GC workers is passed to heap_region_par_iterate_chunked().
+// It does use run_task() which sets _n_workers in the task.
+// G1ParTask executes g1_process_strong_roots() ->
+// SharedHeap::process_strong_roots() which calls eventuall to
+// CardTableModRefBS::par_non_clean_card_iterate_work() which uses
+// SequentialSubTasksDone.  SharedHeap::process_strong_roots() also
+// directly uses SubTasksDone (_process_strong_tasks field in SharedHeap).
+//
+
 // Local to this file.
 
 class RefineCardTableEntryClosure: public CardTableEntryClosure {
@@ -1156,6 +1168,7 @@
   void work(int i) {
     RebuildRSOutOfRegionClosure rebuild_rs(_g1, i);
     _g1->heap_region_par_iterate_chunked(&rebuild_rs, i,
+                                          _g1->workers()->active_workers(),
                                          HeapRegion::RebuildRSClaimValue);
   }
 };
@@ -1360,12 +1373,32 @@
     }
 
     // Rebuild remembered sets of all regions.
-
     if (G1CollectedHeap::use_parallel_gc_threads()) {
+      int n_workers =
+        AdaptiveSizePolicy::calc_active_workers(workers()->total_workers(),
+                                       workers()->active_workers(),
+                                       Threads::number_of_non_daemon_threads());
+      assert(UseDynamicNumberOfGCThreads ||
+             n_workers == workers()->total_workers(),
+             "If not dynamic should be using all the  workers");
+      workers()->set_active_workers(n_workers);
+      // Set parallel threads in the heap (_n_par_threads) only
+      // before a parallel phase and always reset it to 0 after
+      // the phase so that the number of parallel threads does
+      // no get carried forward to a serial phase where there
+      // may be code that is "possibly_parallel".
+      set_par_threads(n_workers);
+
       ParRebuildRSTask rebuild_rs_task(this);
       assert(check_heap_region_claim_values(
              HeapRegion::InitialClaimValue), "sanity check");
-      set_par_threads(workers()->total_workers());
+      assert(UseDynamicNumberOfGCThreads ||
+             workers()->active_workers() == workers()->total_workers(),
+        "Unless dynamic should use total workers");
+      // Use the most recent number of  active workers
+      assert(workers()->active_workers() > 0,
+        "Active workers not properly set");
+      set_par_threads(workers()->active_workers());
       workers()->run_task(&rebuild_rs_task);
       set_par_threads(0);
       assert(check_heap_region_claim_values(
@@ -2477,11 +2510,17 @@
 void
 G1CollectedHeap::heap_region_par_iterate_chunked(HeapRegionClosure* cl,
                                                  int worker,
+                                                 int no_of_par_workers,
                                                  jint claim_value) {
   const size_t regions = n_regions();
-  const size_t worker_num = (G1CollectedHeap::use_parallel_gc_threads() ? ParallelGCThreads : 1);
+  const size_t max_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
+                             no_of_par_workers :
+                             1);
+  assert(UseDynamicNumberOfGCThreads ||
+         no_of_par_workers == workers()->total_workers(),
+         "Non dynamic should use fixed number of workers");
   // try to spread out the starting points of the workers
-  const size_t start_index = regions / worker_num * (size_t) worker;
+  const size_t start_index = regions / max_workers * (size_t) worker;
 
   // each worker will actually look at all regions
   for (size_t count = 0; count < regions; ++count) {
@@ -2920,6 +2959,7 @@
     HandleMark hm;
     VerifyRegionClosure blk(_allow_dirty, true, _vo);
     _g1h->heap_region_par_iterate_chunked(&blk, worker_i,
+                                          _g1h->workers()->active_workers(),
                                           HeapRegion::ParVerifyClaimValue);
     if (blk.failures()) {
       _failures = true;
@@ -2937,6 +2977,10 @@
   if (SafepointSynchronize::is_at_safepoint() || ! UseTLAB) {
     if (!silent) { gclog_or_tty->print("Roots (excluding permgen) "); }
     VerifyRootsClosure rootsCl(vo);
+
+    assert(Thread::current()->is_VM_thread(),
+      "Expected to be executed serially by the VM thread at this point");
+
     CodeBlobToOopClosure blobsCl(&rootsCl, /*do_marking=*/ false);
 
     // We apply the relevant closures to all the oops in the
@@ -2981,7 +3025,10 @@
              "sanity check");
 
       G1ParVerifyTask task(this, allow_dirty, vo);
-      int n_workers = workers()->total_workers();
+      assert(UseDynamicNumberOfGCThreads ||
+        workers()->active_workers() == workers()->total_workers(),
+        "If not dynamic should be using all the workers");
+      int n_workers = workers()->active_workers();
       set_par_threads(n_workers);
       workers()->run_task(&task);
       set_par_threads(0);
@@ -2989,6 +3036,8 @@
         failures = true;
       }
 
+      // Checks that the expected amount of parallel work was done.
+      // The implication is that n_workers is > 0.
       assert(check_heap_region_claim_values(HeapRegion::ParVerifyClaimValue),
              "sanity check");
 
@@ -3402,6 +3451,10 @@
     assert(check_young_list_well_formed(),
       "young list should be well formed");
 
+    // Don't dynamically change the number of GC threads this early.  A value of
+    // 0 is used to indicate serial work.  When parallel work is done,
+    // it will be set.
+
     { // Call to jvmpi::post_class_unload_events must occur outside of active GC
       IsGCActiveMark x;
 
@@ -3615,7 +3668,8 @@
         double end_time_sec = os::elapsedTime();
         double pause_time_ms = (end_time_sec - start_time_sec) * MILLIUNITS;
         g1_policy()->record_pause_time_ms(pause_time_ms);
-        g1_policy()->record_collection_pause_end();
+        int active_gc_threads = workers()->active_workers();
+        g1_policy()->record_collection_pause_end(active_gc_threads);
 
         MemoryService::track_memory_usage();
 
@@ -4562,13 +4616,13 @@
   }
 
 public:
-  G1ParTask(G1CollectedHeap* g1h, int workers, RefToScanQueueSet *task_queues)
+  G1ParTask(G1CollectedHeap* g1h,
+            RefToScanQueueSet *task_queues)
     : AbstractGangTask("G1 collection"),
       _g1h(g1h),
       _queues(task_queues),
-      _terminator(workers, _queues),
-      _stats_lock(Mutex::leaf, "parallel G1 stats lock", true),
-      _n_workers(workers)
+      _terminator(0, _queues),
+      _stats_lock(Mutex::leaf, "parallel G1 stats lock", true)
   {}
 
   RefToScanQueueSet* queues() { return _queues; }
@@ -4577,6 +4631,20 @@
     return queues()->queue(i);
   }
 
+  ParallelTaskTerminator* terminator() { return &_terminator; }
+
+  virtual void set_for_termination(int active_workers) {
+    // This task calls set_n_termination() in par_non_clean_card_iterate_work()
+    // in the young space (_par_seq_tasks) in the G1 heap
+    // for SequentialSubTasksDone.
+    // This task also uses SubTasksDone in SharedHeap and G1CollectedHeap
+    // both of which need setting by set_n_termination().
+    _g1h->SharedHeap::set_n_termination(active_workers);
+    _g1h->set_n_termination(active_workers);
+    terminator()->reset_for_reuse(active_workers);
+    _n_workers = active_workers;
+  }
+
   void work(int i) {
     if (i >= _n_workers) return;  // no work needed this round
 
@@ -4861,12 +4929,12 @@
 private:
   G1CollectedHeap*   _g1h;
   RefToScanQueueSet* _queues;
-  WorkGang*          _workers;
+  FlexibleWorkGang*  _workers;
   int                _active_workers;
 
 public:
   G1STWRefProcTaskExecutor(G1CollectedHeap* g1h,
-                        WorkGang* workers,
+                        FlexibleWorkGang* workers,
                         RefToScanQueueSet *task_queues,
                         int n_workers) :
     _g1h(g1h),
@@ -5122,11 +5190,13 @@
   // referents points to another object which is also referenced by an
   // object discovered by the STW ref processor.
 
-  int n_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
-                        workers()->total_workers() : 1);
-
-  set_par_threads(n_workers);
-  G1ParPreserveCMReferentsTask keep_cm_referents(this, n_workers, _task_queues);
+  int active_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
+                        workers()->active_workers() : 1);
+
+  assert(active_workers == workers()->active_workers(),
+         "Need to reset active_workers");
+  set_par_threads(active_workers);
+  G1ParPreserveCMReferentsTask keep_cm_referents(this, active_workers, _task_queues);
 
   if (G1CollectedHeap::use_parallel_gc_threads()) {
     workers()->run_task(&keep_cm_referents);
@@ -5192,7 +5262,6 @@
                                       NULL);
   } else {
     // Parallel reference processing
-    int active_workers = (ParallelGCThreads > 0 ? workers()->total_workers() : 1);
     assert(rp->num_q() == active_workers, "sanity");
     assert(active_workers <= rp->max_num_q(), "sanity");
 
@@ -5225,7 +5294,9 @@
   } else {
     // Parallel reference enqueuing
 
-    int active_workers = (ParallelGCThreads > 0 ? workers()->total_workers() : 1);
+    int active_workers = (ParallelGCThreads > 0 ? workers()->active_workers() : 1);
+    assert(active_workers == workers()->active_workers(),
+           "Need to reset active_workers");
     assert(rp->num_q() == active_workers, "sanity");
     assert(active_workers <= rp->max_num_q(), "sanity");
 
@@ -5252,9 +5323,24 @@
   concurrent_g1_refine()->set_use_cache(false);
   concurrent_g1_refine()->clear_hot_cache_claimed_index();
 
-  int n_workers = (ParallelGCThreads > 0 ? workers()->total_workers() : 1);
-  set_par_threads(n_workers);
-  G1ParTask g1_par_task(this, n_workers, _task_queues);
+  int n_workers;
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
+    n_workers =
+      AdaptiveSizePolicy::calc_active_workers(workers()->total_workers(),
+                                     workers()->active_workers(),
+                                     Threads::number_of_non_daemon_threads());
+    assert(UseDynamicNumberOfGCThreads ||
+           n_workers == workers()->total_workers(),
+           "If not dynamic should be using all the  workers");
+    set_par_threads(n_workers);
+  } else {
+    assert(n_par_threads() == 0,
+           "Should be the original non-parallel value");
+    n_workers = 1;
+  }
+  workers()->set_active_workers(n_workers);
+
+  G1ParTask g1_par_task(this, _task_queues);
 
   init_for_evac_failure(NULL);
 
@@ -5267,6 +5353,10 @@
     // The individual threads will set their evac-failure closures.
     StrongRootsScope srs(this);
     if (ParallelGCVerbose) G1ParScanThreadState::print_termination_stats_hdr();
+    // These tasks use ShareHeap::_process_strong_tasks
+    assert(UseDynamicNumberOfGCThreads ||
+           workers()->active_workers() == workers()->total_workers(),
+           "If not dynamic should be using all the  workers");
     workers()->run_task(&g1_par_task);
   } else {
     StrongRootsScope srs(this);
@@ -5275,6 +5365,7 @@
 
   double par_time = (os::elapsedTime() - start_par) * 1000.0;
   g1_policy()->record_par_time(par_time);
+
   set_par_threads(0);
 
   // Process any discovered reference objects - we have
@@ -5905,6 +5996,21 @@
   return _g1h->new_mutator_alloc_region(word_size, force);
 }
 
+void G1CollectedHeap::set_par_threads() {
+  // Don't change the number of workers.  Use the value previously set
+  // in the workgroup.
+  int n_workers = workers()->active_workers();
+    assert(UseDynamicNumberOfGCThreads ||
+           n_workers == workers()->total_workers(),
+      "Otherwise should be using the total number of workers");
+  if (n_workers == 0) {
+    assert(false, "Should have been set in prior evacuation pause.");
+    n_workers = ParallelGCThreads;
+    workers()->set_active_workers(n_workers);
+  }
+  set_par_threads(n_workers);
+}
+
 void MutatorAllocRegion::retire_region(HeapRegion* alloc_region,
                                        size_t allocated_bytes) {
   _g1h->retire_mutator_alloc_region(alloc_region, allocated_bytes);