# HG changeset patch # User jmasa # Date 1244671041 25200 # Node ID 7295839252dee314e2914ed1ce4d7ed23c7ce097 # Parent eacd97c88873a4b832abfe4db8a790d741ebcff0# Parent f89cf529c3c772f0435bf0028acc5cb3979b35d2 Merge diff -r eacd97c88873 -r 7295839252de src/cpu/sparc/vm/assembler_sparc.cpp --- a/src/cpu/sparc/vm/assembler_sparc.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/cpu/sparc/vm/assembler_sparc.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -4454,43 +4454,26 @@ delayed()->nop(); } - // Now we decide how to generate the card table write. If we're - // enqueueing, we call out to a generated function. Otherwise, we do it - // inline here. - - if (G1RSBarrierUseQueue) { - // If the "store_addr" register is an "in" or "local" register, move it to - // a scratch reg so we can pass it as an argument. - bool use_scr = !(store_addr->is_global() || store_addr->is_out()); - // Pick a scratch register different from "tmp". - Register scr = (tmp == G1_scratch ? G3_scratch : G1_scratch); - // Make sure we use up the delay slot! - if (use_scr) { - post_filter_masm->mov(store_addr, scr); - } else { - post_filter_masm->nop(); - } - generate_dirty_card_log_enqueue_if_necessary(bs->byte_map_base); - save_frame(0); - call(dirty_card_log_enqueue); - if (use_scr) { - delayed()->mov(scr, O0); - } else { - delayed()->mov(store_addr->after_save(), O0); - } - restore(); - + // If the "store_addr" register is an "in" or "local" register, move it to + // a scratch reg so we can pass it as an argument. + bool use_scr = !(store_addr->is_global() || store_addr->is_out()); + // Pick a scratch register different from "tmp". + Register scr = (tmp == G1_scratch ? G3_scratch : G1_scratch); + // Make sure we use up the delay slot! + if (use_scr) { + post_filter_masm->mov(store_addr, scr); } else { - -#ifdef _LP64 - post_filter_masm->srlx(store_addr, CardTableModRefBS::card_shift, store_addr); -#else - post_filter_masm->srl(store_addr, CardTableModRefBS::card_shift, store_addr); -#endif - assert(tmp != store_addr, "need separate temp reg"); - set(bs->byte_map_base, tmp); - stb(G0, tmp, store_addr); + post_filter_masm->nop(); } + generate_dirty_card_log_enqueue_if_necessary(bs->byte_map_base); + save_frame(0); + call(dirty_card_log_enqueue); + if (use_scr) { + delayed()->mov(scr, O0); + } else { + delayed()->mov(store_addr->after_save(), O0); + } + restore(); bind(filtered); diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp --- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -555,6 +555,7 @@ _collector_policy(cp), _should_unload_classes(false), _concurrent_cycles_since_last_unload(0), + _roots_scanning_options(0), _sweep_estimate(CMS_SweepWeight, CMS_SweepPadding) { if (ExplicitGCInvokesConcurrentAndUnloadsClasses) { diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp --- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -545,6 +545,11 @@ bool unloaded_classes_last_cycle() const { return concurrent_cycles_since_last_unload() == 0; } + // Root scanning options for perm gen + int _roots_scanning_options; + int roots_scanning_options() const { return _roots_scanning_options; } + void add_root_scanning_option(int o) { _roots_scanning_options |= o; } + void remove_root_scanning_option(int o) { _roots_scanning_options &= ~o; } // Verification support CMSBitMap _verification_mark_bm; @@ -719,11 +724,6 @@ NOT_PRODUCT(bool simulate_overflow();) // sequential NOT_PRODUCT(bool par_simulate_overflow();) // MT version - int _roots_scanning_options; - int roots_scanning_options() const { return _roots_scanning_options; } - void add_root_scanning_option(int o) { _roots_scanning_options |= o; } - void remove_root_scanning_option(int o) { _roots_scanning_options &= ~o; } - // CMS work methods void checkpointRootsInitialWork(bool asynch); // initial checkpoint work diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp --- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -25,26 +25,37 @@ #include "incls/_precompiled.incl" #include "incls/_concurrentG1Refine.cpp.incl" -bool ConcurrentG1Refine::_enabled = false; - ConcurrentG1Refine::ConcurrentG1Refine() : - _pya(PYA_continue), _last_pya(PYA_continue), - _last_cards_during(), _first_traversal(false), _card_counts(NULL), _cur_card_count_histo(NULL), _cum_card_count_histo(NULL), _hot_cache(NULL), _def_use_cache(false), _use_cache(false), - _n_periods(0), _total_cards(0), _total_travs(0) + _n_periods(0), _total_cards(0), _total_travs(0), + _threads(NULL), _n_threads(0) { if (G1ConcRefine) { - _cg1rThread = new ConcurrentG1RefineThread(this); - assert(cg1rThread() != NULL, "Conc refine should have been created"); - assert(cg1rThread()->cg1r() == this, - "Conc refine thread should refer to this"); - } else { - _cg1rThread = NULL; + _n_threads = (int)thread_num(); + if (_n_threads > 0) { + _threads = NEW_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _n_threads); + int worker_id_offset = (int)DirtyCardQueueSet::num_par_ids(); + ConcurrentG1RefineThread *next = NULL; + for (int i = _n_threads - 1; i >= 0; i--) { + ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, worker_id_offset, i); + assert(t != NULL, "Conc refine should have been created"); + assert(t->cg1r() == this, "Conc refine thread should refer to this"); + _threads[i] = t; + next = t; + } + } } } +size_t ConcurrentG1Refine::thread_num() { + if (G1ConcRefine) { + return (G1ParallelRSetThreads > 0) ? G1ParallelRSetThreads : ParallelGCThreads; + } + return 0; +} + void ConcurrentG1Refine::init() { if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) { G1CollectedHeap* g1h = G1CollectedHeap::heap(); @@ -75,6 +86,14 @@ } } +void ConcurrentG1Refine::stop() { + if (_threads != NULL) { + for (int i = 0; i < _n_threads; i++) { + _threads[i]->stop(); + } + } +} + ConcurrentG1Refine::~ConcurrentG1Refine() { if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) { assert(_card_counts != NULL, "Logic"); @@ -88,104 +107,22 @@ assert(_hot_cache != NULL, "Logic"); FREE_C_HEAP_ARRAY(jbyte*, _hot_cache); } -} - -bool ConcurrentG1Refine::refine() { - G1CollectedHeap* g1h = G1CollectedHeap::heap(); - unsigned cards_before = g1h->g1_rem_set()->conc_refine_cards(); - clear_hot_cache(); // Any previous values in this are now invalid. - g1h->g1_rem_set()->concurrentRefinementPass(this); - _traversals++; - unsigned cards_after = g1h->g1_rem_set()->conc_refine_cards(); - unsigned cards_during = cards_after-cards_before; - // If this is the first traversal in the current enabling - // and we did some cards, or if the number of cards found is decreasing - // sufficiently quickly, then keep going. Otherwise, sleep a while. - bool res = - (_first_traversal && cards_during > 0) - || - (!_first_traversal && cards_during * 3 < _last_cards_during * 2); - _last_cards_during = cards_during; - _first_traversal = false; - return res; -} - -void ConcurrentG1Refine::enable() { - MutexLocker x(G1ConcRefine_mon); - if (!_enabled) { - _enabled = true; - _first_traversal = true; _last_cards_during = 0; - G1ConcRefine_mon->notify_all(); - } -} - -unsigned ConcurrentG1Refine::disable() { - MutexLocker x(G1ConcRefine_mon); - if (_enabled) { - _enabled = false; - return _traversals; - } else { - return 0; + if (_threads != NULL) { + for (int i = 0; i < _n_threads; i++) { + delete _threads[i]; + } + FREE_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _threads); } } -void ConcurrentG1Refine::wait_for_ConcurrentG1Refine_enabled() { - G1ConcRefine_mon->lock(); - while (!_enabled) { - G1ConcRefine_mon->wait(Mutex::_no_safepoint_check_flag); - } - G1ConcRefine_mon->unlock(); - _traversals = 0; -}; - -void ConcurrentG1Refine::set_pya_restart() { - // If we're using the log-based RS barrier, the above will cause - // in-progress traversals of completed log buffers to quit early; we will - // also abandon all other buffers. - if (G1RSBarrierUseQueue) { - DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set(); - dcqs.abandon_logs(); - // Reset the post-yield actions. - _pya = PYA_continue; - _last_pya = PYA_continue; - } else { - _pya = PYA_restart; +void ConcurrentG1Refine::threads_do(ThreadClosure *tc) { + if (_threads != NULL) { + for (int i = 0; i < _n_threads; i++) { + tc->do_thread(_threads[i]); + } } } -void ConcurrentG1Refine::set_pya_cancel() { - _pya = PYA_cancel; -} - -PostYieldAction ConcurrentG1Refine::get_pya() { - if (_pya != PYA_continue) { - jint val = _pya; - while (true) { - jint val_read = Atomic::cmpxchg(PYA_continue, &_pya, val); - if (val_read == val) { - PostYieldAction res = (PostYieldAction)val; - assert(res != PYA_continue, "Only the refine thread should reset."); - _last_pya = res; - return res; - } else { - val = val_read; - } - } - } - // QQQ WELL WHAT DO WE RETURN HERE??? - // make up something! - return PYA_continue; -} - -PostYieldAction ConcurrentG1Refine::get_last_pya() { - PostYieldAction res = _last_pya; - _last_pya = PYA_continue; - return res; -} - -bool ConcurrentG1Refine::do_traversal() { - return _cg1rThread->do_traversal(); -} int ConcurrentG1Refine::add_card_count(jbyte* card_ptr) { size_t card_num = (card_ptr - _ct_bot); diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp --- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -26,26 +26,9 @@ class ConcurrentG1RefineThread; class G1RemSet; -// What to do after a yield: -enum PostYieldAction { - PYA_continue, // Continue the traversal - PYA_restart, // Restart - PYA_cancel // It's been completed by somebody else: cancel. -}; - class ConcurrentG1Refine: public CHeapObj { - ConcurrentG1RefineThread* _cg1rThread; - - volatile jint _pya; - PostYieldAction _last_pya; - - static bool _enabled; // Protected by G1ConcRefine_mon. - unsigned _traversals; - - // Number of cards processed during last refinement traversal. - unsigned _first_traversal; - unsigned _last_cards_during; - + ConcurrentG1RefineThread** _threads; + int _n_threads; // The cache for card refinement. bool _use_cache; bool _def_use_cache; @@ -74,37 +57,10 @@ ~ConcurrentG1Refine(); void init(); // Accomplish some initialization that has to wait. - - // Enabled Conc refinement, waking up thread if necessary. - void enable(); - - // Returns the number of traversals performed since this refiner was enabled. - unsigned disable(); - - // Requires G1ConcRefine_mon to be held. - bool enabled() { return _enabled; } - - // Returns only when G1 concurrent refinement has been enabled. - void wait_for_ConcurrentG1Refine_enabled(); + void stop(); - // Do one concurrent refinement pass over the card table. Returns "true" - // if heuristics determine that another pass should be done immediately. - bool refine(); - - // Indicate that an in-progress refinement pass should start over. - void set_pya_restart(); - // Indicate that an in-progress refinement pass should quit. - void set_pya_cancel(); - - // Get the appropriate post-yield action. Also sets last_pya. - PostYieldAction get_pya(); - - // The last PYA read by "get_pya". - PostYieldAction get_last_pya(); - - bool do_traversal(); - - ConcurrentG1RefineThread* cg1rThread() { return _cg1rThread; } + // Iterate over the conc refine threads + void threads_do(ThreadClosure *tc); // If this is the first entry for the slot, writes into the cache and // returns NULL. If it causes an eviction, returns the evicted pointer. @@ -129,4 +85,6 @@ void clear_and_record_card_counts(); void print_final_card_counts(); + + static size_t thread_num(); }; diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp --- a/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -30,12 +30,14 @@ // The CM thread is created when the G1 garbage collector is used ConcurrentG1RefineThread:: -ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r) : +ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread *next, + int worker_id_offset, int worker_id) : ConcurrentGCThread(), + _worker_id_offset(worker_id_offset), + _worker_id(worker_id), + _active(false), + _next(next), _cg1r(cg1r), - _started(false), - _in_progress(false), - _do_traversal(false), _vtime_accum(0.0), _co_tracker(G1CRGroup), _interval_ms(5.0) @@ -43,112 +45,6 @@ create_and_start(); } -const long timeout = 200; // ms. - -void ConcurrentG1RefineThread::traversalBasedRefinement() { - _cg1r->wait_for_ConcurrentG1Refine_enabled(); - MutexLocker x(G1ConcRefine_mon); - while (_cg1r->enabled()) { - MutexUnlocker ux(G1ConcRefine_mon); - ResourceMark rm; - HandleMark hm; - - if (G1TraceConcurrentRefinement) { - gclog_or_tty->print_cr("G1-Refine starting pass"); - } - _sts.join(); - bool no_sleep = _cg1r->refine(); - _sts.leave(); - if (!no_sleep) { - MutexLockerEx x(CGC_lock, Mutex::_no_safepoint_check_flag); - // We do this only for the timeout; we don't expect this to be signalled. - CGC_lock->wait(Mutex::_no_safepoint_check_flag, timeout); - } - } -} - -void ConcurrentG1RefineThread::queueBasedRefinement() { - DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set(); - // Wait for completed log buffers to exist. - { - MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag); - while (!_do_traversal && !dcqs.process_completed_buffers() && - !_should_terminate) { - DirtyCardQ_CBL_mon->wait(Mutex::_no_safepoint_check_flag); - } - } - - if (_should_terminate) { - return; - } - - // Now we take them off (this doesn't hold locks while it applies - // closures.) (If we did a full collection, then we'll do a full - // traversal. - _sts.join(); - if (_do_traversal) { - (void)_cg1r->refine(); - switch (_cg1r->get_last_pya()) { - case PYA_cancel: case PYA_continue: - // Continue was caught and handled inside "refine". If it's still - // "continue" when we get here, we're done. - _do_traversal = false; - break; - case PYA_restart: - assert(_do_traversal, "Because of Full GC."); - break; - } - } else { - int n_logs = 0; - int lower_limit = 0; - double start_vtime_sec; // only used when G1SmoothConcRefine is on - int prev_buffer_num; // only used when G1SmoothConcRefine is on - - if (G1SmoothConcRefine) { - lower_limit = 0; - start_vtime_sec = os::elapsedVTime(); - prev_buffer_num = (int) dcqs.completed_buffers_num(); - } else { - lower_limit = DCQBarrierProcessCompletedThreshold / 4; // For now. - } - while (dcqs.apply_closure_to_completed_buffer(0, lower_limit)) { - double end_vtime_sec; - double elapsed_vtime_sec; - int elapsed_vtime_ms; - int curr_buffer_num; - - if (G1SmoothConcRefine) { - end_vtime_sec = os::elapsedVTime(); - elapsed_vtime_sec = end_vtime_sec - start_vtime_sec; - elapsed_vtime_ms = (int) (elapsed_vtime_sec * 1000.0); - curr_buffer_num = (int) dcqs.completed_buffers_num(); - - if (curr_buffer_num > prev_buffer_num || - curr_buffer_num > DCQBarrierProcessCompletedThreshold) { - decreaseInterval(elapsed_vtime_ms); - } else if (curr_buffer_num < prev_buffer_num) { - increaseInterval(elapsed_vtime_ms); - } - } - - sample_young_list_rs_lengths(); - _co_tracker.update(false); - - if (G1SmoothConcRefine) { - prev_buffer_num = curr_buffer_num; - _sts.leave(); - os::sleep(Thread::current(), (jlong) _interval_ms, false); - _sts.join(); - start_vtime_sec = os::elapsedVTime(); - } - n_logs++; - } - // Make sure we harvest the PYA, if any. - (void)_cg1r->get_pya(); - } - _sts.leave(); -} - void ConcurrentG1RefineThread::sample_young_list_rs_lengths() { G1CollectedHeap* g1h = G1CollectedHeap::heap(); G1CollectorPolicy* g1p = g1h->g1_policy(); @@ -184,15 +80,97 @@ _co_tracker.start(); while (!_should_terminate) { - // wait until started is set. - if (G1RSBarrierUseQueue) { - queueBasedRefinement(); + DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set(); + // Wait for completed log buffers to exist. + { + MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag); + while (((_worker_id == 0 && !dcqs.process_completed_buffers()) || + (_worker_id > 0 && !is_active())) && + !_should_terminate) { + DirtyCardQ_CBL_mon->wait(Mutex::_no_safepoint_check_flag); + } + } + + if (_should_terminate) { + return; + } + + // Now we take them off (this doesn't hold locks while it applies + // closures.) (If we did a full collection, then we'll do a full + // traversal. + _sts.join(); + int n_logs = 0; + int lower_limit = 0; + double start_vtime_sec; // only used when G1SmoothConcRefine is on + int prev_buffer_num; // only used when G1SmoothConcRefine is on + // This thread activation threshold + int threshold = DCQBarrierProcessCompletedThreshold * _worker_id; + // Next thread activation threshold + int next_threshold = threshold + DCQBarrierProcessCompletedThreshold; + int deactivation_threshold = MAX2(threshold - DCQBarrierProcessCompletedThreshold / 2, 0); + + if (G1SmoothConcRefine) { + lower_limit = 0; + start_vtime_sec = os::elapsedVTime(); + prev_buffer_num = (int) dcqs.completed_buffers_num(); } else { - traversalBasedRefinement(); + lower_limit = DCQBarrierProcessCompletedThreshold / 4; // For now. } - _sts.join(); - _co_tracker.update(); + while (dcqs.apply_closure_to_completed_buffer(_worker_id + _worker_id_offset, lower_limit)) { + double end_vtime_sec; + double elapsed_vtime_sec; + int elapsed_vtime_ms; + int curr_buffer_num = (int) dcqs.completed_buffers_num(); + + if (G1SmoothConcRefine) { + end_vtime_sec = os::elapsedVTime(); + elapsed_vtime_sec = end_vtime_sec - start_vtime_sec; + elapsed_vtime_ms = (int) (elapsed_vtime_sec * 1000.0); + + if (curr_buffer_num > prev_buffer_num || + curr_buffer_num > next_threshold) { + decreaseInterval(elapsed_vtime_ms); + } else if (curr_buffer_num < prev_buffer_num) { + increaseInterval(elapsed_vtime_ms); + } + } + if (_worker_id == 0) { + sample_young_list_rs_lengths(); + } else if (curr_buffer_num < deactivation_threshold) { + // If the number of the buffer has fallen below our threshold + // we should deactivate. The predecessor will reactivate this + // thread should the number of the buffers cross the threshold again. + MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag); + deactivate(); + if (G1TraceConcurrentRefinement) { + gclog_or_tty->print_cr("G1-Refine-deactivated worker %d", _worker_id); + } + break; + } + _co_tracker.update(false); + + // Check if we need to activate the next thread. + if (curr_buffer_num > next_threshold && _next != NULL && !_next->is_active()) { + MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag); + _next->activate(); + DirtyCardQ_CBL_mon->notify_all(); + if (G1TraceConcurrentRefinement) { + gclog_or_tty->print_cr("G1-Refine-activated worker %d", _next->_worker_id); + } + } + + if (G1SmoothConcRefine) { + prev_buffer_num = curr_buffer_num; + _sts.leave(); + os::sleep(Thread::current(), (jlong) _interval_ms, false); + _sts.join(); + start_vtime_sec = os::elapsedVTime(); + } + n_logs++; + } + _co_tracker.update(false); _sts.leave(); + if (os::supports_vtime()) { _vtime_accum = (os::elapsedVTime() - _vtime_start); } else { @@ -240,7 +218,3 @@ Thread::print(); gclog_or_tty->cr(); } - -void ConcurrentG1RefineThread::set_do_traversal(bool b) { - _do_traversal = b; -} diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/concurrentG1RefineThread.hpp --- a/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -33,21 +33,27 @@ double _vtime_start; // Initial virtual time. double _vtime_accum; // Initial virtual time. + int _worker_id; + int _worker_id_offset; + // The refinement threads collection is linked list. A predecessor can activate a successor + // when the number of the rset update buffer crosses a certain threshold. A successor + // would self-deactivate when the number of the buffers falls below the threshold. + bool _active; + ConcurrentG1RefineThread * _next; public: virtual void run(); + bool is_active() { return _active; } + void activate() { _active = true; } + void deactivate() { _active = false; } + private: ConcurrentG1Refine* _cg1r; - bool _started; - bool _in_progress; - volatile bool _restart; COTracker _co_tracker; double _interval_ms; - bool _do_traversal; - void decreaseInterval(int processing_time_ms) { double min_interval_ms = (double) processing_time_ms; _interval_ms = 0.8 * _interval_ms; @@ -63,16 +69,13 @@ void sleepBeforeNextCycle(); - void traversalBasedRefinement(); - - void queueBasedRefinement(); - // For use by G1CollectedHeap, which is a friend. static SuspendibleThreadSet* sts() { return &_sts; } public: // Constructor - ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r); + ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread* next, + int worker_id_offset, int worker_id); // Printing void print(); @@ -82,23 +85,11 @@ ConcurrentG1Refine* cg1r() { return _cg1r; } - - void set_started() { _started = true; } - void clear_started() { _started = false; } - bool started() { return _started; } - - void set_in_progress() { _in_progress = true; } - void clear_in_progress() { _in_progress = false; } - bool in_progress() { return _in_progress; } - - void set_do_traversal(bool b); - bool do_traversal() { return _do_traversal; } - void sample_young_list_rs_lengths(); // Yield for GC void yield(); // shutdown - static void stop(); + void stop(); }; diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/concurrentMarkThread.hpp --- a/src/share/vm/gc_implementation/g1/concurrentMarkThread.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/concurrentMarkThread.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -80,5 +80,5 @@ void yield(); // shutdown - static void stop(); + void stop(); }; diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/concurrentZFThread.hpp --- a/src/share/vm/gc_implementation/g1/concurrentZFThread.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/concurrentZFThread.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -73,7 +73,7 @@ // while holding the ZF_needed_mon lock. // shutdown - static void stop(); + void stop(); // Stats static void note_region_alloc() {_region_allocs++; } diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp --- a/src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -71,11 +71,11 @@ _all_active = true; } +// Determines how many mutator threads can process the buffers in parallel. size_t DirtyCardQueueSet::num_par_ids() { - return MAX2(ParallelGCThreads, (size_t)2); + return os::processor_count(); } - void DirtyCardQueueSet::initialize(Monitor* cbl_mon, Mutex* fl_lock, int max_completed_queue, Mutex* lock, PtrQueueSet* fl_owner) { @@ -85,8 +85,6 @@ _shared_dirty_card_queue.set_lock(lock); _free_ids = new FreeIdSet((int) num_par_ids(), _cbl_mon); - bool b = _free_ids->claim_perm_id(0); - guarantee(b, "Must reserve id zero for concurrent refinement thread."); } void DirtyCardQueueSet::handle_zero_index_for_thread(JavaThread* t) { @@ -234,7 +232,7 @@ nd = get_completed_buffer_lock(stop_at); } bool res = apply_closure_to_completed_buffer_helper(worker_i, nd); - if (res) _processed_buffers_rs_thread++; + if (res) Atomic::inc(&_processed_buffers_rs_thread); return res; } diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp --- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -446,8 +446,61 @@ gclog_or_tty->print_cr(""); } +void G1CollectedHeap::push_dirty_cards_region(HeapRegion* hr) +{ + // Claim the right to put the region on the dirty cards region list + // by installing a self pointer. + HeapRegion* next = hr->get_next_dirty_cards_region(); + if (next == NULL) { + HeapRegion* res = (HeapRegion*) + Atomic::cmpxchg_ptr(hr, hr->next_dirty_cards_region_addr(), + NULL); + if (res == NULL) { + HeapRegion* head; + do { + // Put the region to the dirty cards region list. + head = _dirty_cards_region_list; + next = (HeapRegion*) + Atomic::cmpxchg_ptr(hr, &_dirty_cards_region_list, head); + if (next == head) { + assert(hr->get_next_dirty_cards_region() == hr, + "hr->get_next_dirty_cards_region() != hr"); + if (next == NULL) { + // The last region in the list points to itself. + hr->set_next_dirty_cards_region(hr); + } else { + hr->set_next_dirty_cards_region(next); + } + } + } while (next != head); + } + } +} + +HeapRegion* G1CollectedHeap::pop_dirty_cards_region() +{ + HeapRegion* head; + HeapRegion* hr; + do { + head = _dirty_cards_region_list; + if (head == NULL) { + return NULL; + } + HeapRegion* new_head = head->get_next_dirty_cards_region(); + if (head == new_head) { + // The last region. + new_head = NULL; + } + hr = (HeapRegion*)Atomic::cmpxchg_ptr(new_head, &_dirty_cards_region_list, + head); + } while (hr != head); + assert(hr != NULL, "invariant"); + hr->set_next_dirty_cards_region(NULL); + return hr; +} + void G1CollectedHeap::stop_conc_gc_threads() { - _cg1r->cg1rThread()->stop(); + _cg1r->stop(); _czft->stop(); _cmThread->stop(); } @@ -1001,12 +1054,8 @@ gc_epilogue(true); - // Abandon concurrent refinement. This must happen last: in the - // dirty-card logging system, some cards may be dirty by weak-ref - // processing, and may be enqueued. But the whole card table is - // dirtied, so this should abandon those logs, and set "do_traversal" - // to true. - concurrent_g1_refine()->set_pya_restart(); + // Discard all rset updates + JavaThread::dirty_card_queue_set().abandon_logs(); assert(!G1DeferredRSUpdate || (G1DeferredRSUpdate && (dirty_card_queue_set().completed_buffers_num() == 0)), "Should not be any"); assert(regions_accounted_for(), "Region leakage!"); @@ -1333,7 +1382,8 @@ _gc_time_stamp(0), _surviving_young_words(NULL), _in_cset_fast_test(NULL), - _in_cset_fast_test_base(NULL) { + _in_cset_fast_test_base(NULL), + _dirty_cards_region_list(NULL) { _g1h = this; // To catch bugs. if (_process_strong_tasks == NULL || !_process_strong_tasks->valid()) { vm_exit_during_initialization("Failed necessary allocation."); @@ -1521,12 +1571,12 @@ SATB_Q_FL_lock, 0, Shared_SATB_Q_lock); - if (G1RSBarrierUseQueue) { - JavaThread::dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon, - DirtyCardQ_FL_lock, - G1DirtyCardQueueMax, - Shared_DirtyCardQ_lock); - } + + JavaThread::dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon, + DirtyCardQ_FL_lock, + G1DirtyCardQueueMax, + Shared_DirtyCardQ_lock); + if (G1DeferredRSUpdate) { dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon, DirtyCardQ_FL_lock, @@ -2249,6 +2299,15 @@ _hrs->iterate(&blk); } +class PrintOnThreadsClosure : public ThreadClosure { + outputStream* _st; +public: + PrintOnThreadsClosure(outputStream* st) : _st(st) { } + virtual void do_thread(Thread *t) { + t->print_on(_st); + } +}; + void G1CollectedHeap::print_gc_threads_on(outputStream* st) const { if (ParallelGCThreads > 0) { workers()->print_worker_threads(); @@ -2256,8 +2315,9 @@ st->print("\"G1 concurrent mark GC Thread\" "); _cmThread->print(); st->cr(); - st->print("\"G1 concurrent refinement GC Thread\" "); - _cg1r->cg1rThread()->print_on(st); + st->print("\"G1 concurrent refinement GC Threads\" "); + PrintOnThreadsClosure p(st); + _cg1r->threads_do(&p); st->cr(); st->print("\"G1 zero-fill GC Thread\" "); _czft->print_on(st); @@ -2269,7 +2329,7 @@ workers()->threads_do(tc); } tc->do_thread(_cmThread); - tc->do_thread(_cg1r->cg1rThread()); + _cg1r->threads_do(tc); tc->do_thread(_czft); } @@ -4685,15 +4745,58 @@ } } + +class G1ParCleanupCTTask : public AbstractGangTask { + CardTableModRefBS* _ct_bs; + G1CollectedHeap* _g1h; +public: + G1ParCleanupCTTask(CardTableModRefBS* ct_bs, + G1CollectedHeap* g1h) : + AbstractGangTask("G1 Par Cleanup CT Task"), + _ct_bs(ct_bs), + _g1h(g1h) + { } + + void work(int i) { + HeapRegion* r; + while (r = _g1h->pop_dirty_cards_region()) { + clear_cards(r); + } + } + void clear_cards(HeapRegion* r) { + // Cards for Survivor and Scan-Only regions will be dirtied later. + if (!r->is_scan_only() && !r->is_survivor()) { + _ct_bs->clear(MemRegion(r->bottom(), r->end())); + } + } +}; + + void G1CollectedHeap::cleanUpCardTable() { CardTableModRefBS* ct_bs = (CardTableModRefBS*) (barrier_set()); double start = os::elapsedTime(); - ct_bs->clear(_g1_committed); - + // Iterate over the dirty cards region list. + G1ParCleanupCTTask cleanup_task(ct_bs, this); + if (ParallelGCThreads > 0) { + set_par_threads(workers()->total_workers()); + workers()->run_task(&cleanup_task); + set_par_threads(0); + } else { + while (_dirty_cards_region_list) { + HeapRegion* r = _dirty_cards_region_list; + cleanup_task.clear_cards(r); + _dirty_cards_region_list = r->get_next_dirty_cards_region(); + if (_dirty_cards_region_list == r) { + // The last region. + _dirty_cards_region_list = NULL; + } + r->set_next_dirty_cards_region(NULL); + } + } // now, redirty the cards of the scan-only and survivor regions // (it seemed faster to do it this way, instead of iterating over - // all regions and then clearing / dirtying as approprite) + // all regions and then clearing / dirtying as appropriate) dirtyCardsForYoungRegions(ct_bs, _young_list->first_scan_only_region()); dirtyCardsForYoungRegions(ct_bs, _young_list->first_survivor_region()); diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp --- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -158,6 +158,7 @@ friend class RegionSorter; friend class CountRCClosure; friend class EvacPopObjClosure; + friend class G1ParCleanupCTTask; // Other related classes. friend class G1MarkSweep; @@ -1191,6 +1192,16 @@ ConcurrentMark* concurrent_mark() const { return _cm; } ConcurrentG1Refine* concurrent_g1_refine() const { return _cg1r; } + // The dirty cards region list is used to record a subset of regions + // whose cards need clearing. The list if populated during the + // remembered set scanning and drained during the card table + // cleanup. Although the methods are reentrant, population/draining + // phases must not overlap. For synchronization purposes the last + // element on the list points to itself. + HeapRegion* _dirty_cards_region_list; + void push_dirty_cards_region(HeapRegion* hr); + HeapRegion* pop_dirty_cards_region(); + public: void stop_conc_gc_threads(); diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp --- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -167,11 +167,6 @@ _all_full_gc_times_ms(new NumberSeq()), - _conc_refine_enabled(0), - _conc_refine_zero_traversals(0), - _conc_refine_max_traversals(0), - _conc_refine_current_delta(G1ConcRefineInitialDelta), - // G1PausesBtwnConcMark defaults to -1 // so the hack is to do the cast QQQ FIXME _pauses_btwn_concurrent_mark((size_t)G1PausesBtwnConcMark), @@ -1634,9 +1629,8 @@ print_stats(1, "Parallel Time", _cur_collection_par_time_ms); print_par_stats(2, "Update RS (Start)", _par_last_update_rs_start_times_ms, false); print_par_stats(2, "Update RS", _par_last_update_rs_times_ms); - if (G1RSBarrierUseQueue) - print_par_buffers(3, "Processed Buffers", - _par_last_update_rs_processed_buffers, true); + print_par_buffers(3, "Processed Buffers", + _par_last_update_rs_processed_buffers, true); print_par_stats(2, "Ext Root Scanning", _par_last_ext_root_scan_times_ms); print_par_stats(2, "Mark Stack Scanning", _par_last_mark_stack_scan_times_ms); print_par_stats(2, "Scan-Only Scanning", _par_last_scan_only_times_ms); @@ -1649,9 +1643,8 @@ print_stats(1, "Clear CT", _cur_clear_ct_time_ms); } else { print_stats(1, "Update RS", update_rs_time); - if (G1RSBarrierUseQueue) - print_stats(2, "Processed Buffers", - (int)update_rs_processed_buffers); + print_stats(2, "Processed Buffers", + (int)update_rs_processed_buffers); print_stats(1, "Ext Root Scanning", ext_root_scan_time); print_stats(1, "Mark Stack Scanning", mark_stack_scan_time); print_stats(1, "Scan-Only Scanning", scan_only_time); @@ -2467,18 +2460,6 @@ (double) _region_num_young / (double) all_region_num * 100.0, _region_num_tenured, (double) _region_num_tenured / (double) all_region_num * 100.0); - - if (!G1RSBarrierUseQueue) { - gclog_or_tty->print_cr("Of %d times conc refinement was enabled, %d (%7.2f%%) " - "did zero traversals.", - _conc_refine_enabled, _conc_refine_zero_traversals, - _conc_refine_enabled > 0 ? - 100.0 * (float)_conc_refine_zero_traversals/ - (float)_conc_refine_enabled : 0.0); - gclog_or_tty->print_cr(" Max # of traversals = %d.", - _conc_refine_max_traversals); - gclog_or_tty->print_cr(""); - } } if (TraceGen1Time) { if (_all_full_gc_times_ms->num() > 0) { @@ -2500,38 +2481,6 @@ #endif // PRODUCT } -void G1CollectorPolicy::update_conc_refine_data() { - unsigned traversals = _g1->concurrent_g1_refine()->disable(); - if (traversals == 0) _conc_refine_zero_traversals++; - _conc_refine_max_traversals = MAX2(_conc_refine_max_traversals, - (size_t)traversals); - - if (G1PolicyVerbose > 1) - gclog_or_tty->print_cr("Did a CR traversal series: %d traversals.", traversals); - double multiplier = 1.0; - if (traversals == 0) { - multiplier = 4.0; - } else if (traversals > (size_t)G1ConcRefineTargTraversals) { - multiplier = 1.0/1.5; - } else if (traversals < (size_t)G1ConcRefineTargTraversals) { - multiplier = 1.5; - } - if (G1PolicyVerbose > 1) { - gclog_or_tty->print_cr(" Multiplier = %7.2f.", multiplier); - gclog_or_tty->print(" Delta went from %d regions to ", - _conc_refine_current_delta); - } - _conc_refine_current_delta = - MIN2(_g1->n_regions(), - (size_t)(_conc_refine_current_delta * multiplier)); - _conc_refine_current_delta = - MAX2(_conc_refine_current_delta, (size_t)1); - if (G1PolicyVerbose > 1) { - gclog_or_tty->print_cr("%d regions.", _conc_refine_current_delta); - } - _conc_refine_enabled++; -} - bool G1CollectorPolicy::should_add_next_region_to_young_list() { assert(in_young_gc_mode(), "should be in young GC mode"); diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp --- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -637,18 +637,6 @@ // The number of collection pauses at the end of the last mark. size_t _n_pauses_at_mark_end; - // ==== This section is for stats related to starting Conc Refinement on time. - size_t _conc_refine_enabled; - size_t _conc_refine_zero_traversals; - size_t _conc_refine_max_traversals; - // In # of heap regions. - size_t _conc_refine_current_delta; - - // At the beginning of a collection pause, update the variables above, - // especially the "delta". - void update_conc_refine_data(); - // ==== - // Stash a pointer to the g1 heap. G1CollectedHeap* _g1; diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/g1RemSet.cpp --- a/src/share/vm/gc_implementation/g1/g1RemSet.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/g1RemSet.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -105,28 +105,6 @@ _g1->heap_region_iterate(&rc); } -class UpdateRSOutOfRegionClosure: public HeapRegionClosure { - G1CollectedHeap* _g1h; - ModRefBarrierSet* _mr_bs; - UpdateRSOopClosure _cl; - int _worker_i; -public: - UpdateRSOutOfRegionClosure(G1CollectedHeap* g1, int worker_i = 0) : - _cl(g1->g1_rem_set()->as_HRInto_G1RemSet(), worker_i), - _mr_bs(g1->mr_bs()), - _worker_i(worker_i), - _g1h(g1) - {} - bool doHeapRegion(HeapRegion* r) { - if (!r->in_collection_set() && !r->continuesHumongous()) { - _cl.set_from(r); - r->set_next_filter_kind(HeapRegionDCTOC::OutOfRegionFilterKind); - _mr_bs->mod_oop_in_space_iterate(r, &_cl, true, true); - } - return false; - } -}; - class VerifyRSCleanCardOopClosure: public OopClosure { G1CollectedHeap* _g1; public: @@ -241,6 +219,7 @@ HeapRegionRemSet* hrrs = r->rem_set(); if (hrrs->iter_is_complete()) return false; // All done. if (!_try_claimed && !hrrs->claim_iter()) return false; + _g1h->push_dirty_cards_region(r); // If we didn't return above, then // _try_claimed || r->claim_iter() // is true: either we're supposed to work on claimed-but-not-complete @@ -264,6 +243,10 @@ assert(card_region != NULL, "Yielding cards not in the heap?"); _cards++; + if (!card_region->is_on_dirty_cards_region_list()) { + _g1h->push_dirty_cards_region(card_region); + } + // If the card is dirty, then we will scan it during updateRS. if (!card_region->in_collection_set() && !_ct_bs->is_card_dirty(card_index)) { if (!_ct_bs->is_card_claimed(card_index) && _ct_bs->claim_card(card_index)) { @@ -350,30 +333,17 @@ double start = os::elapsedTime(); _g1p->record_update_rs_start_time(worker_i, start * 1000.0); - if (G1RSBarrierUseQueue && !cg1r->do_traversal()) { - // Apply the appropriate closure to all remaining log entries. - _g1->iterate_dirty_card_closure(false, worker_i); - // Now there should be no dirty cards. - if (G1RSLogCheckCardTable) { - CountNonCleanMemRegionClosure cl(_g1); - _ct_bs->mod_card_iterate(&cl); - // XXX This isn't true any more: keeping cards of young regions - // marked dirty broke it. Need some reasonable fix. - guarantee(cl.n() == 0, "Card table should be clean."); - } - } else { - UpdateRSOutOfRegionClosure update_rs(_g1, worker_i); - _g1->heap_region_iterate(&update_rs); - // We did a traversal; no further one is necessary. - if (G1RSBarrierUseQueue) { - assert(cg1r->do_traversal(), "Or we shouldn't have gotten here."); - cg1r->set_pya_cancel(); - } - if (_cg1r->use_cache()) { - _cg1r->clear_and_record_card_counts(); - _cg1r->clear_hot_cache(); - } + // Apply the appropriate closure to all remaining log entries. + _g1->iterate_dirty_card_closure(false, worker_i); + // Now there should be no dirty cards. + if (G1RSLogCheckCardTable) { + CountNonCleanMemRegionClosure cl(_g1); + _ct_bs->mod_card_iterate(&cl); + // XXX This isn't true any more: keeping cards of young regions + // marked dirty broke it. Need some reasonable fix. + guarantee(cl.n() == 0, "Card table should be clean."); } + _g1p->record_update_rs_time(worker_i, (os::elapsedTime() - start) * 1000.0); } @@ -486,11 +456,6 @@ * 1000.0); } -void HRInto_G1RemSet::set_par_traversal(bool b) { - _par_traversal_in_progress = b; - HeapRegionRemSet::set_par_traversal(b); -} - void HRInto_G1RemSet::cleanupHRRS() { HeapRegionRemSet::cleanup(); } @@ -527,7 +492,7 @@ updateRS(worker_i); scanNewRefsRS(oc, worker_i); } else { - _g1p->record_update_rs_start_time(worker_i, os::elapsedTime()); + _g1p->record_update_rs_start_time(worker_i, os::elapsedTime() * 1000.0); _g1p->record_update_rs_processed_buffers(worker_i, 0.0); _g1p->record_update_rs_time(worker_i, 0.0); _g1p->record_scan_new_refs_time(worker_i, 0.0); @@ -535,7 +500,7 @@ if (G1ParallelRSetScanningEnabled || (worker_i == 0)) { scanRS(oc, worker_i); } else { - _g1p->record_scan_rs_start_time(worker_i, os::elapsedTime()); + _g1p->record_scan_rs_start_time(worker_i, os::elapsedTime() * 1000.0); _g1p->record_scan_rs_time(worker_i, 0.0); } } else { @@ -562,11 +527,6 @@ if (ParallelGCThreads > 0) { set_par_traversal(true); _seq_task->set_par_threads((int)n_workers()); - if (cg1r->do_traversal()) { - updateRS(0); - // Have to do this again after updaters - cleanupHRRS(); - } } guarantee( _cards_scanned == NULL, "invariant" ); _cards_scanned = NEW_C_HEAP_ARRAY(size_t, n_workers()); @@ -647,11 +607,8 @@ _g1->collection_set_iterate(&iterClosure); // Set all cards back to clean. _g1->cleanUpCardTable(); + if (ParallelGCThreads > 0) { - ConcurrentG1Refine* cg1r = _g1->concurrent_g1_refine(); - if (cg1r->do_traversal()) { - cg1r->cg1rThread()->set_do_traversal(false); - } set_par_traversal(false); } @@ -721,139 +678,8 @@ } -class ConcRefineRegionClosure: public HeapRegionClosure { - G1CollectedHeap* _g1h; - CardTableModRefBS* _ctbs; - ConcurrentGCThread* _cgc_thrd; - ConcurrentG1Refine* _cg1r; - unsigned _cards_processed; - UpdateRSOopClosure _update_rs_oop_cl; -public: - ConcRefineRegionClosure(CardTableModRefBS* ctbs, - ConcurrentG1Refine* cg1r, - HRInto_G1RemSet* g1rs) : - _ctbs(ctbs), _cg1r(cg1r), _cgc_thrd(cg1r->cg1rThread()), - _update_rs_oop_cl(g1rs), _cards_processed(0), - _g1h(G1CollectedHeap::heap()) - {} - - bool doHeapRegion(HeapRegion* r) { - if (!r->in_collection_set() && - !r->continuesHumongous() && - !r->is_young()) { - _update_rs_oop_cl.set_from(r); - UpdateRSObjectClosure update_rs_obj_cl(&_update_rs_oop_cl); - - // For each run of dirty card in the region: - // 1) Clear the cards. - // 2) Process the range corresponding to the run, adding any - // necessary RS entries. - // 1 must precede 2, so that a concurrent modification redirties the - // card. If a processing attempt does not succeed, because it runs - // into an unparseable region, we will do binary search to find the - // beginning of the next parseable region. - HeapWord* startAddr = r->bottom(); - HeapWord* endAddr = r->used_region().end(); - HeapWord* lastAddr; - HeapWord* nextAddr; - - for (nextAddr = lastAddr = startAddr; - nextAddr < endAddr; - nextAddr = lastAddr) { - MemRegion dirtyRegion; - - // Get and clear dirty region from card table - MemRegion next_mr(nextAddr, endAddr); - dirtyRegion = - _ctbs->dirty_card_range_after_reset( - next_mr, - true, CardTableModRefBS::clean_card_val()); - assert(dirtyRegion.start() >= nextAddr, - "returned region inconsistent?"); - - if (!dirtyRegion.is_empty()) { - HeapWord* stop_point = - r->object_iterate_mem_careful(dirtyRegion, - &update_rs_obj_cl); - if (stop_point == NULL) { - lastAddr = dirtyRegion.end(); - _cards_processed += - (int) (dirtyRegion.word_size() / CardTableModRefBS::card_size_in_words); - } else { - // We're going to skip one or more cards that we can't parse. - HeapWord* next_parseable_card = - r->next_block_start_careful(stop_point); - // Round this up to a card boundary. - next_parseable_card = - _ctbs->addr_for(_ctbs->byte_after_const(next_parseable_card)); - // Now we invalidate the intervening cards so we'll see them - // again. - MemRegion remaining_dirty = - MemRegion(stop_point, dirtyRegion.end()); - MemRegion skipped = - MemRegion(stop_point, next_parseable_card); - _ctbs->invalidate(skipped.intersection(remaining_dirty)); - - // Now start up again where we can parse. - lastAddr = next_parseable_card; - - // Count how many we did completely. - _cards_processed += - (stop_point - dirtyRegion.start()) / - CardTableModRefBS::card_size_in_words; - } - // Allow interruption at regular intervals. - // (Might need to make them more regular, if we get big - // dirty regions.) - if (_cgc_thrd != NULL) { - if (_cgc_thrd->should_yield()) { - _cgc_thrd->yield(); - switch (_cg1r->get_pya()) { - case PYA_continue: - // This may have changed: re-read. - endAddr = r->used_region().end(); - continue; - case PYA_restart: case PYA_cancel: - return true; - } - } - } - } else { - break; - } - } - } - // A good yield opportunity. - if (_cgc_thrd != NULL) { - if (_cgc_thrd->should_yield()) { - _cgc_thrd->yield(); - switch (_cg1r->get_pya()) { - case PYA_restart: case PYA_cancel: - return true; - default: - break; - } - - } - } - return false; - } - - unsigned cards_processed() { return _cards_processed; } -}; - - -void HRInto_G1RemSet::concurrentRefinementPass(ConcurrentG1Refine* cg1r) { - ConcRefineRegionClosure cr_cl(ct_bs(), cg1r, this); - _g1->heap_region_iterate(&cr_cl); - _conc_refine_traversals++; - _conc_refine_cards += cr_cl.cards_processed(); -} - static IntHistogram out_of_histo(50, 50); - - void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) { // If the card is no longer dirty, nothing to do. if (*card_ptr != CardTableModRefBS::dirty_card_val()) return; @@ -983,10 +809,16 @@ HeapRegion* max_mem_sz_region() { return _max_mem_sz_region; } }; +class PrintRSThreadVTimeClosure : public ThreadClosure { +public: + virtual void do_thread(Thread *t) { + ConcurrentG1RefineThread* crt = (ConcurrentG1RefineThread*) t; + gclog_or_tty->print(" %5.2f", crt->vtime_accum()); + } +}; + void HRInto_G1RemSet::print_summary_info() { G1CollectedHeap* g1 = G1CollectedHeap::heap(); - ConcurrentG1RefineThread* cg1r_thrd = - g1->concurrent_g1_refine()->cg1rThread(); #if CARD_REPEAT_HISTO gclog_or_tty->print_cr("\nG1 card_repeat count histogram: "); @@ -999,15 +831,13 @@ gclog_or_tty->print_cr(" # of CS ptrs --> # of cards with that number."); out_of_histo.print_on(gclog_or_tty); } - gclog_or_tty->print_cr("\n Concurrent RS processed %d cards in " - "%5.2fs.", - _conc_refine_cards, cg1r_thrd->vtime_accum()); - + gclog_or_tty->print_cr("\n Concurrent RS processed %d cards", + _conc_refine_cards); DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set(); jint tot_processed_buffers = dcqs.processed_buffers_mut() + dcqs.processed_buffers_rs_thread(); gclog_or_tty->print_cr(" Of %d completed buffers:", tot_processed_buffers); - gclog_or_tty->print_cr(" %8d (%5.1f%%) by conc RS thread.", + gclog_or_tty->print_cr(" %8d (%5.1f%%) by conc RS threads.", dcqs.processed_buffers_rs_thread(), 100.0*(float)dcqs.processed_buffers_rs_thread()/ (float)tot_processed_buffers); @@ -1015,15 +845,12 @@ dcqs.processed_buffers_mut(), 100.0*(float)dcqs.processed_buffers_mut()/ (float)tot_processed_buffers); - gclog_or_tty->print_cr(" Did %d concurrent refinement traversals.", - _conc_refine_traversals); - if (!G1RSBarrierUseQueue) { - gclog_or_tty->print_cr(" Scanned %8.2f cards/traversal.", - _conc_refine_traversals > 0 ? - (float)_conc_refine_cards/(float)_conc_refine_traversals : - 0); - } + gclog_or_tty->print_cr(" Conc RS threads times(s)"); + PrintRSThreadVTimeClosure p; + gclog_or_tty->print(" "); + g1->concurrent_g1_refine()->threads_do(&p); gclog_or_tty->print_cr(""); + if (G1UseHRIntoRS) { HRRSStatsIter blk; g1->heap_region_iterate(&blk); diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/g1RemSet.hpp --- a/src/share/vm/gc_implementation/g1/g1RemSet.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/g1RemSet.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -33,15 +33,12 @@ class G1RemSet: public CHeapObj { protected: G1CollectedHeap* _g1; - - unsigned _conc_refine_traversals; unsigned _conc_refine_cards; - size_t n_workers(); public: G1RemSet(G1CollectedHeap* g1) : - _g1(g1), _conc_refine_traversals(0), _conc_refine_cards(0) + _g1(g1), _conc_refine_cards(0) {} // Invoke "blk->do_oop" on all pointers into the CS in object in regions @@ -81,19 +78,11 @@ virtual void scrub_par(BitMap* region_bm, BitMap* card_bm, int worker_num, int claim_val) = 0; - // Do any "refinement" activity that might be appropriate to the given - // G1RemSet. If "refinement" has iterateive "passes", do one pass. - // If "t" is non-NULL, it is the thread performing the refinement. - // Default implementation does nothing. - virtual void concurrentRefinementPass(ConcurrentG1Refine* cg1r) {} - // Refine the card corresponding to "card_ptr". If "sts" is non-NULL, // join and leave around parts that must be atomic wrt GC. (NULL means // being done at a safepoint.) virtual void concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {} - unsigned conc_refine_cards() { return _conc_refine_cards; } - // Print any relevant summary info. virtual void print_summary_info() {} @@ -153,7 +142,7 @@ // progress. If so, then cards added to remembered sets should also have // their references into the collection summarized in "_new_refs". bool _par_traversal_in_progress; - void set_par_traversal(bool b); + void set_par_traversal(bool b) { _par_traversal_in_progress = b; } GrowableArray** _new_refs; void new_refs_iterate(OopClosure* cl); @@ -194,7 +183,6 @@ void scrub_par(BitMap* region_bm, BitMap* card_bm, int worker_num, int claim_val); - virtual void concurrentRefinementPass(ConcurrentG1Refine* t); virtual void concurrentRefineOneCard(jbyte* card_ptr, int worker_i); virtual void print_summary_info(); diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/g1_globals.hpp --- a/src/share/vm/gc_implementation/g1/g1_globals.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -147,9 +147,6 @@ develop(bool, G1PrintCTFilterStats, false, \ "If true, print stats on RS filtering effectiveness") \ \ - develop(bool, G1RSBarrierUseQueue, true, \ - "If true, use queueing RS barrier") \ - \ develop(bool, G1DeferredRSUpdate, true, \ "If true, use deferred RS updates") \ \ @@ -253,6 +250,10 @@ \ experimental(bool, G1ParallelRSetScanningEnabled, false, \ "Enables the parallelization of remembered set scanning " \ - "during evacuation pauses") + "during evacuation pauses") \ + \ + product(uintx, G1ParallelRSetThreads, 0, \ + "If non-0 is the number of parallel rem set update threads, " \ + "otherwise the value is determined ergonomically.") G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG) diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/heapRegion.cpp --- a/src/share/vm/gc_implementation/g1/heapRegion.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/heapRegion.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -351,6 +351,7 @@ _claimed(InitialClaimValue), _evacuation_failed(false), _prev_marked_bytes(0), _next_marked_bytes(0), _sort_index(-1), _young_type(NotYoung), _next_young_region(NULL), + _next_dirty_cards_region(NULL), _young_index_in_cset(-1), _surv_rate_group(NULL), _age_index(-1), _rem_set(NULL), _zfs(NotZeroFilled) { diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/heapRegion.hpp --- a/src/share/vm/gc_implementation/g1/heapRegion.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/heapRegion.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -227,6 +227,9 @@ // next region in the young "generation" region set HeapRegion* _next_young_region; + // Next region whose cards need cleaning + HeapRegion* _next_dirty_cards_region; + // For parallel heapRegion traversal. jint _claimed; @@ -468,6 +471,11 @@ _next_young_region = hr; } + HeapRegion* get_next_dirty_cards_region() const { return _next_dirty_cards_region; } + HeapRegion** next_dirty_cards_region_addr() { return &_next_dirty_cards_region; } + void set_next_dirty_cards_region(HeapRegion* hr) { _next_dirty_cards_region = hr; } + bool is_on_dirty_cards_region_list() const { return get_next_dirty_cards_region() != NULL; } + // Allows logical separation between objects allocated before and after. void save_marks(); diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp --- a/src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -1052,18 +1052,11 @@ } - -bool HeapRegionRemSet::_par_traversal = false; - -void HeapRegionRemSet::set_par_traversal(bool b) { - assert(_par_traversal != b, "Proper alternation..."); - _par_traversal = b; -} - +// Determines how many threads can add records to an rset in parallel. +// This can be done by either mutator threads together with the +// concurrent refinement threads or GC threads. int HeapRegionRemSet::num_par_rem_sets() { - // We always have at least two, so that a mutator thread can claim an - // id and add to a rem set. - return (int) MAX2(ParallelGCThreads, (size_t)2); + return (int)MAX2(DirtyCardQueueSet::num_par_ids() + ConcurrentG1Refine::thread_num(), ParallelGCThreads); } HeapRegionRemSet::HeapRegionRemSet(G1BlockOffsetSharedArray* bosa, diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/heapRegionRemSet.hpp --- a/src/share/vm/gc_implementation/g1/heapRegionRemSet.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/heapRegionRemSet.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -177,8 +177,6 @@ G1BlockOffsetSharedArray* _bosa; G1BlockOffsetSharedArray* bosa() const { return _bosa; } - static bool _par_traversal; - OtherRegionsTable _other_regions; // One set bit for every region that has an entry for this one. @@ -211,8 +209,6 @@ HeapRegion* hr); static int num_par_rem_sets(); - static bool par_traversal() { return _par_traversal; } - static void set_par_traversal(bool b); HeapRegion* hr() const { return _other_regions.hr(); diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/g1/ptrQueue.cpp --- a/src/share/vm/gc_implementation/g1/ptrQueue.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/ptrQueue.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -172,7 +172,7 @@ _n_completed_buffers++; if (!_process_completed && - _n_completed_buffers == _process_completed_threshold) { + _n_completed_buffers >= _process_completed_threshold) { _process_completed = true; if (_notify_when_complete) _cbl_mon->notify_all(); diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/includeDB_gc_g1 --- a/src/share/vm/gc_implementation/includeDB_gc_g1 Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/includeDB_gc_g1 Wed Jun 10 14:57:21 2009 -0700 @@ -49,6 +49,8 @@ concurrentG1Refine.hpp globalDefinitions.hpp concurrentG1Refine.hpp allocation.hpp +concurrentG1Refine.hpp thread.hpp + concurrentG1RefineThread.cpp concurrentG1Refine.hpp concurrentG1RefineThread.cpp concurrentG1RefineThread.hpp @@ -280,6 +282,7 @@ heapRegionRemSet.cpp allocation.hpp heapRegionRemSet.cpp bitMap.inline.hpp +heapRegionRemSet.cpp concurrentG1Refine.hpp heapRegionRemSet.cpp g1BlockOffsetTable.inline.hpp heapRegionRemSet.cpp g1CollectedHeap.inline.hpp heapRegionRemSet.cpp heapRegionRemSet.hpp diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/shared/concurrentGCThread.cpp --- a/src/share/vm/gc_implementation/shared/concurrentGCThread.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/shared/concurrentGCThread.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -27,13 +27,12 @@ # include "incls/_precompiled.incl" # include "incls/_concurrentGCThread.cpp.incl" -bool ConcurrentGCThread::_should_terminate = false; -bool ConcurrentGCThread::_has_terminated = false; int ConcurrentGCThread::_CGC_flag = CGC_nil; SuspendibleThreadSet ConcurrentGCThread::_sts; -ConcurrentGCThread::ConcurrentGCThread() { +ConcurrentGCThread::ConcurrentGCThread() : + _should_terminate(false), _has_terminated(false) { _sts.initialize(); }; diff -r eacd97c88873 -r 7295839252de src/share/vm/gc_implementation/shared/concurrentGCThread.hpp --- a/src/share/vm/gc_implementation/shared/concurrentGCThread.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/gc_implementation/shared/concurrentGCThread.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -72,8 +72,8 @@ friend class VMStructs; protected: - static bool _should_terminate; - static bool _has_terminated; + bool _should_terminate; + bool _has_terminated; enum CGC_flag_type { CGC_nil = 0x0, diff -r eacd97c88873 -r 7295839252de src/share/vm/memory/cardTableRS.cpp --- a/src/share/vm/memory/cardTableRS.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/memory/cardTableRS.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -33,12 +33,8 @@ { #ifndef SERIALGC if (UseG1GC) { - if (G1RSBarrierUseQueue) { _ct_bs = new G1SATBCardTableLoggingModRefBS(whole_heap, max_covered_regions); - } else { - _ct_bs = new G1SATBCardTableModRefBS(whole_heap, max_covered_regions); - } } else { _ct_bs = new CardTableModRefBSForCTRS(whole_heap, max_covered_regions); } diff -r eacd97c88873 -r 7295839252de src/share/vm/runtime/mutexLocker.cpp --- a/src/share/vm/runtime/mutexLocker.cpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/runtime/mutexLocker.cpp Wed Jun 10 14:57:21 2009 -0700 @@ -70,7 +70,6 @@ Monitor* CMark_lock = NULL; Monitor* ZF_mon = NULL; Monitor* Cleanup_mon = NULL; -Monitor* G1ConcRefine_mon = NULL; Mutex* SATB_Q_FL_lock = NULL; Monitor* SATB_Q_CBL_mon = NULL; Mutex* Shared_SATB_Q_lock = NULL; @@ -168,7 +167,6 @@ def(CMark_lock , Monitor, nonleaf, true ); // coordinate concurrent mark thread def(ZF_mon , Monitor, leaf, true ); def(Cleanup_mon , Monitor, nonleaf, true ); - def(G1ConcRefine_mon , Monitor, nonleaf, true ); def(SATB_Q_FL_lock , Mutex , special, true ); def(SATB_Q_CBL_mon , Monitor, nonleaf, true ); def(Shared_SATB_Q_lock , Mutex, nonleaf, true ); diff -r eacd97c88873 -r 7295839252de src/share/vm/runtime/mutexLocker.hpp --- a/src/share/vm/runtime/mutexLocker.hpp Fri Jun 05 10:25:39 2009 -0700 +++ b/src/share/vm/runtime/mutexLocker.hpp Wed Jun 10 14:57:21 2009 -0700 @@ -63,9 +63,6 @@ extern Monitor* CMark_lock; // used for concurrent mark thread coordination extern Monitor* ZF_mon; // used for G1 conc zero-fill. extern Monitor* Cleanup_mon; // used for G1 conc cleanup. -extern Monitor* G1ConcRefine_mon; // used for G1 conc-refine - // coordination. - extern Mutex* SATB_Q_FL_lock; // Protects SATB Q // buffer free list. extern Monitor* SATB_Q_CBL_mon; // Protects SATB Q