comparison src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp @ 890:6cb8e9df7174

6819077: G1: first GC thread coming late into the GC. Summary: The first worker thread is delayed when entering the GC because it clears the card count table that is used in identifying hot cards. Replace the card count table with a dynamically sized evicting hash table that includes an epoch based counter. Reviewed-by: iveresov, tonyp
author johnc
date Tue, 04 Aug 2009 16:00:17 -0700
parents 15c5903cf9e1
children 035d2e036a9b
comparison
equal deleted inserted replaced
889:15c5903cf9e1 890:6cb8e9df7174
23 */ 23 */
24 24
25 #include "incls/_precompiled.incl" 25 #include "incls/_precompiled.incl"
26 #include "incls/_concurrentG1Refine.cpp.incl" 26 #include "incls/_concurrentG1Refine.cpp.incl"
27 27
28 // Possible sizes for the card counts cache: odd primes that roughly double in size.
29 // (See jvmtiTagMap.cpp).
30 int ConcurrentG1Refine::_cc_cache_sizes[] = {
31 16381, 32771, 76831, 150001, 307261,
32 614563, 1228891, 2457733, 4915219, 9830479,
33 19660831, 39321619, 78643219, 157286461, -1
34 };
35
28 ConcurrentG1Refine::ConcurrentG1Refine() : 36 ConcurrentG1Refine::ConcurrentG1Refine() :
29 _card_counts(NULL), _cur_card_count_histo(NULL), _cum_card_count_histo(NULL), 37 _card_counts(NULL), _card_epochs(NULL),
38 _n_card_counts(0), _max_n_card_counts(0),
39 _cache_size_index(0), _expand_card_counts(false),
30 _hot_cache(NULL), 40 _hot_cache(NULL),
31 _def_use_cache(false), _use_cache(false), 41 _def_use_cache(false), _use_cache(false),
32 _n_periods(0), _total_cards(0), _total_travs(0), 42 _n_periods(0),
33 _threads(NULL), _n_threads(0) 43 _threads(NULL), _n_threads(0)
34 { 44 {
35 if (G1ConcRefine) { 45 if (G1ConcRefine) {
36 _n_threads = (int)thread_num(); 46 _n_threads = (int)thread_num();
37 if (_n_threads > 0) { 47 if (_n_threads > 0) {
55 } 65 }
56 return 0; 66 return 0;
57 } 67 }
58 68
59 void ConcurrentG1Refine::init() { 69 void ConcurrentG1Refine::init() {
60 G1CollectedHeap* g1h = G1CollectedHeap::heap(); 70 if (G1ConcRSLogCacheSize > 0) {
61 if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) { 71 _g1h = G1CollectedHeap::heap();
62 _n_card_counts = 72 _max_n_card_counts =
63 (unsigned) (g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift); 73 (unsigned) (_g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
64 _card_counts = NEW_C_HEAP_ARRAY(unsigned char, _n_card_counts); 74
65 for (size_t i = 0; i < _n_card_counts; i++) _card_counts[i] = 0; 75 size_t max_card_num = ((size_t)1 << (sizeof(unsigned)*BitsPerByte-1)) - 1;
66 ModRefBarrierSet* bs = g1h->mr_bs(); 76 guarantee(_max_n_card_counts < max_card_num, "card_num representation");
77
78 int desired = _max_n_card_counts / InitialCacheFraction;
79 for (_cache_size_index = 0;
80 _cc_cache_sizes[_cache_size_index] >= 0; _cache_size_index++) {
81 if (_cc_cache_sizes[_cache_size_index] >= desired) break;
82 }
83 _cache_size_index = MAX2(0, (_cache_size_index - 1));
84
85 int initial_size = _cc_cache_sizes[_cache_size_index];
86 if (initial_size < 0) initial_size = _max_n_card_counts;
87
88 // Make sure we don't go bigger than we will ever need
89 _n_card_counts = MIN2((unsigned) initial_size, _max_n_card_counts);
90
91 _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
92 _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
93
94 Copy::fill_to_bytes(&_card_counts[0],
95 _n_card_counts * sizeof(CardCountCacheEntry));
96 Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
97
98 ModRefBarrierSet* bs = _g1h->mr_bs();
67 guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition"); 99 guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
68 CardTableModRefBS* ctbs = (CardTableModRefBS*)bs; 100 _ct_bs = (CardTableModRefBS*)bs;
69 _ct_bot = ctbs->byte_for_const(g1h->reserved_region().start()); 101 _ct_bot = _ct_bs->byte_for_const(_g1h->reserved_region().start());
70 if (G1ConcRSCountTraversals) { 102
71 _cur_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
72 _cum_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
73 for (int i = 0; i < 256; i++) {
74 _cur_card_count_histo[i] = 0;
75 _cum_card_count_histo[i] = 0;
76 }
77 }
78 }
79 if (G1ConcRSLogCacheSize > 0) {
80 _def_use_cache = true; 103 _def_use_cache = true;
81 _use_cache = true; 104 _use_cache = true;
82 _hot_cache_size = (1 << G1ConcRSLogCacheSize); 105 _hot_cache_size = (1 << G1ConcRSLogCacheSize);
83 _hot_cache = NEW_C_HEAP_ARRAY(jbyte*, _hot_cache_size); 106 _hot_cache = NEW_C_HEAP_ARRAY(jbyte*, _hot_cache_size);
84 _n_hot = 0; 107 _n_hot = 0;
85 _hot_cache_idx = 0; 108 _hot_cache_idx = 0;
86 109
87 // For refining the cards in the hot cache in parallel 110 // For refining the cards in the hot cache in parallel
88 int n_workers = (ParallelGCThreads > 0 ? 111 int n_workers = (ParallelGCThreads > 0 ?
89 g1h->workers()->total_workers() : 1); 112 _g1h->workers()->total_workers() : 1);
90 _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers); 113 _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers);
91 _hot_cache_par_claimed_idx = 0; 114 _hot_cache_par_claimed_idx = 0;
92 } 115 }
93 } 116 }
94 117
99 } 122 }
100 } 123 }
101 } 124 }
102 125
103 ConcurrentG1Refine::~ConcurrentG1Refine() { 126 ConcurrentG1Refine::~ConcurrentG1Refine() {
104 if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) { 127 if (G1ConcRSLogCacheSize > 0) {
105 assert(_card_counts != NULL, "Logic"); 128 assert(_card_counts != NULL, "Logic");
106 FREE_C_HEAP_ARRAY(unsigned char, _card_counts); 129 FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
107 assert(_cur_card_count_histo != NULL, "Logic"); 130 assert(_card_epochs != NULL, "Logic");
108 FREE_C_HEAP_ARRAY(unsigned, _cur_card_count_histo); 131 FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
109 assert(_cum_card_count_histo != NULL, "Logic");
110 FREE_C_HEAP_ARRAY(unsigned, _cum_card_count_histo);
111 }
112 if (G1ConcRSLogCacheSize > 0) {
113 assert(_hot_cache != NULL, "Logic"); 132 assert(_hot_cache != NULL, "Logic");
114 FREE_C_HEAP_ARRAY(jbyte*, _hot_cache); 133 FREE_C_HEAP_ARRAY(jbyte*, _hot_cache);
115 } 134 }
116 if (_threads != NULL) { 135 if (_threads != NULL) {
117 for (int i = 0; i < _n_threads; i++) { 136 for (int i = 0; i < _n_threads; i++) {
127 tc->do_thread(_threads[i]); 146 tc->do_thread(_threads[i]);
128 } 147 }
129 } 148 }
130 } 149 }
131 150
132 151 bool ConcurrentG1Refine::is_young_card(jbyte* card_ptr) {
133 int ConcurrentG1Refine::add_card_count(jbyte* card_ptr) { 152 HeapWord* start = _ct_bs->addr_for(card_ptr);
134 size_t card_num = (card_ptr - _ct_bot); 153 HeapRegion* r = _g1h->heap_region_containing(start);
135 guarantee(0 <= card_num && card_num < _n_card_counts, "Bounds"); 154 if (r != NULL && r->is_young()) {
136 unsigned char cnt = _card_counts[card_num]; 155 return true;
137 if (cnt < 255) _card_counts[card_num]++; 156 }
138 return cnt; 157 // This card is not associated with a heap region
139 _total_travs++; 158 // so can't be young.
140 } 159 return false;
141 160 }
142 jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr) { 161
143 int count = add_card_count(card_ptr); 162 jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* defer) {
144 // Count previously unvisited cards. 163 unsigned new_card_num = ptr_2_card_num(card_ptr);
145 if (count == 0) _total_cards++; 164 unsigned bucket = hash(new_card_num);
146 // We'll assume a traversal unless we store it in the cache. 165 assert(0 <= bucket && bucket < _n_card_counts, "Bounds");
166
167 CardCountCacheEntry* count_ptr = &_card_counts[bucket];
168 CardEpochCacheEntry* epoch_ptr = &_card_epochs[bucket];
169
170 // We have to construct a new entry if we haven't updated the counts
171 // during the current period, or if the count was updated for a
172 // different card number.
173 unsigned int new_epoch = (unsigned int) _n_periods;
174 julong new_epoch_entry = make_epoch_entry(new_card_num, new_epoch);
175
176 while (true) {
177 // Fetch the previous epoch value
178 julong prev_epoch_entry = epoch_ptr->_value;
179 julong cas_res;
180
181 if (extract_epoch(prev_epoch_entry) != new_epoch) {
182 // This entry has not yet been updated during this period.
183 // Note: we update the epoch value atomically to ensure
184 // that there is only one winner that updates the cached
185 // card_ptr value even though all the refine threads share
186 // the same epoch value.
187
188 cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
189 (volatile jlong*)&epoch_ptr->_value,
190 (jlong) prev_epoch_entry);
191
192 if (cas_res == prev_epoch_entry) {
193 // We have successfully won the race to update the
194 // epoch and card_num value. Make it look like the
195 // count and eviction count were previously cleared.
196 count_ptr->_count = 1;
197 count_ptr->_evict_count = 0;
198 *count = 0;
199 // We can defer the processing of card_ptr
200 *defer = true;
201 return card_ptr;
202 }
203 // We did not win the race to update the epoch field, so some other
204 // thread must have done it. The value that gets returned by CAS
205 // should be the new epoch value.
206 assert(extract_epoch(cas_res) == new_epoch, "unexpected epoch");
207 // We could 'continue' here or just re-read the previous epoch value
208 prev_epoch_entry = epoch_ptr->_value;
209 }
210
211 // The epoch entry for card_ptr has been updated during this period.
212 unsigned old_card_num = extract_card_num(prev_epoch_entry);
213
214 // The card count that will be returned to caller
215 *count = count_ptr->_count;
216
217 // Are we updating the count for the same card?
218 if (new_card_num == old_card_num) {
219 // Same card - just update the count. We could have more than one
220 // thread racing to update count for the current card. It should be
221 // OK not to use a CAS as the only penalty should be some missed
222 // increments of the count which delays identifying the card as "hot".
223
224 if (*count < max_jubyte) count_ptr->_count++;
225 // We can defer the processing of card_ptr
226 *defer = true;
227 return card_ptr;
228 }
229
230 // Different card - evict old card info
231 if (count_ptr->_evict_count < max_jubyte) count_ptr->_evict_count++;
232 if (count_ptr->_evict_count > G1CardCountCacheExpandThreshold) {
233 // Trigger a resize the next time we clear
234 _expand_card_counts = true;
235 }
236
237 cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
238 (volatile jlong*)&epoch_ptr->_value,
239 (jlong) prev_epoch_entry);
240
241 if (cas_res == prev_epoch_entry) {
242 // We successfully updated the card num value in the epoch entry
243 count_ptr->_count = 0; // initialize counter for new card num
244
245 // Even though the region containg the card at old_card_num was not
246 // in the young list when old_card_num was recorded in the epoch
247 // cache it could have been added to the free list and subsequently
248 // added to the young list in the intervening time. If the evicted
249 // card is in a young region just return the card_ptr and the evicted
250 // card will not be cleaned. See CR 6817995.
251
252 jbyte* old_card_ptr = card_num_2_ptr(old_card_num);
253 if (is_young_card(old_card_ptr)) {
254 *count = 0;
255 // We can defer the processing of card_ptr
256 *defer = true;
257 return card_ptr;
258 }
259
260 // We do not want to defer processing of card_ptr in this case
261 // (we need to refine old_card_ptr and card_ptr)
262 *defer = false;
263 return old_card_ptr;
264 }
265 // Someone else beat us - try again.
266 }
267 }
268
269 jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) {
270 int count;
271 jbyte* cached_ptr = add_card_count(card_ptr, &count, defer);
272 assert(cached_ptr != NULL, "bad cached card ptr");
273 assert(!is_young_card(cached_ptr), "shouldn't get a card in young region");
274
275 // The card pointer we obtained from card count cache is not hot
276 // so do not store it in the cache; return it for immediate
277 // refining.
147 if (count < G1ConcRSHotCardLimit) { 278 if (count < G1ConcRSHotCardLimit) {
148 _total_travs++; 279 return cached_ptr;
149 return card_ptr; 280 }
150 } 281
151 // Otherwise, it's hot. 282 // Otherwise, the pointer we got from the _card_counts is hot.
152 jbyte* res = NULL; 283 jbyte* res = NULL;
153 MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag); 284 MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag);
154 if (_n_hot == _hot_cache_size) { 285 if (_n_hot == _hot_cache_size) {
155 _total_travs++;
156 res = _hot_cache[_hot_cache_idx]; 286 res = _hot_cache[_hot_cache_idx];
157 _n_hot--; 287 _n_hot--;
158 } 288 }
159 // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx. 289 // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx.
160 _hot_cache[_hot_cache_idx] = card_ptr; 290 _hot_cache[_hot_cache_idx] = cached_ptr;
161 _hot_cache_idx++; 291 _hot_cache_idx++;
162 if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0; 292 if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0;
163 _n_hot++; 293 _n_hot++;
294
295 if (res != NULL) {
296 // Even though the region containg res was not in the young list
297 // when it was recorded in the hot cache it could have been added
298 // to the free list and subsequently added to the young list in
299 // the intervening time. If res is in a young region, return NULL
300 // so that res is not cleaned. See CR 6817995.
301
302 if (is_young_card(res)) {
303 res = NULL;
304 }
305 }
306
164 return res; 307 return res;
165 } 308 }
166
167 309
168 void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) { 310 void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
169 assert(!use_cache(), "cache should be disabled"); 311 assert(!use_cache(), "cache should be disabled");
170 int start_idx; 312 int start_idx;
171 313
184 } 326 }
185 } 327 }
186 } 328 }
187 } 329 }
188 330
331 void ConcurrentG1Refine::expand_card_count_cache() {
332 if (_n_card_counts < _max_n_card_counts) {
333 int new_idx = _cache_size_index+1;
334 int new_size = _cc_cache_sizes[new_idx];
335 if (new_size < 0) new_size = _max_n_card_counts;
336
337 // Make sure we don't go bigger than we will ever need
338 new_size = MIN2((unsigned) new_size, _max_n_card_counts);
339
340 // Expand the card count and card epoch tables
341 if (new_size > (int)_n_card_counts) {
342 // We can just free and allocate a new array as we're
343 // not interested in preserving the contents
344 assert(_card_counts != NULL, "Logic!");
345 assert(_card_epochs != NULL, "Logic!");
346 FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
347 FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
348 _n_card_counts = new_size;
349 _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
350 _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
351 _cache_size_index = new_idx;
352 }
353 }
354 }
355
189 void ConcurrentG1Refine::clear_and_record_card_counts() { 356 void ConcurrentG1Refine::clear_and_record_card_counts() {
190 if (G1ConcRSLogCacheSize == 0 && !G1ConcRSCountTraversals) return; 357 if (G1ConcRSLogCacheSize == 0) return;
358
359 #ifndef PRODUCT
360 double start = os::elapsedTime();
361 #endif
362
363 if (_expand_card_counts) {
364 expand_card_count_cache();
365 _expand_card_counts = false;
366 // Only need to clear the epochs.
367 Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
368 }
369
370 int this_epoch = (int) _n_periods;
371 assert((this_epoch+1) <= max_jint, "to many periods");
372 // Update epoch
191 _n_periods++; 373 _n_periods++;
192 if (G1ConcRSCountTraversals) { 374
193 for (size_t i = 0; i < _n_card_counts; i++) { 375 #ifndef PRODUCT
194 unsigned char bucket = _card_counts[i]; 376 double elapsed = os::elapsedTime() - start;
195 _cur_card_count_histo[bucket]++; 377 _g1h->g1_policy()->record_cc_clear_time(elapsed * 1000.0);
196 _card_counts[i] = 0; 378 #endif
197 } 379 }
198 gclog_or_tty->print_cr("Card counts:");
199 for (int i = 0; i < 256; i++) {
200 if (_cur_card_count_histo[i] > 0) {
201 gclog_or_tty->print_cr(" %3d: %9d", i, _cur_card_count_histo[i]);
202 _cum_card_count_histo[i] += _cur_card_count_histo[i];
203 _cur_card_count_histo[i] = 0;
204 }
205 }
206 } else {
207 assert(G1ConcRSLogCacheSize > 0, "Logic");
208 Copy::fill_to_words((HeapWord*)(&_card_counts[0]),
209 _n_card_counts / HeapWordSize);
210 }
211 }
212
213 void
214 ConcurrentG1Refine::
215 print_card_count_histo_range(unsigned* histo, int from, int to,
216 float& cum_card_pct,
217 float& cum_travs_pct) {
218 unsigned cards = 0;
219 unsigned travs = 0;
220 guarantee(to <= 256, "Precondition");
221 for (int i = from; i < to-1; i++) {
222 cards += histo[i];
223 travs += histo[i] * i;
224 }
225 if (to == 256) {
226 unsigned histo_card_sum = 0;
227 unsigned histo_trav_sum = 0;
228 for (int i = 1; i < 255; i++) {
229 histo_trav_sum += histo[i] * i;
230 }
231 cards += histo[255];
232 // correct traversals for the last one.
233 unsigned travs_255 = (unsigned) (_total_travs - histo_trav_sum);
234 travs += travs_255;
235
236 } else {
237 cards += histo[to-1];
238 travs += histo[to-1] * (to-1);
239 }
240 float fperiods = (float)_n_periods;
241 float f_tot_cards = (float)_total_cards/fperiods;
242 float f_tot_travs = (float)_total_travs/fperiods;
243 if (cards > 0) {
244 float fcards = (float)cards/fperiods;
245 float ftravs = (float)travs/fperiods;
246 if (to == 256) {
247 gclog_or_tty->print(" %4d- %10.2f%10.2f", from, fcards, ftravs);
248 } else {
249 gclog_or_tty->print(" %4d-%4d %10.2f%10.2f", from, to-1, fcards, ftravs);
250 }
251 float pct_cards = fcards*100.0/f_tot_cards;
252 cum_card_pct += pct_cards;
253 float pct_travs = ftravs*100.0/f_tot_travs;
254 cum_travs_pct += pct_travs;
255 gclog_or_tty->print_cr("%10.2f%10.2f%10.2f%10.2f",
256 pct_cards, cum_card_pct,
257 pct_travs, cum_travs_pct);
258 }
259 }
260
261 void ConcurrentG1Refine::print_final_card_counts() {
262 if (!G1ConcRSCountTraversals) return;
263
264 gclog_or_tty->print_cr("Did %d total traversals of %d distinct cards.",
265 _total_travs, _total_cards);
266 float fperiods = (float)_n_periods;
267 gclog_or_tty->print_cr(" This is an average of %8.2f traversals, %8.2f cards, "
268 "per collection.", (float)_total_travs/fperiods,
269 (float)_total_cards/fperiods);
270 gclog_or_tty->print_cr(" This is an average of %8.2f traversals/distinct "
271 "dirty card.\n",
272 _total_cards > 0 ?
273 (float)_total_travs/(float)_total_cards : 0.0);
274
275
276 gclog_or_tty->print_cr("Histogram:\n\n%10s %10s%10s%10s%10s%10s%10s",
277 "range", "# cards", "# travs", "% cards", "(cum)",
278 "% travs", "(cum)");
279 gclog_or_tty->print_cr("------------------------------------------------------------"
280 "-------------");
281 float cum_cards_pct = 0.0;
282 float cum_travs_pct = 0.0;
283 for (int i = 1; i < 10; i++) {
284 print_card_count_histo_range(_cum_card_count_histo, i, i+1,
285 cum_cards_pct, cum_travs_pct);
286 }
287 for (int i = 10; i < 100; i += 10) {
288 print_card_count_histo_range(_cum_card_count_histo, i, i+10,
289 cum_cards_pct, cum_travs_pct);
290 }
291 print_card_count_histo_range(_cum_card_count_histo, 100, 150,
292 cum_cards_pct, cum_travs_pct);
293 print_card_count_histo_range(_cum_card_count_histo, 150, 200,
294 cum_cards_pct, cum_travs_pct);
295 print_card_count_histo_range(_cum_card_count_histo, 150, 255,
296 cum_cards_pct, cum_travs_pct);
297 print_card_count_histo_range(_cum_card_count_histo, 255, 256,
298 cum_cards_pct, cum_travs_pct);
299 }