Mercurial > hg > truffle
annotate src/share/vm/opto/coalesce.cpp @ 452:00b023ae2d78
6722113: CMS: Incorrect overflow handling during precleaning of Reference lists
Summary: When we encounter marking stack overflow during precleaning of Reference lists, we were using the overflow list mechanism, which can cause problems on account of mutating the mark word of the header because of conflicts with mutator accesses and updates of that field. Instead we should use the usual mechanism for overflow handling in concurrent phases, namely dirtying of the card on which the overflowed object lies. Since precleaning effectively does a form of discovered list processing, albeit with discovery enabled, we needed to adjust some code to be correct in the face of interleaved processing and discovery.
Reviewed-by: apetrusenko, jcoomes
author | ysr |
---|---|
date | Thu, 20 Nov 2008 12:27:41 -0800 |
parents | 9ee9cf798b59 |
children | 98cb887364d3 |
rev | line source |
---|---|
0 | 1 /* |
337 | 2 * Copyright 1997-2008 Sun Microsystems, Inc. All Rights Reserved. |
0 | 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 * | |
5 * This code is free software; you can redistribute it and/or modify it | |
6 * under the terms of the GNU General Public License version 2 only, as | |
7 * published by the Free Software Foundation. | |
8 * | |
9 * This code is distributed in the hope that it will be useful, but WITHOUT | |
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
12 * version 2 for more details (a copy is included in the LICENSE file that | |
13 * accompanied this code). | |
14 * | |
15 * You should have received a copy of the GNU General Public License version | |
16 * 2 along with this work; if not, write to the Free Software Foundation, | |
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. | |
18 * | |
19 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
20 * CA 95054 USA or visit www.sun.com if you need additional information or | |
21 * have any questions. | |
22 * | |
23 */ | |
24 | |
25 #include "incls/_precompiled.incl" | |
26 #include "incls/_coalesce.cpp.incl" | |
27 | |
28 //============================================================================= | |
29 //------------------------------reset_uf_map----------------------------------- | |
30 void PhaseChaitin::reset_uf_map( uint maxlrg ) { | |
31 _maxlrg = maxlrg; | |
32 // Force the Union-Find mapping to be at least this large | |
33 _uf_map.extend(_maxlrg,0); | |
34 // Initialize it to be the ID mapping. | |
35 for( uint i=0; i<_maxlrg; i++ ) | |
36 _uf_map.map(i,i); | |
37 } | |
38 | |
39 //------------------------------compress_uf_map-------------------------------- | |
40 // Make all Nodes map directly to their final live range; no need for | |
41 // the Union-Find mapping after this call. | |
42 void PhaseChaitin::compress_uf_map_for_nodes( ) { | |
43 // For all Nodes, compress mapping | |
44 uint unique = _names.Size(); | |
45 for( uint i=0; i<unique; i++ ) { | |
46 uint lrg = _names[i]; | |
47 uint compressed_lrg = Find(lrg); | |
48 if( lrg != compressed_lrg ) | |
49 _names.map(i,compressed_lrg); | |
50 } | |
51 } | |
52 | |
53 //------------------------------Find------------------------------------------- | |
54 // Straight out of Tarjan's union-find algorithm | |
55 uint PhaseChaitin::Find_compress( uint lrg ) { | |
56 uint cur = lrg; | |
57 uint next = _uf_map[cur]; | |
58 while( next != cur ) { // Scan chain of equivalences | |
59 assert( next < cur, "always union smaller" ); | |
60 cur = next; // until find a fixed-point | |
61 next = _uf_map[cur]; | |
62 } | |
63 // Core of union-find algorithm: update chain of | |
64 // equivalences to be equal to the root. | |
65 while( lrg != next ) { | |
66 uint tmp = _uf_map[lrg]; | |
67 _uf_map.map(lrg, next); | |
68 lrg = tmp; | |
69 } | |
70 return lrg; | |
71 } | |
72 | |
73 //------------------------------Find------------------------------------------- | |
74 // Straight out of Tarjan's union-find algorithm | |
75 uint PhaseChaitin::Find_compress( const Node *n ) { | |
76 uint lrg = Find_compress(_names[n->_idx]); | |
77 _names.map(n->_idx,lrg); | |
78 return lrg; | |
79 } | |
80 | |
81 //------------------------------Find_const------------------------------------- | |
82 // Like Find above, but no path compress, so bad asymptotic behavior | |
83 uint PhaseChaitin::Find_const( uint lrg ) const { | |
84 if( !lrg ) return lrg; // Ignore the zero LRG | |
85 // Off the end? This happens during debugging dumps when you got | |
86 // brand new live ranges but have not told the allocator yet. | |
87 if( lrg >= _maxlrg ) return lrg; | |
88 uint next = _uf_map[lrg]; | |
89 while( next != lrg ) { // Scan chain of equivalences | |
90 assert( next < lrg, "always union smaller" ); | |
91 lrg = next; // until find a fixed-point | |
92 next = _uf_map[lrg]; | |
93 } | |
94 return next; | |
95 } | |
96 | |
97 //------------------------------Find------------------------------------------- | |
98 // Like Find above, but no path compress, so bad asymptotic behavior | |
99 uint PhaseChaitin::Find_const( const Node *n ) const { | |
100 if( n->_idx >= _names.Size() ) return 0; // not mapped, usual for debug dump | |
101 return Find_const( _names[n->_idx] ); | |
102 } | |
103 | |
104 //------------------------------Union------------------------------------------ | |
105 // union 2 sets together. | |
106 void PhaseChaitin::Union( const Node *src_n, const Node *dst_n ) { | |
107 uint src = Find(src_n); | |
108 uint dst = Find(dst_n); | |
109 assert( src, "" ); | |
110 assert( dst, "" ); | |
111 assert( src < _maxlrg, "oob" ); | |
112 assert( dst < _maxlrg, "oob" ); | |
113 assert( src < dst, "always union smaller" ); | |
114 _uf_map.map(dst,src); | |
115 } | |
116 | |
117 //------------------------------new_lrg---------------------------------------- | |
118 void PhaseChaitin::new_lrg( const Node *x, uint lrg ) { | |
119 // Make the Node->LRG mapping | |
120 _names.extend(x->_idx,lrg); | |
121 // Make the Union-Find mapping an identity function | |
122 _uf_map.extend(lrg,lrg); | |
123 } | |
124 | |
125 //------------------------------clone_projs------------------------------------ | |
126 // After cloning some rematierialized instruction, clone any MachProj's that | |
127 // follow it. Example: Intel zero is XOR, kills flags. Sparc FP constants | |
128 // use G3 as an address temp. | |
129 int PhaseChaitin::clone_projs( Block *b, uint idx, Node *con, Node *copy, uint &maxlrg ) { | |
130 Block *bcon = _cfg._bbs[con->_idx]; | |
131 uint cindex = bcon->find_node(con); | |
132 Node *con_next = bcon->_nodes[cindex+1]; | |
133 if( con_next->in(0) != con || con_next->Opcode() != Op_MachProj ) | |
134 return false; // No MachProj's follow | |
135 | |
136 // Copy kills after the cloned constant | |
137 Node *kills = con_next->clone(); | |
138 kills->set_req( 0, copy ); | |
139 b->_nodes.insert( idx, kills ); | |
140 _cfg._bbs.map( kills->_idx, b ); | |
141 new_lrg( kills, maxlrg++ ); | |
142 return true; | |
143 } | |
144 | |
145 //------------------------------compact---------------------------------------- | |
146 // Renumber the live ranges to compact them. Makes the IFG smaller. | |
147 void PhaseChaitin::compact() { | |
148 // Current the _uf_map contains a series of short chains which are headed | |
149 // by a self-cycle. All the chains run from big numbers to little numbers. | |
150 // The Find() call chases the chains & shortens them for the next Find call. | |
151 // We are going to change this structure slightly. Numbers above a moving | |
152 // wave 'i' are unchanged. Numbers below 'j' point directly to their | |
153 // compacted live range with no further chaining. There are no chains or | |
154 // cycles below 'i', so the Find call no longer works. | |
155 uint j=1; | |
156 uint i; | |
157 for( i=1; i < _maxlrg; i++ ) { | |
158 uint lr = _uf_map[i]; | |
159 // Ignore unallocated live ranges | |
160 if( !lr ) continue; | |
161 assert( lr <= i, "" ); | |
162 _uf_map.map(i, ( lr == i ) ? j++ : _uf_map[lr]); | |
163 } | |
164 if( false ) // PrintOptoCompactLiveRanges | |
165 printf("Compacted %d LRs from %d\n",i-j,i); | |
166 // Now change the Node->LR mapping to reflect the compacted names | |
167 uint unique = _names.Size(); | |
168 for( i=0; i<unique; i++ ) | |
169 _names.map(i,_uf_map[_names[i]]); | |
170 | |
171 // Reset the Union-Find mapping | |
172 reset_uf_map(j); | |
173 | |
174 } | |
175 | |
176 //============================================================================= | |
177 //------------------------------Dump------------------------------------------- | |
178 #ifndef PRODUCT | |
179 void PhaseCoalesce::dump( Node *n ) const { | |
180 // Being a const function means I cannot use 'Find' | |
181 uint r = _phc.Find(n); | |
182 tty->print("L%d/N%d ",r,n->_idx); | |
183 } | |
184 | |
185 //------------------------------dump------------------------------------------- | |
186 void PhaseCoalesce::dump() const { | |
187 // I know I have a block layout now, so I can print blocks in a loop | |
188 for( uint i=0; i<_phc._cfg._num_blocks; i++ ) { | |
189 uint j; | |
190 Block *b = _phc._cfg._blocks[i]; | |
191 // Print a nice block header | |
192 tty->print("B%d: ",b->_pre_order); | |
193 for( j=1; j<b->num_preds(); j++ ) | |
194 tty->print("B%d ", _phc._cfg._bbs[b->pred(j)->_idx]->_pre_order); | |
195 tty->print("-> "); | |
196 for( j=0; j<b->_num_succs; j++ ) | |
197 tty->print("B%d ",b->_succs[j]->_pre_order); | |
198 tty->print(" IDom: B%d/#%d\n", b->_idom ? b->_idom->_pre_order : 0, b->_dom_depth); | |
199 uint cnt = b->_nodes.size(); | |
200 for( j=0; j<cnt; j++ ) { | |
201 Node *n = b->_nodes[j]; | |
202 dump( n ); | |
203 tty->print("\t%s\t",n->Name()); | |
204 | |
205 // Dump the inputs | |
206 uint k; // Exit value of loop | |
207 for( k=0; k<n->req(); k++ ) // For all required inputs | |
208 if( n->in(k) ) dump( n->in(k) ); | |
209 else tty->print("_ "); | |
210 int any_prec = 0; | |
211 for( ; k<n->len(); k++ ) // For all precedence inputs | |
212 if( n->in(k) ) { | |
213 if( !any_prec++ ) tty->print(" |"); | |
214 dump( n->in(k) ); | |
215 } | |
216 | |
217 // Dump node-specific info | |
218 n->dump_spec(tty); | |
219 tty->print("\n"); | |
220 | |
221 } | |
222 tty->print("\n"); | |
223 } | |
224 } | |
225 #endif | |
226 | |
227 //------------------------------combine_these_two------------------------------ | |
228 // Combine the live ranges def'd by these 2 Nodes. N2 is an input to N1. | |
229 void PhaseCoalesce::combine_these_two( Node *n1, Node *n2 ) { | |
230 uint lr1 = _phc.Find(n1); | |
231 uint lr2 = _phc.Find(n2); | |
232 if( lr1 != lr2 && // Different live ranges already AND | |
233 !_phc._ifg->test_edge_sq( lr1, lr2 ) ) { // Do not interfere | |
234 LRG *lrg1 = &_phc.lrgs(lr1); | |
235 LRG *lrg2 = &_phc.lrgs(lr2); | |
236 // Not an oop->int cast; oop->oop, int->int, AND int->oop are OK. | |
237 | |
238 // Now, why is int->oop OK? We end up declaring a raw-pointer as an oop | |
239 // and in general that's a bad thing. However, int->oop conversions only | |
240 // happen at GC points, so the lifetime of the misclassified raw-pointer | |
241 // is from the CheckCastPP (that converts it to an oop) backwards up | |
242 // through a merge point and into the slow-path call, and around the | |
243 // diamond up to the heap-top check and back down into the slow-path call. | |
244 // The misclassified raw pointer is NOT live across the slow-path call, | |
245 // and so does not appear in any GC info, so the fact that it is | |
246 // misclassified is OK. | |
247 | |
248 if( (lrg1->_is_oop || !lrg2->_is_oop) && // not an oop->int cast AND | |
249 // Compatible final mask | |
250 lrg1->mask().overlap( lrg2->mask() ) ) { | |
251 // Merge larger into smaller. | |
252 if( lr1 > lr2 ) { | |
253 uint tmp = lr1; lr1 = lr2; lr2 = tmp; | |
254 Node *n = n1; n1 = n2; n2 = n; | |
255 LRG *ltmp = lrg1; lrg1 = lrg2; lrg2 = ltmp; | |
256 } | |
257 // Union lr2 into lr1 | |
258 _phc.Union( n1, n2 ); | |
259 if (lrg1->_maxfreq < lrg2->_maxfreq) | |
260 lrg1->_maxfreq = lrg2->_maxfreq; | |
261 // Merge in the IFG | |
262 _phc._ifg->Union( lr1, lr2 ); | |
263 // Combine register restrictions | |
264 lrg1->AND(lrg2->mask()); | |
265 } | |
266 } | |
267 } | |
268 | |
269 //------------------------------coalesce_driver-------------------------------- | |
270 // Copy coalescing | |
271 void PhaseCoalesce::coalesce_driver( ) { | |
272 | |
273 verify(); | |
274 // Coalesce from high frequency to low | |
275 for( uint i=0; i<_phc._cfg._num_blocks; i++ ) | |
276 coalesce( _phc._blks[i] ); | |
277 | |
278 } | |
279 | |
280 //------------------------------insert_copy_with_overlap----------------------- | |
281 // I am inserting copies to come out of SSA form. In the general case, I am | |
282 // doing a parallel renaming. I'm in the Named world now, so I can't do a | |
283 // general parallel renaming. All the copies now use "names" (live-ranges) | |
284 // to carry values instead of the explicit use-def chains. Suppose I need to | |
285 // insert 2 copies into the same block. They copy L161->L128 and L128->L132. | |
286 // If I insert them in the wrong order then L128 will get clobbered before it | |
287 // can get used by the second copy. This cannot happen in the SSA model; | |
288 // direct use-def chains get me the right value. It DOES happen in the named | |
289 // model so I have to handle the reordering of copies. | |
290 // | |
291 // In general, I need to topo-sort the placed copies to avoid conflicts. | |
292 // Its possible to have a closed cycle of copies (e.g., recirculating the same | |
293 // values around a loop). In this case I need a temp to break the cycle. | |
294 void PhaseAggressiveCoalesce::insert_copy_with_overlap( Block *b, Node *copy, uint dst_name, uint src_name ) { | |
295 | |
296 // Scan backwards for the locations of the last use of the dst_name. | |
297 // I am about to clobber the dst_name, so the copy must be inserted | |
298 // after the last use. Last use is really first-use on a backwards scan. | |
299 uint i = b->end_idx()-1; | |
300 while( 1 ) { | |
301 Node *n = b->_nodes[i]; | |
302 // Check for end of virtual copies; this is also the end of the | |
303 // parallel renaming effort. | |
304 if( n->_idx < _unique ) break; | |
305 uint idx = n->is_Copy(); | |
306 assert( idx || n->is_Con() || n->Opcode() == Op_MachProj, "Only copies during parallel renaming" ); | |
307 if( idx && _phc.Find(n->in(idx)) == dst_name ) break; | |
308 i--; | |
309 } | |
310 uint last_use_idx = i; | |
311 | |
312 // Also search for any kill of src_name that exits the block. | |
313 // Since the copy uses src_name, I have to come before any kill. | |
314 uint kill_src_idx = b->end_idx(); | |
315 // There can be only 1 kill that exits any block and that is | |
316 // the last kill. Thus it is the first kill on a backwards scan. | |
317 i = b->end_idx()-1; | |
318 while( 1 ) { | |
319 Node *n = b->_nodes[i]; | |
320 // Check for end of virtual copies; this is also the end of the | |
321 // parallel renaming effort. | |
322 if( n->_idx < _unique ) break; | |
323 assert( n->is_Copy() || n->is_Con() || n->Opcode() == Op_MachProj, "Only copies during parallel renaming" ); | |
324 if( _phc.Find(n) == src_name ) { | |
325 kill_src_idx = i; | |
326 break; | |
327 } | |
328 i--; | |
329 } | |
330 // Need a temp? Last use of dst comes after the kill of src? | |
331 if( last_use_idx >= kill_src_idx ) { | |
332 // Need to break a cycle with a temp | |
333 uint idx = copy->is_Copy(); | |
334 Node *tmp = copy->clone(); | |
335 _phc.new_lrg(tmp,_phc._maxlrg++); | |
336 // Insert new temp between copy and source | |
337 tmp ->set_req(idx,copy->in(idx)); | |
338 copy->set_req(idx,tmp); | |
339 // Save source in temp early, before source is killed | |
340 b->_nodes.insert(kill_src_idx,tmp); | |
341 _phc._cfg._bbs.map( tmp->_idx, b ); | |
342 last_use_idx++; | |
343 } | |
344 | |
345 // Insert just after last use | |
346 b->_nodes.insert(last_use_idx+1,copy); | |
347 } | |
348 | |
349 //------------------------------insert_copies---------------------------------- | |
350 void PhaseAggressiveCoalesce::insert_copies( Matcher &matcher ) { | |
351 // We do LRGs compressing and fix a liveout data only here since the other | |
352 // place in Split() is guarded by the assert which we never hit. | |
353 _phc.compress_uf_map_for_nodes(); | |
354 // Fix block's liveout data for compressed live ranges. | |
355 for(uint lrg = 1; lrg < _phc._maxlrg; lrg++ ) { | |
356 uint compressed_lrg = _phc.Find(lrg); | |
357 if( lrg != compressed_lrg ) { | |
358 for( uint bidx = 0; bidx < _phc._cfg._num_blocks; bidx++ ) { | |
359 IndexSet *liveout = _phc._live->live(_phc._cfg._blocks[bidx]); | |
360 if( liveout->member(lrg) ) { | |
361 liveout->remove(lrg); | |
362 liveout->insert(compressed_lrg); | |
363 } | |
364 } | |
365 } | |
366 } | |
367 | |
368 // All new nodes added are actual copies to replace virtual copies. | |
369 // Nodes with index less than '_unique' are original, non-virtual Nodes. | |
370 _unique = C->unique(); | |
371 | |
372 for( uint i=0; i<_phc._cfg._num_blocks; i++ ) { | |
373 Block *b = _phc._cfg._blocks[i]; | |
374 uint cnt = b->num_preds(); // Number of inputs to the Phi | |
375 | |
376 for( uint l = 1; l<b->_nodes.size(); l++ ) { | |
377 Node *n = b->_nodes[l]; | |
378 | |
379 // Do not use removed-copies, use copied value instead | |
380 uint ncnt = n->req(); | |
381 for( uint k = 1; k<ncnt; k++ ) { | |
382 Node *copy = n->in(k); | |
383 uint cidx = copy->is_Copy(); | |
384 if( cidx ) { | |
385 Node *def = copy->in(cidx); | |
386 if( _phc.Find(copy) == _phc.Find(def) ) | |
387 n->set_req(k,def); | |
388 } | |
389 } | |
390 | |
391 // Remove any explicit copies that get coalesced. | |
392 uint cidx = n->is_Copy(); | |
393 if( cidx ) { | |
394 Node *def = n->in(cidx); | |
395 if( _phc.Find(n) == _phc.Find(def) ) { | |
396 n->replace_by(def); | |
397 n->set_req(cidx,NULL); | |
398 b->_nodes.remove(l); | |
399 l--; | |
400 continue; | |
401 } | |
402 } | |
403 | |
404 if( n->is_Phi() ) { | |
405 // Get the chosen name for the Phi | |
406 uint phi_name = _phc.Find( n ); | |
407 // Ignore the pre-allocated specials | |
408 if( !phi_name ) continue; | |
409 // Check for mismatch inputs to Phi | |
410 for( uint j = 1; j<cnt; j++ ) { | |
411 Node *m = n->in(j); | |
412 uint src_name = _phc.Find(m); | |
413 if( src_name != phi_name ) { | |
414 Block *pred = _phc._cfg._bbs[b->pred(j)->_idx]; | |
415 Node *copy; | |
416 assert(!m->is_Con() || m->is_Mach(), "all Con must be Mach"); | |
417 // Rematerialize constants instead of copying them | |
418 if( m->is_Mach() && m->as_Mach()->is_Con() && | |
419 m->as_Mach()->rematerialize() ) { | |
420 copy = m->clone(); | |
421 // Insert the copy in the predecessor basic block | |
422 pred->add_inst(copy); | |
423 // Copy any flags as well | |
424 _phc.clone_projs( pred, pred->end_idx(), m, copy, _phc._maxlrg ); | |
425 } else { | |
426 const RegMask *rm = C->matcher()->idealreg2spillmask[m->ideal_reg()]; | |
427 copy = new (C) MachSpillCopyNode(m,*rm,*rm); | |
428 // Find a good place to insert. Kinda tricky, use a subroutine | |
429 insert_copy_with_overlap(pred,copy,phi_name,src_name); | |
430 } | |
431 // Insert the copy in the use-def chain | |
432 n->set_req( j, copy ); | |
433 _phc._cfg._bbs.map( copy->_idx, pred ); | |
434 // Extend ("register allocate") the names array for the copy. | |
435 _phc._names.extend( copy->_idx, phi_name ); | |
436 } // End of if Phi names do not match | |
437 } // End of for all inputs to Phi | |
438 } else { // End of if Phi | |
439 | |
440 // Now check for 2-address instructions | |
441 uint idx; | |
442 if( n->is_Mach() && (idx=n->as_Mach()->two_adr()) ) { | |
443 // Get the chosen name for the Node | |
444 uint name = _phc.Find( n ); | |
445 assert( name, "no 2-address specials" ); | |
446 // Check for name mis-match on the 2-address input | |
447 Node *m = n->in(idx); | |
448 if( _phc.Find(m) != name ) { | |
449 Node *copy; | |
450 assert(!m->is_Con() || m->is_Mach(), "all Con must be Mach"); | |
451 // At this point it is unsafe to extend live ranges (6550579). | |
452 // Rematerialize only constants as we do for Phi above. | |
453 if( m->is_Mach() && m->as_Mach()->is_Con() && | |
454 m->as_Mach()->rematerialize() ) { | |
455 copy = m->clone(); | |
456 // Insert the copy in the basic block, just before us | |
457 b->_nodes.insert( l++, copy ); | |
458 if( _phc.clone_projs( b, l, m, copy, _phc._maxlrg ) ) | |
459 l++; | |
460 } else { | |
461 const RegMask *rm = C->matcher()->idealreg2spillmask[m->ideal_reg()]; | |
462 copy = new (C) MachSpillCopyNode( m, *rm, *rm ); | |
463 // Insert the copy in the basic block, just before us | |
464 b->_nodes.insert( l++, copy ); | |
465 } | |
466 // Insert the copy in the use-def chain | |
467 n->set_req(idx, copy ); | |
468 // Extend ("register allocate") the names array for the copy. | |
469 _phc._names.extend( copy->_idx, name ); | |
470 _phc._cfg._bbs.map( copy->_idx, b ); | |
471 } | |
472 | |
473 } // End of is two-adr | |
474 | |
475 // Insert a copy at a debug use for a lrg which has high frequency | |
476 if( (b->_freq < OPTO_DEBUG_SPLIT_FREQ) && n->is_MachSafePoint() ) { | |
477 // Walk the debug inputs to the node and check for lrg freq | |
478 JVMState* jvms = n->jvms(); | |
479 uint debug_start = jvms ? jvms->debug_start() : 999999; | |
480 uint debug_end = jvms ? jvms->debug_end() : 999999; | |
481 for(uint inpidx = debug_start; inpidx < debug_end; inpidx++) { | |
482 // Do not split monitors; they are only needed for debug table | |
483 // entries and need no code. | |
484 if( jvms->is_monitor_use(inpidx) ) continue; | |
485 Node *inp = n->in(inpidx); | |
486 uint nidx = _phc.n2lidx(inp); | |
487 LRG &lrg = lrgs(nidx); | |
488 | |
489 // If this lrg has a high frequency use/def | |
490 if( lrg._maxfreq >= OPTO_LRG_HIGH_FREQ ) { | |
491 // If the live range is also live out of this block (like it | |
492 // would be for a fast/slow idiom), the normal spill mechanism | |
493 // does an excellent job. If it is not live out of this block | |
494 // (like it would be for debug info to uncommon trap) splitting | |
495 // the live range now allows a better allocation in the high | |
496 // frequency blocks. | |
497 // Build_IFG_virtual has converted the live sets to | |
498 // live-IN info, not live-OUT info. | |
499 uint k; | |
500 for( k=0; k < b->_num_succs; k++ ) | |
501 if( _phc._live->live(b->_succs[k])->member( nidx ) ) | |
502 break; // Live in to some successor block? | |
503 if( k < b->_num_succs ) | |
504 continue; // Live out; do not pre-split | |
505 // Split the lrg at this use | |
506 const RegMask *rm = C->matcher()->idealreg2spillmask[inp->ideal_reg()]; | |
507 Node *copy = new (C) MachSpillCopyNode( inp, *rm, *rm ); | |
508 // Insert the copy in the use-def chain | |
509 n->set_req(inpidx, copy ); | |
510 // Insert the copy in the basic block, just before us | |
511 b->_nodes.insert( l++, copy ); | |
512 // Extend ("register allocate") the names array for the copy. | |
513 _phc.new_lrg( copy, _phc._maxlrg++ ); | |
514 _phc._cfg._bbs.map( copy->_idx, b ); | |
515 //tty->print_cr("Split a debug use in Aggressive Coalesce"); | |
516 } // End of if high frequency use/def | |
517 } // End of for all debug inputs | |
518 } // End of if low frequency safepoint | |
519 | |
520 } // End of if Phi | |
521 | |
522 } // End of for all instructions | |
523 } // End of for all blocks | |
524 } | |
525 | |
526 //============================================================================= | |
527 //------------------------------coalesce--------------------------------------- | |
528 // Aggressive (but pessimistic) copy coalescing of a single block | |
529 | |
530 // The following coalesce pass represents a single round of aggressive | |
531 // pessimistic coalesce. "Aggressive" means no attempt to preserve | |
532 // colorability when coalescing. This occasionally means more spills, but | |
533 // it also means fewer rounds of coalescing for better code - and that means | |
534 // faster compiles. | |
535 | |
536 // "Pessimistic" means we do not hit the fixed point in one pass (and we are | |
537 // reaching for the least fixed point to boot). This is typically solved | |
538 // with a few more rounds of coalescing, but the compiler must run fast. We | |
539 // could optimistically coalescing everything touching PhiNodes together | |
540 // into one big live range, then check for self-interference. Everywhere | |
541 // the live range interferes with self it would have to be split. Finding | |
542 // the right split points can be done with some heuristics (based on | |
543 // expected frequency of edges in the live range). In short, it's a real | |
544 // research problem and the timeline is too short to allow such research. | |
545 // Further thoughts: (1) build the LR in a pass, (2) find self-interference | |
546 // in another pass, (3) per each self-conflict, split, (4) split by finding | |
547 // the low-cost cut (min-cut) of the LR, (5) edges in the LR are weighted | |
548 // according to the GCM algorithm (or just exec freq on CFG edges). | |
549 | |
550 void PhaseAggressiveCoalesce::coalesce( Block *b ) { | |
551 // Copies are still "virtual" - meaning we have not made them explicitly | |
552 // copies. Instead, Phi functions of successor blocks have mis-matched | |
553 // live-ranges. If I fail to coalesce, I'll have to insert a copy to line | |
554 // up the live-ranges. Check for Phis in successor blocks. | |
555 uint i; | |
556 for( i=0; i<b->_num_succs; i++ ) { | |
557 Block *bs = b->_succs[i]; | |
558 // Find index of 'b' in 'bs' predecessors | |
559 uint j=1; | |
560 while( _phc._cfg._bbs[bs->pred(j)->_idx] != b ) j++; | |
561 // Visit all the Phis in successor block | |
562 for( uint k = 1; k<bs->_nodes.size(); k++ ) { | |
563 Node *n = bs->_nodes[k]; | |
564 if( !n->is_Phi() ) break; | |
565 combine_these_two( n, n->in(j) ); | |
566 } | |
567 } // End of for all successor blocks | |
568 | |
569 | |
570 // Check _this_ block for 2-address instructions and copies. | |
571 uint cnt = b->end_idx(); | |
572 for( i = 1; i<cnt; i++ ) { | |
573 Node *n = b->_nodes[i]; | |
574 uint idx; | |
575 // 2-address instructions have a virtual Copy matching their input | |
576 // to their output | |
577 if( n->is_Mach() && (idx = n->as_Mach()->two_adr()) ) { | |
578 MachNode *mach = n->as_Mach(); | |
579 combine_these_two( mach, mach->in(idx) ); | |
580 } | |
581 } // End of for all instructions in block | |
582 } | |
583 | |
584 //============================================================================= | |
585 //------------------------------PhaseConservativeCoalesce---------------------- | |
586 PhaseConservativeCoalesce::PhaseConservativeCoalesce( PhaseChaitin &chaitin ) : PhaseCoalesce(chaitin) { | |
587 _ulr.initialize(_phc._maxlrg); | |
588 } | |
589 | |
590 //------------------------------verify----------------------------------------- | |
591 void PhaseConservativeCoalesce::verify() { | |
592 #ifdef ASSERT | |
593 _phc.set_was_low(); | |
594 #endif | |
595 } | |
596 | |
597 //------------------------------union_helper----------------------------------- | |
598 void PhaseConservativeCoalesce::union_helper( Node *lr1_node, Node *lr2_node, uint lr1, uint lr2, Node *src_def, Node *dst_copy, Node *src_copy, Block *b, uint bindex ) { | |
599 // Join live ranges. Merge larger into smaller. Union lr2 into lr1 in the | |
600 // union-find tree | |
601 _phc.Union( lr1_node, lr2_node ); | |
602 | |
603 // Single-def live range ONLY if both live ranges are single-def. | |
604 // If both are single def, then src_def powers one live range | |
605 // and def_copy powers the other. After merging, src_def powers | |
606 // the combined live range. | |
295
ea18057223c4
6732194: Data corruption dependent on -server/-client/-Xbatch
never
parents:
0
diff
changeset
|
607 lrgs(lr1)._def = (lrgs(lr1).is_multidef() || |
ea18057223c4
6732194: Data corruption dependent on -server/-client/-Xbatch
never
parents:
0
diff
changeset
|
608 lrgs(lr2).is_multidef() ) |
0 | 609 ? NodeSentinel : src_def; |
610 lrgs(lr2)._def = NULL; // No def for lrg 2 | |
611 lrgs(lr2).Clear(); // Force empty mask for LRG 2 | |
612 //lrgs(lr2)._size = 0; // Live-range 2 goes dead | |
613 lrgs(lr1)._is_oop |= lrgs(lr2)._is_oop; | |
614 lrgs(lr2)._is_oop = 0; // In particular, not an oop for GC info | |
615 | |
616 if (lrgs(lr1)._maxfreq < lrgs(lr2)._maxfreq) | |
617 lrgs(lr1)._maxfreq = lrgs(lr2)._maxfreq; | |
618 | |
619 // Copy original value instead. Intermediate copies go dead, and | |
620 // the dst_copy becomes useless. | |
621 int didx = dst_copy->is_Copy(); | |
622 dst_copy->set_req( didx, src_def ); | |
623 // Add copy to free list | |
624 // _phc.free_spillcopy(b->_nodes[bindex]); | |
625 assert( b->_nodes[bindex] == dst_copy, "" ); | |
626 dst_copy->replace_by( dst_copy->in(didx) ); | |
627 dst_copy->set_req( didx, NULL); | |
628 b->_nodes.remove(bindex); | |
629 if( bindex < b->_ihrp_index ) b->_ihrp_index--; | |
630 if( bindex < b->_fhrp_index ) b->_fhrp_index--; | |
631 | |
632 // Stretched lr1; add it to liveness of intermediate blocks | |
633 Block *b2 = _phc._cfg._bbs[src_copy->_idx]; | |
634 while( b != b2 ) { | |
635 b = _phc._cfg._bbs[b->pred(1)->_idx]; | |
636 _phc._live->live(b)->insert(lr1); | |
637 } | |
638 } | |
639 | |
640 //------------------------------compute_separating_interferences--------------- | |
641 // Factored code from copy_copy that computes extra interferences from | |
642 // lengthening a live range by double-coalescing. | |
643 uint PhaseConservativeCoalesce::compute_separating_interferences(Node *dst_copy, Node *src_copy, Block *b, uint bindex, RegMask &rm, uint reg_degree, uint rm_size, uint lr1, uint lr2 ) { | |
644 | |
645 assert(!lrgs(lr1)._fat_proj, "cannot coalesce fat_proj"); | |
646 assert(!lrgs(lr2)._fat_proj, "cannot coalesce fat_proj"); | |
647 Node *prev_copy = dst_copy->in(dst_copy->is_Copy()); | |
648 Block *b2 = b; | |
649 uint bindex2 = bindex; | |
650 while( 1 ) { | |
651 // Find previous instruction | |
652 bindex2--; // Chain backwards 1 instruction | |
653 while( bindex2 == 0 ) { // At block start, find prior block | |
654 assert( b2->num_preds() == 2, "cannot double coalesce across c-flow" ); | |
655 b2 = _phc._cfg._bbs[b2->pred(1)->_idx]; | |
656 bindex2 = b2->end_idx()-1; | |
657 } | |
658 // Get prior instruction | |
659 assert(bindex2 < b2->_nodes.size(), "index out of bounds"); | |
660 Node *x = b2->_nodes[bindex2]; | |
661 if( x == prev_copy ) { // Previous copy in copy chain? | |
662 if( prev_copy == src_copy)// Found end of chain and all interferences | |
663 break; // So break out of loop | |
664 // Else work back one in copy chain | |
665 prev_copy = prev_copy->in(prev_copy->is_Copy()); | |
666 } else { // Else collect interferences | |
667 uint lidx = _phc.Find(x); | |
668 // Found another def of live-range being stretched? | |
669 if( lidx == lr1 ) return max_juint; | |
670 if( lidx == lr2 ) return max_juint; | |
671 | |
672 // If we attempt to coalesce across a bound def | |
673 if( lrgs(lidx).is_bound() ) { | |
674 // Do not let the coalesced LRG expect to get the bound color | |
675 rm.SUBTRACT( lrgs(lidx).mask() ); | |
676 // Recompute rm_size | |
677 rm_size = rm.Size(); | |
678 //if( rm._flags ) rm_size += 1000000; | |
679 if( reg_degree >= rm_size ) return max_juint; | |
680 } | |
681 if( rm.overlap(lrgs(lidx).mask()) ) { | |
682 // Insert lidx into union LRG; returns TRUE if actually inserted | |
683 if( _ulr.insert(lidx) ) { | |
684 // Infinite-stack neighbors do not alter colorability, as they | |
685 // can always color to some other color. | |
686 if( !lrgs(lidx).mask().is_AllStack() ) { | |
687 // If this coalesce will make any new neighbor uncolorable, | |
688 // do not coalesce. | |
689 if( lrgs(lidx).just_lo_degree() ) | |
690 return max_juint; | |
691 // Bump our degree | |
692 if( ++reg_degree >= rm_size ) | |
693 return max_juint; | |
694 } // End of if not infinite-stack neighbor | |
695 } // End of if actually inserted | |
696 } // End of if live range overlaps | |
697 } // End of else collect intereferences for 1 node | |
698 } // End of while forever, scan back for intereferences | |
699 return reg_degree; | |
700 } | |
701 | |
702 //------------------------------update_ifg------------------------------------- | |
703 void PhaseConservativeCoalesce::update_ifg(uint lr1, uint lr2, IndexSet *n_lr1, IndexSet *n_lr2) { | |
704 // Some original neighbors of lr1 might have gone away | |
705 // because the constrained register mask prevented them. | |
706 // Remove lr1 from such neighbors. | |
707 IndexSetIterator one(n_lr1); | |
708 uint neighbor; | |
709 LRG &lrg1 = lrgs(lr1); | |
710 while ((neighbor = one.next()) != 0) | |
711 if( !_ulr.member(neighbor) ) | |
712 if( _phc._ifg->neighbors(neighbor)->remove(lr1) ) | |
713 lrgs(neighbor).inc_degree( -lrg1.compute_degree(lrgs(neighbor)) ); | |
714 | |
715 | |
716 // lr2 is now called (coalesced into) lr1. | |
717 // Remove lr2 from the IFG. | |
718 IndexSetIterator two(n_lr2); | |
719 LRG &lrg2 = lrgs(lr2); | |
720 while ((neighbor = two.next()) != 0) | |
721 if( _phc._ifg->neighbors(neighbor)->remove(lr2) ) | |
722 lrgs(neighbor).inc_degree( -lrg2.compute_degree(lrgs(neighbor)) ); | |
723 | |
724 // Some neighbors of intermediate copies now interfere with the | |
725 // combined live range. | |
726 IndexSetIterator three(&_ulr); | |
727 while ((neighbor = three.next()) != 0) | |
728 if( _phc._ifg->neighbors(neighbor)->insert(lr1) ) | |
729 lrgs(neighbor).inc_degree( lrg1.compute_degree(lrgs(neighbor)) ); | |
730 } | |
731 | |
732 //------------------------------record_bias------------------------------------ | |
733 static void record_bias( const PhaseIFG *ifg, int lr1, int lr2 ) { | |
734 // Tag copy bias here | |
735 if( !ifg->lrgs(lr1)._copy_bias ) | |
736 ifg->lrgs(lr1)._copy_bias = lr2; | |
737 if( !ifg->lrgs(lr2)._copy_bias ) | |
738 ifg->lrgs(lr2)._copy_bias = lr1; | |
739 } | |
740 | |
741 //------------------------------copy_copy-------------------------------------- | |
742 // See if I can coalesce a series of multiple copies together. I need the | |
743 // final dest copy and the original src copy. They can be the same Node. | |
744 // Compute the compatible register masks. | |
745 bool PhaseConservativeCoalesce::copy_copy( Node *dst_copy, Node *src_copy, Block *b, uint bindex ) { | |
746 | |
747 if( !dst_copy->is_SpillCopy() ) return false; | |
748 if( !src_copy->is_SpillCopy() ) return false; | |
749 Node *src_def = src_copy->in(src_copy->is_Copy()); | |
750 uint lr1 = _phc.Find(dst_copy); | |
751 uint lr2 = _phc.Find(src_def ); | |
752 | |
753 // Same live ranges already? | |
754 if( lr1 == lr2 ) return false; | |
755 | |
756 // Interfere? | |
757 if( _phc._ifg->test_edge_sq( lr1, lr2 ) ) return false; | |
758 | |
759 // Not an oop->int cast; oop->oop, int->int, AND int->oop are OK. | |
760 if( !lrgs(lr1)._is_oop && lrgs(lr2)._is_oop ) // not an oop->int cast | |
761 return false; | |
762 | |
763 // Coalescing between an aligned live range and a mis-aligned live range? | |
764 // No, no! Alignment changes how we count degree. | |
765 if( lrgs(lr1)._fat_proj != lrgs(lr2)._fat_proj ) | |
766 return false; | |
767 | |
768 // Sort; use smaller live-range number | |
769 Node *lr1_node = dst_copy; | |
770 Node *lr2_node = src_def; | |
771 if( lr1 > lr2 ) { | |
772 uint tmp = lr1; lr1 = lr2; lr2 = tmp; | |
773 lr1_node = src_def; lr2_node = dst_copy; | |
774 } | |
775 | |
776 // Check for compatibility of the 2 live ranges by | |
777 // intersecting their allowed register sets. | |
778 RegMask rm = lrgs(lr1).mask(); | |
779 rm.AND(lrgs(lr2).mask()); | |
780 // Number of bits free | |
781 uint rm_size = rm.Size(); | |
782 | |
783 // If we can use any stack slot, then effective size is infinite | |
784 if( rm.is_AllStack() ) rm_size += 1000000; | |
785 // Incompatible masks, no way to coalesce | |
786 if( rm_size == 0 ) return false; | |
787 | |
788 // Another early bail-out test is when we are double-coalescing and the | |
789 // 2 copies are seperated by some control flow. | |
790 if( dst_copy != src_copy ) { | |
791 Block *src_b = _phc._cfg._bbs[src_copy->_idx]; | |
792 Block *b2 = b; | |
793 while( b2 != src_b ) { | |
794 if( b2->num_preds() > 2 ){// Found merge-point | |
795 _phc._lost_opp_cflow_coalesce++; | |
796 // extra record_bias commented out because Chris believes it is not | |
797 // productive. Since we can record only 1 bias, we want to choose one | |
798 // that stands a chance of working and this one probably does not. | |
799 //record_bias( _phc._lrgs, lr1, lr2 ); | |
800 return false; // To hard to find all interferences | |
801 } | |
802 b2 = _phc._cfg._bbs[b2->pred(1)->_idx]; | |
803 } | |
804 } | |
805 | |
806 // Union the two interference sets together into '_ulr' | |
807 uint reg_degree = _ulr.lrg_union( lr1, lr2, rm_size, _phc._ifg, rm ); | |
808 | |
809 if( reg_degree >= rm_size ) { | |
810 record_bias( _phc._ifg, lr1, lr2 ); | |
811 return false; | |
812 } | |
813 | |
814 // Now I need to compute all the interferences between dst_copy and | |
815 // src_copy. I'm not willing visit the entire interference graph, so | |
816 // I limit my search to things in dst_copy's block or in a straight | |
817 // line of previous blocks. I give up at merge points or when I get | |
818 // more interferences than my degree. I can stop when I find src_copy. | |
819 if( dst_copy != src_copy ) { | |
820 reg_degree = compute_separating_interferences(dst_copy, src_copy, b, bindex, rm, rm_size, reg_degree, lr1, lr2 ); | |
821 if( reg_degree == max_juint ) { | |
822 record_bias( _phc._ifg, lr1, lr2 ); | |
823 return false; | |
824 } | |
825 } // End of if dst_copy & src_copy are different | |
826 | |
827 | |
828 // ---- THE COMBINED LRG IS COLORABLE ---- | |
829 | |
830 // YEAH - Now coalesce this copy away | |
831 assert( lrgs(lr1).num_regs() == lrgs(lr2).num_regs(), "" ); | |
832 | |
833 IndexSet *n_lr1 = _phc._ifg->neighbors(lr1); | |
834 IndexSet *n_lr2 = _phc._ifg->neighbors(lr2); | |
835 | |
836 // Update the interference graph | |
837 update_ifg(lr1, lr2, n_lr1, n_lr2); | |
838 | |
839 _ulr.remove(lr1); | |
840 | |
841 // Uncomment the following code to trace Coalescing in great detail. | |
842 // | |
843 //if (false) { | |
844 // tty->cr(); | |
845 // tty->print_cr("#######################################"); | |
846 // tty->print_cr("union %d and %d", lr1, lr2); | |
847 // n_lr1->dump(); | |
848 // n_lr2->dump(); | |
849 // tty->print_cr("resulting set is"); | |
850 // _ulr.dump(); | |
851 //} | |
852 | |
853 // Replace n_lr1 with the new combined live range. _ulr will use | |
854 // n_lr1's old memory on the next iteration. n_lr2 is cleared to | |
855 // send its internal memory to the free list. | |
856 _ulr.swap(n_lr1); | |
857 _ulr.clear(); | |
858 n_lr2->clear(); | |
859 | |
860 lrgs(lr1).set_degree( _phc._ifg->effective_degree(lr1) ); | |
861 lrgs(lr2).set_degree( 0 ); | |
862 | |
863 // Join live ranges. Merge larger into smaller. Union lr2 into lr1 in the | |
864 // union-find tree | |
865 union_helper( lr1_node, lr2_node, lr1, lr2, src_def, dst_copy, src_copy, b, bindex ); | |
866 // Combine register restrictions | |
867 lrgs(lr1).set_mask(rm); | |
868 lrgs(lr1).compute_set_mask_size(); | |
869 lrgs(lr1)._cost += lrgs(lr2)._cost; | |
870 lrgs(lr1)._area += lrgs(lr2)._area; | |
871 | |
872 // While its uncommon to successfully coalesce live ranges that started out | |
873 // being not-lo-degree, it can happen. In any case the combined coalesced | |
874 // live range better Simplify nicely. | |
875 lrgs(lr1)._was_lo = 1; | |
876 | |
877 // kinda expensive to do all the time | |
878 //tty->print_cr("warning: slow verify happening"); | |
879 //_phc._ifg->verify( &_phc ); | |
880 return true; | |
881 } | |
882 | |
883 //------------------------------coalesce--------------------------------------- | |
884 // Conservative (but pessimistic) copy coalescing of a single block | |
885 void PhaseConservativeCoalesce::coalesce( Block *b ) { | |
886 // Bail out on infrequent blocks | |
887 if( b->is_uncommon(_phc._cfg._bbs) ) | |
888 return; | |
889 // Check this block for copies. | |
890 for( uint i = 1; i<b->end_idx(); i++ ) { | |
891 // Check for actual copies on inputs. Coalesce a copy into its | |
892 // input if use and copy's input are compatible. | |
893 Node *copy1 = b->_nodes[i]; | |
894 uint idx1 = copy1->is_Copy(); | |
895 if( !idx1 ) continue; // Not a copy | |
896 | |
897 if( copy_copy(copy1,copy1,b,i) ) { | |
898 i--; // Retry, same location in block | |
899 PhaseChaitin::_conserv_coalesce++; // Collect stats on success | |
900 continue; | |
901 } | |
902 | |
903 /* do not attempt pairs. About 1/2 of all pairs can be removed by | |
904 post-alloc. The other set are too few to bother. | |
905 Node *copy2 = copy1->in(idx1); | |
906 uint idx2 = copy2->is_Copy(); | |
907 if( !idx2 ) continue; | |
908 if( copy_copy(copy1,copy2,b,i) ) { | |
909 i--; // Retry, same location in block | |
910 PhaseChaitin::_conserv_coalesce_pair++; // Collect stats on success | |
911 continue; | |
912 } | |
913 */ | |
914 } | |
915 } |