comparison src/share/vm/opto/gcm.cpp @ 14412:e2722a66aba7

Merge
author kvn
date Thu, 05 Sep 2013 11:04:39 -0700
parents d2907f74462e adb9a7d94cb5
children 2b8e28fdf503
comparison
equal deleted inserted replaced
14411:bdd155477289 14412:e2722a66aba7
68 //----------------------------schedule_node_into_block------------------------- 68 //----------------------------schedule_node_into_block-------------------------
69 // Insert node n into block b. Look for projections of n and make sure they 69 // Insert node n into block b. Look for projections of n and make sure they
70 // are in b also. 70 // are in b also.
71 void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) { 71 void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) {
72 // Set basic block of n, Add n to b, 72 // Set basic block of n, Add n to b,
73 _bbs.map(n->_idx, b); 73 map_node_to_block(n, b);
74 b->add_inst(n); 74 b->add_inst(n);
75 75
76 // After Matching, nearly any old Node may have projections trailing it. 76 // After Matching, nearly any old Node may have projections trailing it.
77 // These are usually machine-dependent flags. In any case, they might 77 // These are usually machine-dependent flags. In any case, they might
78 // float to another block below this one. Move them up. 78 // float to another block below this one. Move them up.
79 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { 79 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
80 Node* use = n->fast_out(i); 80 Node* use = n->fast_out(i);
81 if (use->is_Proj()) { 81 if (use->is_Proj()) {
82 Block* buse = _bbs[use->_idx]; 82 Block* buse = get_block_for_node(use);
83 if (buse != b) { // In wrong block? 83 if (buse != b) { // In wrong block?
84 if (buse != NULL) 84 if (buse != NULL) {
85 buse->find_remove(use); // Remove from wrong block 85 buse->find_remove(use); // Remove from wrong block
86 _bbs.map(use->_idx, b); // Re-insert in this block 86 }
87 map_node_to_block(use, b);
87 b->add_inst(use); 88 b->add_inst(use);
88 } 89 }
89 } 90 }
90 } 91 }
91 } 92 }
99 assert(in0 != NULL, "Only control-dependent"); 100 assert(in0 != NULL, "Only control-dependent");
100 const Node *p = in0->is_block_proj(); 101 const Node *p = in0->is_block_proj();
101 if (p != NULL && p != n) { // Control from a block projection? 102 if (p != NULL && p != n) { // Control from a block projection?
102 assert(!n->pinned() || n->is_MachConstantBase(), "only pinned MachConstantBase node is expected here"); 103 assert(!n->pinned() || n->is_MachConstantBase(), "only pinned MachConstantBase node is expected here");
103 // Find trailing Region 104 // Find trailing Region
104 Block *pb = _bbs[in0->_idx]; // Block-projection already has basic block 105 Block *pb = get_block_for_node(in0); // Block-projection already has basic block
105 uint j = 0; 106 uint j = 0;
106 if (pb->_num_succs != 1) { // More then 1 successor? 107 if (pb->_num_succs != 1) { // More then 1 successor?
107 // Search for successor 108 // Search for successor
108 uint max = pb->_nodes.size(); 109 uint max = pb->_nodes.size();
109 assert( max > 1, "" ); 110 assert( max > 1, "" );
122 } 123 }
123 124
124 125
125 //------------------------------schedule_pinned_nodes-------------------------- 126 //------------------------------schedule_pinned_nodes--------------------------
126 // Set the basic block for Nodes pinned into blocks 127 // Set the basic block for Nodes pinned into blocks
127 void PhaseCFG::schedule_pinned_nodes( VectorSet &visited ) { 128 void PhaseCFG::schedule_pinned_nodes(VectorSet &visited) {
128 // Allocate node stack of size C->unique()+8 to avoid frequent realloc 129 // Allocate node stack of size C->unique()+8 to avoid frequent realloc
129 GrowableArray <Node *> spstack(C->unique()+8); 130 GrowableArray <Node *> spstack(C->unique() + 8);
130 spstack.push(_root); 131 spstack.push(_root);
131 while ( spstack.is_nonempty() ) { 132 while (spstack.is_nonempty()) {
132 Node *n = spstack.pop(); 133 Node* node = spstack.pop();
133 if( !visited.test_set(n->_idx) ) { // Test node and flag it as visited 134 if (!visited.test_set(node->_idx)) { // Test node and flag it as visited
134 if( n->pinned() && !_bbs.lookup(n->_idx) ) { // Pinned? Nail it down! 135 if (node->pinned() && !has_block(node)) { // Pinned? Nail it down!
135 assert( n->in(0), "pinned Node must have Control" ); 136 assert(node->in(0), "pinned Node must have Control");
136 // Before setting block replace block_proj control edge 137 // Before setting block replace block_proj control edge
137 replace_block_proj_ctrl(n); 138 replace_block_proj_ctrl(node);
138 Node *input = n->in(0); 139 Node* input = node->in(0);
139 while( !input->is_block_start() ) 140 while (!input->is_block_start()) {
140 input = input->in(0); 141 input = input->in(0);
141 Block *b = _bbs[input->_idx]; // Basic block of controlling input 142 }
142 schedule_node_into_block(n, b); 143 Block* block = get_block_for_node(input); // Basic block of controlling input
143 } 144 schedule_node_into_block(node, block);
144 for( int i = n->req() - 1; i >= 0; --i ) { // For all inputs 145 }
145 if( n->in(i) != NULL ) 146
146 spstack.push(n->in(i)); 147 // process all inputs that are non NULL
148 for (int i = node->req() - 1; i >= 0; --i) {
149 if (node->in(i) != NULL) {
150 spstack.push(node->in(i));
151 }
147 } 152 }
148 } 153 }
149 } 154 }
150 } 155 }
151 156
152 #ifdef ASSERT 157 #ifdef ASSERT
153 // Assert that new input b2 is dominated by all previous inputs. 158 // Assert that new input b2 is dominated by all previous inputs.
154 // Check this by by seeing that it is dominated by b1, the deepest 159 // Check this by by seeing that it is dominated by b1, the deepest
155 // input observed until b2. 160 // input observed until b2.
156 static void assert_dom(Block* b1, Block* b2, Node* n, Block_Array &bbs) { 161 static void assert_dom(Block* b1, Block* b2, Node* n, const PhaseCFG* cfg) {
157 if (b1 == NULL) return; 162 if (b1 == NULL) return;
158 assert(b1->_dom_depth < b2->_dom_depth, "sanity"); 163 assert(b1->_dom_depth < b2->_dom_depth, "sanity");
159 Block* tmp = b2; 164 Block* tmp = b2;
160 while (tmp != b1 && tmp != NULL) { 165 while (tmp != b1 && tmp != NULL) {
161 tmp = tmp->_idom; 166 tmp = tmp->_idom;
164 // Detected an unschedulable graph. Print some nice stuff and die. 169 // Detected an unschedulable graph. Print some nice stuff and die.
165 tty->print_cr("!!! Unschedulable graph !!!"); 170 tty->print_cr("!!! Unschedulable graph !!!");
166 for (uint j=0; j<n->len(); j++) { // For all inputs 171 for (uint j=0; j<n->len(); j++) { // For all inputs
167 Node* inn = n->in(j); // Get input 172 Node* inn = n->in(j); // Get input
168 if (inn == NULL) continue; // Ignore NULL, missing inputs 173 if (inn == NULL) continue; // Ignore NULL, missing inputs
169 Block* inb = bbs[inn->_idx]; 174 Block* inb = cfg->get_block_for_node(inn);
170 tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order, 175 tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order,
171 inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth); 176 inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth);
172 inn->dump(); 177 inn->dump();
173 } 178 }
174 tty->print("Failing node: "); 179 tty->print("Failing node: ");
176 assert(false, "unscheduable graph"); 181 assert(false, "unscheduable graph");
177 } 182 }
178 } 183 }
179 #endif 184 #endif
180 185
181 static Block* find_deepest_input(Node* n, Block_Array &bbs) { 186 static Block* find_deepest_input(Node* n, const PhaseCFG* cfg) {
182 // Find the last input dominated by all other inputs. 187 // Find the last input dominated by all other inputs.
183 Block* deepb = NULL; // Deepest block so far 188 Block* deepb = NULL; // Deepest block so far
184 int deepb_dom_depth = 0; 189 int deepb_dom_depth = 0;
185 for (uint k = 0; k < n->len(); k++) { // For all inputs 190 for (uint k = 0; k < n->len(); k++) { // For all inputs
186 Node* inn = n->in(k); // Get input 191 Node* inn = n->in(k); // Get input
187 if (inn == NULL) continue; // Ignore NULL, missing inputs 192 if (inn == NULL) continue; // Ignore NULL, missing inputs
188 Block* inb = bbs[inn->_idx]; 193 Block* inb = cfg->get_block_for_node(inn);
189 assert(inb != NULL, "must already have scheduled this input"); 194 assert(inb != NULL, "must already have scheduled this input");
190 if (deepb_dom_depth < (int) inb->_dom_depth) { 195 if (deepb_dom_depth < (int) inb->_dom_depth) {
191 // The new inb must be dominated by the previous deepb. 196 // The new inb must be dominated by the previous deepb.
192 // The various inputs must be linearly ordered in the dom 197 // The various inputs must be linearly ordered in the dom
193 // tree, or else there will not be a unique deepest block. 198 // tree, or else there will not be a unique deepest block.
194 DEBUG_ONLY(assert_dom(deepb, inb, n, bbs)); 199 DEBUG_ONLY(assert_dom(deepb, inb, n, cfg));
195 deepb = inb; // Save deepest block 200 deepb = inb; // Save deepest block
196 deepb_dom_depth = deepb->_dom_depth; 201 deepb_dom_depth = deepb->_dom_depth;
197 } 202 }
198 } 203 }
199 assert(deepb != NULL, "must be at least one input to n"); 204 assert(deepb != NULL, "must be at least one input to n");
205 // Find the earliest Block any instruction can be placed in. Some instructions 210 // Find the earliest Block any instruction can be placed in. Some instructions
206 // are pinned into Blocks. Unpinned instructions can appear in last block in 211 // are pinned into Blocks. Unpinned instructions can appear in last block in
207 // which all their inputs occur. 212 // which all their inputs occur.
208 bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) { 213 bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) {
209 // Allocate stack with enough space to avoid frequent realloc 214 // Allocate stack with enough space to avoid frequent realloc
210 Node_Stack nstack(roots.Size() + 8); // (unique >> 1) + 24 from Java2D stats 215 Node_Stack nstack(roots.Size() + 8);
211 // roots.push(_root); _root will be processed among C->top() inputs 216 // _root will be processed among C->top() inputs
212 roots.push(C->top()); 217 roots.push(C->top());
213 visited.set(C->top()->_idx); 218 visited.set(C->top()->_idx);
214 219
215 while (roots.size() != 0) { 220 while (roots.size() != 0) {
216 // Use local variables nstack_top_n & nstack_top_i to cache values 221 // Use local variables nstack_top_n & nstack_top_i to cache values
217 // on stack's top. 222 // on stack's top.
218 Node *nstack_top_n = roots.pop(); 223 Node* parent_node = roots.pop();
219 uint nstack_top_i = 0; 224 uint input_index = 0;
220 //while_nstack_nonempty: 225
221 while (true) { 226 while (true) {
222 // Get parent node and next input's index from stack's top. 227 if (input_index == 0) {
223 Node *n = nstack_top_n;
224 uint i = nstack_top_i;
225
226 if (i == 0) {
227 // Fixup some control. Constants without control get attached 228 // Fixup some control. Constants without control get attached
228 // to root and nodes that use is_block_proj() nodes should be attached 229 // to root and nodes that use is_block_proj() nodes should be attached
229 // to the region that starts their block. 230 // to the region that starts their block.
230 const Node *in0 = n->in(0); 231 const Node* control_input = parent_node->in(0);
231 if (in0 != NULL) { // Control-dependent? 232 if (control_input != NULL) {
232 replace_block_proj_ctrl(n); 233 replace_block_proj_ctrl(parent_node);
233 } else { // n->in(0) == NULL 234 } else {
234 if (n->req() == 1) { // This guy is a constant with NO inputs? 235 // Is a constant with NO inputs?
235 n->set_req(0, _root); 236 if (parent_node->req() == 1) {
237 parent_node->set_req(0, _root);
236 } 238 }
237 } 239 }
238 } 240 }
239 241
240 // First, visit all inputs and force them to get a block. If an 242 // First, visit all inputs and force them to get a block. If an
241 // input is already in a block we quit following inputs (to avoid 243 // input is already in a block we quit following inputs (to avoid
242 // cycles). Instead we put that Node on a worklist to be handled 244 // cycles). Instead we put that Node on a worklist to be handled
243 // later (since IT'S inputs may not have a block yet). 245 // later (since IT'S inputs may not have a block yet).
244 bool done = true; // Assume all n's inputs will be processed 246
245 while (i < n->len()) { // For all inputs 247 // Assume all n's inputs will be processed
246 Node *in = n->in(i); // Get input 248 bool done = true;
247 ++i; 249
248 if (in == NULL) continue; // Ignore NULL, missing inputs 250 while (input_index < parent_node->len()) {
251 Node* in = parent_node->in(input_index++);
252 if (in == NULL) {
253 continue;
254 }
255
249 int is_visited = visited.test_set(in->_idx); 256 int is_visited = visited.test_set(in->_idx);
250 if (!_bbs.lookup(in->_idx)) { // Missing block selection? 257 if (!has_block(in)) {
251 if (is_visited) { 258 if (is_visited) {
252 // assert( !visited.test(in->_idx), "did not schedule early" );
253 return false; 259 return false;
254 } 260 }
255 nstack.push(n, i); // Save parent node and next input's index. 261 // Save parent node and next input's index.
256 nstack_top_n = in; // Process current input now. 262 nstack.push(parent_node, input_index);
257 nstack_top_i = 0; 263 // Process current input now.
258 done = false; // Not all n's inputs processed. 264 parent_node = in;
259 break; // continue while_nstack_nonempty; 265 input_index = 0;
260 } else if (!is_visited) { // Input not yet visited? 266 // Not all n's inputs processed.
261 roots.push(in); // Visit this guy later, using worklist 267 done = false;
268 break;
269 } else if (!is_visited) {
270 // Visit this guy later, using worklist
271 roots.push(in);
262 } 272 }
263 } 273 }
274
264 if (done) { 275 if (done) {
265 // All of n's inputs have been processed, complete post-processing. 276 // All of n's inputs have been processed, complete post-processing.
266 277
267 // Some instructions are pinned into a block. These include Region, 278 // Some instructions are pinned into a block. These include Region,
268 // Phi, Start, Return, and other control-dependent instructions and 279 // Phi, Start, Return, and other control-dependent instructions and
269 // any projections which depend on them. 280 // any projections which depend on them.
270 if (!n->pinned()) { 281 if (!parent_node->pinned()) {
271 // Set earliest legal block. 282 // Set earliest legal block.
272 _bbs.map(n->_idx, find_deepest_input(n, _bbs)); 283 Block* earliest_block = find_deepest_input(parent_node, this);
284 map_node_to_block(parent_node, earliest_block);
273 } else { 285 } else {
274 assert(_bbs[n->_idx] == _bbs[n->in(0)->_idx], "Pinned Node should be at the same block as its control edge"); 286 assert(get_block_for_node(parent_node) == get_block_for_node(parent_node->in(0)), "Pinned Node should be at the same block as its control edge");
275 } 287 }
276 288
277 if (nstack.is_empty()) { 289 if (nstack.is_empty()) {
278 // Finished all nodes on stack. 290 // Finished all nodes on stack.
279 // Process next node on the worklist 'roots'. 291 // Process next node on the worklist 'roots'.
280 break; 292 break;
281 } 293 }
282 // Get saved parent node and next input's index. 294 // Get saved parent node and next input's index.
283 nstack_top_n = nstack.node(); 295 parent_node = nstack.node();
284 nstack_top_i = nstack.index(); 296 input_index = nstack.index();
285 nstack.pop(); 297 nstack.pop();
286 } // if (done) 298 }
287 } // while (true) 299 }
288 } // while (roots.size() != 0) 300 }
289 return true; 301 return true;
290 } 302 }
291 303
292 //------------------------------dom_lca---------------------------------------- 304 //------------------------------dom_lca----------------------------------------
293 // Find least common ancestor in dominator tree 305 // Find least common ancestor in dominator tree
315 //--------------------------raise_LCA_above_use-------------------------------- 327 //--------------------------raise_LCA_above_use--------------------------------
316 // We are placing a definition, and have been given a def->use edge. 328 // We are placing a definition, and have been given a def->use edge.
317 // The definition must dominate the use, so move the LCA upward in the 329 // The definition must dominate the use, so move the LCA upward in the
318 // dominator tree to dominate the use. If the use is a phi, adjust 330 // dominator tree to dominate the use. If the use is a phi, adjust
319 // the LCA only with the phi input paths which actually use this def. 331 // the LCA only with the phi input paths which actually use this def.
320 static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, Block_Array &bbs) { 332 static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, const PhaseCFG* cfg) {
321 Block* buse = bbs[use->_idx]; 333 Block* buse = cfg->get_block_for_node(use);
322 if (buse == NULL) return LCA; // Unused killing Projs have no use block 334 if (buse == NULL) return LCA; // Unused killing Projs have no use block
323 if (!use->is_Phi()) return buse->dom_lca(LCA); 335 if (!use->is_Phi()) return buse->dom_lca(LCA);
324 uint pmax = use->req(); // Number of Phi inputs 336 uint pmax = use->req(); // Number of Phi inputs
325 // Why does not this loop just break after finding the matching input to 337 // Why does not this loop just break after finding the matching input to
326 // the Phi? Well...it's like this. I do not have true def-use/use-def 338 // the Phi? Well...it's like this. I do not have true def-use/use-def
331 // which use-def edge I should find the predecessor block for. So I find 343 // which use-def edge I should find the predecessor block for. So I find
332 // them all. Means I do a little extra work if a Phi uses the same value 344 // them all. Means I do a little extra work if a Phi uses the same value
333 // more than once. 345 // more than once.
334 for (uint j=1; j<pmax; j++) { // For all inputs 346 for (uint j=1; j<pmax; j++) { // For all inputs
335 if (use->in(j) == def) { // Found matching input? 347 if (use->in(j) == def) { // Found matching input?
336 Block* pred = bbs[buse->pred(j)->_idx]; 348 Block* pred = cfg->get_block_for_node(buse->pred(j));
337 LCA = pred->dom_lca(LCA); 349 LCA = pred->dom_lca(LCA);
338 } 350 }
339 } 351 }
340 return LCA; 352 return LCA;
341 } 353 }
344 // Return a new LCA that dominates LCA and any of its marked predecessors. 356 // Return a new LCA that dominates LCA and any of its marked predecessors.
345 // Search all my parents up to 'early' (exclusive), looking for predecessors 357 // Search all my parents up to 'early' (exclusive), looking for predecessors
346 // which are marked with the given index. Return the LCA (in the dom tree) 358 // which are marked with the given index. Return the LCA (in the dom tree)
347 // of all marked blocks. If there are none marked, return the original 359 // of all marked blocks. If there are none marked, return the original
348 // LCA. 360 // LCA.
349 static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, 361 static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, const PhaseCFG* cfg) {
350 Block* early, Block_Array &bbs) {
351 Block_List worklist; 362 Block_List worklist;
352 worklist.push(LCA); 363 worklist.push(LCA);
353 while (worklist.size() > 0) { 364 while (worklist.size() > 0) {
354 Block* mid = worklist.pop(); 365 Block* mid = worklist.pop();
355 if (mid == early) continue; // stop searching here 366 if (mid == early) continue; // stop searching here
368 if (LCA == mid) 379 if (LCA == mid)
369 continue; // Don't mark as visited to avoid early termination. 380 continue; // Don't mark as visited to avoid early termination.
370 } else { 381 } else {
371 // Keep searching through this block's predecessors. 382 // Keep searching through this block's predecessors.
372 for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) { 383 for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) {
373 Block* mid_parent = bbs[ mid->pred(j)->_idx ]; 384 Block* mid_parent = cfg->get_block_for_node(mid->pred(j));
374 worklist.push(mid_parent); 385 worklist.push(mid_parent);
375 } 386 }
376 } 387 }
377 mid->set_raise_LCA_visited(mark); 388 mid->set_raise_LCA_visited(mark);
378 } 389 }
386 // 397 //
387 // Because a subset of edges are considered, the resulting block will 398 // Because a subset of edges are considered, the resulting block will
388 // be earlier (at a shallower dom_depth) than the true schedule_early 399 // be earlier (at a shallower dom_depth) than the true schedule_early
389 // point of the node. We compute this earlier block as a more permissive 400 // point of the node. We compute this earlier block as a more permissive
390 // site for anti-dependency insertion, but only if subsume_loads is enabled. 401 // site for anti-dependency insertion, but only if subsume_loads is enabled.
391 static Block* memory_early_block(Node* load, Block* early, Block_Array &bbs) { 402 static Block* memory_early_block(Node* load, Block* early, const PhaseCFG* cfg) {
392 Node* base; 403 Node* base;
393 Node* index; 404 Node* index;
394 Node* store = load->in(MemNode::Memory); 405 Node* store = load->in(MemNode::Memory);
395 load->as_Mach()->memory_inputs(base, index); 406 load->as_Mach()->memory_inputs(base, index);
396 407
414 if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0); 425 if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0);
415 426
416 Block* deepb = NULL; // Deepest block so far 427 Block* deepb = NULL; // Deepest block so far
417 int deepb_dom_depth = 0; 428 int deepb_dom_depth = 0;
418 for (int i = 0; i < mem_inputs_length; i++) { 429 for (int i = 0; i < mem_inputs_length; i++) {
419 Block* inb = bbs[mem_inputs[i]->_idx]; 430 Block* inb = cfg->get_block_for_node(mem_inputs[i]);
420 if (deepb_dom_depth < (int) inb->_dom_depth) { 431 if (deepb_dom_depth < (int) inb->_dom_depth) {
421 // The new inb must be dominated by the previous deepb. 432 // The new inb must be dominated by the previous deepb.
422 // The various inputs must be linearly ordered in the dom 433 // The various inputs must be linearly ordered in the dom
423 // tree, or else there will not be a unique deepest block. 434 // tree, or else there will not be a unique deepest block.
424 DEBUG_ONLY(assert_dom(deepb, inb, load, bbs)); 435 DEBUG_ONLY(assert_dom(deepb, inb, load, cfg));
425 deepb = inb; // Save deepest block 436 deepb = inb; // Save deepest block
426 deepb_dom_depth = deepb->_dom_depth; 437 deepb_dom_depth = deepb->_dom_depth;
427 } 438 }
428 } 439 }
429 early = deepb; 440 early = deepb;
490 // Note the earliest legal placement of 'load', as determined by 501 // Note the earliest legal placement of 'load', as determined by
491 // by the unique point in the dom tree where all memory effects 502 // by the unique point in the dom tree where all memory effects
492 // and other inputs are first available. (Computed by schedule_early.) 503 // and other inputs are first available. (Computed by schedule_early.)
493 // For normal loads, 'early' is the shallowest place (dom graph wise) 504 // For normal loads, 'early' is the shallowest place (dom graph wise)
494 // to look for anti-deps between this load and any store. 505 // to look for anti-deps between this load and any store.
495 Block* early = _bbs[load_index]; 506 Block* early = get_block_for_node(load);
496 507
497 // If we are subsuming loads, compute an "early" block that only considers 508 // If we are subsuming loads, compute an "early" block that only considers
498 // memory or address inputs. This block may be different than the 509 // memory or address inputs. This block may be different than the
499 // schedule_early block in that it could be at an even shallower depth in the 510 // schedule_early block in that it could be at an even shallower depth in the
500 // dominator tree, and allow for a broader discovery of anti-dependences. 511 // dominator tree, and allow for a broader discovery of anti-dependences.
501 if (C->subsume_loads()) { 512 if (C->subsume_loads()) {
502 early = memory_early_block(load, early, _bbs); 513 early = memory_early_block(load, early, this);
503 } 514 }
504 515
505 ResourceArea *area = Thread::current()->resource_area(); 516 ResourceArea *area = Thread::current()->resource_area();
506 Node_List worklist_mem(area); // prior memory state to store 517 Node_List worklist_mem(area); // prior memory state to store
507 Node_List worklist_store(area); // possible-def to explore 518 Node_List worklist_store(area); // possible-def to explore
621 632
622 // Identify a block that the current load must be above, 633 // Identify a block that the current load must be above,
623 // or else observe that 'store' is all the way up in the 634 // or else observe that 'store' is all the way up in the
624 // earliest legal block for 'load'. In the latter case, 635 // earliest legal block for 'load'. In the latter case,
625 // immediately insert an anti-dependence edge. 636 // immediately insert an anti-dependence edge.
626 Block* store_block = _bbs[store->_idx]; 637 Block* store_block = get_block_for_node(store);
627 assert(store_block != NULL, "unused killing projections skipped above"); 638 assert(store_block != NULL, "unused killing projections skipped above");
628 639
629 if (store->is_Phi()) { 640 if (store->is_Phi()) {
630 // 'load' uses memory which is one (or more) of the Phi's inputs. 641 // 'load' uses memory which is one (or more) of the Phi's inputs.
631 // It must be scheduled not before the Phi, but rather before 642 // It must be scheduled not before the Phi, but rather before
639 // PhiNode may be at start of block 'early' with backedge to 'early' 650 // PhiNode may be at start of block 'early' with backedge to 'early'
640 DEBUG_ONLY(bool found_match = false); 651 DEBUG_ONLY(bool found_match = false);
641 for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) { 652 for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) {
642 if (store->in(j) == mem) { // Found matching input? 653 if (store->in(j) == mem) { // Found matching input?
643 DEBUG_ONLY(found_match = true); 654 DEBUG_ONLY(found_match = true);
644 Block* pred_block = _bbs[store_block->pred(j)->_idx]; 655 Block* pred_block = get_block_for_node(store_block->pred(j));
645 if (pred_block != early) { 656 if (pred_block != early) {
646 // If any predecessor of the Phi matches the load's "early block", 657 // If any predecessor of the Phi matches the load's "early block",
647 // we do not need a precedence edge between the Phi and 'load' 658 // we do not need a precedence edge between the Phi and 'load'
648 // since the load will be forced into a block preceding the Phi. 659 // since the load will be forced into a block preceding the Phi.
649 pred_block->set_raise_LCA_mark(load_index); 660 pred_block->set_raise_LCA_mark(load_index);
713 // 724 //
714 // The raised LCA will be a lower bound for placing the load, 725 // The raised LCA will be a lower bound for placing the load,
715 // preventing the load from sinking past any block containing 726 // preventing the load from sinking past any block containing
716 // a store that may invalidate the memory state required by 'load'. 727 // a store that may invalidate the memory state required by 'load'.
717 if (must_raise_LCA) 728 if (must_raise_LCA)
718 LCA = raise_LCA_above_marks(LCA, load->_idx, early, _bbs); 729 LCA = raise_LCA_above_marks(LCA, load->_idx, early, this);
719 if (LCA == early) return LCA; 730 if (LCA == early) return LCA;
720 731
721 // Insert anti-dependence edges from 'load' to each store 732 // Insert anti-dependence edges from 'load' to each store
722 // in the non-early LCA block. 733 // in the non-early LCA block.
723 // Mine the non_early_stores list for such stores. 734 // Mine the non_early_stores list for such stores.
724 if (LCA->raise_LCA_mark() == load_index) { 735 if (LCA->raise_LCA_mark() == load_index) {
725 while (non_early_stores.size() > 0) { 736 while (non_early_stores.size() > 0) {
726 Node* store = non_early_stores.pop(); 737 Node* store = non_early_stores.pop();
727 Block* store_block = _bbs[store->_idx]; 738 Block* store_block = get_block_for_node(store);
728 if (store_block == LCA) { 739 if (store_block == LCA) {
729 // add anti_dependence from store to load in its own block 740 // add anti_dependence from store to load in its own block
730 assert(store != load->in(0), "dependence cycle found"); 741 assert(store != load->in(0), "dependence cycle found");
731 if (verify) { 742 if (verify) {
732 assert(store->find_edge(load) != -1, "missing precedence edge"); 743 assert(store->find_edge(load) != -1, "missing precedence edge");
756 private: 767 private:
757 Node_Backward_Iterator(); 768 Node_Backward_Iterator();
758 769
759 public: 770 public:
760 // Constructor for the iterator 771 // Constructor for the iterator
761 Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs); 772 Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg);
762 773
763 // Postincrement operator to iterate over the nodes 774 // Postincrement operator to iterate over the nodes
764 Node *next(); 775 Node *next();
765 776
766 private: 777 private:
767 VectorSet &_visited; 778 VectorSet &_visited;
768 Node_List &_stack; 779 Node_List &_stack;
769 Block_Array &_bbs; 780 PhaseCFG &_cfg;
770 }; 781 };
771 782
772 // Constructor for the Node_Backward_Iterator 783 // Constructor for the Node_Backward_Iterator
773 Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs ) 784 Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg)
774 : _visited(visited), _stack(stack), _bbs(bbs) { 785 : _visited(visited), _stack(stack), _cfg(cfg) {
775 // The stack should contain exactly the root 786 // The stack should contain exactly the root
776 stack.clear(); 787 stack.clear();
777 stack.push(root); 788 stack.push(root);
778 789
779 // Clear the visited bits 790 // Clear the visited bits
799 while( 1 ) { 810 while( 1 ) {
800 811
801 _visited.set(self->_idx); 812 _visited.set(self->_idx);
802 813
803 // Now schedule all uses as late as possible. 814 // Now schedule all uses as late as possible.
804 uint src = self->is_Proj() ? self->in(0)->_idx : self->_idx; 815 const Node* src = self->is_Proj() ? self->in(0) : self;
805 uint src_rpo = _bbs[src]->_rpo; 816 uint src_rpo = _cfg.get_block_for_node(src)->_rpo;
806 817
807 // Schedule all nodes in a post-order visit 818 // Schedule all nodes in a post-order visit
808 Node *unvisited = NULL; // Unvisited anti-dependent Node, if any 819 Node *unvisited = NULL; // Unvisited anti-dependent Node, if any
809 820
810 // Scan for unvisited nodes 821 // Scan for unvisited nodes
816 if ( _visited.test(n->_idx) ) 827 if ( _visited.test(n->_idx) )
817 continue; 828 continue;
818 829
819 // do not traverse backward control edges 830 // do not traverse backward control edges
820 Node *use = n->is_Proj() ? n->in(0) : n; 831 Node *use = n->is_Proj() ? n->in(0) : n;
821 uint use_rpo = _bbs[use->_idx]->_rpo; 832 uint use_rpo = _cfg.get_block_for_node(use)->_rpo;
822 833
823 if ( use_rpo < src_rpo ) 834 if ( use_rpo < src_rpo )
824 continue; 835 continue;
825 836
826 // Phi nodes always precede uses in a basic block 837 // Phi nodes always precede uses in a basic block
848 return self; 859 return self;
849 } 860 }
850 861
851 //------------------------------ComputeLatenciesBackwards---------------------- 862 //------------------------------ComputeLatenciesBackwards----------------------
852 // Compute the latency of all the instructions. 863 // Compute the latency of all the instructions.
853 void PhaseCFG::ComputeLatenciesBackwards(VectorSet &visited, Node_List &stack) { 864 void PhaseCFG::compute_latencies_backwards(VectorSet &visited, Node_List &stack) {
854 #ifndef PRODUCT 865 #ifndef PRODUCT
855 if (trace_opto_pipelining()) 866 if (trace_opto_pipelining())
856 tty->print("\n#---- ComputeLatenciesBackwards ----\n"); 867 tty->print("\n#---- ComputeLatenciesBackwards ----\n");
857 #endif 868 #endif
858 869
859 Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs); 870 Node_Backward_Iterator iter((Node *)_root, visited, stack, *this);
860 Node *n; 871 Node *n;
861 872
862 // Walk over all the nodes from last to first 873 // Walk over all the nodes from last to first
863 while (n = iter.next()) { 874 while (n = iter.next()) {
864 // Set the latency for the definitions of this instruction 875 // Set the latency for the definitions of this instruction
871 // a number that increases as we approach the beginning of the routine. 882 // a number that increases as we approach the beginning of the routine.
872 void PhaseCFG::partial_latency_of_defs(Node *n) { 883 void PhaseCFG::partial_latency_of_defs(Node *n) {
873 // Set the latency for this instruction 884 // Set the latency for this instruction
874 #ifndef PRODUCT 885 #ifndef PRODUCT
875 if (trace_opto_pipelining()) { 886 if (trace_opto_pipelining()) {
876 tty->print("# latency_to_inputs: node_latency[%d] = %d for node", 887 tty->print("# latency_to_inputs: node_latency[%d] = %d for node", n->_idx, get_latency_for_node(n));
877 n->_idx, _node_latency->at_grow(n->_idx));
878 dump(); 888 dump();
879 } 889 }
880 #endif 890 #endif
881 891
882 if (n->is_Proj()) 892 if (n->is_Proj()) {
883 n = n->in(0); 893 n = n->in(0);
884 894 }
885 if (n->is_Root()) 895
896 if (n->is_Root()) {
886 return; 897 return;
898 }
887 899
888 uint nlen = n->len(); 900 uint nlen = n->len();
889 uint use_latency = _node_latency->at_grow(n->_idx); 901 uint use_latency = get_latency_for_node(n);
890 uint use_pre_order = _bbs[n->_idx]->_pre_order; 902 uint use_pre_order = get_block_for_node(n)->_pre_order;
891 903
892 for ( uint j=0; j<nlen; j++ ) { 904 for (uint j = 0; j < nlen; j++) {
893 Node *def = n->in(j); 905 Node *def = n->in(j);
894 906
895 if (!def || def == n) 907 if (!def || def == n) {
896 continue; 908 continue;
909 }
897 910
898 // Walk backwards thru projections 911 // Walk backwards thru projections
899 if (def->is_Proj()) 912 if (def->is_Proj()) {
900 def = def->in(0); 913 def = def->in(0);
914 }
901 915
902 #ifndef PRODUCT 916 #ifndef PRODUCT
903 if (trace_opto_pipelining()) { 917 if (trace_opto_pipelining()) {
904 tty->print("# in(%2d): ", j); 918 tty->print("# in(%2d): ", j);
905 def->dump(); 919 def->dump();
906 } 920 }
907 #endif 921 #endif
908 922
909 // If the defining block is not known, assume it is ok 923 // If the defining block is not known, assume it is ok
910 Block *def_block = _bbs[def->_idx]; 924 Block *def_block = get_block_for_node(def);
911 uint def_pre_order = def_block ? def_block->_pre_order : 0; 925 uint def_pre_order = def_block ? def_block->_pre_order : 0;
912 926
913 if ( (use_pre_order < def_pre_order) || 927 if ((use_pre_order < def_pre_order) || (use_pre_order == def_pre_order && n->is_Phi())) {
914 (use_pre_order == def_pre_order && n->is_Phi()) )
915 continue; 928 continue;
929 }
916 930
917 uint delta_latency = n->latency(j); 931 uint delta_latency = n->latency(j);
918 uint current_latency = delta_latency + use_latency; 932 uint current_latency = delta_latency + use_latency;
919 933
920 if (_node_latency->at_grow(def->_idx) < current_latency) { 934 if (get_latency_for_node(def) < current_latency) {
921 _node_latency->at_put_grow(def->_idx, current_latency); 935 set_latency_for_node(def, current_latency);
922 } 936 }
923 937
924 #ifndef PRODUCT 938 #ifndef PRODUCT
925 if (trace_opto_pipelining()) { 939 if (trace_opto_pipelining()) {
926 tty->print_cr("# %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d", 940 tty->print_cr("# %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d", use_latency, j, delta_latency, current_latency, def->_idx, get_latency_for_node(def));
927 use_latency, j, delta_latency, current_latency, def->_idx,
928 _node_latency->at_grow(def->_idx));
929 } 941 }
930 #endif 942 #endif
931 } 943 }
932 } 944 }
933 945
934 //------------------------------latency_from_use------------------------------- 946 //------------------------------latency_from_use-------------------------------
935 // Compute the latency of a specific use 947 // Compute the latency of a specific use
936 int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) { 948 int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) {
937 // If self-reference, return no latency 949 // If self-reference, return no latency
938 if (use == n || use->is_Root()) 950 if (use == n || use->is_Root()) {
939 return 0; 951 return 0;
940 952 }
941 uint def_pre_order = _bbs[def->_idx]->_pre_order; 953
954 uint def_pre_order = get_block_for_node(def)->_pre_order;
942 uint latency = 0; 955 uint latency = 0;
943 956
944 // If the use is not a projection, then it is simple... 957 // If the use is not a projection, then it is simple...
945 if (!use->is_Proj()) { 958 if (!use->is_Proj()) {
946 #ifndef PRODUCT 959 #ifndef PRODUCT
948 tty->print("# out(): "); 961 tty->print("# out(): ");
949 use->dump(); 962 use->dump();
950 } 963 }
951 #endif 964 #endif
952 965
953 uint use_pre_order = _bbs[use->_idx]->_pre_order; 966 uint use_pre_order = get_block_for_node(use)->_pre_order;
954 967
955 if (use_pre_order < def_pre_order) 968 if (use_pre_order < def_pre_order)
956 return 0; 969 return 0;
957 970
958 if (use_pre_order == def_pre_order && use->is_Phi()) 971 if (use_pre_order == def_pre_order && use->is_Phi())
959 return 0; 972 return 0;
960 973
961 uint nlen = use->len(); 974 uint nlen = use->len();
962 uint nl = _node_latency->at_grow(use->_idx); 975 uint nl = get_latency_for_node(use);
963 976
964 for ( uint j=0; j<nlen; j++ ) { 977 for ( uint j=0; j<nlen; j++ ) {
965 if (use->in(j) == n) { 978 if (use->in(j) == n) {
966 // Change this if we want local latencies 979 // Change this if we want local latencies
967 uint ul = use->latency(j); 980 uint ul = use->latency(j);
992 // routine. 1005 // routine.
993 void PhaseCFG::latency_from_uses(Node *n) { 1006 void PhaseCFG::latency_from_uses(Node *n) {
994 // Set the latency for this instruction 1007 // Set the latency for this instruction
995 #ifndef PRODUCT 1008 #ifndef PRODUCT
996 if (trace_opto_pipelining()) { 1009 if (trace_opto_pipelining()) {
997 tty->print("# latency_from_outputs: node_latency[%d] = %d for node", 1010 tty->print("# latency_from_outputs: node_latency[%d] = %d for node", n->_idx, get_latency_for_node(n));
998 n->_idx, _node_latency->at_grow(n->_idx));
999 dump(); 1011 dump();
1000 } 1012 }
1001 #endif 1013 #endif
1002 uint latency=0; 1014 uint latency=0;
1003 const Node *def = n->is_Proj() ? n->in(0): n; 1015 const Node *def = n->is_Proj() ? n->in(0): n;
1006 uint l = latency_from_use(n, def, n->fast_out(i)); 1018 uint l = latency_from_use(n, def, n->fast_out(i));
1007 1019
1008 if (latency < l) latency = l; 1020 if (latency < l) latency = l;
1009 } 1021 }
1010 1022
1011 _node_latency->at_put_grow(n->_idx, latency); 1023 set_latency_for_node(n, latency);
1012 } 1024 }
1013 1025
1014 //------------------------------hoist_to_cheaper_block------------------------- 1026 //------------------------------hoist_to_cheaper_block-------------------------
1015 // Pick a block for node self, between early and LCA, that is a cheaper 1027 // Pick a block for node self, between early and LCA, that is a cheaper
1016 // alternative to LCA. 1028 // alternative to LCA.
1017 Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) { 1029 Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) {
1018 const double delta = 1+PROB_UNLIKELY_MAG(4); 1030 const double delta = 1+PROB_UNLIKELY_MAG(4);
1019 Block* least = LCA; 1031 Block* least = LCA;
1020 double least_freq = least->_freq; 1032 double least_freq = least->_freq;
1021 uint target = _node_latency->at_grow(self->_idx); 1033 uint target = get_latency_for_node(self);
1022 uint start_latency = _node_latency->at_grow(LCA->_nodes[0]->_idx); 1034 uint start_latency = get_latency_for_node(LCA->_nodes[0]);
1023 uint end_latency = _node_latency->at_grow(LCA->_nodes[LCA->end_idx()]->_idx); 1035 uint end_latency = get_latency_for_node(LCA->_nodes[LCA->end_idx()]);
1024 bool in_latency = (target <= start_latency); 1036 bool in_latency = (target <= start_latency);
1025 const Block* root_block = _bbs[_root->_idx]; 1037 const Block* root_block = get_block_for_node(_root);
1026 1038
1027 // Turn off latency scheduling if scheduling is just plain off 1039 // Turn off latency scheduling if scheduling is just plain off
1028 if (!C->do_scheduling()) 1040 if (!C->do_scheduling())
1029 in_latency = true; 1041 in_latency = true;
1030 1042
1035 if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty()) 1047 if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty())
1036 in_latency = true; 1048 in_latency = true;
1037 1049
1038 #ifndef PRODUCT 1050 #ifndef PRODUCT
1039 if (trace_opto_pipelining()) { 1051 if (trace_opto_pipelining()) {
1040 tty->print("# Find cheaper block for latency %d: ", 1052 tty->print("# Find cheaper block for latency %d: ", get_latency_for_node(self));
1041 _node_latency->at_grow(self->_idx));
1042 self->dump(); 1053 self->dump();
1043 tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g", 1054 tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1044 LCA->_pre_order, 1055 LCA->_pre_order,
1045 LCA->_nodes[0]->_idx, 1056 LCA->_nodes[0]->_idx,
1046 start_latency, 1057 start_latency,
1065 1076
1066 // Don't hoist machine instructions to the root basic block 1077 // Don't hoist machine instructions to the root basic block
1067 if (mach && LCA == root_block) 1078 if (mach && LCA == root_block)
1068 break; 1079 break;
1069 1080
1070 uint start_lat = _node_latency->at_grow(LCA->_nodes[0]->_idx); 1081 uint start_lat = get_latency_for_node(LCA->_nodes[0]);
1071 uint end_idx = LCA->end_idx(); 1082 uint end_idx = LCA->end_idx();
1072 uint end_lat = _node_latency->at_grow(LCA->_nodes[end_idx]->_idx); 1083 uint end_lat = get_latency_for_node(LCA->_nodes[end_idx]);
1073 double LCA_freq = LCA->_freq; 1084 double LCA_freq = LCA->_freq;
1074 #ifndef PRODUCT 1085 #ifndef PRODUCT
1075 if (trace_opto_pipelining()) { 1086 if (trace_opto_pipelining()) {
1076 tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g", 1087 tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1077 LCA->_pre_order, LCA->_nodes[0]->_idx, start_lat, end_idx, end_lat, LCA_freq); 1088 LCA->_pre_order, LCA->_nodes[0]->_idx, start_lat, end_idx, end_lat, LCA_freq);
1109 #ifndef PRODUCT 1120 #ifndef PRODUCT
1110 if (trace_opto_pipelining()) { 1121 if (trace_opto_pipelining()) {
1111 tty->print_cr("# Change latency for [%4d] from %d to %d", self->_idx, target, end_latency); 1122 tty->print_cr("# Change latency for [%4d] from %d to %d", self->_idx, target, end_latency);
1112 } 1123 }
1113 #endif 1124 #endif
1114 _node_latency->at_put_grow(self->_idx, end_latency); 1125 set_latency_for_node(self, end_latency);
1115 partial_latency_of_defs(self); 1126 partial_latency_of_defs(self);
1116 } 1127 }
1117 1128
1118 return least; 1129 return least;
1119 } 1130 }
1128 #ifndef PRODUCT 1139 #ifndef PRODUCT
1129 if (trace_opto_pipelining()) 1140 if (trace_opto_pipelining())
1130 tty->print("\n#---- schedule_late ----\n"); 1141 tty->print("\n#---- schedule_late ----\n");
1131 #endif 1142 #endif
1132 1143
1133 Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs); 1144 Node_Backward_Iterator iter((Node *)_root, visited, stack, *this);
1134 Node *self; 1145 Node *self;
1135 1146
1136 // Walk over all the nodes from last to first 1147 // Walk over all the nodes from last to first
1137 while (self = iter.next()) { 1148 while (self = iter.next()) {
1138 Block* early = _bbs[self->_idx]; // Earliest legal placement 1149 Block* early = get_block_for_node(self); // Earliest legal placement
1139 1150
1140 if (self->is_top()) { 1151 if (self->is_top()) {
1141 // Top node goes in bb #2 with other constants. 1152 // Top node goes in bb #2 with other constants.
1142 // It must be special-cased, because it has no out edges. 1153 // It must be special-cased, because it has no out edges.
1143 early->add_inst(self); 1154 early->add_inst(self);
1181 Block *LCA = NULL; 1192 Block *LCA = NULL;
1182 { 1193 {
1183 for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) { 1194 for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
1184 // For all uses, find LCA 1195 // For all uses, find LCA
1185 Node* use = self->fast_out(i); 1196 Node* use = self->fast_out(i);
1186 LCA = raise_LCA_above_use(LCA, use, self, _bbs); 1197 LCA = raise_LCA_above_use(LCA, use, self, this);
1187 } 1198 }
1188 } // (Hide defs of imax, i from rest of block.) 1199 } // (Hide defs of imax, i from rest of block.)
1189 1200
1190 // Place temps in the block of their use. This isn't a 1201 // Place temps in the block of their use. This isn't a
1191 // requirement for correctness but it reduces useless 1202 // requirement for correctness but it reduces useless
1192 // interference between temps and other nodes. 1203 // interference between temps and other nodes.
1193 if (mach != NULL && mach->is_MachTemp()) { 1204 if (mach != NULL && mach->is_MachTemp()) {
1194 _bbs.map(self->_idx, LCA); 1205 map_node_to_block(self, LCA);
1195 LCA->add_inst(self); 1206 LCA->add_inst(self);
1196 continue; 1207 continue;
1197 } 1208 }
1198 1209
1199 // Check if 'self' could be anti-dependent on memory 1210 // Check if 'self' could be anti-dependent on memory
1255 } // Loop until all nodes have been visited 1266 } // Loop until all nodes have been visited
1256 1267
1257 } // end ScheduleLate 1268 } // end ScheduleLate
1258 1269
1259 //------------------------------GlobalCodeMotion------------------------------- 1270 //------------------------------GlobalCodeMotion-------------------------------
1260 void PhaseCFG::GlobalCodeMotion( Matcher &matcher, uint unique, Node_List &proj_list ) { 1271 void PhaseCFG::global_code_motion() {
1261 ResourceMark rm; 1272 ResourceMark rm;
1262 1273
1263 #ifndef PRODUCT 1274 #ifndef PRODUCT
1264 if (trace_opto_pipelining()) { 1275 if (trace_opto_pipelining()) {
1265 tty->print("\n---- Start GlobalCodeMotion ----\n"); 1276 tty->print("\n---- Start GlobalCodeMotion ----\n");
1266 } 1277 }
1267 #endif 1278 #endif
1268 1279
1269 // Initialize the bbs.map for things on the proj_list 1280 // Initialize the node to block mapping for things on the proj_list
1270 uint i; 1281 for (uint i = 0; i < _matcher.number_of_projections(); i++) {
1271 for( i=0; i < proj_list.size(); i++ ) 1282 unmap_node_from_block(_matcher.get_projection(i));
1272 _bbs.map(proj_list[i]->_idx, NULL); 1283 }
1273 1284
1274 // Set the basic block for Nodes pinned into blocks 1285 // Set the basic block for Nodes pinned into blocks
1275 Arena *a = Thread::current()->resource_area(); 1286 Arena* arena = Thread::current()->resource_area();
1276 VectorSet visited(a); 1287 VectorSet visited(arena);
1277 schedule_pinned_nodes( visited ); 1288 schedule_pinned_nodes(visited);
1278 1289
1279 // Find the earliest Block any instruction can be placed in. Some 1290 // Find the earliest Block any instruction can be placed in. Some
1280 // instructions are pinned into Blocks. Unpinned instructions can 1291 // instructions are pinned into Blocks. Unpinned instructions can
1281 // appear in last block in which all their inputs occur. 1292 // appear in last block in which all their inputs occur.
1282 visited.Clear(); 1293 visited.Clear();
1283 Node_List stack(a); 1294 Node_List stack(arena);
1284 stack.map( (unique >> 1) + 16, NULL); // Pre-grow the list 1295 // Pre-grow the list
1296 stack.map((C->unique() >> 1) + 16, NULL);
1285 if (!schedule_early(visited, stack)) { 1297 if (!schedule_early(visited, stack)) {
1286 // Bailout without retry 1298 // Bailout without retry
1287 C->record_method_not_compilable("early schedule failed"); 1299 C->record_method_not_compilable("early schedule failed");
1288 return; 1300 return;
1289 } 1301 }
1290 1302
1291 // Build Def-Use edges. 1303 // Build Def-Use edges.
1292 proj_list.push(_root); // Add real root as another root
1293 proj_list.pop();
1294
1295 // Compute the latency information (via backwards walk) for all the 1304 // Compute the latency information (via backwards walk) for all the
1296 // instructions in the graph 1305 // instructions in the graph
1297 _node_latency = new GrowableArray<uint>(); // resource_area allocation 1306 _node_latency = new GrowableArray<uint>(); // resource_area allocation
1298 1307
1299 if( C->do_scheduling() ) 1308 if (C->do_scheduling()) {
1300 ComputeLatenciesBackwards(visited, stack); 1309 compute_latencies_backwards(visited, stack);
1310 }
1301 1311
1302 // Now schedule all codes as LATE as possible. This is the LCA in the 1312 // Now schedule all codes as LATE as possible. This is the LCA in the
1303 // dominator tree of all USES of a value. Pick the block with the least 1313 // dominator tree of all USES of a value. Pick the block with the least
1304 // loop nesting depth that is lowest in the dominator tree. 1314 // loop nesting depth that is lowest in the dominator tree.
1305 // ( visited.Clear() called in schedule_late()->Node_Backward_Iterator() ) 1315 // ( visited.Clear() called in schedule_late()->Node_Backward_Iterator() )
1306 schedule_late(visited, stack); 1316 schedule_late(visited, stack);
1307 if( C->failing() ) { 1317 if (C->failing()) {
1308 // schedule_late fails only when graph is incorrect. 1318 // schedule_late fails only when graph is incorrect.
1309 assert(!VerifyGraphEdges, "verification should have failed"); 1319 assert(!VerifyGraphEdges, "verification should have failed");
1310 return; 1320 return;
1311 } 1321 }
1312
1313 unique = C->unique();
1314 1322
1315 #ifndef PRODUCT 1323 #ifndef PRODUCT
1316 if (trace_opto_pipelining()) { 1324 if (trace_opto_pipelining()) {
1317 tty->print("\n---- Detect implicit null checks ----\n"); 1325 tty->print("\n---- Detect implicit null checks ----\n");
1318 } 1326 }
1332 allowed_reasons |= nth_bit(reason); 1340 allowed_reasons |= nth_bit(reason);
1333 } 1341 }
1334 // By reversing the loop direction we get a very minor gain on mpegaudio. 1342 // By reversing the loop direction we get a very minor gain on mpegaudio.
1335 // Feel free to revert to a forward loop for clarity. 1343 // Feel free to revert to a forward loop for clarity.
1336 // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) { 1344 // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) {
1337 for( int i= matcher._null_check_tests.size()-2; i>=0; i-=2 ) { 1345 for (int i = _matcher._null_check_tests.size() - 2; i >= 0; i -= 2) {
1338 Node *proj = matcher._null_check_tests[i ]; 1346 Node* proj = _matcher._null_check_tests[i];
1339 Node *val = matcher._null_check_tests[i+1]; 1347 Node* val = _matcher._null_check_tests[i + 1];
1340 _bbs[proj->_idx]->implicit_null_check(this, proj, val, allowed_reasons); 1348 Block* block = get_block_for_node(proj);
1349 block->implicit_null_check(this, proj, val, allowed_reasons);
1341 // The implicit_null_check will only perform the transformation 1350 // The implicit_null_check will only perform the transformation
1342 // if the null branch is truly uncommon, *and* it leads to an 1351 // if the null branch is truly uncommon, *and* it leads to an
1343 // uncommon trap. Combined with the too_many_traps guards 1352 // uncommon trap. Combined with the too_many_traps guards
1344 // above, this prevents SEGV storms reported in 6366351, 1353 // above, this prevents SEGV storms reported in 6366351,
1345 // by recompiling offending methods without this optimization. 1354 // by recompiling offending methods without this optimization.
1352 } 1361 }
1353 #endif 1362 #endif
1354 1363
1355 // Schedule locally. Right now a simple topological sort. 1364 // Schedule locally. Right now a simple topological sort.
1356 // Later, do a real latency aware scheduler. 1365 // Later, do a real latency aware scheduler.
1357 uint max_idx = C->unique(); 1366 GrowableArray<int> ready_cnt(C->unique(), C->unique(), -1);
1358 GrowableArray<int> ready_cnt(max_idx, max_idx, -1);
1359 visited.Clear(); 1367 visited.Clear();
1360 for (i = 0; i < _num_blocks; i++) { 1368 for (uint i = 0; i < number_of_blocks(); i++) {
1361 if (!_blocks[i]->schedule_local(this, matcher, ready_cnt, visited)) { 1369 Block* block = get_block(i);
1370 if (!block->schedule_local(this, _matcher, ready_cnt, visited)) {
1362 if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) { 1371 if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
1363 C->record_method_not_compilable("local schedule failed"); 1372 C->record_method_not_compilable("local schedule failed");
1364 } 1373 }
1365 return; 1374 return;
1366 } 1375 }
1367 } 1376 }
1368 1377
1369 // If we inserted any instructions between a Call and his CatchNode, 1378 // If we inserted any instructions between a Call and his CatchNode,
1370 // clone the instructions on all paths below the Catch. 1379 // clone the instructions on all paths below the Catch.
1371 for( i=0; i < _num_blocks; i++ ) 1380 for (uint i = 0; i < number_of_blocks(); i++) {
1372 _blocks[i]->call_catch_cleanup(_bbs, C); 1381 Block* block = get_block(i);
1382 block->call_catch_cleanup(this, C);
1383 }
1373 1384
1374 #ifndef PRODUCT 1385 #ifndef PRODUCT
1375 if (trace_opto_pipelining()) { 1386 if (trace_opto_pipelining()) {
1376 tty->print("\n---- After GlobalCodeMotion ----\n"); 1387 tty->print("\n---- After GlobalCodeMotion ----\n");
1377 for (uint i = 0; i < _num_blocks; i++) { 1388 for (uint i = 0; i < number_of_blocks(); i++) {
1378 _blocks[i]->dump(); 1389 Block* block = get_block(i);
1390 block->dump();
1379 } 1391 }
1380 } 1392 }
1381 #endif 1393 #endif
1382 // Dead. 1394 // Dead.
1383 _node_latency = (GrowableArray<uint> *)0xdeadbeef; 1395 _node_latency = (GrowableArray<uint> *)0xdeadbeef;
1384 } 1396 }
1385 1397
1398 bool PhaseCFG::do_global_code_motion() {
1399
1400 build_dominator_tree();
1401 if (C->failing()) {
1402 return false;
1403 }
1404
1405 NOT_PRODUCT( C->verify_graph_edges(); )
1406
1407 estimate_block_frequency();
1408
1409 global_code_motion();
1410
1411 if (C->failing()) {
1412 return false;
1413 }
1414
1415 return true;
1416 }
1386 1417
1387 //------------------------------Estimate_Block_Frequency----------------------- 1418 //------------------------------Estimate_Block_Frequency-----------------------
1388 // Estimate block frequencies based on IfNode probabilities. 1419 // Estimate block frequencies based on IfNode probabilities.
1389 void PhaseCFG::Estimate_Block_Frequency() { 1420 void PhaseCFG::estimate_block_frequency() {
1390 1421
1391 // Force conditional branches leading to uncommon traps to be unlikely, 1422 // Force conditional branches leading to uncommon traps to be unlikely,
1392 // not because we get to the uncommon_trap with less relative frequency, 1423 // not because we get to the uncommon_trap with less relative frequency,
1393 // but because an uncommon_trap typically causes a deopt, so we only get 1424 // but because an uncommon_trap typically causes a deopt, so we only get
1394 // there once. 1425 // there once.
1395 if (C->do_freq_based_layout()) { 1426 if (C->do_freq_based_layout()) {
1396 Block_List worklist; 1427 Block_List worklist;
1397 Block* root_blk = _blocks[0]; 1428 Block* root_blk = get_block(0);
1398 for (uint i = 1; i < root_blk->num_preds(); i++) { 1429 for (uint i = 1; i < root_blk->num_preds(); i++) {
1399 Block *pb = _bbs[root_blk->pred(i)->_idx]; 1430 Block *pb = get_block_for_node(root_blk->pred(i));
1400 if (pb->has_uncommon_code()) { 1431 if (pb->has_uncommon_code()) {
1401 worklist.push(pb); 1432 worklist.push(pb);
1402 } 1433 }
1403 } 1434 }
1404 while (worklist.size() > 0) { 1435 while (worklist.size() > 0) {
1405 Block* uct = worklist.pop(); 1436 Block* uct = worklist.pop();
1406 if (uct == _broot) continue; 1437 if (uct == get_root_block()) {
1438 continue;
1439 }
1407 for (uint i = 1; i < uct->num_preds(); i++) { 1440 for (uint i = 1; i < uct->num_preds(); i++) {
1408 Block *pb = _bbs[uct->pred(i)->_idx]; 1441 Block *pb = get_block_for_node(uct->pred(i));
1409 if (pb->_num_succs == 1) { 1442 if (pb->_num_succs == 1) {
1410 worklist.push(pb); 1443 worklist.push(pb);
1411 } else if (pb->num_fall_throughs() == 2) { 1444 } else if (pb->num_fall_throughs() == 2) {
1412 pb->update_uncommon_branch(uct); 1445 pb->update_uncommon_branch(uct);
1413 } 1446 }
1425 // Adjust all frequencies to be relative to a single method entry 1458 // Adjust all frequencies to be relative to a single method entry
1426 _root_loop->_freq = 1.0; 1459 _root_loop->_freq = 1.0;
1427 _root_loop->scale_freq(); 1460 _root_loop->scale_freq();
1428 1461
1429 // Save outmost loop frequency for LRG frequency threshold 1462 // Save outmost loop frequency for LRG frequency threshold
1430 _outer_loop_freq = _root_loop->outer_loop_freq(); 1463 _outer_loop_frequency = _root_loop->outer_loop_freq();
1431 1464
1432 // force paths ending at uncommon traps to be infrequent 1465 // force paths ending at uncommon traps to be infrequent
1433 if (!C->do_freq_based_layout()) { 1466 if (!C->do_freq_based_layout()) {
1434 Block_List worklist; 1467 Block_List worklist;
1435 Block* root_blk = _blocks[0]; 1468 Block* root_blk = get_block(0);
1436 for (uint i = 1; i < root_blk->num_preds(); i++) { 1469 for (uint i = 1; i < root_blk->num_preds(); i++) {
1437 Block *pb = _bbs[root_blk->pred(i)->_idx]; 1470 Block *pb = get_block_for_node(root_blk->pred(i));
1438 if (pb->has_uncommon_code()) { 1471 if (pb->has_uncommon_code()) {
1439 worklist.push(pb); 1472 worklist.push(pb);
1440 } 1473 }
1441 } 1474 }
1442 while (worklist.size() > 0) { 1475 while (worklist.size() > 0) {
1443 Block* uct = worklist.pop(); 1476 Block* uct = worklist.pop();
1444 uct->_freq = PROB_MIN; 1477 uct->_freq = PROB_MIN;
1445 for (uint i = 1; i < uct->num_preds(); i++) { 1478 for (uint i = 1; i < uct->num_preds(); i++) {
1446 Block *pb = _bbs[uct->pred(i)->_idx]; 1479 Block *pb = get_block_for_node(uct->pred(i));
1447 if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) { 1480 if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) {
1448 worklist.push(pb); 1481 worklist.push(pb);
1449 } 1482 }
1450 } 1483 }
1451 } 1484 }
1452 } 1485 }
1453 1486
1454 #ifdef ASSERT 1487 #ifdef ASSERT
1455 for (uint i = 0; i < _num_blocks; i++ ) { 1488 for (uint i = 0; i < number_of_blocks(); i++) {
1456 Block *b = _blocks[i]; 1489 Block* b = get_block(i);
1457 assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requires meaningful block frequency"); 1490 assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requires meaningful block frequency");
1458 } 1491 }
1459 #endif 1492 #endif
1460 1493
1461 #ifndef PRODUCT 1494 #ifndef PRODUCT
1475 //----------------------------create_loop_tree-------------------------------- 1508 //----------------------------create_loop_tree--------------------------------
1476 // Create a loop tree from the CFG 1509 // Create a loop tree from the CFG
1477 CFGLoop* PhaseCFG::create_loop_tree() { 1510 CFGLoop* PhaseCFG::create_loop_tree() {
1478 1511
1479 #ifdef ASSERT 1512 #ifdef ASSERT
1480 assert( _blocks[0] == _broot, "" ); 1513 assert(get_block(0) == get_root_block(), "first block should be root block");
1481 for (uint i = 0; i < _num_blocks; i++ ) { 1514 for (uint i = 0; i < number_of_blocks(); i++) {
1482 Block *b = _blocks[i]; 1515 Block* block = get_block(i);
1483 // Check that _loop field are clear...we could clear them if not. 1516 // Check that _loop field are clear...we could clear them if not.
1484 assert(b->_loop == NULL, "clear _loop expected"); 1517 assert(block->_loop == NULL, "clear _loop expected");
1485 // Sanity check that the RPO numbering is reflected in the _blocks array. 1518 // Sanity check that the RPO numbering is reflected in the _blocks array.
1486 // It doesn't have to be for the loop tree to be built, but if it is not, 1519 // It doesn't have to be for the loop tree to be built, but if it is not,
1487 // then the blocks have been reordered since dom graph building...which 1520 // then the blocks have been reordered since dom graph building...which
1488 // may question the RPO numbering 1521 // may question the RPO numbering
1489 assert(b->_rpo == i, "unexpected reverse post order number"); 1522 assert(block->_rpo == i, "unexpected reverse post order number");
1490 } 1523 }
1491 #endif 1524 #endif
1492 1525
1493 int idct = 0; 1526 int idct = 0;
1494 CFGLoop* root_loop = new CFGLoop(idct++); 1527 CFGLoop* root_loop = new CFGLoop(idct++);
1495 1528
1496 Block_List worklist; 1529 Block_List worklist;
1497 1530
1498 // Assign blocks to loops 1531 // Assign blocks to loops
1499 for(uint i = _num_blocks - 1; i > 0; i-- ) { // skip Root block 1532 for(uint i = number_of_blocks() - 1; i > 0; i-- ) { // skip Root block
1500 Block *b = _blocks[i]; 1533 Block* block = get_block(i);
1501 1534
1502 if (b->head()->is_Loop()) { 1535 if (block->head()->is_Loop()) {
1503 Block* loop_head = b; 1536 Block* loop_head = block;
1504 assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors"); 1537 assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1505 Node* tail_n = loop_head->pred(LoopNode::LoopBackControl); 1538 Node* tail_n = loop_head->pred(LoopNode::LoopBackControl);
1506 Block* tail = _bbs[tail_n->_idx]; 1539 Block* tail = get_block_for_node(tail_n);
1507 1540
1508 // Defensively filter out Loop nodes for non-single-entry loops. 1541 // Defensively filter out Loop nodes for non-single-entry loops.
1509 // For all reasonable loops, the head occurs before the tail in RPO. 1542 // For all reasonable loops, the head occurs before the tail in RPO.
1510 if (i <= tail->_rpo) { 1543 if (i <= tail->_rpo) {
1511 1544
1516 CFGLoop* nloop = new CFGLoop(idct++); 1549 CFGLoop* nloop = new CFGLoop(idct++);
1517 assert(loop_head->_loop == NULL, "just checking"); 1550 assert(loop_head->_loop == NULL, "just checking");
1518 loop_head->_loop = nloop; 1551 loop_head->_loop = nloop;
1519 // Add to nloop so push_pred() will skip over inner loops 1552 // Add to nloop so push_pred() will skip over inner loops
1520 nloop->add_member(loop_head); 1553 nloop->add_member(loop_head);
1521 nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, _bbs); 1554 nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, this);
1522 1555
1523 while (worklist.size() > 0) { 1556 while (worklist.size() > 0) {
1524 Block* member = worklist.pop(); 1557 Block* member = worklist.pop();
1525 if (member != loop_head) { 1558 if (member != loop_head) {
1526 for (uint j = 1; j < member->num_preds(); j++) { 1559 for (uint j = 1; j < member->num_preds(); j++) {
1527 nloop->push_pred(member, j, worklist, _bbs); 1560 nloop->push_pred(member, j, worklist, this);
1528 } 1561 }
1529 } 1562 }
1530 } 1563 }
1531 } 1564 }
1532 } 1565 }
1533 } 1566 }
1534 1567
1535 // Create a member list for each loop consisting 1568 // Create a member list for each loop consisting
1536 // of both blocks and (immediate child) loops. 1569 // of both blocks and (immediate child) loops.
1537 for (uint i = 0; i < _num_blocks; i++) { 1570 for (uint i = 0; i < number_of_blocks(); i++) {
1538 Block *b = _blocks[i]; 1571 Block* block = get_block(i);
1539 CFGLoop* lp = b->_loop; 1572 CFGLoop* lp = block->_loop;
1540 if (lp == NULL) { 1573 if (lp == NULL) {
1541 // Not assigned to a loop. Add it to the method's pseudo loop. 1574 // Not assigned to a loop. Add it to the method's pseudo loop.
1542 b->_loop = root_loop; 1575 block->_loop = root_loop;
1543 lp = root_loop; 1576 lp = root_loop;
1544 } 1577 }
1545 if (lp == root_loop || b != lp->head()) { // loop heads are already members 1578 if (lp == root_loop || block != lp->head()) { // loop heads are already members
1546 lp->add_member(b); 1579 lp->add_member(block);
1547 } 1580 }
1548 if (lp != root_loop) { 1581 if (lp != root_loop) {
1549 if (lp->parent() == NULL) { 1582 if (lp->parent() == NULL) {
1550 // Not a nested loop. Make it a child of the method's pseudo loop. 1583 // Not a nested loop. Make it a child of the method's pseudo loop.
1551 root_loop->add_nested_loop(lp); 1584 root_loop->add_nested_loop(lp);
1552 } 1585 }
1553 if (b == lp->head()) { 1586 if (block == lp->head()) {
1554 // Add nested loop to member list of parent loop. 1587 // Add nested loop to member list of parent loop.
1555 lp->parent()->add_member(lp); 1588 lp->parent()->add_member(lp);
1556 } 1589 }
1557 } 1590 }
1558 } 1591 }
1559 1592
1560 return root_loop; 1593 return root_loop;
1561 } 1594 }
1562 1595
1563 //------------------------------push_pred-------------------------------------- 1596 //------------------------------push_pred--------------------------------------
1564 void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, Block_Array& node_to_blk) { 1597 void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, PhaseCFG* cfg) {
1565 Node* pred_n = blk->pred(i); 1598 Node* pred_n = blk->pred(i);
1566 Block* pred = node_to_blk[pred_n->_idx]; 1599 Block* pred = cfg->get_block_for_node(pred_n);
1567 CFGLoop *pred_loop = pred->_loop; 1600 CFGLoop *pred_loop = pred->_loop;
1568 if (pred_loop == NULL) { 1601 if (pred_loop == NULL) {
1569 // Filter out blocks for non-single-entry loops. 1602 // Filter out blocks for non-single-entry loops.
1570 // For all reasonable loops, the head occurs before the tail in RPO. 1603 // For all reasonable loops, the head occurs before the tail in RPO.
1571 if (pred->_rpo > head()->_rpo) { 1604 if (pred->_rpo > head()->_rpo) {
1582 add_nested_loop(pred_loop); 1615 add_nested_loop(pred_loop);
1583 // Continue with loop entry predecessor. 1616 // Continue with loop entry predecessor.
1584 Block* pred_head = pred_loop->head(); 1617 Block* pred_head = pred_loop->head();
1585 assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors"); 1618 assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1586 assert(pred_head != head(), "loop head in only one loop"); 1619 assert(pred_head != head(), "loop head in only one loop");
1587 push_pred(pred_head, LoopNode::EntryControl, worklist, node_to_blk); 1620 push_pred(pred_head, LoopNode::EntryControl, worklist, cfg);
1588 } else { 1621 } else {
1589 assert(pred_loop->_parent == this && _parent == NULL, "just checking"); 1622 assert(pred_loop->_parent == this && _parent == NULL, "just checking");
1590 } 1623 }
1591 } 1624 }
1592 } 1625 }