comparison src/share/vm/opto/superword.cpp @ 6179:8c92982cbbc4

7119644: Increase superword's vector size up to 256 bits Summary: Increase vector size up to 256-bits for YMM AVX registers on x86. Reviewed-by: never, twisti, roland
author kvn
date Fri, 15 Jun 2012 01:25:19 -0700
parents 5e990493719e
children 6f8f439e247d
comparison
equal deleted inserted replaced
6146:eba1d5bce9e8 6179:8c92982cbbc4
1 /* 1 /*
2 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 2 * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 * 4 *
5 * This code is free software; you can redistribute it and/or modify it 5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as 6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
65 _iv(NULL) // induction var 65 _iv(NULL) // induction var
66 {} 66 {}
67 67
68 //------------------------------transform_loop--------------------------- 68 //------------------------------transform_loop---------------------------
69 void SuperWord::transform_loop(IdealLoopTree* lpt) { 69 void SuperWord::transform_loop(IdealLoopTree* lpt) {
70 assert(UseSuperWord, "should be");
71 // Do vectors exist on this architecture?
72 if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
73
70 assert(lpt->_head->is_CountedLoop(), "must be"); 74 assert(lpt->_head->is_CountedLoop(), "must be");
71 CountedLoopNode *cl = lpt->_head->as_CountedLoop(); 75 CountedLoopNode *cl = lpt->_head->as_CountedLoop();
72 76
73 if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop 77 if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
74 78
87 CountedLoopEndNode* pre_end = get_pre_loop_end(cl); 91 CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
88 if (pre_end == NULL) return; 92 if (pre_end == NULL) return;
89 Node *pre_opaq1 = pre_end->limit(); 93 Node *pre_opaq1 = pre_end->limit();
90 if (pre_opaq1->Opcode() != Op_Opaque1) return; 94 if (pre_opaq1->Opcode() != Op_Opaque1) return;
91 95
92 // Do vectors exist on this architecture?
93 if (vector_width_in_bytes() == 0) return;
94
95 init(); // initialize data structures 96 init(); // initialize data structures
96 97
97 set_lpt(lpt); 98 set_lpt(lpt);
98 set_lp(cl); 99 set_lp(cl);
99 100
100 // For now, define one block which is the entire loop body 101 // For now, define one block which is the entire loop body
101 set_bb(cl); 102 set_bb(cl);
102 103
103 assert(_packset.length() == 0, "packset must be empty"); 104 assert(_packset.length() == 0, "packset must be empty");
104 SLP_extract(); 105 SLP_extract();
105 } 106 }
175 void SuperWord::find_adjacent_refs() { 176 void SuperWord::find_adjacent_refs() {
176 // Get list of memory operations 177 // Get list of memory operations
177 Node_List memops; 178 Node_List memops;
178 for (int i = 0; i < _block.length(); i++) { 179 for (int i = 0; i < _block.length(); i++) {
179 Node* n = _block.at(i); 180 Node* n = _block.at(i);
180 if (n->is_Mem() && in_bb(n) && 181 if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
181 is_java_primitive(n->as_Mem()->memory_type())) { 182 is_java_primitive(n->as_Mem()->memory_type())) {
182 int align = memory_alignment(n->as_Mem(), 0); 183 int align = memory_alignment(n->as_Mem(), 0);
183 if (align != bottom_align) { 184 if (align != bottom_align) {
184 memops.push(n); 185 memops.push(n);
185 } 186 }
186 } 187 }
187 } 188 }
188 if (memops.size() == 0) return; 189
189 190 Node_List align_to_refs;
190 // Find a memory reference to align to. The pre-loop trip count 191 int best_iv_adjustment = 0;
191 // is modified to align this reference to a vector-aligned address 192 MemNode* best_align_to_mem_ref = NULL;
192 find_align_to_ref(memops); 193
193 if (align_to_ref() == NULL) return; 194 while (memops.size() != 0) {
194 195 // Find a memory reference to align to.
195 SWPointer align_to_ref_p(align_to_ref(), this); 196 MemNode* mem_ref = find_align_to_ref(memops);
196 int offset = align_to_ref_p.offset_in_bytes(); 197 if (mem_ref == NULL) break;
197 int scale = align_to_ref_p.scale_in_bytes(); 198 align_to_refs.push(mem_ref);
198 int vw = vector_width_in_bytes(); 199 int iv_adjustment = get_iv_adjustment(mem_ref);
199 int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; 200
200 int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw; 201 if (best_align_to_mem_ref == NULL) {
201 202 // Set memory reference which is the best from all memory operations
202 #ifndef PRODUCT 203 // to be used for alignment. The pre-loop trip count is modified to align
203 if (TraceSuperWord) 204 // this reference to a vector-aligned address.
204 tty->print_cr("\noffset = %d iv_adjustment = %d elt_align = %d scale = %d iv_stride = %d", 205 best_align_to_mem_ref = mem_ref;
205 offset, iv_adjustment, align_to_ref_p.memory_size(), align_to_ref_p.scale_in_bytes(), iv_stride()); 206 best_iv_adjustment = iv_adjustment;
206 #endif 207 }
207 208
208 // Set alignment relative to "align_to_ref" 209 SWPointer align_to_ref_p(mem_ref, this);
209 for (int i = memops.size() - 1; i >= 0; i--) { 210 // Set alignment relative to "align_to_ref" for all related memory operations.
210 MemNode* s = memops.at(i)->as_Mem(); 211 for (int i = memops.size() - 1; i >= 0; i--) {
211 SWPointer p2(s, this); 212 MemNode* s = memops.at(i)->as_Mem();
212 if (p2.comparable(align_to_ref_p)) { 213 if (isomorphic(s, mem_ref)) {
213 int align = memory_alignment(s, iv_adjustment); 214 SWPointer p2(s, this);
214 set_alignment(s, align); 215 if (p2.comparable(align_to_ref_p)) {
215 } else { 216 int align = memory_alignment(s, iv_adjustment);
216 memops.remove(i); 217 set_alignment(s, align);
217 } 218 }
218 } 219 }
219 220 }
220 // Create initial pack pairs of memory operations 221
221 for (uint i = 0; i < memops.size(); i++) { 222 // Create initial pack pairs of memory operations for which
222 Node* s1 = memops.at(i); 223 // alignment is set and vectors will be aligned.
223 for (uint j = 0; j < memops.size(); j++) { 224 bool create_pack = true;
224 Node* s2 = memops.at(j); 225 if (memory_alignment(mem_ref, best_iv_adjustment) != 0) {
225 if (s1 != s2 && are_adjacent_refs(s1, s2)) { 226 if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
227 // Can't allow vectorization of unaligned memory accesses with the
228 // same type since it could be overlapped accesses to the same array.
229 create_pack = false;
230 } else {
231 // Allow independent (different type) unaligned memory operations
232 // if HW supports them.
233 if (!Matcher::misaligned_vectors_ok()) {
234 create_pack = false;
235 } else {
236 // Check if packs of the same memory type but
237 // with a different alignment were created before.
238 for (uint i = 0; i < align_to_refs.size(); i++) {
239 MemNode* mr = align_to_refs.at(i)->as_Mem();
240 if (same_velt_type(mr, mem_ref) &&
241 memory_alignment(mr, iv_adjustment) != 0)
242 create_pack = false;
243 }
244 }
245 }
246 }
247 if (create_pack) {
248 for (uint i = 0; i < memops.size(); i++) {
249 Node* s1 = memops.at(i);
226 int align = alignment(s1); 250 int align = alignment(s1);
227 if (stmts_can_pack(s1, s2, align)) { 251 if (align == top_align) continue;
228 Node_List* pair = new Node_List(); 252 for (uint j = 0; j < memops.size(); j++) {
229 pair->push(s1); 253 Node* s2 = memops.at(j);
230 pair->push(s2); 254 if (alignment(s2) == top_align) continue;
231 _packset.append(pair); 255 if (s1 != s2 && are_adjacent_refs(s1, s2)) {
232 } 256 if (stmts_can_pack(s1, s2, align)) {
233 } 257 Node_List* pair = new Node_List();
234 } 258 pair->push(s1);
235 } 259 pair->push(s2);
260 _packset.append(pair);
261 }
262 }
263 }
264 }
265 } else { // Don't create unaligned pack
266 // First, remove remaining memory ops of the same type from the list.
267 for (int i = memops.size() - 1; i >= 0; i--) {
268 MemNode* s = memops.at(i)->as_Mem();
269 if (same_velt_type(s, mem_ref)) {
270 memops.remove(i);
271 }
272 }
273
274 // Second, remove already constructed packs of the same type.
275 for (int i = _packset.length() - 1; i >= 0; i--) {
276 Node_List* p = _packset.at(i);
277 MemNode* s = p->at(0)->as_Mem();
278 if (same_velt_type(s, mem_ref)) {
279 remove_pack_at(i);
280 }
281 }
282
283 // If needed find the best memory reference for loop alignment again.
284 if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
285 // Put memory ops from remaining packs back on memops list for
286 // the best alignment search.
287 uint orig_msize = memops.size();
288 for (int i = 0; i < _packset.length(); i++) {
289 Node_List* p = _packset.at(i);
290 MemNode* s = p->at(0)->as_Mem();
291 assert(!same_velt_type(s, mem_ref), "sanity");
292 memops.push(s);
293 }
294 MemNode* best_align_to_mem_ref = find_align_to_ref(memops);
295 if (best_align_to_mem_ref == NULL) break;
296 best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
297 // Restore list.
298 while (memops.size() > orig_msize)
299 (void)memops.pop();
300 }
301 } // unaligned memory accesses
302
303 // Remove used mem nodes.
304 for (int i = memops.size() - 1; i >= 0; i--) {
305 MemNode* m = memops.at(i)->as_Mem();
306 if (alignment(m) != top_align) {
307 memops.remove(i);
308 }
309 }
310
311 } // while (memops.size() != 0
312 set_align_to_ref(best_align_to_mem_ref);
236 313
237 #ifndef PRODUCT 314 #ifndef PRODUCT
238 if (TraceSuperWord) { 315 if (TraceSuperWord) {
239 tty->print_cr("\nAfter find_adjacent_refs"); 316 tty->print_cr("\nAfter find_adjacent_refs");
240 print_packset(); 317 print_packset();
244 321
245 //------------------------------find_align_to_ref--------------------------- 322 //------------------------------find_align_to_ref---------------------------
246 // Find a memory reference to align the loop induction variable to. 323 // Find a memory reference to align the loop induction variable to.
247 // Looks first at stores then at loads, looking for a memory reference 324 // Looks first at stores then at loads, looking for a memory reference
248 // with the largest number of references similar to it. 325 // with the largest number of references similar to it.
249 void SuperWord::find_align_to_ref(Node_List &memops) { 326 MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
250 GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0); 327 GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
251 328
252 // Count number of comparable memory ops 329 // Count number of comparable memory ops
253 for (uint i = 0; i < memops.size(); i++) { 330 for (uint i = 0; i < memops.size(); i++) {
254 MemNode* s1 = memops.at(i)->as_Mem(); 331 MemNode* s1 = memops.at(i)->as_Mem();
268 } 345 }
269 } 346 }
270 } 347 }
271 } 348 }
272 349
273 // Find Store (or Load) with the greatest number of "comparable" references 350 // Find Store (or Load) with the greatest number of "comparable" references,
351 // biggest vector size, smallest data size and smallest iv offset.
274 int max_ct = 0; 352 int max_ct = 0;
353 int max_vw = 0;
275 int max_idx = -1; 354 int max_idx = -1;
276 int min_size = max_jint; 355 int min_size = max_jint;
277 int min_iv_offset = max_jint; 356 int min_iv_offset = max_jint;
278 for (uint j = 0; j < memops.size(); j++) { 357 for (uint j = 0; j < memops.size(); j++) {
279 MemNode* s = memops.at(j)->as_Mem(); 358 MemNode* s = memops.at(j)->as_Mem();
280 if (s->is_Store()) { 359 if (s->is_Store()) {
360 int vw = vector_width_in_bytes(velt_basic_type(s));
361 assert(vw > 1, "sanity");
281 SWPointer p(s, this); 362 SWPointer p(s, this);
282 if (cmp_ct.at(j) > max_ct || 363 if (cmp_ct.at(j) > max_ct ||
283 cmp_ct.at(j) == max_ct && (data_size(s) < min_size || 364 cmp_ct.at(j) == max_ct &&
284 data_size(s) == min_size && 365 (vw > max_vw ||
285 p.offset_in_bytes() < min_iv_offset)) { 366 vw == max_vw &&
367 (data_size(s) < min_size ||
368 data_size(s) == min_size &&
369 (p.offset_in_bytes() < min_iv_offset)))) {
286 max_ct = cmp_ct.at(j); 370 max_ct = cmp_ct.at(j);
371 max_vw = vw;
287 max_idx = j; 372 max_idx = j;
288 min_size = data_size(s); 373 min_size = data_size(s);
289 min_iv_offset = p.offset_in_bytes(); 374 min_iv_offset = p.offset_in_bytes();
290 } 375 }
291 } 376 }
293 // If no stores, look at loads 378 // If no stores, look at loads
294 if (max_ct == 0) { 379 if (max_ct == 0) {
295 for (uint j = 0; j < memops.size(); j++) { 380 for (uint j = 0; j < memops.size(); j++) {
296 MemNode* s = memops.at(j)->as_Mem(); 381 MemNode* s = memops.at(j)->as_Mem();
297 if (s->is_Load()) { 382 if (s->is_Load()) {
383 int vw = vector_width_in_bytes(velt_basic_type(s));
384 assert(vw > 1, "sanity");
298 SWPointer p(s, this); 385 SWPointer p(s, this);
299 if (cmp_ct.at(j) > max_ct || 386 if (cmp_ct.at(j) > max_ct ||
300 cmp_ct.at(j) == max_ct && (data_size(s) < min_size || 387 cmp_ct.at(j) == max_ct &&
301 data_size(s) == min_size && 388 (vw > max_vw ||
302 p.offset_in_bytes() < min_iv_offset)) { 389 vw == max_vw &&
390 (data_size(s) < min_size ||
391 data_size(s) == min_size &&
392 (p.offset_in_bytes() < min_iv_offset)))) {
303 max_ct = cmp_ct.at(j); 393 max_ct = cmp_ct.at(j);
394 max_vw = vw;
304 max_idx = j; 395 max_idx = j;
305 min_size = data_size(s); 396 min_size = data_size(s);
306 min_iv_offset = p.offset_in_bytes(); 397 min_iv_offset = p.offset_in_bytes();
307 } 398 }
308 } 399 }
309 } 400 }
310 } 401 }
311 402
312 if (max_ct > 0) 403 #ifdef ASSERT
313 set_align_to_ref(memops.at(max_idx)->as_Mem());
314
315 #ifndef PRODUCT
316 if (TraceSuperWord && Verbose) { 404 if (TraceSuperWord && Verbose) {
317 tty->print_cr("\nVector memops after find_align_to_refs"); 405 tty->print_cr("\nVector memops after find_align_to_refs");
318 for (uint i = 0; i < memops.size(); i++) { 406 for (uint i = 0; i < memops.size(); i++) {
319 MemNode* s = memops.at(i)->as_Mem(); 407 MemNode* s = memops.at(i)->as_Mem();
320 s->dump(); 408 s->dump();
321 } 409 }
322 } 410 }
323 #endif 411 #endif
412
413 if (max_ct > 0) {
414 #ifdef ASSERT
415 if (TraceSuperWord) {
416 tty->print("\nVector align to node: ");
417 memops.at(max_idx)->as_Mem()->dump();
418 }
419 #endif
420 return memops.at(max_idx)->as_Mem();
421 }
422 return NULL;
324 } 423 }
325 424
326 //------------------------------ref_is_alignable--------------------------- 425 //------------------------------ref_is_alignable---------------------------
327 // Can the preloop align the reference to position zero in the vector? 426 // Can the preloop align the reference to position zero in the vector?
328 bool SuperWord::ref_is_alignable(SWPointer& p) { 427 bool SuperWord::ref_is_alignable(SWPointer& p) {
339 if (ABS(span) == p.memory_size()) 438 if (ABS(span) == p.memory_size())
340 return true; 439 return true;
341 440
342 // If initial offset from start of object is computable, 441 // If initial offset from start of object is computable,
343 // compute alignment within the vector. 442 // compute alignment within the vector.
344 int vw = vector_width_in_bytes(); 443 BasicType bt = velt_basic_type(p.mem());
444 int vw = vector_width_in_bytes(bt);
445 assert(vw > 1, "sanity");
345 if (vw % span == 0) { 446 if (vw % span == 0) {
346 Node* init_nd = pre_end->init_trip(); 447 Node* init_nd = pre_end->init_trip();
347 if (init_nd->is_Con() && p.invar() == NULL) { 448 if (init_nd->is_Con() && p.invar() == NULL) {
348 int init = init_nd->bottom_type()->is_int()->get_con(); 449 int init = init_nd->bottom_type()->is_int()->get_con();
349 450
357 return (init_offset % vw) % -span == 0; 458 return (init_offset % vw) % -span == 0;
358 } 459 }
359 } 460 }
360 } 461 }
361 return false; 462 return false;
463 }
464
465 //---------------------------get_iv_adjustment---------------------------
466 // Calculate loop's iv adjustment for this memory ops.
467 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
468 SWPointer align_to_ref_p(mem_ref, this);
469 int offset = align_to_ref_p.offset_in_bytes();
470 int scale = align_to_ref_p.scale_in_bytes();
471 BasicType bt = velt_basic_type(mem_ref);
472 int vw = vector_width_in_bytes(bt);
473 assert(vw > 1, "sanity");
474 int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
475 int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw;
476
477 #ifndef PRODUCT
478 if (TraceSuperWord)
479 tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
480 offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw);
481 #endif
482 return iv_adjustment;
362 } 483 }
363 484
364 //---------------------------dependence_graph--------------------------- 485 //---------------------------dependence_graph---------------------------
365 // Construct dependency graph. 486 // Construct dependency graph.
366 // Add dependence edges to load/store nodes for memory dependence 487 // Add dependence edges to load/store nodes for memory dependence
486 // Can s1 and s2 be in a pack with s1 immediately preceding s2 and 607 // Can s1 and s2 be in a pack with s1 immediately preceding s2 and
487 // s1 aligned at "align" 608 // s1 aligned at "align"
488 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) { 609 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
489 610
490 // Do not use superword for non-primitives 611 // Do not use superword for non-primitives
491 if((s1->is_Mem() && !is_java_primitive(s1->as_Mem()->memory_type())) || 612 BasicType bt1 = velt_basic_type(s1);
492 (s2->is_Mem() && !is_java_primitive(s2->as_Mem()->memory_type()))) 613 BasicType bt2 = velt_basic_type(s2);
614 if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
493 return false; 615 return false;
616 if (Matcher::max_vector_size(bt1) < 2) {
617 return false; // No vectors for this type
618 }
494 619
495 if (isomorphic(s1, s2)) { 620 if (isomorphic(s1, s2)) {
496 if (independent(s1, s2)) { 621 if (independent(s1, s2)) {
497 if (!exists_at(s1, 0) && !exists_at(s2, 1)) { 622 if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
498 if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) { 623 if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
550 // Are s1 and s2 similar? 675 // Are s1 and s2 similar?
551 bool SuperWord::isomorphic(Node* s1, Node* s2) { 676 bool SuperWord::isomorphic(Node* s1, Node* s2) {
552 if (s1->Opcode() != s2->Opcode()) return false; 677 if (s1->Opcode() != s2->Opcode()) return false;
553 if (s1->req() != s2->req()) return false; 678 if (s1->req() != s2->req()) return false;
554 if (s1->in(0) != s2->in(0)) return false; 679 if (s1->in(0) != s2->in(0)) return false;
555 if (velt_type(s1) != velt_type(s2)) return false; 680 if (!same_velt_type(s1, s2)) return false;
556 return true; 681 return true;
557 } 682 }
558 683
559 //------------------------------independent--------------------------- 684 //------------------------------independent---------------------------
560 // Is there no data path from s1 to s2 or s2 to s1? 685 // Is there no data path from s1 to s2 or s2 to s1?
593 } 718 }
594 719
595 //------------------------------set_alignment--------------------------- 720 //------------------------------set_alignment---------------------------
596 void SuperWord::set_alignment(Node* s1, Node* s2, int align) { 721 void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
597 set_alignment(s1, align); 722 set_alignment(s1, align);
598 set_alignment(s2, align + data_size(s1)); 723 if (align == top_align || align == bottom_align) {
724 set_alignment(s2, align);
725 } else {
726 set_alignment(s2, align + data_size(s1));
727 }
599 } 728 }
600 729
601 //------------------------------data_size--------------------------- 730 //------------------------------data_size---------------------------
602 int SuperWord::data_size(Node* s) { 731 int SuperWord::data_size(Node* s) {
603 const Type* t = velt_type(s); 732 int bsize = type2aelembytes(velt_basic_type(s));
604 BasicType bt = t->array_element_basic_type();
605 int bsize = type2aelembytes(bt);
606 assert(bsize != 0, "valid size"); 733 assert(bsize != 0, "valid size");
607 return bsize; 734 return bsize;
608 } 735 }
609 736
610 //------------------------------extend_packlist--------------------------- 737 //------------------------------extend_packlist---------------------------
629 } 756 }
630 757
631 //------------------------------follow_use_defs--------------------------- 758 //------------------------------follow_use_defs---------------------------
632 // Extend the packset by visiting operand definitions of nodes in pack p 759 // Extend the packset by visiting operand definitions of nodes in pack p
633 bool SuperWord::follow_use_defs(Node_List* p) { 760 bool SuperWord::follow_use_defs(Node_List* p) {
761 assert(p->size() == 2, "just checking");
634 Node* s1 = p->at(0); 762 Node* s1 = p->at(0);
635 Node* s2 = p->at(1); 763 Node* s2 = p->at(1);
636 assert(p->size() == 2, "just checking");
637 assert(s1->req() == s2->req(), "just checking"); 764 assert(s1->req() == s2->req(), "just checking");
638 assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking"); 765 assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
639 766
640 if (s1->is_Load()) return false; 767 if (s1->is_Load()) return false;
641 768
716 uint i2 = 0; 843 uint i2 = 0;
717 do { 844 do {
718 for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break; 845 for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
719 for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break; 846 for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
720 if (i1 != i2) { 847 if (i1 != i2) {
721 return false; 848 if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
849 // Further analysis relies on operands position matching.
850 u2->swap_edges(i1, i2);
851 } else {
852 return false;
853 }
722 } 854 }
723 } while (i1 < ct); 855 } while (i1 < ct);
724 return true; 856 return true;
725 } 857 }
726 858
727 //------------------------------est_savings--------------------------- 859 //------------------------------est_savings---------------------------
728 // Estimate the savings from executing s1 and s2 as a pack 860 // Estimate the savings from executing s1 and s2 as a pack
729 int SuperWord::est_savings(Node* s1, Node* s2) { 861 int SuperWord::est_savings(Node* s1, Node* s2) {
730 int save = 2 - 1; // 2 operations per instruction in packed form 862 int save_in = 2 - 1; // 2 operations per instruction in packed form
731 863
732 // inputs 864 // inputs
733 for (uint i = 1; i < s1->req(); i++) { 865 for (uint i = 1; i < s1->req(); i++) {
734 Node* x1 = s1->in(i); 866 Node* x1 = s1->in(i);
735 Node* x2 = s2->in(i); 867 Node* x2 = s2->in(i);
736 if (x1 != x2) { 868 if (x1 != x2) {
737 if (are_adjacent_refs(x1, x2)) { 869 if (are_adjacent_refs(x1, x2)) {
738 save += adjacent_profit(x1, x2); 870 save_in += adjacent_profit(x1, x2);
739 } else if (!in_packset(x1, x2)) { 871 } else if (!in_packset(x1, x2)) {
740 save -= pack_cost(2); 872 save_in -= pack_cost(2);
741 } else { 873 } else {
742 save += unpack_cost(2); 874 save_in += unpack_cost(2);
743 } 875 }
744 } 876 }
745 } 877 }
746 878
747 // uses of result 879 // uses of result
748 uint ct = 0; 880 uint ct = 0;
881 int save_use = 0;
749 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { 882 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
750 Node* s1_use = s1->fast_out(i); 883 Node* s1_use = s1->fast_out(i);
751 for (int j = 0; j < _packset.length(); j++) { 884 for (int j = 0; j < _packset.length(); j++) {
752 Node_List* p = _packset.at(j); 885 Node_List* p = _packset.at(j);
753 if (p->at(0) == s1_use) { 886 if (p->at(0) == s1_use) {
754 for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) { 887 for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
755 Node* s2_use = s2->fast_out(k); 888 Node* s2_use = s2->fast_out(k);
756 if (p->at(p->size()-1) == s2_use) { 889 if (p->at(p->size()-1) == s2_use) {
757 ct++; 890 ct++;
758 if (are_adjacent_refs(s1_use, s2_use)) { 891 if (are_adjacent_refs(s1_use, s2_use)) {
759 save += adjacent_profit(s1_use, s2_use); 892 save_use += adjacent_profit(s1_use, s2_use);
760 } 893 }
761 } 894 }
762 } 895 }
763 } 896 }
764 } 897 }
765 } 898 }
766 899
767 if (ct < s1->outcnt()) save += unpack_cost(1); 900 if (ct < s1->outcnt()) save_use += unpack_cost(1);
768 if (ct < s2->outcnt()) save += unpack_cost(1); 901 if (ct < s2->outcnt()) save_use += unpack_cost(1);
769 902
770 return save; 903 return MAX2(save_in, save_use);
771 } 904 }
772 905
773 //------------------------------costs--------------------------- 906 //------------------------------costs---------------------------
774 int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; } 907 int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; }
775 int SuperWord::pack_cost(int ct) { return ct; } 908 int SuperWord::pack_cost(int ct) { return ct; }
776 int SuperWord::unpack_cost(int ct) { return ct; } 909 int SuperWord::unpack_cost(int ct) { return ct; }
777 910
778 //------------------------------combine_packs--------------------------- 911 //------------------------------combine_packs---------------------------
779 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last 912 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
780 void SuperWord::combine_packs() { 913 void SuperWord::combine_packs() {
781 bool changed; 914 bool changed = true;
782 do { 915 // Combine packs regardless max vector size.
916 while (changed) {
783 changed = false; 917 changed = false;
784 for (int i = 0; i < _packset.length(); i++) { 918 for (int i = 0; i < _packset.length(); i++) {
785 Node_List* p1 = _packset.at(i); 919 Node_List* p1 = _packset.at(i);
786 if (p1 == NULL) continue; 920 if (p1 == NULL) continue;
787 for (int j = 0; j < _packset.length(); j++) { 921 for (int j = 0; j < _packset.length(); j++) {
788 Node_List* p2 = _packset.at(j); 922 Node_List* p2 = _packset.at(j);
789 if (p2 == NULL) continue; 923 if (p2 == NULL) continue;
924 if (i == j) continue;
790 if (p1->at(p1->size()-1) == p2->at(0)) { 925 if (p1->at(p1->size()-1) == p2->at(0)) {
791 for (uint k = 1; k < p2->size(); k++) { 926 for (uint k = 1; k < p2->size(); k++) {
792 p1->push(p2->at(k)); 927 p1->push(p2->at(k));
793 } 928 }
794 _packset.at_put(j, NULL); 929 _packset.at_put(j, NULL);
795 changed = true; 930 changed = true;
796 } 931 }
797 } 932 }
798 } 933 }
799 } while (changed); 934 }
800 935
936 // Split packs which have size greater then max vector size.
937 for (int i = 0; i < _packset.length(); i++) {
938 Node_List* p1 = _packset.at(i);
939 if (p1 != NULL) {
940 BasicType bt = velt_basic_type(p1->at(0));
941 uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
942 assert(is_power_of_2(max_vlen), "sanity");
943 uint psize = p1->size();
944 if (!is_power_of_2(psize)) {
945 // Skip pack which can't be vector.
946 // case1: for(...) { a[i] = i; } elements values are different (i+x)
947 // case2: for(...) { a[i] = b[i+1]; } can't align both, load and store
948 _packset.at_put(i, NULL);
949 continue;
950 }
951 if (psize > max_vlen) {
952 Node_List* pack = new Node_List();
953 for (uint j = 0; j < psize; j++) {
954 pack->push(p1->at(j));
955 if (pack->size() >= max_vlen) {
956 assert(is_power_of_2(pack->size()), "sanity");
957 _packset.append(pack);
958 pack = new Node_List();
959 }
960 }
961 _packset.at_put(i, NULL);
962 }
963 }
964 }
965
966 // Compress list.
801 for (int i = _packset.length() - 1; i >= 0; i--) { 967 for (int i = _packset.length() - 1; i >= 0; i--) {
802 Node_List* p1 = _packset.at(i); 968 Node_List* p1 = _packset.at(i);
803 if (p1 == NULL) { 969 if (p1 == NULL) {
804 _packset.remove_at(i); 970 _packset.remove_at(i);
805 } 971 }
878 1044
879 //------------------------------implemented--------------------------- 1045 //------------------------------implemented---------------------------
880 // Can code be generated for pack p? 1046 // Can code be generated for pack p?
881 bool SuperWord::implemented(Node_List* p) { 1047 bool SuperWord::implemented(Node_List* p) {
882 Node* p0 = p->at(0); 1048 Node* p0 = p->at(0);
883 int vopc = VectorNode::opcode(p0->Opcode(), p->size(), velt_type(p0)); 1049 return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0));
884 return vopc > 0 && Matcher::has_match_rule(vopc);
885 } 1050 }
886 1051
887 //------------------------------profitable--------------------------- 1052 //------------------------------profitable---------------------------
888 // For pack p, are all operands and all uses (with in the block) vector? 1053 // For pack p, are all operands and all uses (with in the block) vector?
889 bool SuperWord::profitable(Node_List* p) { 1054 bool SuperWord::profitable(Node_List* p) {
937 co_locate_pack(_packset.at(i)); 1102 co_locate_pack(_packset.at(i));
938 } 1103 }
939 } 1104 }
940 1105
941 //-------------------------------remove_and_insert------------------- 1106 //-------------------------------remove_and_insert-------------------
942 //remove "current" from its current position in the memory graph and insert 1107 // Remove "current" from its current position in the memory graph and insert
943 //it after the appropriate insertion point (lip or uip) 1108 // it after the appropriate insertion point (lip or uip).
944 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip, 1109 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
945 Node *uip, Unique_Node_List &sched_before) { 1110 Node *uip, Unique_Node_List &sched_before) {
946 Node* my_mem = current->in(MemNode::Memory); 1111 Node* my_mem = current->in(MemNode::Memory);
947 _igvn.rehash_node_delayed(current); 1112 bool sched_up = sched_before.member(current);
948 _igvn.hash_delete(my_mem); 1113
949 1114 // remove current_store from its current position in the memmory graph
950 //remove current_store from its current position in the memmory graph
951 for (DUIterator i = current->outs(); current->has_out(i); i++) { 1115 for (DUIterator i = current->outs(); current->has_out(i); i++) {
952 Node* use = current->out(i); 1116 Node* use = current->out(i);
953 if (use->is_Mem()) { 1117 if (use->is_Mem()) {
954 assert(use->in(MemNode::Memory) == current, "must be"); 1118 assert(use->in(MemNode::Memory) == current, "must be");
955 _igvn.rehash_node_delayed(use);
956 if (use == prev) { // connect prev to my_mem 1119 if (use == prev) { // connect prev to my_mem
957 use->set_req(MemNode::Memory, my_mem); 1120 _igvn.replace_input_of(use, MemNode::Memory, my_mem);
1121 --i; //deleted this edge; rescan position
958 } else if (sched_before.member(use)) { 1122 } else if (sched_before.member(use)) {
959 _igvn.hash_delete(uip); 1123 if (!sched_up) { // Will be moved together with current
960 use->set_req(MemNode::Memory, uip); 1124 _igvn.replace_input_of(use, MemNode::Memory, uip);
1125 --i; //deleted this edge; rescan position
1126 }
961 } else { 1127 } else {
962 _igvn.hash_delete(lip); 1128 if (sched_up) { // Will be moved together with current
963 use->set_req(MemNode::Memory, lip); 1129 _igvn.replace_input_of(use, MemNode::Memory, lip);
964 } 1130 --i; //deleted this edge; rescan position
965 --i; //deleted this edge; rescan position 1131 }
966 } 1132 }
967 } 1133 }
968 1134 }
969 bool sched_up = sched_before.member(current); 1135
970 Node *insert_pt = sched_up ? uip : lip; 1136 Node *insert_pt = sched_up ? uip : lip;
971 _igvn.hash_delete(insert_pt);
972 1137
973 // all uses of insert_pt's memory state should use current's instead 1138 // all uses of insert_pt's memory state should use current's instead
974 for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) { 1139 for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
975 Node* use = insert_pt->out(i); 1140 Node* use = insert_pt->out(i);
976 if (use->is_Mem()) { 1141 if (use->is_Mem()) {
986 --i; 1151 --i;
987 } 1152 }
988 } 1153 }
989 1154
990 //connect current to insert_pt 1155 //connect current to insert_pt
991 current->set_req(MemNode::Memory, insert_pt); 1156 _igvn.replace_input_of(current, MemNode::Memory, insert_pt);
992 } 1157 }
993 1158
994 //------------------------------co_locate_pack---------------------------------- 1159 //------------------------------co_locate_pack----------------------------------
995 // To schedule a store pack, we need to move any sandwiched memory ops either before 1160 // To schedule a store pack, we need to move any sandwiched memory ops either before
996 // or after the pack, based upon dependence information: 1161 // or after the pack, based upon dependence information:
1023 for (DUIterator i = current->outs(); current->has_out(i); i++) { 1188 for (DUIterator i = current->outs(); current->has_out(i); i++) {
1024 Node* use = current->out(i); 1189 Node* use = current->out(i);
1025 if (use->is_Mem() && use != previous) 1190 if (use->is_Mem() && use != previous)
1026 memops.push(use); 1191 memops.push(use);
1027 } 1192 }
1028 if(current == first) break; 1193 if (current == first) break;
1029 previous = current; 1194 previous = current;
1030 current = current->in(MemNode::Memory)->as_Mem(); 1195 current = current->in(MemNode::Memory)->as_Mem();
1031 } 1196 }
1032 1197
1033 // determine which memory operations should be scheduled before the pack 1198 // determine which memory operations should be scheduled before the pack
1036 if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) { 1201 if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
1037 for (uint j = 0; j< i; j++) { 1202 for (uint j = 0; j< i; j++) {
1038 Node *s2 = memops.at(j); 1203 Node *s2 = memops.at(j);
1039 if (!independent(s1, s2)) { 1204 if (!independent(s1, s2)) {
1040 if (in_pack(s2, pk) || schedule_before_pack.member(s2)) { 1205 if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
1041 schedule_before_pack.push(s1); //s1 must be scheduled before 1206 schedule_before_pack.push(s1); // s1 must be scheduled before
1042 Node_List* mem_pk = my_pack(s1); 1207 Node_List* mem_pk = my_pack(s1);
1043 if (mem_pk != NULL) { 1208 if (mem_pk != NULL) {
1044 for (uint ii = 0; ii < mem_pk->size(); ii++) { 1209 for (uint ii = 0; ii < mem_pk->size(); ii++) {
1045 Node* s = mem_pk->at(ii); // follow partner 1210 Node* s = mem_pk->at(ii); // follow partner
1046 if (memops.member(s) && !schedule_before_pack.member(s)) 1211 if (memops.member(s) && !schedule_before_pack.member(s))
1047 schedule_before_pack.push(s); 1212 schedule_before_pack.push(s);
1048 } 1213 }
1049 } 1214 }
1215 break;
1050 } 1216 }
1051 } 1217 }
1052 } 1218 }
1053 } 1219 }
1054 } 1220 }
1055 1221
1222 Node* upper_insert_pt = first->in(MemNode::Memory);
1223 // Following code moves loads connected to upper_insert_pt below aliased stores.
1224 // Collect such loads here and reconnect them back to upper_insert_pt later.
1225 memops.clear();
1226 for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
1227 Node* use = upper_insert_pt->out(i);
1228 if (!use->is_Store())
1229 memops.push(use);
1230 }
1231
1056 MemNode* lower_insert_pt = last; 1232 MemNode* lower_insert_pt = last;
1057 Node* upper_insert_pt = first->in(MemNode::Memory);
1058 previous = last; //previous store in pk 1233 previous = last; //previous store in pk
1059 current = last->in(MemNode::Memory)->as_Mem(); 1234 current = last->in(MemNode::Memory)->as_Mem();
1060 1235
1061 //start scheduling from "last" to "first" 1236 // start scheduling from "last" to "first"
1062 while (true) { 1237 while (true) {
1063 assert(in_bb(current), "stay in block"); 1238 assert(in_bb(current), "stay in block");
1064 assert(in_pack(previous, pk), "previous stays in pack"); 1239 assert(in_pack(previous, pk), "previous stays in pack");
1065 Node* my_mem = current->in(MemNode::Memory); 1240 Node* my_mem = current->in(MemNode::Memory);
1066 1241
1067 if (in_pack(current, pk)) { 1242 if (in_pack(current, pk)) {
1068 // Forward users of my memory state (except "previous) to my input memory state 1243 // Forward users of my memory state (except "previous) to my input memory state
1069 _igvn.hash_delete(current);
1070 for (DUIterator i = current->outs(); current->has_out(i); i++) { 1244 for (DUIterator i = current->outs(); current->has_out(i); i++) {
1071 Node* use = current->out(i); 1245 Node* use = current->out(i);
1072 if (use->is_Mem() && use != previous) { 1246 if (use->is_Mem() && use != previous) {
1073 assert(use->in(MemNode::Memory) == current, "must be"); 1247 assert(use->in(MemNode::Memory) == current, "must be");
1074 if (schedule_before_pack.member(use)) { 1248 if (schedule_before_pack.member(use)) {
1075 _igvn.hash_delete(upper_insert_pt);
1076 _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt); 1249 _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
1077 } else { 1250 } else {
1078 _igvn.hash_delete(lower_insert_pt);
1079 _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt); 1251 _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
1080 } 1252 }
1081 --i; // deleted this edge; rescan position 1253 --i; // deleted this edge; rescan position
1082 } 1254 }
1083 } 1255 }
1087 } 1259 }
1088 1260
1089 if (current == first) break; 1261 if (current == first) break;
1090 current = my_mem->as_Mem(); 1262 current = my_mem->as_Mem();
1091 } // end while 1263 } // end while
1264
1265 // Reconnect loads back to upper_insert_pt.
1266 for (uint i = 0; i < memops.size(); i++) {
1267 Node *ld = memops.at(i);
1268 if (ld->in(MemNode::Memory) != upper_insert_pt) {
1269 _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
1270 }
1271 }
1092 } else if (pk->at(0)->is_Load()) { //load 1272 } else if (pk->at(0)->is_Load()) { //load
1093 // all loads in the pack should have the same memory state. By default, 1273 // all loads in the pack should have the same memory state. By default,
1094 // we use the memory state of the last load. However, if any load could 1274 // we use the memory state of the last load. However, if any load could
1095 // not be moved down due to the dependence constraint, we use the memory 1275 // not be moved down due to the dependence constraint, we use the memory
1096 // state of the first load. 1276 // state of the first load.
1147 if (p && n == executed_last(p)) { 1327 if (p && n == executed_last(p)) {
1148 uint vlen = p->size(); 1328 uint vlen = p->size();
1149 Node* vn = NULL; 1329 Node* vn = NULL;
1150 Node* low_adr = p->at(0); 1330 Node* low_adr = p->at(0);
1151 Node* first = executed_first(p); 1331 Node* first = executed_first(p);
1332 int opc = n->Opcode();
1152 if (n->is_Load()) { 1333 if (n->is_Load()) {
1153 int opc = n->Opcode();
1154 Node* ctl = n->in(MemNode::Control); 1334 Node* ctl = n->in(MemNode::Control);
1155 Node* mem = first->in(MemNode::Memory); 1335 Node* mem = first->in(MemNode::Memory);
1156 Node* adr = low_adr->in(MemNode::Address); 1336 Node* adr = low_adr->in(MemNode::Address);
1157 const TypePtr* atyp = n->adr_type(); 1337 const TypePtr* atyp = n->adr_type();
1158 vn = VectorLoadNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen); 1338 vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
1159
1160 } else if (n->is_Store()) { 1339 } else if (n->is_Store()) {
1161 // Promote value to be stored to vector 1340 // Promote value to be stored to vector
1162 Node* val = vector_opd(p, MemNode::ValueIn); 1341 Node* val = vector_opd(p, MemNode::ValueIn);
1163
1164 int opc = n->Opcode();
1165 Node* ctl = n->in(MemNode::Control); 1342 Node* ctl = n->in(MemNode::Control);
1166 Node* mem = first->in(MemNode::Memory); 1343 Node* mem = first->in(MemNode::Memory);
1167 Node* adr = low_adr->in(MemNode::Address); 1344 Node* adr = low_adr->in(MemNode::Address);
1168 const TypePtr* atyp = n->adr_type(); 1345 const TypePtr* atyp = n->adr_type();
1169 vn = VectorStoreNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen); 1346 vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
1170
1171 } else if (n->req() == 3) { 1347 } else if (n->req() == 3) {
1172 // Promote operands to vector 1348 // Promote operands to vector
1173 Node* in1 = vector_opd(p, 1); 1349 Node* in1 = vector_opd(p, 1);
1174 Node* in2 = vector_opd(p, 2); 1350 Node* in2 = vector_opd(p, 2);
1175 vn = VectorNode::make(_phase->C, n->Opcode(), in1, in2, vlen, velt_type(n)); 1351 vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
1176
1177 } else { 1352 } else {
1178 ShouldNotReachHere(); 1353 ShouldNotReachHere();
1179 } 1354 }
1180 1355 assert(vn != NULL, "sanity");
1181 _phase->_igvn.register_new_node_with_optimizer(vn); 1356 _phase->_igvn.register_new_node_with_optimizer(vn);
1182 _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0))); 1357 _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
1183 for (uint j = 0; j < p->size(); j++) { 1358 for (uint j = 0; j < p->size(); j++) {
1184 Node* pm = p->at(j); 1359 Node* pm = p->at(j);
1185 _igvn.replace_node(pm, vn); 1360 _igvn.replace_node(pm, vn);
1186 } 1361 }
1187 _igvn._worklist.push(vn); 1362 _igvn._worklist.push(vn);
1363 #ifdef ASSERT
1364 if (TraceSuperWord) {
1365 tty->print("new Vector node: ");
1366 vn->dump();
1367 }
1368 #endif
1188 } 1369 }
1189 } 1370 }
1190 } 1371 }
1191 1372
1192 //------------------------------vector_opd--------------------------- 1373 //------------------------------vector_opd---------------------------
1205 break; 1386 break;
1206 } 1387 }
1207 } 1388 }
1208 1389
1209 if (same_opd) { 1390 if (same_opd) {
1210 if (opd->is_Vector() || opd->is_VectorLoad()) { 1391 if (opd->is_Vector() || opd->is_LoadVector()) {
1211 return opd; // input is matching vector 1392 return opd; // input is matching vector
1212 } 1393 }
1213 assert(!opd->is_VectorStore(), "such vector is not expected here"); 1394 assert(!opd->is_StoreVector(), "such vector is not expected here");
1214 // Convert scalar input to vector with the same number of elements as 1395 // Convert scalar input to vector with the same number of elements as
1215 // p0's vector. Use p0's type because size of operand's container in 1396 // p0's vector. Use p0's type because size of operand's container in
1216 // vector should match p0's size regardless operand's size. 1397 // vector should match p0's size regardless operand's size.
1217 const Type* p0_t = velt_type(p0); 1398 const Type* p0_t = velt_type(p0);
1218 VectorNode* vn = VectorNode::scalar2vector(_phase->C, opd, vlen, p0_t); 1399 VectorNode* vn = VectorNode::scalar2vector(_phase->C, opd, vlen, p0_t);
1219 1400
1220 _phase->_igvn.register_new_node_with_optimizer(vn); 1401 _phase->_igvn.register_new_node_with_optimizer(vn);
1221 _phase->set_ctrl(vn, _phase->get_ctrl(opd)); 1402 _phase->set_ctrl(vn, _phase->get_ctrl(opd));
1403 #ifdef ASSERT
1404 if (TraceSuperWord) {
1405 tty->print("new Vector node: ");
1406 vn->dump();
1407 }
1408 #endif
1222 return vn; 1409 return vn;
1223 } 1410 }
1224 1411
1225 // Insert pack operation 1412 // Insert pack operation
1226 const Type* p0_t = velt_type(p0); 1413 BasicType bt = velt_basic_type(p0);
1227 PackNode* pk = PackNode::make(_phase->C, opd, p0_t); 1414 PackNode* pk = PackNode::make(_phase->C, opd, vlen, bt);
1228 DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); ) 1415 DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
1229 1416
1230 for (uint i = 1; i < vlen; i++) { 1417 for (uint i = 1; i < vlen; i++) {
1231 Node* pi = p->at(i); 1418 Node* pi = p->at(i);
1232 Node* in = pi->in(opd_idx); 1419 Node* in = pi->in(opd_idx);
1233 assert(my_pack(in) == NULL, "Should already have been unpacked"); 1420 assert(my_pack(in) == NULL, "Should already have been unpacked");
1234 assert(opd_bt == in->bottom_type()->basic_type(), "all same type"); 1421 assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
1235 pk->add_opd(in); 1422 pk->add_opd(i, in);
1236 } 1423 }
1237 _phase->_igvn.register_new_node_with_optimizer(pk); 1424 _phase->_igvn.register_new_node_with_optimizer(pk);
1238 _phase->set_ctrl(pk, _phase->get_ctrl(opd)); 1425 _phase->set_ctrl(pk, _phase->get_ctrl(opd));
1426 #ifdef ASSERT
1427 if (TraceSuperWord) {
1428 tty->print("new Pack node: ");
1429 pk->dump();
1430 }
1431 #endif
1239 return pk; 1432 return pk;
1240 } 1433 }
1241 1434
1242 //------------------------------insert_extracts--------------------------- 1435 //------------------------------insert_extracts---------------------------
1243 // If a use of pack p is not a vector use, then replace the 1436 // If a use of pack p is not a vector use, then replace the
1271 Node* def = use->in(idx); 1464 Node* def = use->in(idx);
1272 1465
1273 // Insert extract operation 1466 // Insert extract operation
1274 _igvn.hash_delete(def); 1467 _igvn.hash_delete(def);
1275 int def_pos = alignment(def) / data_size(def); 1468 int def_pos = alignment(def) / data_size(def);
1276 const Type* def_t = velt_type(def); 1469
1277 1470 Node* ex = ExtractNode::make(_phase->C, def, def_pos, velt_basic_type(def));
1278 Node* ex = ExtractNode::make(_phase->C, def, def_pos, def_t);
1279 _phase->_igvn.register_new_node_with_optimizer(ex); 1471 _phase->_igvn.register_new_node_with_optimizer(ex);
1280 _phase->set_ctrl(ex, _phase->get_ctrl(def)); 1472 _phase->set_ctrl(ex, _phase->get_ctrl(def));
1281 _igvn.replace_input_of(use, idx, ex); 1473 _igvn.replace_input_of(use, idx, ex);
1282 _igvn._worklist.push(def); 1474 _igvn._worklist.push(def);
1283 1475
1284 bb_insert_after(ex, bb_idx(def)); 1476 bb_insert_after(ex, bb_idx(def));
1285 set_velt_type(ex, def_t); 1477 set_velt_type(ex, velt_type(def));
1286 } 1478 }
1287 } 1479 }
1288 1480
1289 //------------------------------is_vector_use--------------------------- 1481 //------------------------------is_vector_use---------------------------
1290 // Is use->in(u_idx) a vector use? 1482 // Is use->in(u_idx) a vector use?
1507 #endif 1699 #endif
1508 1700
1509 // Initial type 1701 // Initial type
1510 for (int i = 0; i < _block.length(); i++) { 1702 for (int i = 0; i < _block.length(); i++) {
1511 Node* n = _block.at(i); 1703 Node* n = _block.at(i);
1512 const Type* t = n->is_Mem() ? Type::get_const_basic_type(n->as_Mem()->memory_type()) 1704 set_velt_type(n, container_type(n));
1513 : _igvn.type(n);
1514 const Type* vt = container_type(t);
1515 set_velt_type(n, vt);
1516 } 1705 }
1517 1706
1518 // Propagate narrowed type backwards through operations 1707 // Propagate narrowed type backwards through operations
1519 // that don't depend on higher order bits 1708 // that don't depend on higher order bits
1520 for (int i = _block.length() - 1; i >= 0; i--) { 1709 for (int i = _block.length() - 1; i >= 0; i--) {
1541 case Op_CMoveI: case Op_CMoveL: 1730 case Op_CMoveI: case Op_CMoveL:
1542 if (in_bb(in)) { 1731 if (in_bb(in)) {
1543 bool same_type = true; 1732 bool same_type = true;
1544 for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) { 1733 for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
1545 Node *use = in->fast_out(k); 1734 Node *use = in->fast_out(k);
1546 if (!in_bb(use) || velt_type(use) != vt) { 1735 if (!in_bb(use) || !same_velt_type(use, n)) {
1547 same_type = false; 1736 same_type = false;
1548 break; 1737 break;
1549 } 1738 }
1550 } 1739 }
1551 if (same_type) { 1740 if (same_type) {
1573 int SuperWord::memory_alignment(MemNode* s, int iv_adjust_in_bytes) { 1762 int SuperWord::memory_alignment(MemNode* s, int iv_adjust_in_bytes) {
1574 SWPointer p(s, this); 1763 SWPointer p(s, this);
1575 if (!p.valid()) { 1764 if (!p.valid()) {
1576 return bottom_align; 1765 return bottom_align;
1577 } 1766 }
1767 int vw = vector_width_in_bytes(velt_basic_type(s));
1768 if (vw < 2) {
1769 return bottom_align; // No vectors for this type
1770 }
1578 int offset = p.offset_in_bytes(); 1771 int offset = p.offset_in_bytes();
1579 offset += iv_adjust_in_bytes; 1772 offset += iv_adjust_in_bytes;
1580 int off_rem = offset % vector_width_in_bytes(); 1773 int off_rem = offset % vw;
1581 int off_mod = off_rem >= 0 ? off_rem : off_rem + vector_width_in_bytes(); 1774 int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
1582 return off_mod; 1775 return off_mod;
1583 } 1776 }
1584 1777
1585 //---------------------------container_type--------------------------- 1778 //---------------------------container_type---------------------------
1586 // Smallest type containing range of values 1779 // Smallest type containing range of values
1587 const Type* SuperWord::container_type(const Type* t) { 1780 const Type* SuperWord::container_type(Node* n) {
1588 const Type* tp = t->make_ptr(); 1781 if (n->is_Mem()) {
1589 if (tp && tp->isa_aryptr()) { 1782 return Type::get_const_basic_type(n->as_Mem()->memory_type());
1590 t = tp->is_aryptr()->elem(); 1783 }
1591 } 1784 const Type* t = _igvn.type(n);
1592 if (t->basic_type() == T_INT) { 1785 if (t->basic_type() == T_INT) {
1593 if (t->higher_equal(TypeInt::BOOL)) return TypeInt::BOOL; 1786 if (t->higher_equal(TypeInt::BOOL)) return TypeInt::BOOL;
1594 if (t->higher_equal(TypeInt::BYTE)) return TypeInt::BYTE; 1787 if (t->higher_equal(TypeInt::BYTE)) return TypeInt::BYTE;
1595 if (t->higher_equal(TypeInt::CHAR)) return TypeInt::CHAR; 1788 if (t->higher_equal(TypeInt::CHAR)) return TypeInt::CHAR;
1596 if (t->higher_equal(TypeInt::SHORT)) return TypeInt::SHORT; 1789 if (t->higher_equal(TypeInt::SHORT)) return TypeInt::SHORT;
1597 return TypeInt::INT; 1790 return TypeInt::INT;
1598 } 1791 }
1599 return t; 1792 return t;
1600 } 1793 }
1601 1794
1795 bool SuperWord::same_velt_type(Node* n1, Node* n2) {
1796 const Type* vt1 = velt_type(n1);
1797 const Type* vt2 = velt_type(n1);
1798 if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
1799 // Compare vectors element sizes for integer types.
1800 return data_size(n1) == data_size(n2);
1801 }
1802 return vt1 == vt2;
1803 }
1804
1602 //-------------------------vector_opd_range----------------------- 1805 //-------------------------vector_opd_range-----------------------
1603 // (Start, end] half-open range defining which operands are vector 1806 // (Start, end] half-open range defining which operands are vector
1604 void SuperWord::vector_opd_range(Node* n, uint* start, uint* end) { 1807 void SuperWord::vector_opd_range(Node* n, uint* start, uint* end) {
1605 switch (n->Opcode()) { 1808 switch (n->Opcode()) {
1606 case Op_LoadB: case Op_LoadUS: 1809 case Op_LoadB: case Op_LoadUB:
1810 case Op_LoadS: case Op_LoadUS:
1607 case Op_LoadI: case Op_LoadL: 1811 case Op_LoadI: case Op_LoadL:
1608 case Op_LoadF: case Op_LoadD: 1812 case Op_LoadF: case Op_LoadD:
1609 case Op_LoadP: 1813 case Op_LoadP:
1610 *start = 0; 1814 *start = 0;
1611 *end = 0; 1815 *end = 0;
1719 // pre-loop Opaque1 node. 1923 // pre-loop Opaque1 node.
1720 Node *orig_limit = pre_opaq->original_loop_limit(); 1924 Node *orig_limit = pre_opaq->original_loop_limit();
1721 assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, ""); 1925 assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
1722 1926
1723 SWPointer align_to_ref_p(align_to_ref, this); 1927 SWPointer align_to_ref_p(align_to_ref, this);
1928 assert(align_to_ref_p.valid(), "sanity");
1724 1929
1725 // Given: 1930 // Given:
1726 // lim0 == original pre loop limit 1931 // lim0 == original pre loop limit
1727 // V == v_align (power of 2) 1932 // V == v_align (power of 2)
1728 // invar == extra invariant piece of the address expression 1933 // invar == extra invariant piece of the address expression
1771 // Solving for lim: 1976 // Solving for lim:
1772 // (e - lim0 + N) % V == 0 1977 // (e - lim0 + N) % V == 0
1773 // N = (V - (e - lim0)) % V 1978 // N = (V - (e - lim0)) % V
1774 // lim = lim0 - (V - (e - lim0)) % V 1979 // lim = lim0 - (V - (e - lim0)) % V
1775 1980
1981 int vw = vector_width_in_bytes(velt_basic_type(align_to_ref));
1982 assert(vw > 1, "sanity");
1776 int stride = iv_stride(); 1983 int stride = iv_stride();
1777 int scale = align_to_ref_p.scale_in_bytes(); 1984 int scale = align_to_ref_p.scale_in_bytes();
1778 int elt_size = align_to_ref_p.memory_size(); 1985 int elt_size = align_to_ref_p.memory_size();
1779 int v_align = vector_width_in_bytes() / elt_size; 1986 int v_align = vw / elt_size;
1780 int k = align_to_ref_p.offset_in_bytes() / elt_size; 1987 int k = align_to_ref_p.offset_in_bytes() / elt_size;
1781 1988
1782 Node *kn = _igvn.intcon(k); 1989 Node *kn = _igvn.intcon(k);
1783 1990
1784 Node *e = kn; 1991 Node *e = kn;
1791 if (align_to_ref_p.negate_invar()) { 1998 if (align_to_ref_p.negate_invar()) {
1792 e = new (_phase->C, 3) SubINode(e, aref); 1999 e = new (_phase->C, 3) SubINode(e, aref);
1793 } else { 2000 } else {
1794 e = new (_phase->C, 3) AddINode(e, aref); 2001 e = new (_phase->C, 3) AddINode(e, aref);
1795 } 2002 }
2003 _phase->_igvn.register_new_node_with_optimizer(e);
2004 _phase->set_ctrl(e, pre_ctrl);
2005 }
2006 if (vw > ObjectAlignmentInBytes) {
2007 // incorporate base e +/- base && Mask >>> log2(elt)
2008 Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw)));
2009 Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base());
2010 _phase->_igvn.register_new_node_with_optimizer(xbase);
2011 Node* masked_xbase = new (_phase->C, 3) AndXNode(xbase, mask);
2012 _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
2013 #ifdef _LP64
2014 masked_xbase = new (_phase->C, 2) ConvL2INode(masked_xbase);
2015 _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
2016 #endif
2017 Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
2018 Node* bref = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt);
2019 _phase->_igvn.register_new_node_with_optimizer(bref);
2020 _phase->set_ctrl(bref, pre_ctrl);
2021 e = new (_phase->C, 3) AddINode(e, bref);
1796 _phase->_igvn.register_new_node_with_optimizer(e); 2022 _phase->_igvn.register_new_node_with_optimizer(e);
1797 _phase->set_ctrl(e, pre_ctrl); 2023 _phase->set_ctrl(e, pre_ctrl);
1798 } 2024 }
1799 2025
1800 // compute e +/- lim0 2026 // compute e +/- lim0