comparison src/share/vm/opto/superword.cpp @ 23015:95dbbc0431d9

8078497: C2's superword optimization causes unaligned memory accesses Summary: Prevent vectorization of memory operations with different invariant offsets if unaligned memory accesses are not allowed. Reviewed-by: kvn
author thartmann
date Fri, 08 May 2015 12:19:17 +0200
parents 84d55f179e24
children c1c199dde5c9
comparison
equal deleted inserted replaced
23014:84d55f179e24 23015:95dbbc0431d9
230 if (vw > vw_best) { 230 if (vw > vw_best) {
231 // Do not vectorize a memory access with more elements per vector 231 // Do not vectorize a memory access with more elements per vector
232 // if unaligned memory access is not allowed because number of 232 // if unaligned memory access is not allowed because number of
233 // iterations in pre-loop will be not enough to align it. 233 // iterations in pre-loop will be not enough to align it.
234 create_pack = false; 234 create_pack = false;
235 } else {
236 SWPointer p2(best_align_to_mem_ref, this);
237 if (align_to_ref_p.invar() != p2.invar()) {
238 // Do not vectorize memory accesses with different invariants
239 // if unaligned memory accesses are not allowed.
240 create_pack = false;
241 }
235 } 242 }
236 } 243 }
237 } else { 244 } else {
238 if (same_velt_type(mem_ref, best_align_to_mem_ref)) { 245 if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
239 // Can't allow vectorization of unaligned memory accesses with the 246 // Can't allow vectorization of unaligned memory accesses with the
450 // Stride one accesses are alignable if offset is aligned to memory operation size. 457 // Stride one accesses are alignable if offset is aligned to memory operation size.
451 // Offset can be unaligned when UseUnalignedAccesses is used. 458 // Offset can be unaligned when UseUnalignedAccesses is used.
452 if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) { 459 if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
453 return true; 460 return true;
454 } 461 }
455 // If initial offset from start of object is computable, 462 // If the initial offset from start of the object is computable,
456 // compute alignment within the vector. 463 // check if the pre-loop can align the final offset accordingly.
464 //
465 // In other words: Can we find an i such that the offset
466 // after i pre-loop iterations is aligned to vw?
467 // (init_offset + pre_loop) % vw == 0 (1)
468 // where
469 // pre_loop = i * span
470 // is the number of bytes added to the offset by i pre-loop iterations.
471 //
472 // For this to hold we need pre_loop to increase init_offset by
473 // pre_loop = vw - (init_offset % vw)
474 //
475 // This is only possible if pre_loop is divisible by span because each
476 // pre-loop iteration increases the initial offset by 'span' bytes:
477 // (vw - (init_offset % vw)) % span == 0
478 //
457 int vw = vector_width_in_bytes(p.mem()); 479 int vw = vector_width_in_bytes(p.mem());
458 assert(vw > 1, "sanity"); 480 assert(vw > 1, "sanity");
459 if (vw % span == 0) { 481 Node* init_nd = pre_end->init_trip();
460 Node* init_nd = pre_end->init_trip(); 482 if (init_nd->is_Con() && p.invar() == NULL) {
461 if (init_nd->is_Con() && p.invar() == NULL) { 483 int init = init_nd->bottom_type()->is_int()->get_con();
462 int init = init_nd->bottom_type()->is_int()->get_con(); 484 int init_offset = init * p.scale_in_bytes() + offset;
463 485 assert(init_offset >= 0, "positive offset from object start");
464 int init_offset = init * p.scale_in_bytes() + offset; 486 if (vw % span == 0) {
465 assert(init_offset >= 0, "positive offset from object start"); 487 // If vm is a multiple of span, we use formula (1).
466
467 if (span > 0) { 488 if (span > 0) {
468 return (vw - (init_offset % vw)) % span == 0; 489 return (vw - (init_offset % vw)) % span == 0;
469 } else { 490 } else {
470 assert(span < 0, "nonzero stride * scale"); 491 assert(span < 0, "nonzero stride * scale");
471 return (init_offset % vw) % -span == 0; 492 return (init_offset % vw) % -span == 0;
472 } 493 }
494 } else if (span % vw == 0) {
495 // If span is a multiple of vw, we can simplify formula (1) to:
496 // (init_offset + i * span) % vw == 0
497 // =>
498 // (init_offset % vw) + ((i * span) % vw) == 0
499 // =>
500 // init_offset % vw == 0
501 //
502 // Because we add a multiple of vw to the initial offset, the final
503 // offset is a multiple of vw if and only if init_offset is a multiple.
504 //
505 return (init_offset % vw) == 0;
473 } 506 }
474 } 507 }
475 return false; 508 return false;
476 } 509 }
477 510
479 // Calculate loop's iv adjustment for this memory ops. 512 // Calculate loop's iv adjustment for this memory ops.
480 int SuperWord::get_iv_adjustment(MemNode* mem_ref) { 513 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
481 SWPointer align_to_ref_p(mem_ref, this); 514 SWPointer align_to_ref_p(mem_ref, this);
482 int offset = align_to_ref_p.offset_in_bytes(); 515 int offset = align_to_ref_p.offset_in_bytes();
483 int scale = align_to_ref_p.scale_in_bytes(); 516 int scale = align_to_ref_p.scale_in_bytes();
517 int elt_size = align_to_ref_p.memory_size();
484 int vw = vector_width_in_bytes(mem_ref); 518 int vw = vector_width_in_bytes(mem_ref);
485 assert(vw > 1, "sanity"); 519 assert(vw > 1, "sanity");
486 int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; 520 int iv_adjustment;
487 // At least one iteration is executed in pre-loop by default. As result 521 if (scale != 0) {
488 // several iterations are needed to align memory operations in main-loop even 522 int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
489 // if offset is 0. 523 // At least one iteration is executed in pre-loop by default. As result
490 int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); 524 // several iterations are needed to align memory operations in main-loop even
491 int elt_size = align_to_ref_p.memory_size(); 525 // if offset is 0.
492 assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), 526 int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
493 err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); 527 assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
494 int iv_adjustment = iv_adjustment_in_bytes/elt_size; 528 err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
529 iv_adjustment = iv_adjustment_in_bytes/elt_size;
530 } else {
531 // This memory op is not dependent on iv (scale == 0)
532 iv_adjustment = 0;
533 }
495 534
496 #ifndef PRODUCT 535 #ifndef PRODUCT
497 if (TraceSuperWord) 536 if (TraceSuperWord)
498 tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d", 537 tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
499 offset, iv_adjustment, elt_size, scale, iv_stride(), vw); 538 offset, iv_adjustment, elt_size, scale, iv_stride(), vw);