Mercurial > hg > graal-jvmci-8
comparison src/share/vm/opto/superword.cpp @ 23015:95dbbc0431d9
8078497: C2's superword optimization causes unaligned memory accesses
Summary: Prevent vectorization of memory operations with different invariant offsets if unaligned memory accesses are not allowed.
Reviewed-by: kvn
author | thartmann |
---|---|
date | Fri, 08 May 2015 12:19:17 +0200 |
parents | 84d55f179e24 |
children | c1c199dde5c9 |
comparison
equal
deleted
inserted
replaced
23014:84d55f179e24 | 23015:95dbbc0431d9 |
---|---|
230 if (vw > vw_best) { | 230 if (vw > vw_best) { |
231 // Do not vectorize a memory access with more elements per vector | 231 // Do not vectorize a memory access with more elements per vector |
232 // if unaligned memory access is not allowed because number of | 232 // if unaligned memory access is not allowed because number of |
233 // iterations in pre-loop will be not enough to align it. | 233 // iterations in pre-loop will be not enough to align it. |
234 create_pack = false; | 234 create_pack = false; |
235 } else { | |
236 SWPointer p2(best_align_to_mem_ref, this); | |
237 if (align_to_ref_p.invar() != p2.invar()) { | |
238 // Do not vectorize memory accesses with different invariants | |
239 // if unaligned memory accesses are not allowed. | |
240 create_pack = false; | |
241 } | |
235 } | 242 } |
236 } | 243 } |
237 } else { | 244 } else { |
238 if (same_velt_type(mem_ref, best_align_to_mem_ref)) { | 245 if (same_velt_type(mem_ref, best_align_to_mem_ref)) { |
239 // Can't allow vectorization of unaligned memory accesses with the | 246 // Can't allow vectorization of unaligned memory accesses with the |
450 // Stride one accesses are alignable if offset is aligned to memory operation size. | 457 // Stride one accesses are alignable if offset is aligned to memory operation size. |
451 // Offset can be unaligned when UseUnalignedAccesses is used. | 458 // Offset can be unaligned when UseUnalignedAccesses is used. |
452 if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) { | 459 if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) { |
453 return true; | 460 return true; |
454 } | 461 } |
455 // If initial offset from start of object is computable, | 462 // If the initial offset from start of the object is computable, |
456 // compute alignment within the vector. | 463 // check if the pre-loop can align the final offset accordingly. |
464 // | |
465 // In other words: Can we find an i such that the offset | |
466 // after i pre-loop iterations is aligned to vw? | |
467 // (init_offset + pre_loop) % vw == 0 (1) | |
468 // where | |
469 // pre_loop = i * span | |
470 // is the number of bytes added to the offset by i pre-loop iterations. | |
471 // | |
472 // For this to hold we need pre_loop to increase init_offset by | |
473 // pre_loop = vw - (init_offset % vw) | |
474 // | |
475 // This is only possible if pre_loop is divisible by span because each | |
476 // pre-loop iteration increases the initial offset by 'span' bytes: | |
477 // (vw - (init_offset % vw)) % span == 0 | |
478 // | |
457 int vw = vector_width_in_bytes(p.mem()); | 479 int vw = vector_width_in_bytes(p.mem()); |
458 assert(vw > 1, "sanity"); | 480 assert(vw > 1, "sanity"); |
459 if (vw % span == 0) { | 481 Node* init_nd = pre_end->init_trip(); |
460 Node* init_nd = pre_end->init_trip(); | 482 if (init_nd->is_Con() && p.invar() == NULL) { |
461 if (init_nd->is_Con() && p.invar() == NULL) { | 483 int init = init_nd->bottom_type()->is_int()->get_con(); |
462 int init = init_nd->bottom_type()->is_int()->get_con(); | 484 int init_offset = init * p.scale_in_bytes() + offset; |
463 | 485 assert(init_offset >= 0, "positive offset from object start"); |
464 int init_offset = init * p.scale_in_bytes() + offset; | 486 if (vw % span == 0) { |
465 assert(init_offset >= 0, "positive offset from object start"); | 487 // If vm is a multiple of span, we use formula (1). |
466 | |
467 if (span > 0) { | 488 if (span > 0) { |
468 return (vw - (init_offset % vw)) % span == 0; | 489 return (vw - (init_offset % vw)) % span == 0; |
469 } else { | 490 } else { |
470 assert(span < 0, "nonzero stride * scale"); | 491 assert(span < 0, "nonzero stride * scale"); |
471 return (init_offset % vw) % -span == 0; | 492 return (init_offset % vw) % -span == 0; |
472 } | 493 } |
494 } else if (span % vw == 0) { | |
495 // If span is a multiple of vw, we can simplify formula (1) to: | |
496 // (init_offset + i * span) % vw == 0 | |
497 // => | |
498 // (init_offset % vw) + ((i * span) % vw) == 0 | |
499 // => | |
500 // init_offset % vw == 0 | |
501 // | |
502 // Because we add a multiple of vw to the initial offset, the final | |
503 // offset is a multiple of vw if and only if init_offset is a multiple. | |
504 // | |
505 return (init_offset % vw) == 0; | |
473 } | 506 } |
474 } | 507 } |
475 return false; | 508 return false; |
476 } | 509 } |
477 | 510 |
479 // Calculate loop's iv adjustment for this memory ops. | 512 // Calculate loop's iv adjustment for this memory ops. |
480 int SuperWord::get_iv_adjustment(MemNode* mem_ref) { | 513 int SuperWord::get_iv_adjustment(MemNode* mem_ref) { |
481 SWPointer align_to_ref_p(mem_ref, this); | 514 SWPointer align_to_ref_p(mem_ref, this); |
482 int offset = align_to_ref_p.offset_in_bytes(); | 515 int offset = align_to_ref_p.offset_in_bytes(); |
483 int scale = align_to_ref_p.scale_in_bytes(); | 516 int scale = align_to_ref_p.scale_in_bytes(); |
517 int elt_size = align_to_ref_p.memory_size(); | |
484 int vw = vector_width_in_bytes(mem_ref); | 518 int vw = vector_width_in_bytes(mem_ref); |
485 assert(vw > 1, "sanity"); | 519 assert(vw > 1, "sanity"); |
486 int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; | 520 int iv_adjustment; |
487 // At least one iteration is executed in pre-loop by default. As result | 521 if (scale != 0) { |
488 // several iterations are needed to align memory operations in main-loop even | 522 int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; |
489 // if offset is 0. | 523 // At least one iteration is executed in pre-loop by default. As result |
490 int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); | 524 // several iterations are needed to align memory operations in main-loop even |
491 int elt_size = align_to_ref_p.memory_size(); | 525 // if offset is 0. |
492 assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), | 526 int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); |
493 err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); | 527 assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), |
494 int iv_adjustment = iv_adjustment_in_bytes/elt_size; | 528 err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); |
529 iv_adjustment = iv_adjustment_in_bytes/elt_size; | |
530 } else { | |
531 // This memory op is not dependent on iv (scale == 0) | |
532 iv_adjustment = 0; | |
533 } | |
495 | 534 |
496 #ifndef PRODUCT | 535 #ifndef PRODUCT |
497 if (TraceSuperWord) | 536 if (TraceSuperWord) |
498 tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d", | 537 tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d", |
499 offset, iv_adjustment, elt_size, scale, iv_stride(), vw); | 538 offset, iv_adjustment, elt_size, scale, iv_stride(), vw); |