graal-jvmci-8: src/share/vm/opto/superword.cpp comparison

comparison src/share/vm/opto/superword.cpp @ 23015:95dbbc0431d9

8078497: C2's superword optimization causes unaligned memory accesses Summary: Prevent vectorization of memory operations with different invariant offsets if unaligned memory accesses are not allowed. Reviewed-by: kvn

author	thartmann
date	Fri, 08 May 2015 12:19:17 +0200
parents	84d55f179e24
children	c1c199dde5c9

comparison

equal deleted inserted replaced

-:84d55f179e24
+:95dbbc0431d9
 if (vw > vw_best) {
 // Do not vectorize a memory access with more elements per vector
 // if unaligned memory access is not allowed because number of
 // iterations in pre-loop will be not enough to align it.
 create_pack = false;
+} else {
+SWPointer p2(best_align_to_mem_ref, this);
+if (align_to_ref_p.invar() != p2.invar()) {
+// Do not vectorize memory accesses with different invariants
+// if unaligned memory accesses are not allowed.
+create_pack = false;
+}
 }
 }
 } else {
 if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
 // Can't allow vectorization of unaligned memory accesses with the
 // Stride one accesses are alignable if offset is aligned to memory operation size.
 // Offset can be unaligned when UseUnalignedAccesses is used.
 if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
 return true;
 }
-// If initial offset from start of object is computable,
+// If the initial offset from start of the object is computable,
-// compute alignment within the vector.
+// check if the pre-loop can align the final offset accordingly.
+//
+// In other words: Can we find an i such that the offset
+// after i pre-loop iterations is aligned to vw?
+//   (init_offset + pre_loop) % vw == 0              (1)
+// where
+//   pre_loop = i * span
+// is the number of bytes added to the offset by i pre-loop iterations.
+//
+// For this to hold we need pre_loop to increase init_offset by
+//   pre_loop = vw - (init_offset % vw)
+//
+// This is only possible if pre_loop is divisible by span because each
+// pre-loop iteration increases the initial offset by 'span' bytes:
+//   (vw - (init_offset % vw)) % span == 0
+//
 int vw = vector_width_in_bytes(p.mem());
 assert(vw > 1, "sanity");
-if (vw % span == 0) {
+Node* init_nd = pre_end->init_trip();
-Node* init_nd = pre_end->init_trip();
+if (init_nd->is_Con() && p.invar() == NULL) {
-if (init_nd->is_Con() && p.invar() == NULL) {
+int init = init_nd->bottom_type()->is_int()->get_con();
-int init = init_nd->bottom_type()->is_int()->get_con();
+int init_offset = init * p.scale_in_bytes() + offset;
+assert(init_offset >= 0, "positive offset from object start");
-int init_offset = init * p.scale_in_bytes() + offset;
+if (vw % span == 0) {
-assert(init_offset >= 0, "positive offset from object start");
+// If vm is a multiple of span, we use formula (1).
 if (span > 0) {
 return (vw - (init_offset % vw)) % span == 0;
 } else {
 assert(span < 0, "nonzero stride * scale");
 return (init_offset % vw) % -span == 0;
 }
+} else if (span % vw == 0) {
+// If span is a multiple of vw, we can simplify formula (1) to:
+//   (init_offset + i * span) % vw == 0
+//     =>
+//   (init_offset % vw) + ((i * span) % vw) == 0
+//     =>
+//   init_offset % vw == 0
+//
+// Because we add a multiple of vw to the initial offset, the final
+// offset is a multiple of vw if and only if init_offset is a multiple.
+//
+return (init_offset % vw) == 0;
 }
 }
 return false;
 }
 // Calculate loop's iv adjustment for this memory ops.
 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
 SWPointer align_to_ref_p(mem_ref, this);
 int offset = align_to_ref_p.offset_in_bytes();
 int scale  = align_to_ref_p.scale_in_bytes();
+int elt_size = align_to_ref_p.memory_size();
 int vw       = vector_width_in_bytes(mem_ref);
 assert(vw > 1, "sanity");
-int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
+int iv_adjustment;
-// At least one iteration is executed in pre-loop by default. As result
+if (scale != 0) {
-// several iterations are needed to align memory operations in main-loop even
+int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
-// if offset is 0.
+// At least one iteration is executed in pre-loop by default. As result
-int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
+// several iterations are needed to align memory operations in main-loop even
-int elt_size = align_to_ref_p.memory_size();
+// if offset is 0.
-assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
+int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
-err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
+assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
-int iv_adjustment = iv_adjustment_in_bytes/elt_size;
+err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
+iv_adjustment = iv_adjustment_in_bytes/elt_size;
+} else {
+// This memory op is not dependent on iv (scale == 0)
+iv_adjustment = 0;
+}
 #ifndef PRODUCT
 if (TraceSuperWord)
 tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
 offset, iv_adjustment, elt_size, scale, iv_stride(), vw);

Mercurial > hg > graal-jvmci-8

comparison src/share/vm/opto/superword.cpp @ 23015:95dbbc0431d9