# HG changeset patch # User kvn # Date 1348098626 25200 # Node ID 8ae8f9dd70990411a3e6d538a13076307df88b4d # Parent 9d89c76b0505db315adc8e42a875a8cbb8e74c3a 7199010: incorrect vector alignment Summary: Fixed vectors alignment when several arrays are accessed in one loop. Reviewed-by: roland, twisti diff -r 9d89c76b0505 -r 8ae8f9dd7099 src/cpu/x86/vm/vm_version_x86.cpp --- a/src/cpu/x86/vm/vm_version_x86.cpp Wed Sep 19 10:38:12 2012 -0700 +++ b/src/cpu/x86/vm/vm_version_x86.cpp Wed Sep 19 16:50:26 2012 -0700 @@ -562,10 +562,10 @@ AllocatePrefetchInstr = 3; } // On family 15h processors use XMM and UnalignedLoadStores for Array Copy - if( supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) { + if (supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy)) { UseXMMForArrayCopy = true; } - if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) { + if (supports_sse2() && FLAG_IS_DEFAULT(UseUnalignedLoadStores)) { UseUnalignedLoadStores = true; } } @@ -612,16 +612,16 @@ MaxLoopPad = 11; } #endif // COMPILER2 - if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) { + if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) { UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus } - if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus - if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) { + if (supports_sse4_2() && supports_ht()) { // Newest Intel cpus + if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) { UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus } } - if( supports_sse4_2() && UseSSE >= 4 ) { - if( FLAG_IS_DEFAULT(UseSSE42Intrinsics)) { + if (supports_sse4_2() && UseSSE >= 4) { + if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) { UseSSE42Intrinsics = true; } } @@ -638,6 +638,13 @@ FLAG_SET_DEFAULT(UsePopCountInstruction, false); } +#ifdef COMPILER2 + if (FLAG_IS_DEFAULT(AlignVector)) { + // Modern processors allow misaligned memory operations for vectors. + AlignVector = !UseUnalignedLoadStores; + } +#endif // COMPILER2 + assert(0 <= ReadPrefetchInstr && ReadPrefetchInstr <= 3, "invalid value"); assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 3, "invalid value"); diff -r 9d89c76b0505 -r 8ae8f9dd7099 src/share/vm/opto/c2_globals.hpp --- a/src/share/vm/opto/c2_globals.hpp Wed Sep 19 10:38:12 2012 -0700 +++ b/src/share/vm/opto/c2_globals.hpp Wed Sep 19 16:50:26 2012 -0700 @@ -85,7 +85,7 @@ "Max vector size in bytes, " \ "actual size could be less depending on elements type") \ \ - product(bool, AlignVector, false, \ + product(bool, AlignVector, true, \ "Perform vector store/load alignment in loop") \ \ product(intx, NumberOfLoopInstrToAlign, 4, \ diff -r 9d89c76b0505 -r 8ae8f9dd7099 src/share/vm/opto/superword.cpp --- a/src/share/vm/opto/superword.cpp Wed Sep 19 10:38:12 2012 -0700 +++ b/src/share/vm/opto/superword.cpp Wed Sep 19 16:50:26 2012 -0700 @@ -179,6 +179,7 @@ for (int i = 0; i < _block.length(); i++) { Node* n = _block.at(i); if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) && + n->Opcode() != Op_LoadUI2L && is_java_primitive(n->as_Mem()->memory_type())) { int align = memory_alignment(n->as_Mem(), 0); if (align != bottom_align) { @@ -481,12 +482,19 @@ int vw = vector_width_in_bytes(mem_ref); assert(vw > 1, "sanity"); int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; - int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw; + // At least one iteration is executed in pre-loop by default. As result + // several iterations are needed to align memory operations in main-loop even + // if offset is 0. + int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); + int elt_size = align_to_ref_p.memory_size(); + assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), + err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); + int iv_adjustment = iv_adjustment_in_bytes/elt_size; #ifndef PRODUCT if (TraceSuperWord) tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d", - offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw); + offset, iv_adjustment, elt_size, scale, iv_stride(), vw); #endif return iv_adjustment; } @@ -1816,7 +1824,7 @@ //------------------------------memory_alignment--------------------------- // Alignment within a vector memory reference -int SuperWord::memory_alignment(MemNode* s, int iv_adjust_in_bytes) { +int SuperWord::memory_alignment(MemNode* s, int iv_adjust) { SWPointer p(s, this); if (!p.valid()) { return bottom_align; @@ -1826,7 +1834,7 @@ return bottom_align; // No vectors for this type } int offset = p.offset_in_bytes(); - offset += iv_adjust_in_bytes; + offset += iv_adjust*p.memory_size(); int off_rem = offset % vw; int off_mod = off_rem >= 0 ? off_rem : off_rem + vw; return off_mod; @@ -1849,7 +1857,7 @@ bool SuperWord::same_velt_type(Node* n1, Node* n2) { const Type* vt1 = velt_type(n1); - const Type* vt2 = velt_type(n1); + const Type* vt2 = velt_type(n2); if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) { // Compare vectors element sizes for integer types. return data_size(n1) == data_size(n2); diff -r 9d89c76b0505 -r 8ae8f9dd7099 src/share/vm/opto/superword.hpp --- a/src/share/vm/opto/superword.hpp Wed Sep 19 10:38:12 2012 -0700 +++ b/src/share/vm/opto/superword.hpp Wed Sep 19 16:50:26 2012 -0700 @@ -400,7 +400,7 @@ // Return the node executed last in pack p. Node* executed_last(Node_List* p); // Alignment within a vector memory reference - int memory_alignment(MemNode* s, int iv_adjust_in_bytes); + int memory_alignment(MemNode* s, int iv_adjust); // (Start, end] half-open range defining which operands are vector void vector_opd_range(Node* n, uint* start, uint* end); // Smallest type containing range of values