# HG changeset patch
# User kvn
# Date 1348098626 25200
# Node ID 8ae8f9dd70990411a3e6d538a13076307df88b4d
# Parent  9d89c76b0505db315adc8e42a875a8cbb8e74c3a
7199010: incorrect vector alignment
Summary: Fixed vectors alignment when several arrays are accessed in one loop.
Reviewed-by: roland, twisti

diff -r 9d89c76b0505 -r 8ae8f9dd7099 src/cpu/x86/vm/vm_version_x86.cpp
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Wed Sep 19 10:38:12 2012 -0700
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Wed Sep 19 16:50:26 2012 -0700
@@ -562,10 +562,10 @@
         AllocatePrefetchInstr = 3;
       }
       // On family 15h processors use XMM and UnalignedLoadStores for Array Copy
-      if( supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+      if (supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
         UseXMMForArrayCopy = true;
       }
-      if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+      if (supports_sse2() && FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
         UseUnalignedLoadStores = true;
       }
     }
@@ -612,16 +612,16 @@
         MaxLoopPad = 11;
       }
 #endif // COMPILER2
-      if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
         UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
       }
-      if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
-        if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+      if (supports_sse4_2() && supports_ht()) { // Newest Intel cpus
+        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
           UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
         }
       }
-      if( supports_sse4_2() && UseSSE >= 4 ) {
-        if( FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
+      if (supports_sse4_2() && UseSSE >= 4) {
+        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
           UseSSE42Intrinsics = true;
         }
       }
@@ -638,6 +638,13 @@
     FLAG_SET_DEFAULT(UsePopCountInstruction, false);
   }
 
+#ifdef COMPILER2
+  if (FLAG_IS_DEFAULT(AlignVector)) {
+    // Modern processors allow misaligned memory operations for vectors.
+    AlignVector = !UseUnalignedLoadStores;
+  }
+#endif // COMPILER2
+
   assert(0 <= ReadPrefetchInstr && ReadPrefetchInstr <= 3, "invalid value");
   assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 3, "invalid value");
 
diff -r 9d89c76b0505 -r 8ae8f9dd7099 src/share/vm/opto/c2_globals.hpp
--- a/src/share/vm/opto/c2_globals.hpp	Wed Sep 19 10:38:12 2012 -0700
+++ b/src/share/vm/opto/c2_globals.hpp	Wed Sep 19 16:50:26 2012 -0700
@@ -85,7 +85,7 @@
           "Max vector size in bytes, "                                      \
           "actual size could be less depending on elements type")           \
                                                                             \
-  product(bool, AlignVector, false,                                         \
+  product(bool, AlignVector, true,                                          \
           "Perform vector store/load alignment in loop")                    \
                                                                             \
   product(intx, NumberOfLoopInstrToAlign, 4,                                \
diff -r 9d89c76b0505 -r 8ae8f9dd7099 src/share/vm/opto/superword.cpp
--- a/src/share/vm/opto/superword.cpp	Wed Sep 19 10:38:12 2012 -0700
+++ b/src/share/vm/opto/superword.cpp	Wed Sep 19 16:50:26 2012 -0700
@@ -179,6 +179,7 @@
   for (int i = 0; i < _block.length(); i++) {
     Node* n = _block.at(i);
     if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
+        n->Opcode() != Op_LoadUI2L &&
         is_java_primitive(n->as_Mem()->memory_type())) {
       int align = memory_alignment(n->as_Mem(), 0);
       if (align != bottom_align) {
@@ -481,12 +482,19 @@
   int vw       = vector_width_in_bytes(mem_ref);
   assert(vw > 1, "sanity");
   int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
-  int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw;
+  // At least one iteration is executed in pre-loop by default. As result
+  // several iterations are needed to align memory operations in main-loop even
+  // if offset is 0.
+  int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
+  int elt_size = align_to_ref_p.memory_size();
+  assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
+         err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
+  int iv_adjustment = iv_adjustment_in_bytes/elt_size;
 
 #ifndef PRODUCT
   if (TraceSuperWord)
     tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
-                  offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw);
+                  offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
 #endif
   return iv_adjustment;
 }
@@ -1816,7 +1824,7 @@
 
 //------------------------------memory_alignment---------------------------
 // Alignment within a vector memory reference
-int SuperWord::memory_alignment(MemNode* s, int iv_adjust_in_bytes) {
+int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
   SWPointer p(s, this);
   if (!p.valid()) {
     return bottom_align;
@@ -1826,7 +1834,7 @@
     return bottom_align; // No vectors for this type
   }
   int offset  = p.offset_in_bytes();
-  offset     += iv_adjust_in_bytes;
+  offset     += iv_adjust*p.memory_size();
   int off_rem = offset % vw;
   int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
   return off_mod;
@@ -1849,7 +1857,7 @@
 
 bool SuperWord::same_velt_type(Node* n1, Node* n2) {
   const Type* vt1 = velt_type(n1);
-  const Type* vt2 = velt_type(n1);
+  const Type* vt2 = velt_type(n2);
   if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
     // Compare vectors element sizes for integer types.
     return data_size(n1) == data_size(n2);
diff -r 9d89c76b0505 -r 8ae8f9dd7099 src/share/vm/opto/superword.hpp
--- a/src/share/vm/opto/superword.hpp	Wed Sep 19 10:38:12 2012 -0700
+++ b/src/share/vm/opto/superword.hpp	Wed Sep 19 16:50:26 2012 -0700
@@ -400,7 +400,7 @@
   // Return the node executed last in pack p.
   Node* executed_last(Node_List* p);
   // Alignment within a vector memory reference
-  int memory_alignment(MemNode* s, int iv_adjust_in_bytes);
+  int memory_alignment(MemNode* s, int iv_adjust);
   // (Start, end] half-open range defining which operands are vector
   void vector_opd_range(Node* n, uint* start, uint* end);
   // Smallest type containing range of values