Mercurial > hg > truffle

diff src/cpu/x86/vm/stubGenerator_x86_32.cpp @ 405:2649e5276dd7
6532536: Optimize arraycopy stubs for Intel cpus Summary: Use SSE2 movdqu in arraycopy stubs on newest Intel's cpus Reviewed-by: rasbold
author: kvn
date: Tue, 14 Oct 2008 15:10:26 -0700
parents: f8199438385b
children: 67e8b4d06369
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue Oct 14 06:58:58 2008 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue Oct 14 15:10:26 2008 -0700
@@ -791,6 +791,69 @@
     }
   }
 
+
+  // Copy 64 bytes chunks
+  //
+  // Inputs:
+  //   from        - source array address
+  //   to_from     - destination array address - from
+  //   qword_count - 8-bytes element count, negative
+  //
+  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
+    assert( UseSSE >= 2, "supported cpu only" );
+    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
+    // Copy 64-byte chunks
+    __ jmpb(L_copy_64_bytes);
+    __ align(16);
+  __ BIND(L_copy_64_bytes_loop);
+
+    if(UseUnalignedLoadStores) {
+      __ movdqu(xmm0, Address(from, 0));
+      __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
+      __ movdqu(xmm1, Address(from, 16));
+      __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
+      __ movdqu(xmm2, Address(from, 32));
+      __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
+      __ movdqu(xmm3, Address(from, 48));
+      __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
+
+    } else {
+      __ movq(xmm0, Address(from, 0));
+      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
+      __ movq(xmm1, Address(from, 8));
+      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
+      __ movq(xmm2, Address(from, 16));
+      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
+      __ movq(xmm3, Address(from, 24));
+      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
+      __ movq(xmm4, Address(from, 32));
+      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
+      __ movq(xmm5, Address(from, 40));
+      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
+      __ movq(xmm6, Address(from, 48));
+      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
+      __ movq(xmm7, Address(from, 56));
+      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
+    }
+
+    __ addl(from, 64);
+  __ BIND(L_copy_64_bytes);
+    __ subl(qword_count, 8);
+    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
+    __ addl(qword_count, 8);
+    __ jccb(Assembler::zero, L_exit);
+    //
+    // length is too short, just copy qwords
+    //
+  __ BIND(L_copy_8_bytes);
+    __ movq(xmm0, Address(from, 0));
+    __ movq(Address(from, to_from, Address::times_1), xmm0);
+    __ addl(from, 8);
+    __ decrement(qword_count);
+    __ jcc(Assembler::greater, L_copy_8_bytes);
+  __ BIND(L_exit);
+  }
+
   // Copy 64 bytes chunks
   //
   // Inputs:
@@ -799,6 +862,7 @@
   //   qword_count - 8-bytes element count, negative
   //
   void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
+    assert( VM_Version::supports_mmx(), "supported cpu only" );
     Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
     // Copy 64-byte chunks
     __ jmpb(L_copy_64_bytes);
@@ -876,7 +940,7 @@
     __ subptr(to, from); // to --> to_from
     __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
     __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
-    if (!aligned && (t == T_BYTE || t == T_SHORT)) {
+    if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
       // align source address at 4 bytes address boundary
       if (t == T_BYTE) {
         // One byte misalignment happens only for byte arrays
@@ -906,20 +970,26 @@
       __ mov(count, rax);      // restore 'count'
       __ jmpb(L_copy_2_bytes); // all dwords were copied
     } else {
-      // align to 8 bytes, we know we are 4 byte aligned to start
-      __ testptr(from, 4);
-      __ jccb(Assembler::zero, L_copy_64_bytes);
-      __ movl(rax, Address(from, 0));
-      __ movl(Address(from, to_from, Address::times_1, 0), rax);
-      __ addptr(from, 4);
-      __ subl(count, 1<<shift);
+      if (!UseUnalignedLoadStores) {
+        // align to 8 bytes, we know we are 4 byte aligned to start
+        __ testptr(from, 4);
+        __ jccb(Assembler::zero, L_copy_64_bytes);
+        __ movl(rax, Address(from, 0));
+        __ movl(Address(from, to_from, Address::times_1, 0), rax);
+        __ addptr(from, 4);
+        __ subl(count, 1<<shift);
+      }
     __ BIND(L_copy_64_bytes);
       __ mov(rax, count);
       __ shrl(rax, shift+1);  // 8 bytes chunk count
       //
       // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
       //
-      mmx_copy_forward(from, to_from, rax);
+      if (UseXMMForArrayCopy) {
+        xmm_copy_forward(from, to_from, rax);
+      } else {
+        mmx_copy_forward(from, to_from, rax);
+      }
     }
     // copy tailing dword
   __ BIND(L_copy_4_bytes);
@@ -1069,13 +1139,20 @@
       __ align(16);
       // Move 8 bytes
     __ BIND(L_copy_8_bytes_loop);
-      __ movq(mmx0, Address(from, count, sf, 0));
-      __ movq(Address(to, count, sf, 0), mmx0);
+      if (UseXMMForArrayCopy) {
+        __ movq(xmm0, Address(from, count, sf, 0));
+        __ movq(Address(to, count, sf, 0), xmm0);
+      } else {
+        __ movq(mmx0, Address(from, count, sf, 0));
+        __ movq(Address(to, count, sf, 0), mmx0);
+      }
     __ BIND(L_copy_8_bytes);
       __ subl(count, 2<<shift);
       __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
       __ addl(count, 2<<shift);
-      __ emms();
+      if (!UseXMMForArrayCopy) {
+        __ emms();
+      }
     }
   __ BIND(L_copy_4_bytes);
     // copy prefix qword
@@ -1143,7 +1220,11 @@
 
     __ subptr(to, from); // to --> to_from
     if (VM_Version::supports_mmx()) {
-      mmx_copy_forward(from, to_from, count);
+      if (UseXMMForArrayCopy) {
+        xmm_copy_forward(from, to_from, count);
+      } else {
+        mmx_copy_forward(from, to_from, count);
+      }
     } else {
       __ jmpb(L_copy_8_bytes);
       __ align(16);
@@ -1196,8 +1277,13 @@
     __ align(16);
   __ BIND(L_copy_8_bytes_loop);
     if (VM_Version::supports_mmx()) {
-      __ movq(mmx0, Address(from, count, Address::times_8));
-      __ movq(Address(to, count, Address::times_8), mmx0);
+      if (UseXMMForArrayCopy) {
+        __ movq(xmm0, Address(from, count, Address::times_8));
+        __ movq(Address(to, count, Address::times_8), xmm0);
+      } else {
+        __ movq(mmx0, Address(from, count, Address::times_8));
+        __ movq(Address(to, count, Address::times_8), mmx0);
+      }
     } else {
       __ fild_d(Address(from, count, Address::times_8));
       __ fistp_d(Address(to, count, Address::times_8));
@@ -1206,7 +1292,7 @@
     __ decrement(count);
     __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
 
-    if (VM_Version::supports_mmx()) {
+    if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
       __ emms();
     }
     inc_copy_counter_np(T_LONG);
author	kvn
date	Tue, 14 Oct 2008 15:10:26 -0700
parents	f8199438385b
children	67e8b4d06369