truffle: src/cpu/x86/vm/stubGenerator_x86

comparison src/cpu/x86/vm/stubGenerator_x86_32.cpp @ 405:2649e5276dd7

6532536: Optimize arraycopy stubs for Intel cpus Summary: Use SSE2 movdqu in arraycopy stubs on newest Intel's cpus Reviewed-by: rasbold

author	kvn
date	Tue, 14 Oct 2008 15:10:26 -0700
parents	f8199438385b
children	67e8b4d06369

comparison

equal deleted inserted replaced

-:78c058bc5cdc
+:2649e5276dd7
 ShouldNotReachHere();
 }
 }
 // Copy 64 bytes chunks
 //
 // Inputs:
 //   from        - source array address
 //   to_from     - destination array address - from
 //   qword_count - 8-bytes element count, negative
 //
+void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
+assert( UseSSE >= 2, "supported cpu only" );
+Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
+// Copy 64-byte chunks
+__ jmpb(L_copy_64_bytes);
+__ align(16);
+__ BIND(L_copy_64_bytes_loop);
+if(UseUnalignedLoadStores) {
+__ movdqu(xmm0, Address(from, 0));
+__ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
+__ movdqu(xmm1, Address(from, 16));
+__ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
+__ movdqu(xmm2, Address(from, 32));
+__ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
+__ movdqu(xmm3, Address(from, 48));
+__ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
+} else {
+__ movq(xmm0, Address(from, 0));
+__ movq(Address(from, to_from, Address::times_1, 0), xmm0);
+__ movq(xmm1, Address(from, 8));
+__ movq(Address(from, to_from, Address::times_1, 8), xmm1);
+__ movq(xmm2, Address(from, 16));
+__ movq(Address(from, to_from, Address::times_1, 16), xmm2);
+__ movq(xmm3, Address(from, 24));
+__ movq(Address(from, to_from, Address::times_1, 24), xmm3);
+__ movq(xmm4, Address(from, 32));
+__ movq(Address(from, to_from, Address::times_1, 32), xmm4);
+__ movq(xmm5, Address(from, 40));
+__ movq(Address(from, to_from, Address::times_1, 40), xmm5);
+__ movq(xmm6, Address(from, 48));
+__ movq(Address(from, to_from, Address::times_1, 48), xmm6);
+__ movq(xmm7, Address(from, 56));
+__ movq(Address(from, to_from, Address::times_1, 56), xmm7);
+}
+__ addl(from, 64);
+__ BIND(L_copy_64_bytes);
+__ subl(qword_count, 8);
+__ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
+__ addl(qword_count, 8);
+__ jccb(Assembler::zero, L_exit);
+//
+// length is too short, just copy qwords
+//
+__ BIND(L_copy_8_bytes);
+__ movq(xmm0, Address(from, 0));
+__ movq(Address(from, to_from, Address::times_1), xmm0);
+__ addl(from, 8);
+__ decrement(qword_count);
+__ jcc(Assembler::greater, L_copy_8_bytes);
+__ BIND(L_exit);
+}
+// Copy 64 bytes chunks
+//
+// Inputs:
+//   from        - source array address
+//   to_from     - destination array address - from
+//   qword_count - 8-bytes element count, negative
+//
 void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
+assert( VM_Version::supports_mmx(), "supported cpu only" );
 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
 // Copy 64-byte chunks
 __ jmpb(L_copy_64_bytes);
 __ align(16);
 __ BIND(L_copy_64_bytes_loop);
 BLOCK_COMMENT("Entry:");
 __ subptr(to, from); // to --> to_from
 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
 __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
-if (!aligned && (t == T_BYTE || t == T_SHORT)) {
+if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
 // align source address at 4 bytes address boundary
 if (t == T_BYTE) {
 // One byte misalignment happens only for byte arrays
 __ testl(from, 1);
 __ jccb(Assembler::zero, L_skip_align1);
 __ rep_mov();
 __ subptr(to_from, from);// restore 'to_from'
 __ mov(count, rax);      // restore 'count'
 __ jmpb(L_copy_2_bytes); // all dwords were copied
 } else {
-// align to 8 bytes, we know we are 4 byte aligned to start
+if (!UseUnalignedLoadStores) {
-__ testptr(from, 4);
+// align to 8 bytes, we know we are 4 byte aligned to start
-__ jccb(Assembler::zero, L_copy_64_bytes);
+__ testptr(from, 4);
-__ movl(rax, Address(from, 0));
+__ jccb(Assembler::zero, L_copy_64_bytes);
-__ movl(Address(from, to_from, Address::times_1, 0), rax);
+__ movl(rax, Address(from, 0));
-__ addptr(from, 4);
+__ movl(Address(from, to_from, Address::times_1, 0), rax);
-__ subl(count, 1<<shift);
+__ addptr(from, 4);
+__ subl(count, 1<<shift);
+}
 __ BIND(L_copy_64_bytes);
 __ mov(rax, count);
 __ shrl(rax, shift+1);  // 8 bytes chunk count
 //
 // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
 //
-mmx_copy_forward(from, to_from, rax);
+if (UseXMMForArrayCopy) {
+xmm_copy_forward(from, to_from, rax);
+} else {
+mmx_copy_forward(from, to_from, rax);
+}
 }
 // copy tailing dword
 __ BIND(L_copy_4_bytes);
 __ testl(count, 1<<shift);
 __ jccb(Assembler::zero, L_copy_2_bytes);
 __ jmpb(L_copy_8_bytes);
 __ align(16);
 // Move 8 bytes
 __ BIND(L_copy_8_bytes_loop);
-__ movq(mmx0, Address(from, count, sf, 0));
+if (UseXMMForArrayCopy) {
-__ movq(Address(to, count, sf, 0), mmx0);
+__ movq(xmm0, Address(from, count, sf, 0));
+__ movq(Address(to, count, sf, 0), xmm0);
+} else {
+__ movq(mmx0, Address(from, count, sf, 0));
+__ movq(Address(to, count, sf, 0), mmx0);
+}
 __ BIND(L_copy_8_bytes);
 __ subl(count, 2<<shift);
 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
 __ addl(count, 2<<shift);
-__ emms();
+if (!UseXMMForArrayCopy) {
+__ emms();
+}
 }
 __ BIND(L_copy_4_bytes);
 // copy prefix qword
 __ testl(count, 1<<shift);
 __ jccb(Assembler::zero, L_copy_2_bytes);
 *entry = __ pc(); // Entry point from conjoint arraycopy stub.
 BLOCK_COMMENT("Entry:");
 __ subptr(to, from); // to --> to_from
 if (VM_Version::supports_mmx()) {
-mmx_copy_forward(from, to_from, count);
+if (UseXMMForArrayCopy) {
+xmm_copy_forward(from, to_from, count);
+} else {
+mmx_copy_forward(from, to_from, count);
+}
 } else {
 __ jmpb(L_copy_8_bytes);
 __ align(16);
 __ BIND(L_copy_8_bytes_loop);
 __ fild_d(Address(from, 0));
 __ jmpb(L_copy_8_bytes);
 __ align(16);
 __ BIND(L_copy_8_bytes_loop);
 if (VM_Version::supports_mmx()) {
-__ movq(mmx0, Address(from, count, Address::times_8));
+if (UseXMMForArrayCopy) {
-__ movq(Address(to, count, Address::times_8), mmx0);
+__ movq(xmm0, Address(from, count, Address::times_8));
+__ movq(Address(to, count, Address::times_8), xmm0);
+} else {
+__ movq(mmx0, Address(from, count, Address::times_8));
+__ movq(Address(to, count, Address::times_8), mmx0);
+}
 } else {
 __ fild_d(Address(from, count, Address::times_8));
 __ fistp_d(Address(to, count, Address::times_8));
 }
 __ BIND(L_copy_8_bytes);
 __ decrement(count);
 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
-if (VM_Version::supports_mmx()) {
+if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
 __ emms();
 }
 inc_copy_counter_np(T_LONG);
 __ leave(); // required for proper stackwalking of RuntimeStub frame
 __ xorptr(rax, rax); // return 0

Mercurial > hg > truffle

comparison src/cpu/x86/vm/stubGenerator_x86_32.cpp @ 405:2649e5276dd7