Mercurial > hg > truffle
diff src/cpu/x86/vm/stubGenerator_x86_32.cpp @ 405:2649e5276dd7
6532536: Optimize arraycopy stubs for Intel cpus
Summary: Use SSE2 movdqu in arraycopy stubs on newest Intel's cpus
Reviewed-by: rasbold
author | kvn |
---|---|
date | Tue, 14 Oct 2008 15:10:26 -0700 |
parents | f8199438385b |
children | 67e8b4d06369 |
line wrap: on
line diff
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Oct 14 06:58:58 2008 -0700 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Oct 14 15:10:26 2008 -0700 @@ -791,6 +791,69 @@ } } + + // Copy 64 bytes chunks + // + // Inputs: + // from - source array address + // to_from - destination array address - from + // qword_count - 8-bytes element count, negative + // + void xmm_copy_forward(Register from, Register to_from, Register qword_count) { + assert( UseSSE >= 2, "supported cpu only" ); + Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; + // Copy 64-byte chunks + __ jmpb(L_copy_64_bytes); + __ align(16); + __ BIND(L_copy_64_bytes_loop); + + if(UseUnalignedLoadStores) { + __ movdqu(xmm0, Address(from, 0)); + __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0); + __ movdqu(xmm1, Address(from, 16)); + __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1); + __ movdqu(xmm2, Address(from, 32)); + __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2); + __ movdqu(xmm3, Address(from, 48)); + __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3); + + } else { + __ movq(xmm0, Address(from, 0)); + __ movq(Address(from, to_from, Address::times_1, 0), xmm0); + __ movq(xmm1, Address(from, 8)); + __ movq(Address(from, to_from, Address::times_1, 8), xmm1); + __ movq(xmm2, Address(from, 16)); + __ movq(Address(from, to_from, Address::times_1, 16), xmm2); + __ movq(xmm3, Address(from, 24)); + __ movq(Address(from, to_from, Address::times_1, 24), xmm3); + __ movq(xmm4, Address(from, 32)); + __ movq(Address(from, to_from, Address::times_1, 32), xmm4); + __ movq(xmm5, Address(from, 40)); + __ movq(Address(from, to_from, Address::times_1, 40), xmm5); + __ movq(xmm6, Address(from, 48)); + __ movq(Address(from, to_from, Address::times_1, 48), xmm6); + __ movq(xmm7, Address(from, 56)); + __ movq(Address(from, to_from, Address::times_1, 56), xmm7); + } + + __ addl(from, 64); + __ BIND(L_copy_64_bytes); + __ subl(qword_count, 8); + __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop); + __ addl(qword_count, 8); + __ jccb(Assembler::zero, L_exit); + // + // length is too short, just copy qwords + // + __ BIND(L_copy_8_bytes); + __ movq(xmm0, Address(from, 0)); + __ movq(Address(from, to_from, Address::times_1), xmm0); + __ addl(from, 8); + __ decrement(qword_count); + __ jcc(Assembler::greater, L_copy_8_bytes); + __ BIND(L_exit); + } + // Copy 64 bytes chunks // // Inputs: @@ -799,6 +862,7 @@ // qword_count - 8-bytes element count, negative // void mmx_copy_forward(Register from, Register to_from, Register qword_count) { + assert( VM_Version::supports_mmx(), "supported cpu only" ); Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; // Copy 64-byte chunks __ jmpb(L_copy_64_bytes); @@ -876,7 +940,7 @@ __ subptr(to, from); // to --> to_from __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp - if (!aligned && (t == T_BYTE || t == T_SHORT)) { + if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) { // align source address at 4 bytes address boundary if (t == T_BYTE) { // One byte misalignment happens only for byte arrays @@ -906,20 +970,26 @@ __ mov(count, rax); // restore 'count' __ jmpb(L_copy_2_bytes); // all dwords were copied } else { - // align to 8 bytes, we know we are 4 byte aligned to start - __ testptr(from, 4); - __ jccb(Assembler::zero, L_copy_64_bytes); - __ movl(rax, Address(from, 0)); - __ movl(Address(from, to_from, Address::times_1, 0), rax); - __ addptr(from, 4); - __ subl(count, 1<<shift); + if (!UseUnalignedLoadStores) { + // align to 8 bytes, we know we are 4 byte aligned to start + __ testptr(from, 4); + __ jccb(Assembler::zero, L_copy_64_bytes); + __ movl(rax, Address(from, 0)); + __ movl(Address(from, to_from, Address::times_1, 0), rax); + __ addptr(from, 4); + __ subl(count, 1<<shift); + } __ BIND(L_copy_64_bytes); __ mov(rax, count); __ shrl(rax, shift+1); // 8 bytes chunk count // // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop // - mmx_copy_forward(from, to_from, rax); + if (UseXMMForArrayCopy) { + xmm_copy_forward(from, to_from, rax); + } else { + mmx_copy_forward(from, to_from, rax); + } } // copy tailing dword __ BIND(L_copy_4_bytes); @@ -1069,13 +1139,20 @@ __ align(16); // Move 8 bytes __ BIND(L_copy_8_bytes_loop); - __ movq(mmx0, Address(from, count, sf, 0)); - __ movq(Address(to, count, sf, 0), mmx0); + if (UseXMMForArrayCopy) { + __ movq(xmm0, Address(from, count, sf, 0)); + __ movq(Address(to, count, sf, 0), xmm0); + } else { + __ movq(mmx0, Address(from, count, sf, 0)); + __ movq(Address(to, count, sf, 0), mmx0); + } __ BIND(L_copy_8_bytes); __ subl(count, 2<<shift); __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); __ addl(count, 2<<shift); - __ emms(); + if (!UseXMMForArrayCopy) { + __ emms(); + } } __ BIND(L_copy_4_bytes); // copy prefix qword @@ -1143,7 +1220,11 @@ __ subptr(to, from); // to --> to_from if (VM_Version::supports_mmx()) { - mmx_copy_forward(from, to_from, count); + if (UseXMMForArrayCopy) { + xmm_copy_forward(from, to_from, count); + } else { + mmx_copy_forward(from, to_from, count); + } } else { __ jmpb(L_copy_8_bytes); __ align(16); @@ -1196,8 +1277,13 @@ __ align(16); __ BIND(L_copy_8_bytes_loop); if (VM_Version::supports_mmx()) { - __ movq(mmx0, Address(from, count, Address::times_8)); - __ movq(Address(to, count, Address::times_8), mmx0); + if (UseXMMForArrayCopy) { + __ movq(xmm0, Address(from, count, Address::times_8)); + __ movq(Address(to, count, Address::times_8), xmm0); + } else { + __ movq(mmx0, Address(from, count, Address::times_8)); + __ movq(Address(to, count, Address::times_8), mmx0); + } } else { __ fild_d(Address(from, count, Address::times_8)); __ fistp_d(Address(to, count, Address::times_8)); @@ -1206,7 +1292,7 @@ __ decrement(count); __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); - if (VM_Version::supports_mmx()) { + if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) { __ emms(); } inc_copy_counter_np(T_LONG);