comparison src/cpu/x86/vm/stubGenerator_x86_32.cpp @ 405:2649e5276dd7

6532536: Optimize arraycopy stubs for Intel cpus Summary: Use SSE2 movdqu in arraycopy stubs on newest Intel's cpus Reviewed-by: rasbold
author kvn
date Tue, 14 Oct 2008 15:10:26 -0700
parents f8199438385b
children 67e8b4d06369
comparison
equal deleted inserted replaced
404:78c058bc5cdc 405:2649e5276dd7
789 ShouldNotReachHere(); 789 ShouldNotReachHere();
790 790
791 } 791 }
792 } 792 }
793 793
794
794 // Copy 64 bytes chunks 795 // Copy 64 bytes chunks
795 // 796 //
796 // Inputs: 797 // Inputs:
797 // from - source array address 798 // from - source array address
798 // to_from - destination array address - from 799 // to_from - destination array address - from
799 // qword_count - 8-bytes element count, negative 800 // qword_count - 8-bytes element count, negative
800 // 801 //
802 void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
803 assert( UseSSE >= 2, "supported cpu only" );
804 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
805 // Copy 64-byte chunks
806 __ jmpb(L_copy_64_bytes);
807 __ align(16);
808 __ BIND(L_copy_64_bytes_loop);
809
810 if(UseUnalignedLoadStores) {
811 __ movdqu(xmm0, Address(from, 0));
812 __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
813 __ movdqu(xmm1, Address(from, 16));
814 __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
815 __ movdqu(xmm2, Address(from, 32));
816 __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
817 __ movdqu(xmm3, Address(from, 48));
818 __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
819
820 } else {
821 __ movq(xmm0, Address(from, 0));
822 __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
823 __ movq(xmm1, Address(from, 8));
824 __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
825 __ movq(xmm2, Address(from, 16));
826 __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
827 __ movq(xmm3, Address(from, 24));
828 __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
829 __ movq(xmm4, Address(from, 32));
830 __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
831 __ movq(xmm5, Address(from, 40));
832 __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
833 __ movq(xmm6, Address(from, 48));
834 __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
835 __ movq(xmm7, Address(from, 56));
836 __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
837 }
838
839 __ addl(from, 64);
840 __ BIND(L_copy_64_bytes);
841 __ subl(qword_count, 8);
842 __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
843 __ addl(qword_count, 8);
844 __ jccb(Assembler::zero, L_exit);
845 //
846 // length is too short, just copy qwords
847 //
848 __ BIND(L_copy_8_bytes);
849 __ movq(xmm0, Address(from, 0));
850 __ movq(Address(from, to_from, Address::times_1), xmm0);
851 __ addl(from, 8);
852 __ decrement(qword_count);
853 __ jcc(Assembler::greater, L_copy_8_bytes);
854 __ BIND(L_exit);
855 }
856
857 // Copy 64 bytes chunks
858 //
859 // Inputs:
860 // from - source array address
861 // to_from - destination array address - from
862 // qword_count - 8-bytes element count, negative
863 //
801 void mmx_copy_forward(Register from, Register to_from, Register qword_count) { 864 void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
865 assert( VM_Version::supports_mmx(), "supported cpu only" );
802 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; 866 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
803 // Copy 64-byte chunks 867 // Copy 64-byte chunks
804 __ jmpb(L_copy_64_bytes); 868 __ jmpb(L_copy_64_bytes);
805 __ align(16); 869 __ align(16);
806 __ BIND(L_copy_64_bytes_loop); 870 __ BIND(L_copy_64_bytes_loop);
874 BLOCK_COMMENT("Entry:"); 938 BLOCK_COMMENT("Entry:");
875 939
876 __ subptr(to, from); // to --> to_from 940 __ subptr(to, from); // to --> to_from
877 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element 941 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
878 __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp 942 __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
879 if (!aligned && (t == T_BYTE || t == T_SHORT)) { 943 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
880 // align source address at 4 bytes address boundary 944 // align source address at 4 bytes address boundary
881 if (t == T_BYTE) { 945 if (t == T_BYTE) {
882 // One byte misalignment happens only for byte arrays 946 // One byte misalignment happens only for byte arrays
883 __ testl(from, 1); 947 __ testl(from, 1);
884 __ jccb(Assembler::zero, L_skip_align1); 948 __ jccb(Assembler::zero, L_skip_align1);
904 __ rep_mov(); 968 __ rep_mov();
905 __ subptr(to_from, from);// restore 'to_from' 969 __ subptr(to_from, from);// restore 'to_from'
906 __ mov(count, rax); // restore 'count' 970 __ mov(count, rax); // restore 'count'
907 __ jmpb(L_copy_2_bytes); // all dwords were copied 971 __ jmpb(L_copy_2_bytes); // all dwords were copied
908 } else { 972 } else {
909 // align to 8 bytes, we know we are 4 byte aligned to start 973 if (!UseUnalignedLoadStores) {
910 __ testptr(from, 4); 974 // align to 8 bytes, we know we are 4 byte aligned to start
911 __ jccb(Assembler::zero, L_copy_64_bytes); 975 __ testptr(from, 4);
912 __ movl(rax, Address(from, 0)); 976 __ jccb(Assembler::zero, L_copy_64_bytes);
913 __ movl(Address(from, to_from, Address::times_1, 0), rax); 977 __ movl(rax, Address(from, 0));
914 __ addptr(from, 4); 978 __ movl(Address(from, to_from, Address::times_1, 0), rax);
915 __ subl(count, 1<<shift); 979 __ addptr(from, 4);
980 __ subl(count, 1<<shift);
981 }
916 __ BIND(L_copy_64_bytes); 982 __ BIND(L_copy_64_bytes);
917 __ mov(rax, count); 983 __ mov(rax, count);
918 __ shrl(rax, shift+1); // 8 bytes chunk count 984 __ shrl(rax, shift+1); // 8 bytes chunk count
919 // 985 //
920 // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop 986 // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
921 // 987 //
922 mmx_copy_forward(from, to_from, rax); 988 if (UseXMMForArrayCopy) {
989 xmm_copy_forward(from, to_from, rax);
990 } else {
991 mmx_copy_forward(from, to_from, rax);
992 }
923 } 993 }
924 // copy tailing dword 994 // copy tailing dword
925 __ BIND(L_copy_4_bytes); 995 __ BIND(L_copy_4_bytes);
926 __ testl(count, 1<<shift); 996 __ testl(count, 1<<shift);
927 __ jccb(Assembler::zero, L_copy_2_bytes); 997 __ jccb(Assembler::zero, L_copy_2_bytes);
1067 __ jmpb(L_copy_8_bytes); 1137 __ jmpb(L_copy_8_bytes);
1068 1138
1069 __ align(16); 1139 __ align(16);
1070 // Move 8 bytes 1140 // Move 8 bytes
1071 __ BIND(L_copy_8_bytes_loop); 1141 __ BIND(L_copy_8_bytes_loop);
1072 __ movq(mmx0, Address(from, count, sf, 0)); 1142 if (UseXMMForArrayCopy) {
1073 __ movq(Address(to, count, sf, 0), mmx0); 1143 __ movq(xmm0, Address(from, count, sf, 0));
1144 __ movq(Address(to, count, sf, 0), xmm0);
1145 } else {
1146 __ movq(mmx0, Address(from, count, sf, 0));
1147 __ movq(Address(to, count, sf, 0), mmx0);
1148 }
1074 __ BIND(L_copy_8_bytes); 1149 __ BIND(L_copy_8_bytes);
1075 __ subl(count, 2<<shift); 1150 __ subl(count, 2<<shift);
1076 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); 1151 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1077 __ addl(count, 2<<shift); 1152 __ addl(count, 2<<shift);
1078 __ emms(); 1153 if (!UseXMMForArrayCopy) {
1154 __ emms();
1155 }
1079 } 1156 }
1080 __ BIND(L_copy_4_bytes); 1157 __ BIND(L_copy_4_bytes);
1081 // copy prefix qword 1158 // copy prefix qword
1082 __ testl(count, 1<<shift); 1159 __ testl(count, 1<<shift);
1083 __ jccb(Assembler::zero, L_copy_2_bytes); 1160 __ jccb(Assembler::zero, L_copy_2_bytes);
1141 *entry = __ pc(); // Entry point from conjoint arraycopy stub. 1218 *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1142 BLOCK_COMMENT("Entry:"); 1219 BLOCK_COMMENT("Entry:");
1143 1220
1144 __ subptr(to, from); // to --> to_from 1221 __ subptr(to, from); // to --> to_from
1145 if (VM_Version::supports_mmx()) { 1222 if (VM_Version::supports_mmx()) {
1146 mmx_copy_forward(from, to_from, count); 1223 if (UseXMMForArrayCopy) {
1224 xmm_copy_forward(from, to_from, count);
1225 } else {
1226 mmx_copy_forward(from, to_from, count);
1227 }
1147 } else { 1228 } else {
1148 __ jmpb(L_copy_8_bytes); 1229 __ jmpb(L_copy_8_bytes);
1149 __ align(16); 1230 __ align(16);
1150 __ BIND(L_copy_8_bytes_loop); 1231 __ BIND(L_copy_8_bytes_loop);
1151 __ fild_d(Address(from, 0)); 1232 __ fild_d(Address(from, 0));
1194 __ jmpb(L_copy_8_bytes); 1275 __ jmpb(L_copy_8_bytes);
1195 1276
1196 __ align(16); 1277 __ align(16);
1197 __ BIND(L_copy_8_bytes_loop); 1278 __ BIND(L_copy_8_bytes_loop);
1198 if (VM_Version::supports_mmx()) { 1279 if (VM_Version::supports_mmx()) {
1199 __ movq(mmx0, Address(from, count, Address::times_8)); 1280 if (UseXMMForArrayCopy) {
1200 __ movq(Address(to, count, Address::times_8), mmx0); 1281 __ movq(xmm0, Address(from, count, Address::times_8));
1282 __ movq(Address(to, count, Address::times_8), xmm0);
1283 } else {
1284 __ movq(mmx0, Address(from, count, Address::times_8));
1285 __ movq(Address(to, count, Address::times_8), mmx0);
1286 }
1201 } else { 1287 } else {
1202 __ fild_d(Address(from, count, Address::times_8)); 1288 __ fild_d(Address(from, count, Address::times_8));
1203 __ fistp_d(Address(to, count, Address::times_8)); 1289 __ fistp_d(Address(to, count, Address::times_8));
1204 } 1290 }
1205 __ BIND(L_copy_8_bytes); 1291 __ BIND(L_copy_8_bytes);
1206 __ decrement(count); 1292 __ decrement(count);
1207 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); 1293 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1208 1294
1209 if (VM_Version::supports_mmx()) { 1295 if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
1210 __ emms(); 1296 __ emms();
1211 } 1297 }
1212 inc_copy_counter_np(T_LONG); 1298 inc_copy_counter_np(T_LONG);
1213 __ leave(); // required for proper stackwalking of RuntimeStub frame 1299 __ leave(); // required for proper stackwalking of RuntimeStub frame
1214 __ xorptr(rax, rax); // return 0 1300 __ xorptr(rax, rax); // return 0