Mercurial > hg > truffle
comparison src/cpu/x86/vm/stubGenerator_x86_32.cpp @ 405:2649e5276dd7
6532536: Optimize arraycopy stubs for Intel cpus
Summary: Use SSE2 movdqu in arraycopy stubs on newest Intel's cpus
Reviewed-by: rasbold
author | kvn |
---|---|
date | Tue, 14 Oct 2008 15:10:26 -0700 |
parents | f8199438385b |
children | 67e8b4d06369 |
comparison
equal
deleted
inserted
replaced
404:78c058bc5cdc | 405:2649e5276dd7 |
---|---|
789 ShouldNotReachHere(); | 789 ShouldNotReachHere(); |
790 | 790 |
791 } | 791 } |
792 } | 792 } |
793 | 793 |
794 | |
794 // Copy 64 bytes chunks | 795 // Copy 64 bytes chunks |
795 // | 796 // |
796 // Inputs: | 797 // Inputs: |
797 // from - source array address | 798 // from - source array address |
798 // to_from - destination array address - from | 799 // to_from - destination array address - from |
799 // qword_count - 8-bytes element count, negative | 800 // qword_count - 8-bytes element count, negative |
800 // | 801 // |
802 void xmm_copy_forward(Register from, Register to_from, Register qword_count) { | |
803 assert( UseSSE >= 2, "supported cpu only" ); | |
804 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; | |
805 // Copy 64-byte chunks | |
806 __ jmpb(L_copy_64_bytes); | |
807 __ align(16); | |
808 __ BIND(L_copy_64_bytes_loop); | |
809 | |
810 if(UseUnalignedLoadStores) { | |
811 __ movdqu(xmm0, Address(from, 0)); | |
812 __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0); | |
813 __ movdqu(xmm1, Address(from, 16)); | |
814 __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1); | |
815 __ movdqu(xmm2, Address(from, 32)); | |
816 __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2); | |
817 __ movdqu(xmm3, Address(from, 48)); | |
818 __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3); | |
819 | |
820 } else { | |
821 __ movq(xmm0, Address(from, 0)); | |
822 __ movq(Address(from, to_from, Address::times_1, 0), xmm0); | |
823 __ movq(xmm1, Address(from, 8)); | |
824 __ movq(Address(from, to_from, Address::times_1, 8), xmm1); | |
825 __ movq(xmm2, Address(from, 16)); | |
826 __ movq(Address(from, to_from, Address::times_1, 16), xmm2); | |
827 __ movq(xmm3, Address(from, 24)); | |
828 __ movq(Address(from, to_from, Address::times_1, 24), xmm3); | |
829 __ movq(xmm4, Address(from, 32)); | |
830 __ movq(Address(from, to_from, Address::times_1, 32), xmm4); | |
831 __ movq(xmm5, Address(from, 40)); | |
832 __ movq(Address(from, to_from, Address::times_1, 40), xmm5); | |
833 __ movq(xmm6, Address(from, 48)); | |
834 __ movq(Address(from, to_from, Address::times_1, 48), xmm6); | |
835 __ movq(xmm7, Address(from, 56)); | |
836 __ movq(Address(from, to_from, Address::times_1, 56), xmm7); | |
837 } | |
838 | |
839 __ addl(from, 64); | |
840 __ BIND(L_copy_64_bytes); | |
841 __ subl(qword_count, 8); | |
842 __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop); | |
843 __ addl(qword_count, 8); | |
844 __ jccb(Assembler::zero, L_exit); | |
845 // | |
846 // length is too short, just copy qwords | |
847 // | |
848 __ BIND(L_copy_8_bytes); | |
849 __ movq(xmm0, Address(from, 0)); | |
850 __ movq(Address(from, to_from, Address::times_1), xmm0); | |
851 __ addl(from, 8); | |
852 __ decrement(qword_count); | |
853 __ jcc(Assembler::greater, L_copy_8_bytes); | |
854 __ BIND(L_exit); | |
855 } | |
856 | |
857 // Copy 64 bytes chunks | |
858 // | |
859 // Inputs: | |
860 // from - source array address | |
861 // to_from - destination array address - from | |
862 // qword_count - 8-bytes element count, negative | |
863 // | |
801 void mmx_copy_forward(Register from, Register to_from, Register qword_count) { | 864 void mmx_copy_forward(Register from, Register to_from, Register qword_count) { |
865 assert( VM_Version::supports_mmx(), "supported cpu only" ); | |
802 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; | 866 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; |
803 // Copy 64-byte chunks | 867 // Copy 64-byte chunks |
804 __ jmpb(L_copy_64_bytes); | 868 __ jmpb(L_copy_64_bytes); |
805 __ align(16); | 869 __ align(16); |
806 __ BIND(L_copy_64_bytes_loop); | 870 __ BIND(L_copy_64_bytes_loop); |
874 BLOCK_COMMENT("Entry:"); | 938 BLOCK_COMMENT("Entry:"); |
875 | 939 |
876 __ subptr(to, from); // to --> to_from | 940 __ subptr(to, from); // to --> to_from |
877 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element | 941 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element |
878 __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp | 942 __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp |
879 if (!aligned && (t == T_BYTE || t == T_SHORT)) { | 943 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) { |
880 // align source address at 4 bytes address boundary | 944 // align source address at 4 bytes address boundary |
881 if (t == T_BYTE) { | 945 if (t == T_BYTE) { |
882 // One byte misalignment happens only for byte arrays | 946 // One byte misalignment happens only for byte arrays |
883 __ testl(from, 1); | 947 __ testl(from, 1); |
884 __ jccb(Assembler::zero, L_skip_align1); | 948 __ jccb(Assembler::zero, L_skip_align1); |
904 __ rep_mov(); | 968 __ rep_mov(); |
905 __ subptr(to_from, from);// restore 'to_from' | 969 __ subptr(to_from, from);// restore 'to_from' |
906 __ mov(count, rax); // restore 'count' | 970 __ mov(count, rax); // restore 'count' |
907 __ jmpb(L_copy_2_bytes); // all dwords were copied | 971 __ jmpb(L_copy_2_bytes); // all dwords were copied |
908 } else { | 972 } else { |
909 // align to 8 bytes, we know we are 4 byte aligned to start | 973 if (!UseUnalignedLoadStores) { |
910 __ testptr(from, 4); | 974 // align to 8 bytes, we know we are 4 byte aligned to start |
911 __ jccb(Assembler::zero, L_copy_64_bytes); | 975 __ testptr(from, 4); |
912 __ movl(rax, Address(from, 0)); | 976 __ jccb(Assembler::zero, L_copy_64_bytes); |
913 __ movl(Address(from, to_from, Address::times_1, 0), rax); | 977 __ movl(rax, Address(from, 0)); |
914 __ addptr(from, 4); | 978 __ movl(Address(from, to_from, Address::times_1, 0), rax); |
915 __ subl(count, 1<<shift); | 979 __ addptr(from, 4); |
980 __ subl(count, 1<<shift); | |
981 } | |
916 __ BIND(L_copy_64_bytes); | 982 __ BIND(L_copy_64_bytes); |
917 __ mov(rax, count); | 983 __ mov(rax, count); |
918 __ shrl(rax, shift+1); // 8 bytes chunk count | 984 __ shrl(rax, shift+1); // 8 bytes chunk count |
919 // | 985 // |
920 // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop | 986 // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop |
921 // | 987 // |
922 mmx_copy_forward(from, to_from, rax); | 988 if (UseXMMForArrayCopy) { |
989 xmm_copy_forward(from, to_from, rax); | |
990 } else { | |
991 mmx_copy_forward(from, to_from, rax); | |
992 } | |
923 } | 993 } |
924 // copy tailing dword | 994 // copy tailing dword |
925 __ BIND(L_copy_4_bytes); | 995 __ BIND(L_copy_4_bytes); |
926 __ testl(count, 1<<shift); | 996 __ testl(count, 1<<shift); |
927 __ jccb(Assembler::zero, L_copy_2_bytes); | 997 __ jccb(Assembler::zero, L_copy_2_bytes); |
1067 __ jmpb(L_copy_8_bytes); | 1137 __ jmpb(L_copy_8_bytes); |
1068 | 1138 |
1069 __ align(16); | 1139 __ align(16); |
1070 // Move 8 bytes | 1140 // Move 8 bytes |
1071 __ BIND(L_copy_8_bytes_loop); | 1141 __ BIND(L_copy_8_bytes_loop); |
1072 __ movq(mmx0, Address(from, count, sf, 0)); | 1142 if (UseXMMForArrayCopy) { |
1073 __ movq(Address(to, count, sf, 0), mmx0); | 1143 __ movq(xmm0, Address(from, count, sf, 0)); |
1144 __ movq(Address(to, count, sf, 0), xmm0); | |
1145 } else { | |
1146 __ movq(mmx0, Address(from, count, sf, 0)); | |
1147 __ movq(Address(to, count, sf, 0), mmx0); | |
1148 } | |
1074 __ BIND(L_copy_8_bytes); | 1149 __ BIND(L_copy_8_bytes); |
1075 __ subl(count, 2<<shift); | 1150 __ subl(count, 2<<shift); |
1076 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); | 1151 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); |
1077 __ addl(count, 2<<shift); | 1152 __ addl(count, 2<<shift); |
1078 __ emms(); | 1153 if (!UseXMMForArrayCopy) { |
1154 __ emms(); | |
1155 } | |
1079 } | 1156 } |
1080 __ BIND(L_copy_4_bytes); | 1157 __ BIND(L_copy_4_bytes); |
1081 // copy prefix qword | 1158 // copy prefix qword |
1082 __ testl(count, 1<<shift); | 1159 __ testl(count, 1<<shift); |
1083 __ jccb(Assembler::zero, L_copy_2_bytes); | 1160 __ jccb(Assembler::zero, L_copy_2_bytes); |
1141 *entry = __ pc(); // Entry point from conjoint arraycopy stub. | 1218 *entry = __ pc(); // Entry point from conjoint arraycopy stub. |
1142 BLOCK_COMMENT("Entry:"); | 1219 BLOCK_COMMENT("Entry:"); |
1143 | 1220 |
1144 __ subptr(to, from); // to --> to_from | 1221 __ subptr(to, from); // to --> to_from |
1145 if (VM_Version::supports_mmx()) { | 1222 if (VM_Version::supports_mmx()) { |
1146 mmx_copy_forward(from, to_from, count); | 1223 if (UseXMMForArrayCopy) { |
1224 xmm_copy_forward(from, to_from, count); | |
1225 } else { | |
1226 mmx_copy_forward(from, to_from, count); | |
1227 } | |
1147 } else { | 1228 } else { |
1148 __ jmpb(L_copy_8_bytes); | 1229 __ jmpb(L_copy_8_bytes); |
1149 __ align(16); | 1230 __ align(16); |
1150 __ BIND(L_copy_8_bytes_loop); | 1231 __ BIND(L_copy_8_bytes_loop); |
1151 __ fild_d(Address(from, 0)); | 1232 __ fild_d(Address(from, 0)); |
1194 __ jmpb(L_copy_8_bytes); | 1275 __ jmpb(L_copy_8_bytes); |
1195 | 1276 |
1196 __ align(16); | 1277 __ align(16); |
1197 __ BIND(L_copy_8_bytes_loop); | 1278 __ BIND(L_copy_8_bytes_loop); |
1198 if (VM_Version::supports_mmx()) { | 1279 if (VM_Version::supports_mmx()) { |
1199 __ movq(mmx0, Address(from, count, Address::times_8)); | 1280 if (UseXMMForArrayCopy) { |
1200 __ movq(Address(to, count, Address::times_8), mmx0); | 1281 __ movq(xmm0, Address(from, count, Address::times_8)); |
1282 __ movq(Address(to, count, Address::times_8), xmm0); | |
1283 } else { | |
1284 __ movq(mmx0, Address(from, count, Address::times_8)); | |
1285 __ movq(Address(to, count, Address::times_8), mmx0); | |
1286 } | |
1201 } else { | 1287 } else { |
1202 __ fild_d(Address(from, count, Address::times_8)); | 1288 __ fild_d(Address(from, count, Address::times_8)); |
1203 __ fistp_d(Address(to, count, Address::times_8)); | 1289 __ fistp_d(Address(to, count, Address::times_8)); |
1204 } | 1290 } |
1205 __ BIND(L_copy_8_bytes); | 1291 __ BIND(L_copy_8_bytes); |
1206 __ decrement(count); | 1292 __ decrement(count); |
1207 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); | 1293 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); |
1208 | 1294 |
1209 if (VM_Version::supports_mmx()) { | 1295 if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) { |
1210 __ emms(); | 1296 __ emms(); |
1211 } | 1297 } |
1212 inc_copy_counter_np(T_LONG); | 1298 inc_copy_counter_np(T_LONG); |
1213 __ leave(); // required for proper stackwalking of RuntimeStub frame | 1299 __ leave(); // required for proper stackwalking of RuntimeStub frame |
1214 __ xorptr(rax, rax); // return 0 | 1300 __ xorptr(rax, rax); // return 0 |