Mercurial > hg > graal-jvmci-8
comparison src/cpu/sparc/vm/stubGenerator_sparc.cpp @ 3903:2f9b79ddb05c
7039731: arraycopy could use prefetch on SPARC
Summary: Use BIS and prefetch in arraycopy stubs for Sparc (BIS for T4 only).
Reviewed-by: never, iveresov
author | kvn |
---|---|
date | Fri, 02 Sep 2011 12:13:33 -0700 |
parents | baf763f388e6 |
children | c565834fb592 |
comparison
equal
deleted
inserted
replaced
3902:11a4af030e4b | 3903:2f9b79ddb05c |
---|---|
1122 default: | 1122 default: |
1123 ShouldNotReachHere(); | 1123 ShouldNotReachHere(); |
1124 } | 1124 } |
1125 } | 1125 } |
1126 | 1126 |
1127 // | |
1128 // Generate main code for disjoint arraycopy | |
1129 // | |
1130 typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec, | |
1131 Label& L_loop, bool use_prefetch, bool use_bis); | |
1132 | |
1133 void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size, | |
1134 int iter_size, CopyLoopFunc copy_loop_func) { | |
1135 Label L_copy; | |
1136 | |
1137 assert(log2_elem_size <= 3, "the following code should be changed"); | |
1138 int count_dec = 16>>log2_elem_size; | |
1139 | |
1140 int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance); | |
1141 assert(prefetch_dist < 4096, "invalid value"); | |
1142 prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size | |
1143 int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count | |
1144 | |
1145 if (UseBlockCopy) { | |
1146 Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy; | |
1147 | |
1148 // 64 bytes tail + bytes copied in one loop iteration | |
1149 int tail_size = 64 + iter_size; | |
1150 int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size; | |
1151 // Use BIS copy only for big arrays since it requires membar. | |
1152 __ set(block_copy_count, O4); | |
1153 __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy); | |
1154 // This code is for disjoint source and destination: | |
1155 // to <= from || to >= from+count | |
1156 // but BIS will stomp over 'from' if (to > from-tail_size && to <= from) | |
1157 __ sub(from, to, O4); | |
1158 __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm. | |
1159 __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy); | |
1160 | |
1161 __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY); | |
1162 // BIS should not be used to copy tail (64 bytes+iter_size) | |
1163 // to avoid zeroing of following values. | |
1164 __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0 | |
1165 | |
1166 if (prefetch_count > 0) { // rounded up to one iteration count | |
1167 // Do prefetching only if copy size is bigger | |
1168 // than prefetch distance. | |
1169 __ set(prefetch_count, O4); | |
1170 __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy); | |
1171 __ sub(count, prefetch_count, count); | |
1172 | |
1173 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true); | |
1174 __ add(count, prefetch_count, count); // restore count | |
1175 | |
1176 } // prefetch_count > 0 | |
1177 | |
1178 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true); | |
1179 __ add(count, (tail_size>>log2_elem_size), count); // restore count | |
1180 | |
1181 __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT); | |
1182 // BIS needs membar. | |
1183 __ membar(Assembler::StoreLoad); | |
1184 // Copy tail | |
1185 __ ba_short(L_copy); | |
1186 | |
1187 __ BIND(L_skip_block_copy); | |
1188 } // UseBlockCopy | |
1189 | |
1190 if (prefetch_count > 0) { // rounded up to one iteration count | |
1191 // Do prefetching only if copy size is bigger | |
1192 // than prefetch distance. | |
1193 __ set(prefetch_count, O4); | |
1194 __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy); | |
1195 __ sub(count, prefetch_count, count); | |
1196 | |
1197 Label L_copy_prefetch; | |
1198 (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false); | |
1199 __ add(count, prefetch_count, count); // restore count | |
1200 | |
1201 } // prefetch_count > 0 | |
1202 | |
1203 (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false); | |
1204 } | |
1205 | |
1206 | |
1207 | |
1208 // | |
1209 // Helper methods for copy_16_bytes_forward_with_shift() | |
1210 // | |
1211 void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec, | |
1212 Label& L_loop, bool use_prefetch, bool use_bis) { | |
1213 | |
1214 const Register left_shift = G1; // left shift bit counter | |
1215 const Register right_shift = G5; // right shift bit counter | |
1216 | |
1217 __ align(OptoLoopAlignment); | |
1218 __ BIND(L_loop); | |
1219 if (use_prefetch) { | |
1220 if (ArraycopySrcPrefetchDistance > 0) { | |
1221 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); | |
1222 } | |
1223 if (ArraycopyDstPrefetchDistance > 0) { | |
1224 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); | |
1225 } | |
1226 } | |
1227 __ ldx(from, 0, O4); | |
1228 __ ldx(from, 8, G4); | |
1229 __ inc(to, 16); | |
1230 __ inc(from, 16); | |
1231 __ deccc(count, count_dec); // Can we do next iteration after this one? | |
1232 __ srlx(O4, right_shift, G3); | |
1233 __ bset(G3, O3); | |
1234 __ sllx(O4, left_shift, O4); | |
1235 __ srlx(G4, right_shift, G3); | |
1236 __ bset(G3, O4); | |
1237 if (use_bis) { | |
1238 __ stxa(O3, to, -16); | |
1239 __ stxa(O4, to, -8); | |
1240 } else { | |
1241 __ stx(O3, to, -16); | |
1242 __ stx(O4, to, -8); | |
1243 } | |
1244 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); | |
1245 __ delayed()->sllx(G4, left_shift, O3); | |
1246 } | |
1127 | 1247 |
1128 // Copy big chunks forward with shift | 1248 // Copy big chunks forward with shift |
1129 // | 1249 // |
1130 // Inputs: | 1250 // Inputs: |
1131 // from - source arrays | 1251 // from - source arrays |
1133 // count - elements count to copy >= the count equivalent to 16 bytes | 1253 // count - elements count to copy >= the count equivalent to 16 bytes |
1134 // count_dec - elements count's decrement equivalent to 16 bytes | 1254 // count_dec - elements count's decrement equivalent to 16 bytes |
1135 // L_copy_bytes - copy exit label | 1255 // L_copy_bytes - copy exit label |
1136 // | 1256 // |
1137 void copy_16_bytes_forward_with_shift(Register from, Register to, | 1257 void copy_16_bytes_forward_with_shift(Register from, Register to, |
1138 Register count, int count_dec, Label& L_copy_bytes) { | 1258 Register count, int log2_elem_size, Label& L_copy_bytes) { |
1139 Label L_loop, L_aligned_copy, L_copy_last_bytes; | 1259 Label L_aligned_copy, L_copy_last_bytes; |
1260 assert(log2_elem_size <= 3, "the following code should be changed"); | |
1261 int count_dec = 16>>log2_elem_size; | |
1140 | 1262 |
1141 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy | 1263 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy |
1142 __ andcc(from, 7, G1); // misaligned bytes | 1264 __ andcc(from, 7, G1); // misaligned bytes |
1143 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); | 1265 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); |
1144 __ delayed()->nop(); | 1266 __ delayed()->nop(); |
1145 | 1267 |
1146 const Register left_shift = G1; // left shift bit counter | 1268 const Register left_shift = G1; // left shift bit counter |
1147 const Register right_shift = G5; // right shift bit counter | 1269 const Register right_shift = G5; // right shift bit counter |
1148 | 1270 |
1149 __ sll(G1, LogBitsPerByte, left_shift); | 1271 __ sll(G1, LogBitsPerByte, left_shift); |
1150 __ mov(64, right_shift); | 1272 __ mov(64, right_shift); |
1151 __ sub(right_shift, left_shift, right_shift); | 1273 __ sub(right_shift, left_shift, right_shift); |
1152 | 1274 |
1153 // | 1275 // |
1154 // Load 2 aligned 8-bytes chunks and use one from previous iteration | 1276 // Load 2 aligned 8-bytes chunks and use one from previous iteration |
1155 // to form 2 aligned 8-bytes chunks to store. | 1277 // to form 2 aligned 8-bytes chunks to store. |
1156 // | 1278 // |
1157 __ deccc(count, count_dec); // Pre-decrement 'count' | 1279 __ dec(count, count_dec); // Pre-decrement 'count' |
1158 __ andn(from, 7, from); // Align address | 1280 __ andn(from, 7, from); // Align address |
1159 __ ldx(from, 0, O3); | 1281 __ ldx(from, 0, O3); |
1160 __ inc(from, 8); | 1282 __ inc(from, 8); |
1161 __ align(OptoLoopAlignment); | 1283 __ sllx(O3, left_shift, O3); |
1162 __ BIND(L_loop); | 1284 |
1163 __ ldx(from, 0, O4); | 1285 disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop); |
1164 __ deccc(count, count_dec); // Can we do next iteration after this one? | 1286 |
1165 __ ldx(from, 8, G4); | 1287 __ inccc(count, count_dec>>1 ); // + 8 bytes |
1166 __ inc(to, 16); | 1288 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); |
1167 __ inc(from, 16); | 1289 __ delayed()->inc(count, count_dec>>1); // restore 'count' |
1168 __ sllx(O3, left_shift, O3); | 1290 |
1169 __ srlx(O4, right_shift, G3); | 1291 // copy 8 bytes, part of them already loaded in O3 |
1170 __ bset(G3, O3); | 1292 __ ldx(from, 0, O4); |
1171 __ stx(O3, to, -16); | 1293 __ inc(to, 8); |
1172 __ sllx(O4, left_shift, O4); | 1294 __ inc(from, 8); |
1173 __ srlx(G4, right_shift, G3); | 1295 __ srlx(O4, right_shift, G3); |
1174 __ bset(G3, O4); | 1296 __ bset(O3, G3); |
1175 __ stx(O4, to, -8); | 1297 __ stx(G3, to, -8); |
1176 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); | |
1177 __ delayed()->mov(G4, O3); | |
1178 | |
1179 __ inccc(count, count_dec>>1 ); // + 8 bytes | |
1180 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); | |
1181 __ delayed()->inc(count, count_dec>>1); // restore 'count' | |
1182 | |
1183 // copy 8 bytes, part of them already loaded in O3 | |
1184 __ ldx(from, 0, O4); | |
1185 __ inc(to, 8); | |
1186 __ inc(from, 8); | |
1187 __ sllx(O3, left_shift, O3); | |
1188 __ srlx(O4, right_shift, G3); | |
1189 __ bset(O3, G3); | |
1190 __ stx(G3, to, -8); | |
1191 | 1298 |
1192 __ BIND(L_copy_last_bytes); | 1299 __ BIND(L_copy_last_bytes); |
1193 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes | 1300 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes |
1194 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); | 1301 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); |
1195 __ delayed()->sub(from, right_shift, from); // restore address | 1302 __ delayed()->sub(from, right_shift, from); // restore address |
1196 | 1303 |
1197 __ BIND(L_aligned_copy); | 1304 __ BIND(L_aligned_copy); |
1198 } | 1305 } |
1199 | 1306 |
1200 // Copy big chunks backward with shift | 1307 // Copy big chunks backward with shift |
1346 // the same alignment mod 8, otherwise fall through to the next | 1453 // the same alignment mod 8, otherwise fall through to the next |
1347 // code for aligned copy. | 1454 // code for aligned copy. |
1348 // The compare above (count >= 23) guarantes 'count' >= 16 bytes. | 1455 // The compare above (count >= 23) guarantes 'count' >= 16 bytes. |
1349 // Also jump over aligned copy after the copy with shift completed. | 1456 // Also jump over aligned copy after the copy with shift completed. |
1350 | 1457 |
1351 copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte); | 1458 copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte); |
1352 } | 1459 } |
1353 | 1460 |
1354 // Both array are 8 bytes aligned, copy 16 bytes at a time | 1461 // Both array are 8 bytes aligned, copy 16 bytes at a time |
1355 __ and3(count, 7, G4); // Save count | 1462 __ and3(count, 7, G4); // Save count |
1356 __ srl(count, 3, count); | 1463 __ srl(count, 3, count); |
1574 // the same alignment mod 8, otherwise fall through to the next | 1681 // the same alignment mod 8, otherwise fall through to the next |
1575 // code for aligned copy. | 1682 // code for aligned copy. |
1576 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. | 1683 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. |
1577 // Also jump over aligned copy after the copy with shift completed. | 1684 // Also jump over aligned copy after the copy with shift completed. |
1578 | 1685 |
1579 copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes); | 1686 copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes); |
1580 } | 1687 } |
1581 | 1688 |
1582 // Both array are 8 bytes aligned, copy 16 bytes at a time | 1689 // Both array are 8 bytes aligned, copy 16 bytes at a time |
1583 __ and3(count, 3, G4); // Save | 1690 __ and3(count, 3, G4); // Save |
1584 __ srl(count, 2, count); | 1691 __ srl(count, 2, count); |
1948 __ delayed()->mov(G0, O0); // return 0 | 2055 __ delayed()->mov(G0, O0); // return 0 |
1949 return start; | 2056 return start; |
1950 } | 2057 } |
1951 | 2058 |
1952 // | 2059 // |
2060 // Helper methods for generate_disjoint_int_copy_core() | |
2061 // | |
2062 void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec, | |
2063 Label& L_loop, bool use_prefetch, bool use_bis) { | |
2064 | |
2065 __ align(OptoLoopAlignment); | |
2066 __ BIND(L_loop); | |
2067 if (use_prefetch) { | |
2068 if (ArraycopySrcPrefetchDistance > 0) { | |
2069 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); | |
2070 } | |
2071 if (ArraycopyDstPrefetchDistance > 0) { | |
2072 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); | |
2073 } | |
2074 } | |
2075 __ ldx(from, 4, O4); | |
2076 __ ldx(from, 12, G4); | |
2077 __ inc(to, 16); | |
2078 __ inc(from, 16); | |
2079 __ deccc(count, 4); // Can we do next iteration after this one? | |
2080 | |
2081 __ srlx(O4, 32, G3); | |
2082 __ bset(G3, O3); | |
2083 __ sllx(O4, 32, O4); | |
2084 __ srlx(G4, 32, G3); | |
2085 __ bset(G3, O4); | |
2086 if (use_bis) { | |
2087 __ stxa(O3, to, -16); | |
2088 __ stxa(O4, to, -8); | |
2089 } else { | |
2090 __ stx(O3, to, -16); | |
2091 __ stx(O4, to, -8); | |
2092 } | |
2093 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); | |
2094 __ delayed()->sllx(G4, 32, O3); | |
2095 | |
2096 } | |
2097 | |
2098 // | |
1953 // Generate core code for disjoint int copy (and oop copy on 32-bit). | 2099 // Generate core code for disjoint int copy (and oop copy on 32-bit). |
1954 // If "aligned" is true, the "from" and "to" addresses are assumed | 2100 // If "aligned" is true, the "from" and "to" addresses are assumed |
1955 // to be heapword aligned. | 2101 // to be heapword aligned. |
1956 // | 2102 // |
1957 // Arguments: | 2103 // Arguments: |
1960 // count: O2 treated as signed | 2106 // count: O2 treated as signed |
1961 // | 2107 // |
1962 void generate_disjoint_int_copy_core(bool aligned) { | 2108 void generate_disjoint_int_copy_core(bool aligned) { |
1963 | 2109 |
1964 Label L_skip_alignment, L_aligned_copy; | 2110 Label L_skip_alignment, L_aligned_copy; |
1965 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; | 2111 Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; |
1966 | 2112 |
1967 const Register from = O0; // source array address | 2113 const Register from = O0; // source array address |
1968 const Register to = O1; // destination array address | 2114 const Register to = O1; // destination array address |
1969 const Register count = O2; // elements count | 2115 const Register count = O2; // elements count |
1970 const Register offset = O5; // offset from start of arrays | 2116 const Register offset = O5; // offset from start of arrays |
2011 // copy_16_bytes_forward_with_shift() is not used here since this | 2157 // copy_16_bytes_forward_with_shift() is not used here since this |
2012 // code is more optimal. | 2158 // code is more optimal. |
2013 | 2159 |
2014 // copy with shift 4 elements (16 bytes) at a time | 2160 // copy with shift 4 elements (16 bytes) at a time |
2015 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 | 2161 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 |
2016 | 2162 __ sllx(O3, 32, O3); |
2017 __ align(OptoLoopAlignment); | 2163 |
2018 __ BIND(L_copy_16_bytes); | 2164 disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop); |
2019 __ ldx(from, 4, O4); | |
2020 __ deccc(count, 4); // Can we do next iteration after this one? | |
2021 __ ldx(from, 12, G4); | |
2022 __ inc(to, 16); | |
2023 __ inc(from, 16); | |
2024 __ sllx(O3, 32, O3); | |
2025 __ srlx(O4, 32, G3); | |
2026 __ bset(G3, O3); | |
2027 __ stx(O3, to, -16); | |
2028 __ sllx(O4, 32, O4); | |
2029 __ srlx(G4, 32, G3); | |
2030 __ bset(G3, O4); | |
2031 __ stx(O4, to, -8); | |
2032 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); | |
2033 __ delayed()->mov(G4, O3); | |
2034 | 2165 |
2035 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); | 2166 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); |
2036 __ delayed()->inc(count, 4); // restore 'count' | 2167 __ delayed()->inc(count, 4); // restore 'count' |
2037 | 2168 |
2038 __ BIND(L_aligned_copy); | 2169 __ BIND(L_aligned_copy); |
2039 } | 2170 } // !aligned |
2171 | |
2040 // copy 4 elements (16 bytes) at a time | 2172 // copy 4 elements (16 bytes) at a time |
2041 __ and3(count, 1, G4); // Save | 2173 __ and3(count, 1, G4); // Save |
2042 __ srl(count, 1, count); | 2174 __ srl(count, 1, count); |
2043 generate_disjoint_long_copy_core(aligned); | 2175 generate_disjoint_long_copy_core(aligned); |
2044 __ mov(G4, count); // Restore | 2176 __ mov(G4, count); // Restore |
2221 __ delayed()->mov(G0, O0); // return 0 | 2353 __ delayed()->mov(G0, O0); // return 0 |
2222 return start; | 2354 return start; |
2223 } | 2355 } |
2224 | 2356 |
2225 // | 2357 // |
2358 // Helper methods for generate_disjoint_long_copy_core() | |
2359 // | |
2360 void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec, | |
2361 Label& L_loop, bool use_prefetch, bool use_bis) { | |
2362 __ align(OptoLoopAlignment); | |
2363 __ BIND(L_loop); | |
2364 for (int off = 0; off < 64; off += 16) { | |
2365 if (use_prefetch && (off & 31) == 0) { | |
2366 if (ArraycopySrcPrefetchDistance > 0) { | |
2367 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); | |
2368 } | |
2369 if (ArraycopyDstPrefetchDistance > 0) { | |
2370 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); | |
2371 } | |
2372 } | |
2373 __ ldx(from, off+0, O4); | |
2374 __ ldx(from, off+8, O5); | |
2375 if (use_bis) { | |
2376 __ stxa(O4, to, off+0); | |
2377 __ stxa(O5, to, off+8); | |
2378 } else { | |
2379 __ stx(O4, to, off+0); | |
2380 __ stx(O5, to, off+8); | |
2381 } | |
2382 } | |
2383 __ deccc(count, 8); | |
2384 __ inc(from, 64); | |
2385 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); | |
2386 __ delayed()->inc(to, 64); | |
2387 } | |
2388 | |
2389 // | |
2226 // Generate core code for disjoint long copy (and oop copy on 64-bit). | 2390 // Generate core code for disjoint long copy (and oop copy on 64-bit). |
2227 // "aligned" is ignored, because we must make the stronger | 2391 // "aligned" is ignored, because we must make the stronger |
2228 // assumption that both addresses are always 64-bit aligned. | 2392 // assumption that both addresses are always 64-bit aligned. |
2229 // | 2393 // |
2230 // Arguments: | 2394 // Arguments: |
2259 const Register to = O1; // destination array address | 2423 const Register to = O1; // destination array address |
2260 const Register count = O2; // elements count | 2424 const Register count = O2; // elements count |
2261 const Register offset0 = O4; // element offset | 2425 const Register offset0 = O4; // element offset |
2262 const Register offset8 = O5; // next element offset | 2426 const Register offset8 = O5; // next element offset |
2263 | 2427 |
2264 __ deccc(count, 2); | 2428 __ deccc(count, 2); |
2265 __ mov(G0, offset0); // offset from start of arrays (0) | 2429 __ mov(G0, offset0); // offset from start of arrays (0) |
2266 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); | 2430 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); |
2267 __ delayed()->add(offset0, 8, offset8); | 2431 __ delayed()->add(offset0, 8, offset8); |
2268 | 2432 |
2269 // Copy by 64 bytes chunks | 2433 // Copy by 64 bytes chunks |
2270 Label L_copy_64_bytes; | 2434 |
2271 const Register from64 = O3; // source address | 2435 const Register from64 = O3; // source address |
2272 const Register to64 = G3; // destination address | 2436 const Register to64 = G3; // destination address |
2273 __ subcc(count, 6, O3); | 2437 __ subcc(count, 6, O3); |
2274 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); | 2438 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); |
2275 __ delayed()->mov(to, to64); | 2439 __ delayed()->mov(to, to64); |
2276 // Now we can use O4(offset0), O5(offset8) as temps | 2440 // Now we can use O4(offset0), O5(offset8) as temps |
2277 __ mov(O3, count); | 2441 __ mov(O3, count); |
2278 __ mov(from, from64); | 2442 // count >= 0 (original count - 8) |
2279 | 2443 __ mov(from, from64); |
2280 __ align(OptoLoopAlignment); | 2444 |
2281 __ BIND(L_copy_64_bytes); | 2445 disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop); |
2282 for( int off = 0; off < 64; off += 16 ) { | |
2283 __ ldx(from64, off+0, O4); | |
2284 __ ldx(from64, off+8, O5); | |
2285 __ stx(O4, to64, off+0); | |
2286 __ stx(O5, to64, off+8); | |
2287 } | |
2288 __ deccc(count, 8); | |
2289 __ inc(from64, 64); | |
2290 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes); | |
2291 __ delayed()->inc(to64, 64); | |
2292 | 2446 |
2293 // Restore O4(offset0), O5(offset8) | 2447 // Restore O4(offset0), O5(offset8) |
2294 __ sub(from64, from, offset0); | 2448 __ sub(from64, from, offset0); |
2295 __ inccc(count, 6); | 2449 __ inccc(count, 6); // restore count |
2296 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); | 2450 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); |
2297 __ delayed()->add(offset0, 8, offset8); | 2451 __ delayed()->add(offset0, 8, offset8); |
2298 | 2452 |
2299 // Copy by 16 bytes chunks | 2453 // Copy by 16 bytes chunks |
2300 __ align(OptoLoopAlignment); | 2454 __ align(OptoLoopAlignment); |