comparison src/cpu/x86/vm/stubGenerator_x86_64.cpp @ 7475:e2e6bf86682c

8005544: Use 256bit YMM registers in arraycopy stubs on x86 Summary: Use YMM registers in arraycopy and array_fill stubs. Reviewed-by: roland, twisti
author kvn
date Thu, 03 Jan 2013 16:30:47 -0800
parents 2c7f594145dc
children 989155e2d07a e961c11b85fe
comparison
equal deleted inserted replaced
7474:00af3a3a8df4 7475:e2e6bf86682c
1284 // Inputs: 1284 // Inputs:
1285 // end_from - source arrays end address 1285 // end_from - source arrays end address
1286 // end_to - destination array end address 1286 // end_to - destination array end address
1287 // qword_count - 64-bits element count, negative 1287 // qword_count - 64-bits element count, negative
1288 // to - scratch 1288 // to - scratch
1289 // L_copy_32_bytes - entry label 1289 // L_copy_bytes - entry label
1290 // L_copy_8_bytes - exit label 1290 // L_copy_8_bytes - exit label
1291 // 1291 //
1292 void copy_32_bytes_forward(Register end_from, Register end_to, 1292 void copy_bytes_forward(Register end_from, Register end_to,
1293 Register qword_count, Register to, 1293 Register qword_count, Register to,
1294 Label& L_copy_32_bytes, Label& L_copy_8_bytes) { 1294 Label& L_copy_bytes, Label& L_copy_8_bytes) {
1295 DEBUG_ONLY(__ stop("enter at entry label, not here")); 1295 DEBUG_ONLY(__ stop("enter at entry label, not here"));
1296 Label L_loop; 1296 Label L_loop;
1297 __ align(OptoLoopAlignment); 1297 __ align(OptoLoopAlignment);
1298 __ BIND(L_loop); 1298 if (UseUnalignedLoadStores) {
1299 if(UseUnalignedLoadStores) { 1299 Label L_end;
1300 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); 1300 // Copy 64-bytes per iteration
1301 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); 1301 __ BIND(L_loop);
1302 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8)); 1302 if (UseAVX >= 2) {
1303 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1); 1303 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1304 1304 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1305 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1306 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1307 } else {
1308 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1309 __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1310 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1311 __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1312 __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1313 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1314 __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1315 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1316 }
1317 __ BIND(L_copy_bytes);
1318 __ addptr(qword_count, 8);
1319 __ jcc(Assembler::lessEqual, L_loop);
1320 __ subptr(qword_count, 4); // sub(8) and add(4)
1321 __ jccb(Assembler::greater, L_end);
1322 // Copy trailing 32 bytes
1323 if (UseAVX >= 2) {
1324 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1325 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1326 } else {
1327 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1328 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1329 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1330 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1331 }
1332 __ addptr(qword_count, 4);
1333 __ BIND(L_end);
1305 } else { 1334 } else {
1335 // Copy 32-bytes per iteration
1336 __ BIND(L_loop);
1306 __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); 1337 __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1307 __ movq(Address(end_to, qword_count, Address::times_8, -24), to); 1338 __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1308 __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); 1339 __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1309 __ movq(Address(end_to, qword_count, Address::times_8, -16), to); 1340 __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1310 __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); 1341 __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1311 __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); 1342 __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1312 __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); 1343 __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1313 __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); 1344 __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1314 } 1345
1315 __ BIND(L_copy_32_bytes); 1346 __ BIND(L_copy_bytes);
1316 __ addptr(qword_count, 4); 1347 __ addptr(qword_count, 4);
1317 __ jcc(Assembler::lessEqual, L_loop); 1348 __ jcc(Assembler::lessEqual, L_loop);
1349 }
1318 __ subptr(qword_count, 4); 1350 __ subptr(qword_count, 4);
1319 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords 1351 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1320 } 1352 }
1321
1322 1353
1323 // Copy big chunks backward 1354 // Copy big chunks backward
1324 // 1355 //
1325 // Inputs: 1356 // Inputs:
1326 // from - source arrays address 1357 // from - source arrays address
1327 // dest - destination array address 1358 // dest - destination array address
1328 // qword_count - 64-bits element count 1359 // qword_count - 64-bits element count
1329 // to - scratch 1360 // to - scratch
1330 // L_copy_32_bytes - entry label 1361 // L_copy_bytes - entry label
1331 // L_copy_8_bytes - exit label 1362 // L_copy_8_bytes - exit label
1332 // 1363 //
1333 void copy_32_bytes_backward(Register from, Register dest, 1364 void copy_bytes_backward(Register from, Register dest,
1334 Register qword_count, Register to, 1365 Register qword_count, Register to,
1335 Label& L_copy_32_bytes, Label& L_copy_8_bytes) { 1366 Label& L_copy_bytes, Label& L_copy_8_bytes) {
1336 DEBUG_ONLY(__ stop("enter at entry label, not here")); 1367 DEBUG_ONLY(__ stop("enter at entry label, not here"));
1337 Label L_loop; 1368 Label L_loop;
1338 __ align(OptoLoopAlignment); 1369 __ align(OptoLoopAlignment);
1339 __ BIND(L_loop); 1370 if (UseUnalignedLoadStores) {
1340 if(UseUnalignedLoadStores) { 1371 Label L_end;
1341 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); 1372 // Copy 64-bytes per iteration
1342 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0); 1373 __ BIND(L_loop);
1343 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 1374 if (UseAVX >= 2) {
1344 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 1375 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1345 1376 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1377 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1378 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1379 } else {
1380 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1381 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1382 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1383 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1384 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1385 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1386 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
1387 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
1388 }
1389 __ BIND(L_copy_bytes);
1390 __ subptr(qword_count, 8);
1391 __ jcc(Assembler::greaterEqual, L_loop);
1392
1393 __ addptr(qword_count, 4); // add(8) and sub(4)
1394 __ jccb(Assembler::less, L_end);
1395 // Copy trailing 32 bytes
1396 if (UseAVX >= 2) {
1397 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1398 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1399 } else {
1400 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1401 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1402 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1403 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1404 }
1405 __ subptr(qword_count, 4);
1406 __ BIND(L_end);
1346 } else { 1407 } else {
1408 // Copy 32-bytes per iteration
1409 __ BIND(L_loop);
1347 __ movq(to, Address(from, qword_count, Address::times_8, 24)); 1410 __ movq(to, Address(from, qword_count, Address::times_8, 24));
1348 __ movq(Address(dest, qword_count, Address::times_8, 24), to); 1411 __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1349 __ movq(to, Address(from, qword_count, Address::times_8, 16)); 1412 __ movq(to, Address(from, qword_count, Address::times_8, 16));
1350 __ movq(Address(dest, qword_count, Address::times_8, 16), to); 1413 __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1351 __ movq(to, Address(from, qword_count, Address::times_8, 8)); 1414 __ movq(to, Address(from, qword_count, Address::times_8, 8));
1352 __ movq(Address(dest, qword_count, Address::times_8, 8), to); 1415 __ movq(Address(dest, qword_count, Address::times_8, 8), to);
1353 __ movq(to, Address(from, qword_count, Address::times_8, 0)); 1416 __ movq(to, Address(from, qword_count, Address::times_8, 0));
1354 __ movq(Address(dest, qword_count, Address::times_8, 0), to); 1417 __ movq(Address(dest, qword_count, Address::times_8, 0), to);
1355 } 1418
1356 __ BIND(L_copy_32_bytes); 1419 __ BIND(L_copy_bytes);
1357 __ subptr(qword_count, 4); 1420 __ subptr(qword_count, 4);
1358 __ jcc(Assembler::greaterEqual, L_loop); 1421 __ jcc(Assembler::greaterEqual, L_loop);
1422 }
1359 __ addptr(qword_count, 4); 1423 __ addptr(qword_count, 4);
1360 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords 1424 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1361 } 1425 }
1362 1426
1363 1427
1383 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1447 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1384 __ align(CodeEntryAlignment); 1448 __ align(CodeEntryAlignment);
1385 StubCodeMark mark(this, "StubRoutines", name); 1449 StubCodeMark mark(this, "StubRoutines", name);
1386 address start = __ pc(); 1450 address start = __ pc();
1387 1451
1388 Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1452 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1389 Label L_copy_byte, L_exit; 1453 Label L_copy_byte, L_exit;
1390 const Register from = rdi; // source array address 1454 const Register from = rdi; // source array address
1391 const Register to = rsi; // destination array address 1455 const Register to = rsi; // destination array address
1392 const Register count = rdx; // elements count 1456 const Register count = rdx; // elements count
1393 const Register byte_count = rcx; 1457 const Register byte_count = rcx;
1415 1479
1416 // Copy from low to high addresses. Use 'to' as scratch. 1480 // Copy from low to high addresses. Use 'to' as scratch.
1417 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1481 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1418 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1482 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1419 __ negptr(qword_count); // make the count negative 1483 __ negptr(qword_count); // make the count negative
1420 __ jmp(L_copy_32_bytes); 1484 __ jmp(L_copy_bytes);
1421 1485
1422 // Copy trailing qwords 1486 // Copy trailing qwords
1423 __ BIND(L_copy_8_bytes); 1487 __ BIND(L_copy_8_bytes);
1424 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1488 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1425 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1489 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1458 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1522 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1459 __ xorptr(rax, rax); // return 0 1523 __ xorptr(rax, rax); // return 0
1460 __ leave(); // required for proper stackwalking of RuntimeStub frame 1524 __ leave(); // required for proper stackwalking of RuntimeStub frame
1461 __ ret(0); 1525 __ ret(0);
1462 1526
1463 // Copy in 32-bytes chunks 1527 // Copy in multi-bytes chunks
1464 copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); 1528 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1465 __ jmp(L_copy_4_bytes); 1529 __ jmp(L_copy_4_bytes);
1466 1530
1467 return start; 1531 return start;
1468 } 1532 }
1469 1533
1486 address* entry, const char *name) { 1550 address* entry, const char *name) {
1487 __ align(CodeEntryAlignment); 1551 __ align(CodeEntryAlignment);
1488 StubCodeMark mark(this, "StubRoutines", name); 1552 StubCodeMark mark(this, "StubRoutines", name);
1489 address start = __ pc(); 1553 address start = __ pc();
1490 1554
1491 Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1555 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1492 const Register from = rdi; // source array address 1556 const Register from = rdi; // source array address
1493 const Register to = rsi; // destination array address 1557 const Register to = rsi; // destination array address
1494 const Register count = rdx; // elements count 1558 const Register count = rdx; // elements count
1495 const Register byte_count = rcx; 1559 const Register byte_count = rcx;
1496 const Register qword_count = count; 1560 const Register qword_count = count;
1529 __ movw(Address(to, byte_count, Address::times_1, -2), rax); 1593 __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1530 1594
1531 // Check for and copy trailing dword 1595 // Check for and copy trailing dword
1532 __ BIND(L_copy_4_bytes); 1596 __ BIND(L_copy_4_bytes);
1533 __ testl(byte_count, 4); 1597 __ testl(byte_count, 4);
1534 __ jcc(Assembler::zero, L_copy_32_bytes); 1598 __ jcc(Assembler::zero, L_copy_bytes);
1535 __ movl(rax, Address(from, qword_count, Address::times_8)); 1599 __ movl(rax, Address(from, qword_count, Address::times_8));
1536 __ movl(Address(to, qword_count, Address::times_8), rax); 1600 __ movl(Address(to, qword_count, Address::times_8), rax);
1537 __ jmp(L_copy_32_bytes); 1601 __ jmp(L_copy_bytes);
1538 1602
1539 // Copy trailing qwords 1603 // Copy trailing qwords
1540 __ BIND(L_copy_8_bytes); 1604 __ BIND(L_copy_8_bytes);
1541 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1605 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1542 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1606 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1547 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1611 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1548 __ xorptr(rax, rax); // return 0 1612 __ xorptr(rax, rax); // return 0
1549 __ leave(); // required for proper stackwalking of RuntimeStub frame 1613 __ leave(); // required for proper stackwalking of RuntimeStub frame
1550 __ ret(0); 1614 __ ret(0);
1551 1615
1552 // Copy in 32-bytes chunks 1616 // Copy in multi-bytes chunks
1553 copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); 1617 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1554 1618
1555 restore_arg_regs(); 1619 restore_arg_regs();
1556 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1620 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1557 __ xorptr(rax, rax); // return 0 1621 __ xorptr(rax, rax); // return 0
1558 __ leave(); // required for proper stackwalking of RuntimeStub frame 1622 __ leave(); // required for proper stackwalking of RuntimeStub frame
1583 address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) { 1647 address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1584 __ align(CodeEntryAlignment); 1648 __ align(CodeEntryAlignment);
1585 StubCodeMark mark(this, "StubRoutines", name); 1649 StubCodeMark mark(this, "StubRoutines", name);
1586 address start = __ pc(); 1650 address start = __ pc();
1587 1651
1588 Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; 1652 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1589 const Register from = rdi; // source array address 1653 const Register from = rdi; // source array address
1590 const Register to = rsi; // destination array address 1654 const Register to = rsi; // destination array address
1591 const Register count = rdx; // elements count 1655 const Register count = rdx; // elements count
1592 const Register word_count = rcx; 1656 const Register word_count = rcx;
1593 const Register qword_count = count; 1657 const Register qword_count = count;
1614 1678
1615 // Copy from low to high addresses. Use 'to' as scratch. 1679 // Copy from low to high addresses. Use 'to' as scratch.
1616 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1680 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1617 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1681 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1618 __ negptr(qword_count); 1682 __ negptr(qword_count);
1619 __ jmp(L_copy_32_bytes); 1683 __ jmp(L_copy_bytes);
1620 1684
1621 // Copy trailing qwords 1685 // Copy trailing qwords
1622 __ BIND(L_copy_8_bytes); 1686 __ BIND(L_copy_8_bytes);
1623 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1687 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1624 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1688 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1650 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 1714 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1651 __ xorptr(rax, rax); // return 0 1715 __ xorptr(rax, rax); // return 0
1652 __ leave(); // required for proper stackwalking of RuntimeStub frame 1716 __ leave(); // required for proper stackwalking of RuntimeStub frame
1653 __ ret(0); 1717 __ ret(0);
1654 1718
1655 // Copy in 32-bytes chunks 1719 // Copy in multi-bytes chunks
1656 copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); 1720 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1657 __ jmp(L_copy_4_bytes); 1721 __ jmp(L_copy_4_bytes);
1658 1722
1659 return start; 1723 return start;
1660 } 1724 }
1661 1725
1698 address *entry, const char *name) { 1762 address *entry, const char *name) {
1699 __ align(CodeEntryAlignment); 1763 __ align(CodeEntryAlignment);
1700 StubCodeMark mark(this, "StubRoutines", name); 1764 StubCodeMark mark(this, "StubRoutines", name);
1701 address start = __ pc(); 1765 address start = __ pc();
1702 1766
1703 Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes; 1767 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1704 const Register from = rdi; // source array address 1768 const Register from = rdi; // source array address
1705 const Register to = rsi; // destination array address 1769 const Register to = rsi; // destination array address
1706 const Register count = rdx; // elements count 1770 const Register count = rdx; // elements count
1707 const Register word_count = rcx; 1771 const Register word_count = rcx;
1708 const Register qword_count = count; 1772 const Register qword_count = count;
1733 __ movw(Address(to, word_count, Address::times_2, -2), rax); 1797 __ movw(Address(to, word_count, Address::times_2, -2), rax);
1734 1798
1735 // Check for and copy trailing dword 1799 // Check for and copy trailing dword
1736 __ BIND(L_copy_4_bytes); 1800 __ BIND(L_copy_4_bytes);
1737 __ testl(word_count, 2); 1801 __ testl(word_count, 2);
1738 __ jcc(Assembler::zero, L_copy_32_bytes); 1802 __ jcc(Assembler::zero, L_copy_bytes);
1739 __ movl(rax, Address(from, qword_count, Address::times_8)); 1803 __ movl(rax, Address(from, qword_count, Address::times_8));
1740 __ movl(Address(to, qword_count, Address::times_8), rax); 1804 __ movl(Address(to, qword_count, Address::times_8), rax);
1741 __ jmp(L_copy_32_bytes); 1805 __ jmp(L_copy_bytes);
1742 1806
1743 // Copy trailing qwords 1807 // Copy trailing qwords
1744 __ BIND(L_copy_8_bytes); 1808 __ BIND(L_copy_8_bytes);
1745 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1809 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1746 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1810 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1751 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 1815 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1752 __ xorptr(rax, rax); // return 0 1816 __ xorptr(rax, rax); // return 0
1753 __ leave(); // required for proper stackwalking of RuntimeStub frame 1817 __ leave(); // required for proper stackwalking of RuntimeStub frame
1754 __ ret(0); 1818 __ ret(0);
1755 1819
1756 // Copy in 32-bytes chunks 1820 // Copy in multi-bytes chunks
1757 copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); 1821 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1758 1822
1759 restore_arg_regs(); 1823 restore_arg_regs();
1760 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 1824 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1761 __ xorptr(rax, rax); // return 0 1825 __ xorptr(rax, rax); // return 0
1762 __ leave(); // required for proper stackwalking of RuntimeStub frame 1826 __ leave(); // required for proper stackwalking of RuntimeStub frame
1788 const char *name, bool dest_uninitialized = false) { 1852 const char *name, bool dest_uninitialized = false) {
1789 __ align(CodeEntryAlignment); 1853 __ align(CodeEntryAlignment);
1790 StubCodeMark mark(this, "StubRoutines", name); 1854 StubCodeMark mark(this, "StubRoutines", name);
1791 address start = __ pc(); 1855 address start = __ pc();
1792 1856
1793 Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; 1857 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1794 const Register from = rdi; // source array address 1858 const Register from = rdi; // source array address
1795 const Register to = rsi; // destination array address 1859 const Register to = rsi; // destination array address
1796 const Register count = rdx; // elements count 1860 const Register count = rdx; // elements count
1797 const Register dword_count = rcx; 1861 const Register dword_count = rcx;
1798 const Register qword_count = count; 1862 const Register qword_count = count;
1824 1888
1825 // Copy from low to high addresses. Use 'to' as scratch. 1889 // Copy from low to high addresses. Use 'to' as scratch.
1826 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1890 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1827 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1891 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1828 __ negptr(qword_count); 1892 __ negptr(qword_count);
1829 __ jmp(L_copy_32_bytes); 1893 __ jmp(L_copy_bytes);
1830 1894
1831 // Copy trailing qwords 1895 // Copy trailing qwords
1832 __ BIND(L_copy_8_bytes); 1896 __ BIND(L_copy_8_bytes);
1833 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1897 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1834 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1898 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1851 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free 1915 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1852 __ xorptr(rax, rax); // return 0 1916 __ xorptr(rax, rax); // return 0
1853 __ leave(); // required for proper stackwalking of RuntimeStub frame 1917 __ leave(); // required for proper stackwalking of RuntimeStub frame
1854 __ ret(0); 1918 __ ret(0);
1855 1919
1856 // Copy 32-bytes chunks 1920 // Copy in multi-bytes chunks
1857 copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); 1921 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1858 __ jmp(L_copy_4_bytes); 1922 __ jmp(L_copy_4_bytes);
1859 1923
1860 return start; 1924 return start;
1861 } 1925 }
1862 1926
1880 bool dest_uninitialized = false) { 1944 bool dest_uninitialized = false) {
1881 __ align(CodeEntryAlignment); 1945 __ align(CodeEntryAlignment);
1882 StubCodeMark mark(this, "StubRoutines", name); 1946 StubCodeMark mark(this, "StubRoutines", name);
1883 address start = __ pc(); 1947 address start = __ pc();
1884 1948
1885 Label L_copy_32_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit; 1949 Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
1886 const Register from = rdi; // source array address 1950 const Register from = rdi; // source array address
1887 const Register to = rsi; // destination array address 1951 const Register to = rsi; // destination array address
1888 const Register count = rdx; // elements count 1952 const Register count = rdx; // elements count
1889 const Register dword_count = rcx; 1953 const Register dword_count = rcx;
1890 const Register qword_count = count; 1954 const Register qword_count = count;
1914 1978
1915 // Copy from high to low addresses. Use 'to' as scratch. 1979 // Copy from high to low addresses. Use 'to' as scratch.
1916 1980
1917 // Check for and copy trailing dword 1981 // Check for and copy trailing dword
1918 __ testl(dword_count, 1); 1982 __ testl(dword_count, 1);
1919 __ jcc(Assembler::zero, L_copy_32_bytes); 1983 __ jcc(Assembler::zero, L_copy_bytes);
1920 __ movl(rax, Address(from, dword_count, Address::times_4, -4)); 1984 __ movl(rax, Address(from, dword_count, Address::times_4, -4));
1921 __ movl(Address(to, dword_count, Address::times_4, -4), rax); 1985 __ movl(Address(to, dword_count, Address::times_4, -4), rax);
1922 __ jmp(L_copy_32_bytes); 1986 __ jmp(L_copy_bytes);
1923 1987
1924 // Copy trailing qwords 1988 // Copy trailing qwords
1925 __ BIND(L_copy_8_bytes); 1989 __ BIND(L_copy_8_bytes);
1926 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1990 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1927 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1991 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1935 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free 1999 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1936 __ xorptr(rax, rax); // return 0 2000 __ xorptr(rax, rax); // return 0
1937 __ leave(); // required for proper stackwalking of RuntimeStub frame 2001 __ leave(); // required for proper stackwalking of RuntimeStub frame
1938 __ ret(0); 2002 __ ret(0);
1939 2003
1940 // Copy in 32-bytes chunks 2004 // Copy in multi-bytes chunks
1941 copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); 2005 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1942 2006
1943 __ bind(L_exit); 2007 __ bind(L_exit);
1944 if (is_oop) { 2008 if (is_oop) {
1945 Register end_to = rdx; 2009 Register end_to = rdx;
1946 __ leaq(end_to, Address(to, dword_count, Address::times_4, -4)); 2010 __ leaq(end_to, Address(to, dword_count, Address::times_4, -4));
1974 const char *name, bool dest_uninitialized = false) { 2038 const char *name, bool dest_uninitialized = false) {
1975 __ align(CodeEntryAlignment); 2039 __ align(CodeEntryAlignment);
1976 StubCodeMark mark(this, "StubRoutines", name); 2040 StubCodeMark mark(this, "StubRoutines", name);
1977 address start = __ pc(); 2041 address start = __ pc();
1978 2042
1979 Label L_copy_32_bytes, L_copy_8_bytes, L_exit; 2043 Label L_copy_bytes, L_copy_8_bytes, L_exit;
1980 const Register from = rdi; // source array address 2044 const Register from = rdi; // source array address
1981 const Register to = rsi; // destination array address 2045 const Register to = rsi; // destination array address
1982 const Register qword_count = rdx; // elements count 2046 const Register qword_count = rdx; // elements count
1983 const Register end_from = from; // source array end address 2047 const Register end_from = from; // source array end address
1984 const Register end_to = rcx; // destination array end address 2048 const Register end_to = rcx; // destination array end address
2006 2070
2007 // Copy from low to high addresses. Use 'to' as scratch. 2071 // Copy from low to high addresses. Use 'to' as scratch.
2008 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 2072 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2009 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 2073 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2010 __ negptr(qword_count); 2074 __ negptr(qword_count);
2011 __ jmp(L_copy_32_bytes); 2075 __ jmp(L_copy_bytes);
2012 2076
2013 // Copy trailing qwords 2077 // Copy trailing qwords
2014 __ BIND(L_copy_8_bytes); 2078 __ BIND(L_copy_8_bytes);
2015 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 2079 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2016 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 2080 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2025 __ xorptr(rax, rax); // return 0 2089 __ xorptr(rax, rax); // return 0
2026 __ leave(); // required for proper stackwalking of RuntimeStub frame 2090 __ leave(); // required for proper stackwalking of RuntimeStub frame
2027 __ ret(0); 2091 __ ret(0);
2028 } 2092 }
2029 2093
2030 // Copy 64-byte chunks 2094 // Copy in multi-bytes chunks
2031 copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); 2095 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2032 2096
2033 if (is_oop) { 2097 if (is_oop) {
2034 __ BIND(L_exit); 2098 __ BIND(L_exit);
2035 gen_write_ref_array_post_barrier(saved_to, end_to, rax); 2099 gen_write_ref_array_post_barrier(saved_to, end_to, rax);
2036 } 2100 }
2063 const char *name, bool dest_uninitialized = false) { 2127 const char *name, bool dest_uninitialized = false) {
2064 __ align(CodeEntryAlignment); 2128 __ align(CodeEntryAlignment);
2065 StubCodeMark mark(this, "StubRoutines", name); 2129 StubCodeMark mark(this, "StubRoutines", name);
2066 address start = __ pc(); 2130 address start = __ pc();
2067 2131
2068 Label L_copy_32_bytes, L_copy_8_bytes, L_exit; 2132 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2069 const Register from = rdi; // source array address 2133 const Register from = rdi; // source array address
2070 const Register to = rsi; // destination array address 2134 const Register to = rsi; // destination array address
2071 const Register qword_count = rdx; // elements count 2135 const Register qword_count = rdx; // elements count
2072 const Register saved_count = rcx; 2136 const Register saved_count = rcx;
2073 2137
2089 __ movptr(saved_count, qword_count); 2153 __ movptr(saved_count, qword_count);
2090 // No registers are destroyed by this call 2154 // No registers are destroyed by this call
2091 gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized); 2155 gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2092 } 2156 }
2093 2157
2094 __ jmp(L_copy_32_bytes); 2158 __ jmp(L_copy_bytes);
2095 2159
2096 // Copy trailing qwords 2160 // Copy trailing qwords
2097 __ BIND(L_copy_8_bytes); 2161 __ BIND(L_copy_8_bytes);
2098 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 2162 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2099 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 2163 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2108 __ xorptr(rax, rax); // return 0 2172 __ xorptr(rax, rax); // return 0
2109 __ leave(); // required for proper stackwalking of RuntimeStub frame 2173 __ leave(); // required for proper stackwalking of RuntimeStub frame
2110 __ ret(0); 2174 __ ret(0);
2111 } 2175 }
2112 2176
2113 // Copy in 32-bytes chunks 2177 // Copy in multi-bytes chunks
2114 copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); 2178 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2115 2179
2116 if (is_oop) { 2180 if (is_oop) {
2117 __ BIND(L_exit); 2181 __ BIND(L_exit);
2118 __ lea(rcx, Address(to, saved_count, Address::times_8, -8)); 2182 __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
2119 gen_write_ref_array_post_barrier(to, rcx, rax); 2183 gen_write_ref_array_post_barrier(to, rcx, rax);