Mercurial > hg > graal-compiler
comparison src/cpu/sparc/vm/stubGenerator_sparc.cpp @ 1365:6476042f815c
6940701: Don't align loops in stubs for Niagara sparc
Summary: Don't align loops in stubs for Niagara sparc since NOPs are expensive.
Reviewed-by: twisti, never
author | kvn |
---|---|
date | Wed, 07 Apr 2010 09:37:47 -0700 |
parents | 0dc88ad3244e |
children | c640000b7cc1 |
comparison
equal
deleted
inserted
replaced
1364:0dc88ad3244e | 1365:6476042f815c |
---|---|
1146 // | 1146 // |
1147 __ deccc(count, count_dec); // Pre-decrement 'count' | 1147 __ deccc(count, count_dec); // Pre-decrement 'count' |
1148 __ andn(from, 7, from); // Align address | 1148 __ andn(from, 7, from); // Align address |
1149 __ ldx(from, 0, O3); | 1149 __ ldx(from, 0, O3); |
1150 __ inc(from, 8); | 1150 __ inc(from, 8); |
1151 __ align(16); | 1151 __ align(OptoLoopAlignment); |
1152 __ BIND(L_loop); | 1152 __ BIND(L_loop); |
1153 __ ldx(from, 0, O4); | 1153 __ ldx(from, 0, O4); |
1154 __ deccc(count, count_dec); // Can we do next iteration after this one? | 1154 __ deccc(count, count_dec); // Can we do next iteration after this one? |
1155 __ ldx(from, 8, G4); | 1155 __ ldx(from, 8, G4); |
1156 __ inc(to, 16); | 1156 __ inc(to, 16); |
1218 // Load 2 aligned 8-bytes chunks and use one from previous iteration | 1218 // Load 2 aligned 8-bytes chunks and use one from previous iteration |
1219 // to form 2 aligned 8-bytes chunks to store. | 1219 // to form 2 aligned 8-bytes chunks to store. |
1220 // | 1220 // |
1221 __ andn(end_from, 7, end_from); // Align address | 1221 __ andn(end_from, 7, end_from); // Align address |
1222 __ ldx(end_from, 0, O3); | 1222 __ ldx(end_from, 0, O3); |
1223 __ align(16); | 1223 __ align(OptoLoopAlignment); |
1224 __ BIND(L_loop); | 1224 __ BIND(L_loop); |
1225 __ ldx(end_from, -8, O4); | 1225 __ ldx(end_from, -8, O4); |
1226 __ deccc(count, count_dec); // Can we do next iteration after this one? | 1226 __ deccc(count, count_dec); // Can we do next iteration after this one? |
1227 __ ldx(end_from, -16, G4); | 1227 __ ldx(end_from, -16, G4); |
1228 __ dec(end_to, 16); | 1228 __ dec(end_to, 16); |
1347 | 1347 |
1348 // copy tailing bytes | 1348 // copy tailing bytes |
1349 __ BIND(L_copy_byte); | 1349 __ BIND(L_copy_byte); |
1350 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); | 1350 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); |
1351 __ delayed()->nop(); | 1351 __ delayed()->nop(); |
1352 __ align(16); | 1352 __ align(OptoLoopAlignment); |
1353 __ BIND(L_copy_byte_loop); | 1353 __ BIND(L_copy_byte_loop); |
1354 __ ldub(from, offset, O3); | 1354 __ ldub(from, offset, O3); |
1355 __ deccc(count); | 1355 __ deccc(count); |
1356 __ stb(O3, to, offset); | 1356 __ stb(O3, to, offset); |
1357 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop); | 1357 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop); |
1443 | 1443 |
1444 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16, | 1444 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16, |
1445 L_aligned_copy, L_copy_byte); | 1445 L_aligned_copy, L_copy_byte); |
1446 } | 1446 } |
1447 // copy 4 elements (16 bytes) at a time | 1447 // copy 4 elements (16 bytes) at a time |
1448 __ align(16); | 1448 __ align(OptoLoopAlignment); |
1449 __ BIND(L_aligned_copy); | 1449 __ BIND(L_aligned_copy); |
1450 __ dec(end_from, 16); | 1450 __ dec(end_from, 16); |
1451 __ ldx(end_from, 8, O3); | 1451 __ ldx(end_from, 8, O3); |
1452 __ ldx(end_from, 0, O4); | 1452 __ ldx(end_from, 0, O4); |
1453 __ dec(end_to, 16); | 1453 __ dec(end_to, 16); |
1459 | 1459 |
1460 // copy 1 element (2 bytes) at a time | 1460 // copy 1 element (2 bytes) at a time |
1461 __ BIND(L_copy_byte); | 1461 __ BIND(L_copy_byte); |
1462 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); | 1462 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); |
1463 __ delayed()->nop(); | 1463 __ delayed()->nop(); |
1464 __ align(16); | 1464 __ align(OptoLoopAlignment); |
1465 __ BIND(L_copy_byte_loop); | 1465 __ BIND(L_copy_byte_loop); |
1466 __ dec(end_from); | 1466 __ dec(end_from); |
1467 __ dec(end_to); | 1467 __ dec(end_to); |
1468 __ ldub(end_from, 0, O4); | 1468 __ ldub(end_from, 0, O4); |
1469 __ deccc(count); | 1469 __ deccc(count); |
1575 | 1575 |
1576 // copy 1 element at a time | 1576 // copy 1 element at a time |
1577 __ BIND(L_copy_2_bytes); | 1577 __ BIND(L_copy_2_bytes); |
1578 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); | 1578 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); |
1579 __ delayed()->nop(); | 1579 __ delayed()->nop(); |
1580 __ align(16); | 1580 __ align(OptoLoopAlignment); |
1581 __ BIND(L_copy_2_bytes_loop); | 1581 __ BIND(L_copy_2_bytes_loop); |
1582 __ lduh(from, offset, O3); | 1582 __ lduh(from, offset, O3); |
1583 __ deccc(count); | 1583 __ deccc(count); |
1584 __ sth(O3, to, offset); | 1584 __ sth(O3, to, offset); |
1585 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop); | 1585 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop); |
1682 | 1682 |
1683 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8, | 1683 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8, |
1684 L_aligned_copy, L_copy_2_bytes); | 1684 L_aligned_copy, L_copy_2_bytes); |
1685 } | 1685 } |
1686 // copy 4 elements (16 bytes) at a time | 1686 // copy 4 elements (16 bytes) at a time |
1687 __ align(16); | 1687 __ align(OptoLoopAlignment); |
1688 __ BIND(L_aligned_copy); | 1688 __ BIND(L_aligned_copy); |
1689 __ dec(end_from, 16); | 1689 __ dec(end_from, 16); |
1690 __ ldx(end_from, 8, O3); | 1690 __ ldx(end_from, 8, O3); |
1691 __ ldx(end_from, 0, O4); | 1691 __ ldx(end_from, 0, O4); |
1692 __ dec(end_to, 16); | 1692 __ dec(end_to, 16); |
1779 // code is more optimal. | 1779 // code is more optimal. |
1780 | 1780 |
1781 // copy with shift 4 elements (16 bytes) at a time | 1781 // copy with shift 4 elements (16 bytes) at a time |
1782 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 | 1782 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 |
1783 | 1783 |
1784 __ align(16); | 1784 __ align(OptoLoopAlignment); |
1785 __ BIND(L_copy_16_bytes); | 1785 __ BIND(L_copy_16_bytes); |
1786 __ ldx(from, 4, O4); | 1786 __ ldx(from, 4, O4); |
1787 __ deccc(count, 4); // Can we do next iteration after this one? | 1787 __ deccc(count, 4); // Can we do next iteration after this one? |
1788 __ ldx(from, 12, G4); | 1788 __ ldx(from, 12, G4); |
1789 __ inc(to, 16); | 1789 __ inc(to, 16); |
1905 // | 1905 // |
1906 // Load 2 aligned 8-bytes chunks and use one from previous iteration | 1906 // Load 2 aligned 8-bytes chunks and use one from previous iteration |
1907 // to form 2 aligned 8-bytes chunks to store. | 1907 // to form 2 aligned 8-bytes chunks to store. |
1908 // | 1908 // |
1909 __ ldx(end_from, -4, O3); | 1909 __ ldx(end_from, -4, O3); |
1910 __ align(16); | 1910 __ align(OptoLoopAlignment); |
1911 __ BIND(L_copy_16_bytes); | 1911 __ BIND(L_copy_16_bytes); |
1912 __ ldx(end_from, -12, O4); | 1912 __ ldx(end_from, -12, O4); |
1913 __ deccc(count, 4); | 1913 __ deccc(count, 4); |
1914 __ ldx(end_from, -20, O5); | 1914 __ ldx(end_from, -20, O5); |
1915 __ dec(end_to, 16); | 1915 __ dec(end_to, 16); |
1927 | 1927 |
1928 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); | 1928 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); |
1929 __ delayed()->inc(count, 4); | 1929 __ delayed()->inc(count, 4); |
1930 | 1930 |
1931 // copy 4 elements (16 bytes) at a time | 1931 // copy 4 elements (16 bytes) at a time |
1932 __ align(16); | 1932 __ align(OptoLoopAlignment); |
1933 __ BIND(L_aligned_copy); | 1933 __ BIND(L_aligned_copy); |
1934 __ dec(end_from, 16); | 1934 __ dec(end_from, 16); |
1935 __ ldx(end_from, 8, O3); | 1935 __ ldx(end_from, 8, O3); |
1936 __ ldx(end_from, 0, O4); | 1936 __ ldx(end_from, 0, O4); |
1937 __ dec(end_to, 16); | 1937 __ dec(end_to, 16); |
2043 __ delayed()->mov(to, to64); | 2043 __ delayed()->mov(to, to64); |
2044 // Now we can use O4(offset0), O5(offset8) as temps | 2044 // Now we can use O4(offset0), O5(offset8) as temps |
2045 __ mov(O3, count); | 2045 __ mov(O3, count); |
2046 __ mov(from, from64); | 2046 __ mov(from, from64); |
2047 | 2047 |
2048 __ align(16); | 2048 __ align(OptoLoopAlignment); |
2049 __ BIND(L_copy_64_bytes); | 2049 __ BIND(L_copy_64_bytes); |
2050 for( int off = 0; off < 64; off += 16 ) { | 2050 for( int off = 0; off < 64; off += 16 ) { |
2051 __ ldx(from64, off+0, O4); | 2051 __ ldx(from64, off+0, O4); |
2052 __ ldx(from64, off+8, O5); | 2052 __ ldx(from64, off+8, O5); |
2053 __ stx(O4, to64, off+0); | 2053 __ stx(O4, to64, off+0); |
2063 __ inccc(count, 6); | 2063 __ inccc(count, 6); |
2064 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); | 2064 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); |
2065 __ delayed()->add(offset0, 8, offset8); | 2065 __ delayed()->add(offset0, 8, offset8); |
2066 | 2066 |
2067 // Copy by 16 bytes chunks | 2067 // Copy by 16 bytes chunks |
2068 __ align(16); | 2068 __ align(OptoLoopAlignment); |
2069 __ BIND(L_copy_16_bytes); | 2069 __ BIND(L_copy_16_bytes); |
2070 __ ldx(from, offset0, O3); | 2070 __ ldx(from, offset0, O3); |
2071 __ ldx(from, offset8, G3); | 2071 __ ldx(from, offset8, G3); |
2072 __ deccc(count, 2); | 2072 __ deccc(count, 2); |
2073 __ stx(O3, to, offset0); | 2073 __ stx(O3, to, offset0); |
2137 | 2137 |
2138 __ subcc(count, 1, count); | 2138 __ subcc(count, 1, count); |
2139 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); | 2139 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); |
2140 __ delayed()->sllx(count, LogBytesPerLong, offset8); | 2140 __ delayed()->sllx(count, LogBytesPerLong, offset8); |
2141 __ sub(offset8, 8, offset0); | 2141 __ sub(offset8, 8, offset0); |
2142 __ align(16); | 2142 __ align(OptoLoopAlignment); |
2143 __ BIND(L_copy_16_bytes); | 2143 __ BIND(L_copy_16_bytes); |
2144 __ ldx(from, offset8, O2); | 2144 __ ldx(from, offset8, O2); |
2145 __ ldx(from, offset0, O3); | 2145 __ ldx(from, offset0, O3); |
2146 __ stx(O2, to, offset8); | 2146 __ stx(O2, to, offset8); |
2147 __ deccc(offset8, 16); // use offset8 as counter | 2147 __ deccc(offset8, 16); // use offset8 as counter |
2403 // (Loop is rotated; its entry is load_element.) | 2403 // (Loop is rotated; its entry is load_element.) |
2404 // Loop variables: | 2404 // Loop variables: |
2405 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays | 2405 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays |
2406 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* | 2406 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* |
2407 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super | 2407 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super |
2408 __ align(16); | 2408 __ align(OptoLoopAlignment); |
2409 | 2409 |
2410 __ BIND(store_element); | 2410 __ BIND(store_element); |
2411 __ deccc(G1_remain); // decrement the count | 2411 __ deccc(G1_remain); // decrement the count |
2412 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop | 2412 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop |
2413 __ inc(O5_offset, heapOopSize); // step to next offset | 2413 __ inc(O5_offset, heapOopSize); // step to next offset |