comparison src/cpu/sparc/vm/stubGenerator_sparc.cpp @ 1365:6476042f815c

6940701: Don't align loops in stubs for Niagara sparc Summary: Don't align loops in stubs for Niagara sparc since NOPs are expensive. Reviewed-by: twisti, never
author kvn
date Wed, 07 Apr 2010 09:37:47 -0700
parents 0dc88ad3244e
children c640000b7cc1
comparison
equal deleted inserted replaced
1364:0dc88ad3244e 1365:6476042f815c
1146 // 1146 //
1147 __ deccc(count, count_dec); // Pre-decrement 'count' 1147 __ deccc(count, count_dec); // Pre-decrement 'count'
1148 __ andn(from, 7, from); // Align address 1148 __ andn(from, 7, from); // Align address
1149 __ ldx(from, 0, O3); 1149 __ ldx(from, 0, O3);
1150 __ inc(from, 8); 1150 __ inc(from, 8);
1151 __ align(16); 1151 __ align(OptoLoopAlignment);
1152 __ BIND(L_loop); 1152 __ BIND(L_loop);
1153 __ ldx(from, 0, O4); 1153 __ ldx(from, 0, O4);
1154 __ deccc(count, count_dec); // Can we do next iteration after this one? 1154 __ deccc(count, count_dec); // Can we do next iteration after this one?
1155 __ ldx(from, 8, G4); 1155 __ ldx(from, 8, G4);
1156 __ inc(to, 16); 1156 __ inc(to, 16);
1218 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1218 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1219 // to form 2 aligned 8-bytes chunks to store. 1219 // to form 2 aligned 8-bytes chunks to store.
1220 // 1220 //
1221 __ andn(end_from, 7, end_from); // Align address 1221 __ andn(end_from, 7, end_from); // Align address
1222 __ ldx(end_from, 0, O3); 1222 __ ldx(end_from, 0, O3);
1223 __ align(16); 1223 __ align(OptoLoopAlignment);
1224 __ BIND(L_loop); 1224 __ BIND(L_loop);
1225 __ ldx(end_from, -8, O4); 1225 __ ldx(end_from, -8, O4);
1226 __ deccc(count, count_dec); // Can we do next iteration after this one? 1226 __ deccc(count, count_dec); // Can we do next iteration after this one?
1227 __ ldx(end_from, -16, G4); 1227 __ ldx(end_from, -16, G4);
1228 __ dec(end_to, 16); 1228 __ dec(end_to, 16);
1347 1347
1348 // copy tailing bytes 1348 // copy tailing bytes
1349 __ BIND(L_copy_byte); 1349 __ BIND(L_copy_byte);
1350 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 1350 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1351 __ delayed()->nop(); 1351 __ delayed()->nop();
1352 __ align(16); 1352 __ align(OptoLoopAlignment);
1353 __ BIND(L_copy_byte_loop); 1353 __ BIND(L_copy_byte_loop);
1354 __ ldub(from, offset, O3); 1354 __ ldub(from, offset, O3);
1355 __ deccc(count); 1355 __ deccc(count);
1356 __ stb(O3, to, offset); 1356 __ stb(O3, to, offset);
1357 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop); 1357 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1443 1443
1444 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16, 1444 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1445 L_aligned_copy, L_copy_byte); 1445 L_aligned_copy, L_copy_byte);
1446 } 1446 }
1447 // copy 4 elements (16 bytes) at a time 1447 // copy 4 elements (16 bytes) at a time
1448 __ align(16); 1448 __ align(OptoLoopAlignment);
1449 __ BIND(L_aligned_copy); 1449 __ BIND(L_aligned_copy);
1450 __ dec(end_from, 16); 1450 __ dec(end_from, 16);
1451 __ ldx(end_from, 8, O3); 1451 __ ldx(end_from, 8, O3);
1452 __ ldx(end_from, 0, O4); 1452 __ ldx(end_from, 0, O4);
1453 __ dec(end_to, 16); 1453 __ dec(end_to, 16);
1459 1459
1460 // copy 1 element (2 bytes) at a time 1460 // copy 1 element (2 bytes) at a time
1461 __ BIND(L_copy_byte); 1461 __ BIND(L_copy_byte);
1462 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 1462 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1463 __ delayed()->nop(); 1463 __ delayed()->nop();
1464 __ align(16); 1464 __ align(OptoLoopAlignment);
1465 __ BIND(L_copy_byte_loop); 1465 __ BIND(L_copy_byte_loop);
1466 __ dec(end_from); 1466 __ dec(end_from);
1467 __ dec(end_to); 1467 __ dec(end_to);
1468 __ ldub(end_from, 0, O4); 1468 __ ldub(end_from, 0, O4);
1469 __ deccc(count); 1469 __ deccc(count);
1575 1575
1576 // copy 1 element at a time 1576 // copy 1 element at a time
1577 __ BIND(L_copy_2_bytes); 1577 __ BIND(L_copy_2_bytes);
1578 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 1578 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1579 __ delayed()->nop(); 1579 __ delayed()->nop();
1580 __ align(16); 1580 __ align(OptoLoopAlignment);
1581 __ BIND(L_copy_2_bytes_loop); 1581 __ BIND(L_copy_2_bytes_loop);
1582 __ lduh(from, offset, O3); 1582 __ lduh(from, offset, O3);
1583 __ deccc(count); 1583 __ deccc(count);
1584 __ sth(O3, to, offset); 1584 __ sth(O3, to, offset);
1585 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop); 1585 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1682 1682
1683 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8, 1683 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
1684 L_aligned_copy, L_copy_2_bytes); 1684 L_aligned_copy, L_copy_2_bytes);
1685 } 1685 }
1686 // copy 4 elements (16 bytes) at a time 1686 // copy 4 elements (16 bytes) at a time
1687 __ align(16); 1687 __ align(OptoLoopAlignment);
1688 __ BIND(L_aligned_copy); 1688 __ BIND(L_aligned_copy);
1689 __ dec(end_from, 16); 1689 __ dec(end_from, 16);
1690 __ ldx(end_from, 8, O3); 1690 __ ldx(end_from, 8, O3);
1691 __ ldx(end_from, 0, O4); 1691 __ ldx(end_from, 0, O4);
1692 __ dec(end_to, 16); 1692 __ dec(end_to, 16);
1779 // code is more optimal. 1779 // code is more optimal.
1780 1780
1781 // copy with shift 4 elements (16 bytes) at a time 1781 // copy with shift 4 elements (16 bytes) at a time
1782 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 1782 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4
1783 1783
1784 __ align(16); 1784 __ align(OptoLoopAlignment);
1785 __ BIND(L_copy_16_bytes); 1785 __ BIND(L_copy_16_bytes);
1786 __ ldx(from, 4, O4); 1786 __ ldx(from, 4, O4);
1787 __ deccc(count, 4); // Can we do next iteration after this one? 1787 __ deccc(count, 4); // Can we do next iteration after this one?
1788 __ ldx(from, 12, G4); 1788 __ ldx(from, 12, G4);
1789 __ inc(to, 16); 1789 __ inc(to, 16);
1905 // 1905 //
1906 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1906 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1907 // to form 2 aligned 8-bytes chunks to store. 1907 // to form 2 aligned 8-bytes chunks to store.
1908 // 1908 //
1909 __ ldx(end_from, -4, O3); 1909 __ ldx(end_from, -4, O3);
1910 __ align(16); 1910 __ align(OptoLoopAlignment);
1911 __ BIND(L_copy_16_bytes); 1911 __ BIND(L_copy_16_bytes);
1912 __ ldx(end_from, -12, O4); 1912 __ ldx(end_from, -12, O4);
1913 __ deccc(count, 4); 1913 __ deccc(count, 4);
1914 __ ldx(end_from, -20, O5); 1914 __ ldx(end_from, -20, O5);
1915 __ dec(end_to, 16); 1915 __ dec(end_to, 16);
1927 1927
1928 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 1928 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
1929 __ delayed()->inc(count, 4); 1929 __ delayed()->inc(count, 4);
1930 1930
1931 // copy 4 elements (16 bytes) at a time 1931 // copy 4 elements (16 bytes) at a time
1932 __ align(16); 1932 __ align(OptoLoopAlignment);
1933 __ BIND(L_aligned_copy); 1933 __ BIND(L_aligned_copy);
1934 __ dec(end_from, 16); 1934 __ dec(end_from, 16);
1935 __ ldx(end_from, 8, O3); 1935 __ ldx(end_from, 8, O3);
1936 __ ldx(end_from, 0, O4); 1936 __ ldx(end_from, 0, O4);
1937 __ dec(end_to, 16); 1937 __ dec(end_to, 16);
2043 __ delayed()->mov(to, to64); 2043 __ delayed()->mov(to, to64);
2044 // Now we can use O4(offset0), O5(offset8) as temps 2044 // Now we can use O4(offset0), O5(offset8) as temps
2045 __ mov(O3, count); 2045 __ mov(O3, count);
2046 __ mov(from, from64); 2046 __ mov(from, from64);
2047 2047
2048 __ align(16); 2048 __ align(OptoLoopAlignment);
2049 __ BIND(L_copy_64_bytes); 2049 __ BIND(L_copy_64_bytes);
2050 for( int off = 0; off < 64; off += 16 ) { 2050 for( int off = 0; off < 64; off += 16 ) {
2051 __ ldx(from64, off+0, O4); 2051 __ ldx(from64, off+0, O4);
2052 __ ldx(from64, off+8, O5); 2052 __ ldx(from64, off+8, O5);
2053 __ stx(O4, to64, off+0); 2053 __ stx(O4, to64, off+0);
2063 __ inccc(count, 6); 2063 __ inccc(count, 6);
2064 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2064 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2065 __ delayed()->add(offset0, 8, offset8); 2065 __ delayed()->add(offset0, 8, offset8);
2066 2066
2067 // Copy by 16 bytes chunks 2067 // Copy by 16 bytes chunks
2068 __ align(16); 2068 __ align(OptoLoopAlignment);
2069 __ BIND(L_copy_16_bytes); 2069 __ BIND(L_copy_16_bytes);
2070 __ ldx(from, offset0, O3); 2070 __ ldx(from, offset0, O3);
2071 __ ldx(from, offset8, G3); 2071 __ ldx(from, offset8, G3);
2072 __ deccc(count, 2); 2072 __ deccc(count, 2);
2073 __ stx(O3, to, offset0); 2073 __ stx(O3, to, offset0);
2137 2137
2138 __ subcc(count, 1, count); 2138 __ subcc(count, 1, count);
2139 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); 2139 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2140 __ delayed()->sllx(count, LogBytesPerLong, offset8); 2140 __ delayed()->sllx(count, LogBytesPerLong, offset8);
2141 __ sub(offset8, 8, offset0); 2141 __ sub(offset8, 8, offset0);
2142 __ align(16); 2142 __ align(OptoLoopAlignment);
2143 __ BIND(L_copy_16_bytes); 2143 __ BIND(L_copy_16_bytes);
2144 __ ldx(from, offset8, O2); 2144 __ ldx(from, offset8, O2);
2145 __ ldx(from, offset0, O3); 2145 __ ldx(from, offset0, O3);
2146 __ stx(O2, to, offset8); 2146 __ stx(O2, to, offset8);
2147 __ deccc(offset8, 16); // use offset8 as counter 2147 __ deccc(offset8, 16); // use offset8 as counter
2403 // (Loop is rotated; its entry is load_element.) 2403 // (Loop is rotated; its entry is load_element.)
2404 // Loop variables: 2404 // Loop variables:
2405 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays 2405 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2406 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* 2406 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2407 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super 2407 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2408 __ align(16); 2408 __ align(OptoLoopAlignment);
2409 2409
2410 __ BIND(store_element); 2410 __ BIND(store_element);
2411 __ deccc(G1_remain); // decrement the count 2411 __ deccc(G1_remain); // decrement the count
2412 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop 2412 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2413 __ inc(O5_offset, heapOopSize); // step to next offset 2413 __ inc(O5_offset, heapOopSize); // step to next offset