graal-compiler: src/cpu/sparc/vm/stubGenerator

comparison src/cpu/sparc/vm/stubGenerator_sparc.cpp @ 1365:6476042f815c

6940701: Don't align loops in stubs for Niagara sparc Summary: Don't align loops in stubs for Niagara sparc since NOPs are expensive. Reviewed-by: twisti, never

author	kvn
date	Wed, 07 Apr 2010 09:37:47 -0700
parents	0dc88ad3244e
children	c640000b7cc1

comparison

equal deleted inserted replaced

-:0dc88ad3244e
+:6476042f815c
 //
 __ deccc(count, count_dec); // Pre-decrement 'count'
 __ andn(from, 7, from);     // Align address
 __ ldx(from, 0, O3);
 __ inc(from, 8);
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_loop);
 __ ldx(from, 0, O4);
 __ deccc(count, count_dec); // Can we do next iteration after this one?
 __ ldx(from, 8, G4);
 __ inc(to, 16);
 // Load 2 aligned 8-bytes chunks and use one from previous iteration
 // to form 2 aligned 8-bytes chunks to store.
 //
 __ andn(end_from, 7, end_from);     // Align address
 __ ldx(end_from, 0, O3);
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_loop);
 __ ldx(end_from, -8, O4);
 __ deccc(count, count_dec); // Can we do next iteration after this one?
 __ ldx(end_from, -16, G4);
 __ dec(end_to, 16);
 // copy tailing bytes
 __ BIND(L_copy_byte);
 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
 __ delayed()->nop();
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_copy_byte_loop);
 __ ldub(from, offset, O3);
 __ deccc(count);
 __ stb(O3, to, offset);
 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
 L_aligned_copy, L_copy_byte);
 }
 // copy 4 elements (16 bytes) at a time
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_aligned_copy);
 __ dec(end_from, 16);
 __ ldx(end_from, 8, O3);
 __ ldx(end_from, 0, O4);
 __ dec(end_to, 16);
 // copy 1 element (2 bytes) at a time
 __ BIND(L_copy_byte);
 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
 __ delayed()->nop();
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_copy_byte_loop);
 __ dec(end_from);
 __ dec(end_to);
 __ ldub(end_from, 0, O4);
 __ deccc(count);
 // copy 1 element at a time
 __ BIND(L_copy_2_bytes);
 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
 __ delayed()->nop();
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_copy_2_bytes_loop);
 __ lduh(from, offset, O3);
 __ deccc(count);
 __ sth(O3, to, offset);
 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
 L_aligned_copy, L_copy_2_bytes);
 }
 // copy 4 elements (16 bytes) at a time
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_aligned_copy);
 __ dec(end_from, 16);
 __ ldx(end_from, 8, O3);
 __ ldx(end_from, 0, O4);
 __ dec(end_to, 16);
 // code is more optimal.
 // copy with shift 4 elements (16 bytes) at a time
 __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_copy_16_bytes);
 __ ldx(from, 4, O4);
 __ deccc(count, 4); // Can we do next iteration after this one?
 __ ldx(from, 12, G4);
 __ inc(to, 16);
 //
 // Load 2 aligned 8-bytes chunks and use one from previous iteration
 // to form 2 aligned 8-bytes chunks to store.
 //
 __ ldx(end_from, -4, O3);
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_copy_16_bytes);
 __ ldx(end_from, -12, O4);
 __ deccc(count, 4);
 __ ldx(end_from, -20, O5);
 __ dec(end_to, 16);
 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
 __ delayed()->inc(count, 4);
 // copy 4 elements (16 bytes) at a time
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_aligned_copy);
 __ dec(end_from, 16);
 __ ldx(end_from, 8, O3);
 __ ldx(end_from, 0, O4);
 __ dec(end_to, 16);
 __ delayed()->mov(to,   to64);
 // Now we can use O4(offset0), O5(offset8) as temps
 __ mov(O3, count);
 __ mov(from, from64);
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_copy_64_bytes);
 for( int off = 0; off < 64; off += 16 ) {
 __ ldx(from64,  off+0, O4);
 __ ldx(from64,  off+8, O5);
 __ stx(O4, to64,  off+0);
 __ inccc(count, 6);
 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
 __ delayed()->add(offset0, 8, offset8);
 // Copy by 16 bytes chunks
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_copy_16_bytes);
 __ ldx(from, offset0, O3);
 __ ldx(from, offset8, G3);
 __ deccc(count, 2);
 __ stx(O3, to, offset0);
 __ subcc(count, 1, count);
 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
 __ delayed()->sllx(count, LogBytesPerLong, offset8);
 __ sub(offset8, 8, offset0);
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(L_copy_16_bytes);
 __ ldx(from, offset8, O2);
 __ ldx(from, offset0, O3);
 __ stx(O2, to, offset8);
 __ deccc(offset8, 16);      // use offset8 as counter
 // (Loop is rotated; its entry is load_element.)
 // Loop variables:
 //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
 //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
 //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
-__ align(16);
+__ align(OptoLoopAlignment);
 __ BIND(store_element);
 __ deccc(G1_remain);                // decrement the count
 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
 __ inc(O5_offset, heapOopSize);     // step to next offset

Mercurial > hg > graal-compiler

comparison src/cpu/sparc/vm/stubGenerator_sparc.cpp @ 1365:6476042f815c