# HG changeset patch # User kvn # Date 1270658267 25200 # Node ID 6476042f815c711bca349683fdeeb23e104c03f6 # Parent 0dc88ad3244ecd71f16171e429b0797657843a7f 6940701: Don't align loops in stubs for Niagara sparc Summary: Don't align loops in stubs for Niagara sparc since NOPs are expensive. Reviewed-by: twisti, never diff -r 0dc88ad3244e -r 6476042f815c src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp --- a/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Tue Apr 06 15:18:10 2010 -0700 +++ b/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Wed Apr 07 09:37:47 2010 -0700 @@ -2849,7 +2849,7 @@ void LIR_Assembler::align_backward_branch_target() { - __ align(16); + __ align(OptoLoopAlignment); } diff -r 0dc88ad3244e -r 6476042f815c src/cpu/sparc/vm/c2_globals_sparc.hpp --- a/src/cpu/sparc/vm/c2_globals_sparc.hpp Tue Apr 06 15:18:10 2010 -0700 +++ b/src/cpu/sparc/vm/c2_globals_sparc.hpp Wed Apr 07 09:37:47 2010 -0700 @@ -60,9 +60,6 @@ define_pd_global(intx, INTPRESSURE, 48); // large register set define_pd_global(intx, InteriorEntryAlignment, 16); // = CodeEntryAlignment define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K)); -// The default setting 16/16 seems to work best. -// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.) -define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize define_pd_global(intx, RegisterCostAreaRatio, 12000); define_pd_global(bool, UseTLAB, true); define_pd_global(bool, ResizeTLAB, true); diff -r 0dc88ad3244e -r 6476042f815c src/cpu/sparc/vm/globals_sparc.hpp --- a/src/cpu/sparc/vm/globals_sparc.hpp Tue Apr 06 15:18:10 2010 -0700 +++ b/src/cpu/sparc/vm/globals_sparc.hpp Wed Apr 07 09:37:47 2010 -0700 @@ -40,6 +40,9 @@ define_pd_global(bool, UncommonNullCast, true); // Uncommon-trap NULLs past to check cast define_pd_global(intx, CodeEntryAlignment, 32); +// The default setting 16/16 seems to work best. +// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.) +define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize define_pd_global(intx, InlineFrequencyCount, 50); // we can use more inlining on the SPARC define_pd_global(intx, InlineSmallCode, 1500); #ifdef _LP64 diff -r 0dc88ad3244e -r 6476042f815c src/cpu/sparc/vm/stubGenerator_sparc.cpp --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Tue Apr 06 15:18:10 2010 -0700 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Wed Apr 07 09:37:47 2010 -0700 @@ -1148,7 +1148,7 @@ __ andn(from, 7, from); // Align address __ ldx(from, 0, O3); __ inc(from, 8); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_loop); __ ldx(from, 0, O4); __ deccc(count, count_dec); // Can we do next iteration after this one? @@ -1220,7 +1220,7 @@ // __ andn(end_from, 7, end_from); // Align address __ ldx(end_from, 0, O3); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_loop); __ ldx(end_from, -8, O4); __ deccc(count, count_dec); // Can we do next iteration after this one? @@ -1349,7 +1349,7 @@ __ BIND(L_copy_byte); __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); __ delayed()->nop(); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_byte_loop); __ ldub(from, offset, O3); __ deccc(count); @@ -1445,7 +1445,7 @@ L_aligned_copy, L_copy_byte); } // copy 4 elements (16 bytes) at a time - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_aligned_copy); __ dec(end_from, 16); __ ldx(end_from, 8, O3); @@ -1461,7 +1461,7 @@ __ BIND(L_copy_byte); __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); __ delayed()->nop(); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_byte_loop); __ dec(end_from); __ dec(end_to); @@ -1577,7 +1577,7 @@ __ BIND(L_copy_2_bytes); __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); __ delayed()->nop(); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_2_bytes_loop); __ lduh(from, offset, O3); __ deccc(count); @@ -1684,7 +1684,7 @@ L_aligned_copy, L_copy_2_bytes); } // copy 4 elements (16 bytes) at a time - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_aligned_copy); __ dec(end_from, 16); __ ldx(end_from, 8, O3); @@ -1781,7 +1781,7 @@ // copy with shift 4 elements (16 bytes) at a time __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_16_bytes); __ ldx(from, 4, O4); __ deccc(count, 4); // Can we do next iteration after this one? @@ -1907,7 +1907,7 @@ // to form 2 aligned 8-bytes chunks to store. // __ ldx(end_from, -4, O3); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_16_bytes); __ ldx(end_from, -12, O4); __ deccc(count, 4); @@ -1929,7 +1929,7 @@ __ delayed()->inc(count, 4); // copy 4 elements (16 bytes) at a time - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_aligned_copy); __ dec(end_from, 16); __ ldx(end_from, 8, O3); @@ -2045,7 +2045,7 @@ __ mov(O3, count); __ mov(from, from64); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_64_bytes); for( int off = 0; off < 64; off += 16 ) { __ ldx(from64, off+0, O4); @@ -2065,7 +2065,7 @@ __ delayed()->add(offset0, 8, offset8); // Copy by 16 bytes chunks - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_16_bytes); __ ldx(from, offset0, O3); __ ldx(from, offset8, G3); @@ -2139,7 +2139,7 @@ __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); __ delayed()->sllx(count, LogBytesPerLong, offset8); __ sub(offset8, 8, offset0); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_16_bytes); __ ldx(from, offset8, O2); __ ldx(from, offset0, O3); @@ -2405,7 +2405,7 @@ // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* // G3, G4, G5 --- current oop, oop.klass, oop.klass.super - __ align(16); + __ align(OptoLoopAlignment); __ BIND(store_element); __ deccc(G1_remain); // decrement the count diff -r 0dc88ad3244e -r 6476042f815c src/cpu/sparc/vm/vm_version_sparc.cpp --- a/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Apr 06 15:18:10 2010 -0700 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp Wed Apr 07 09:37:47 2010 -0700 @@ -86,14 +86,14 @@ if (FLAG_IS_DEFAULT(InteriorEntryAlignment)) { FLAG_SET_DEFAULT(InteriorEntryAlignment, 4); } - if (FLAG_IS_DEFAULT(OptoLoopAlignment)) { - FLAG_SET_DEFAULT(OptoLoopAlignment, 4); - } if (is_niagara1_plus() && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { // Use smaller prefetch distance on N2 FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256); } #endif + if (FLAG_IS_DEFAULT(OptoLoopAlignment)) { + FLAG_SET_DEFAULT(OptoLoopAlignment, 4); + } } // Use hardware population count instruction if available. diff -r 0dc88ad3244e -r 6476042f815c src/cpu/x86/vm/c2_globals_x86.hpp --- a/src/cpu/x86/vm/c2_globals_x86.hpp Tue Apr 06 15:18:10 2010 -0700 +++ b/src/cpu/x86/vm/c2_globals_x86.hpp Wed Apr 07 09:37:47 2010 -0700 @@ -80,7 +80,6 @@ // Ergonomics related flags define_pd_global(uint64_t,MaxRAM, 4ULL*G); #endif // AMD64 -define_pd_global(intx, OptoLoopAlignment, 16); define_pd_global(intx, RegisterCostAreaRatio, 16000); // Peephole and CISC spilling both break the graph, and so makes the diff -r 0dc88ad3244e -r 6476042f815c src/cpu/x86/vm/globals_x86.hpp --- a/src/cpu/x86/vm/globals_x86.hpp Tue Apr 06 15:18:10 2010 -0700 +++ b/src/cpu/x86/vm/globals_x86.hpp Wed Apr 07 09:37:47 2010 -0700 @@ -45,6 +45,7 @@ #else define_pd_global(intx, CodeEntryAlignment, 16); #endif // COMPILER2 +define_pd_global(intx, OptoLoopAlignment, 16); define_pd_global(intx, InlineFrequencyCount, 100); define_pd_global(intx, InlineSmallCode, 1000); diff -r 0dc88ad3244e -r 6476042f815c src/cpu/x86/vm/stubGenerator_x86_32.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Apr 06 15:18:10 2010 -0700 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Apr 07 09:37:47 2010 -0700 @@ -812,7 +812,7 @@ Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; // Copy 64-byte chunks __ jmpb(L_copy_64_bytes); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_64_bytes_loop); if(UseUnalignedLoadStores) { @@ -874,7 +874,7 @@ Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; // Copy 64-byte chunks __ jmpb(L_copy_64_bytes); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_64_bytes_loop); __ movq(mmx0, Address(from, 0)); __ movq(mmx1, Address(from, 8)); @@ -1144,7 +1144,7 @@ __ movl(Address(to, count, sf, 0), rdx); __ jmpb(L_copy_8_bytes); - __ align(16); + __ align(OptoLoopAlignment); // Move 8 bytes __ BIND(L_copy_8_bytes_loop); if (UseXMMForArrayCopy) { @@ -1235,7 +1235,7 @@ } } else { __ jmpb(L_copy_8_bytes); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_8_bytes_loop); __ fild_d(Address(from, 0)); __ fistp_d(Address(from, to_from, Address::times_1)); @@ -1282,7 +1282,7 @@ __ jmpb(L_copy_8_bytes); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_8_bytes_loop); if (VM_Version::supports_mmx()) { if (UseXMMForArrayCopy) { @@ -1454,7 +1454,7 @@ // Loop control: // for (count = -count; count != 0; count++) // Base pointers src, dst are biased by 8*count,to last element. - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_store_element); __ movptr(to_element_addr, elem); // store the oop diff -r 0dc88ad3244e -r 6476042f815c src/cpu/x86/vm/stubGenerator_x86_64.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Apr 06 15:18:10 2010 -0700 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Apr 07 09:37:47 2010 -0700 @@ -871,9 +871,8 @@ } address generate_fp_mask(const char *stub_name, int64_t mask) { + __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", stub_name); - - __ align(16); address start = __ pc(); __ emit_data64( mask, relocInfo::none ); @@ -1268,7 +1267,7 @@ Label& L_copy_32_bytes, Label& L_copy_8_bytes) { DEBUG_ONLY(__ stop("enter at entry label, not here")); Label L_loop; - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_loop); if(UseUnalignedLoadStores) { __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); @@ -1309,7 +1308,7 @@ Label& L_copy_32_bytes, Label& L_copy_8_bytes) { DEBUG_ONLY(__ stop("enter at entry label, not here")); Label L_loop; - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_loop); if(UseUnalignedLoadStores) { __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); @@ -2229,7 +2228,7 @@ // Loop control: // for (count = -count; count != 0; count++) // Base pointers src, dst are biased by 8*(count-1),to last element. - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_store_element); __ store_heap_oop(to_element_addr, rax_oop); // store the oop diff -r 0dc88ad3244e -r 6476042f815c src/share/vm/opto/c2_globals.hpp --- a/src/share/vm/opto/c2_globals.hpp Tue Apr 06 15:18:10 2010 -0700 +++ b/src/share/vm/opto/c2_globals.hpp Wed Apr 07 09:37:47 2010 -0700 @@ -52,9 +52,6 @@ "Code alignment for interior entry points " \ "in generated code (in bytes)") \ \ - product_pd(intx, OptoLoopAlignment, \ - "Align inner loops to zero relative to this modulus") \ - \ product(intx, MaxLoopPad, (OptoLoopAlignment-1), \ "Align a loop if padding size in bytes is less or equal to this value") \ \ diff -r 0dc88ad3244e -r 6476042f815c src/share/vm/runtime/globals.hpp --- a/src/share/vm/runtime/globals.hpp Tue Apr 06 15:18:10 2010 -0700 +++ b/src/share/vm/runtime/globals.hpp Wed Apr 07 09:37:47 2010 -0700 @@ -3110,6 +3110,9 @@ develop_pd(intx, CodeEntryAlignment, \ "Code entry alignment for generated code (in bytes)") \ \ + product_pd(intx, OptoLoopAlignment, \ + "Align inner loops to zero relative to this modulus") \ + \ product_pd(uintx, InitialCodeCacheSize, \ "Initial code cache size (in bytes)") \ \