# HG changeset patch # User jrose # Date 1271188897 25200 # Node ID fc3cd2277dc77b41ae89ca16d33201f0bd80aa0e # Parent e4c77b879561123ffe0b68830bd43e1fd6928e3e# Parent 213fbcf54799881724d1a6f8b43bffc4e2c18dd4 Merge diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp --- a/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -1728,9 +1728,13 @@ ShouldNotReachHere(); } } else if (code == lir_cmp_l2i) { +#ifdef _LP64 + __ lcmp(left->as_register_lo(), right->as_register_lo(), dst->as_register()); +#else __ lcmp(left->as_register_hi(), left->as_register_lo(), right->as_register_hi(), right->as_register_lo(), dst->as_register()); +#endif } else { ShouldNotReachHere(); } @@ -2849,7 +2853,7 @@ void LIR_Assembler::align_backward_branch_target() { - __ align(16); + __ align(OptoLoopAlignment); } diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/sparc/vm/c2_globals_sparc.hpp --- a/src/cpu/sparc/vm/c2_globals_sparc.hpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/sparc/vm/c2_globals_sparc.hpp Tue Apr 13 13:01:37 2010 -0700 @@ -60,9 +60,6 @@ define_pd_global(intx, INTPRESSURE, 48); // large register set define_pd_global(intx, InteriorEntryAlignment, 16); // = CodeEntryAlignment define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K)); -// The default setting 16/16 seems to work best. -// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.) -define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize define_pd_global(intx, RegisterCostAreaRatio, 12000); define_pd_global(bool, UseTLAB, true); define_pd_global(bool, ResizeTLAB, true); diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/sparc/vm/globals_sparc.hpp --- a/src/cpu/sparc/vm/globals_sparc.hpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/sparc/vm/globals_sparc.hpp Tue Apr 13 13:01:37 2010 -0700 @@ -40,6 +40,9 @@ define_pd_global(bool, UncommonNullCast, true); // Uncommon-trap NULLs past to check cast define_pd_global(intx, CodeEntryAlignment, 32); +// The default setting 16/16 seems to work best. +// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.) +define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize define_pd_global(intx, InlineFrequencyCount, 50); // we can use more inlining on the SPARC define_pd_global(intx, InlineSmallCode, 1500); #ifdef _LP64 diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/sparc/vm/sparc.ad --- a/src/cpu/sparc/vm/sparc.ad Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/sparc/vm/sparc.ad Tue Apr 13 13:01:37 2010 -0700 @@ -471,6 +471,9 @@ source %{ #define __ _masm. +// Block initializing store +#define ASI_BLK_INIT_QUAD_LDD_P 0xE2 + // tertiary op of a LoadP or StoreP encoding #define REGP_OP true @@ -6147,6 +6150,7 @@ %} instruct prefetchw( memory mem ) %{ + predicate(AllocatePrefetchStyle != 3 ); match( PrefetchWrite mem ); ins_cost(MEMORY_REF_COST); @@ -6156,6 +6160,23 @@ ins_pipe(iload_mem); %} +// Use BIS instruction to prefetch. +instruct prefetchw_bis( memory mem ) %{ + predicate(AllocatePrefetchStyle == 3); + match( PrefetchWrite mem ); + ins_cost(MEMORY_REF_COST); + + format %{ "STXA G0,$mem\t! // Block initializing store" %} + ins_encode %{ + Register base = as_Register($mem$$base); + int disp = $mem$$disp; + if (disp != 0) { + __ add(base, AllocatePrefetchStepSize, base); + } + __ stxa(G0, base, G0, ASI_BLK_INIT_QUAD_LDD_P); + %} + ins_pipe(istore_mem_reg); +%} //----------Store Instructions------------------------------------------------- // Store Byte diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/sparc/vm/stubGenerator_sparc.cpp --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -1148,7 +1148,7 @@ __ andn(from, 7, from); // Align address __ ldx(from, 0, O3); __ inc(from, 8); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_loop); __ ldx(from, 0, O4); __ deccc(count, count_dec); // Can we do next iteration after this one? @@ -1220,7 +1220,7 @@ // __ andn(end_from, 7, end_from); // Align address __ ldx(end_from, 0, O3); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_loop); __ ldx(end_from, -8, O4); __ deccc(count, count_dec); // Can we do next iteration after this one? @@ -1349,7 +1349,7 @@ __ BIND(L_copy_byte); __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); __ delayed()->nop(); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_byte_loop); __ ldub(from, offset, O3); __ deccc(count); @@ -1445,7 +1445,7 @@ L_aligned_copy, L_copy_byte); } // copy 4 elements (16 bytes) at a time - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_aligned_copy); __ dec(end_from, 16); __ ldx(end_from, 8, O3); @@ -1461,7 +1461,7 @@ __ BIND(L_copy_byte); __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); __ delayed()->nop(); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_byte_loop); __ dec(end_from); __ dec(end_to); @@ -1577,7 +1577,7 @@ __ BIND(L_copy_2_bytes); __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); __ delayed()->nop(); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_2_bytes_loop); __ lduh(from, offset, O3); __ deccc(count); @@ -1684,7 +1684,7 @@ L_aligned_copy, L_copy_2_bytes); } // copy 4 elements (16 bytes) at a time - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_aligned_copy); __ dec(end_from, 16); __ ldx(end_from, 8, O3); @@ -1781,7 +1781,7 @@ // copy with shift 4 elements (16 bytes) at a time __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_16_bytes); __ ldx(from, 4, O4); __ deccc(count, 4); // Can we do next iteration after this one? @@ -1907,7 +1907,7 @@ // to form 2 aligned 8-bytes chunks to store. // __ ldx(end_from, -4, O3); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_16_bytes); __ ldx(end_from, -12, O4); __ deccc(count, 4); @@ -1929,7 +1929,7 @@ __ delayed()->inc(count, 4); // copy 4 elements (16 bytes) at a time - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_aligned_copy); __ dec(end_from, 16); __ ldx(end_from, 8, O3); @@ -2000,6 +2000,27 @@ // to: O1 // count: O2 treated as signed // + // count -= 2; + // if ( count >= 0 ) { // >= 2 elements + // if ( count > 6) { // >= 8 elements + // count -= 6; // original count - 8 + // do { + // copy_8_elements; + // count -= 8; + // } while ( count >= 0 ); + // count += 6; + // } + // if ( count >= 0 ) { // >= 2 elements + // do { + // copy_2_elements; + // } while ( (count=count-2) >= 0 ); + // } + // } + // count += 2; + // if ( count != 0 ) { // 1 element left + // copy_1_element; + // } + // void generate_disjoint_long_copy_core(bool aligned) { Label L_copy_8_bytes, L_copy_16_bytes, L_exit; const Register from = O0; // source array address @@ -2012,7 +2033,39 @@ __ mov(G0, offset0); // offset from start of arrays (0) __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); __ delayed()->add(offset0, 8, offset8); - __ align(16); + + // Copy by 64 bytes chunks + Label L_copy_64_bytes; + const Register from64 = O3; // source address + const Register to64 = G3; // destination address + __ subcc(count, 6, O3); + __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); + __ delayed()->mov(to, to64); + // Now we can use O4(offset0), O5(offset8) as temps + __ mov(O3, count); + __ mov(from, from64); + + __ align(OptoLoopAlignment); + __ BIND(L_copy_64_bytes); + for( int off = 0; off < 64; off += 16 ) { + __ ldx(from64, off+0, O4); + __ ldx(from64, off+8, O5); + __ stx(O4, to64, off+0); + __ stx(O5, to64, off+8); + } + __ deccc(count, 8); + __ inc(from64, 64); + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes); + __ delayed()->inc(to64, 64); + + // Restore O4(offset0), O5(offset8) + __ sub(from64, from, offset0); + __ inccc(count, 6); + __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); + __ delayed()->add(offset0, 8, offset8); + + // Copy by 16 bytes chunks + __ align(OptoLoopAlignment); __ BIND(L_copy_16_bytes); __ ldx(from, offset0, O3); __ ldx(from, offset8, G3); @@ -2023,6 +2076,7 @@ __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); __ delayed()->inc(offset8, 16); + // Copy last 8 bytes __ BIND(L_copy_8_bytes); __ inccc(count, 2); __ brx(Assembler::zero, true, Assembler::pn, L_exit ); @@ -2085,7 +2139,7 @@ __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); __ delayed()->sllx(count, LogBytesPerLong, offset8); __ sub(offset8, 8, offset0); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_16_bytes); __ ldx(from, offset8, O2); __ ldx(from, offset0, O3); @@ -2351,7 +2405,7 @@ // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* // G3, G4, G5 --- current oop, oop.klass, oop.klass.super - __ align(16); + __ align(OptoLoopAlignment); __ BIND(store_element); __ deccc(G1_remain); // decrement the count diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/sparc/vm/vm_version_sparc.cpp --- a/src/cpu/sparc/vm/vm_version_sparc.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -86,14 +86,24 @@ if (FLAG_IS_DEFAULT(InteriorEntryAlignment)) { FLAG_SET_DEFAULT(InteriorEntryAlignment, 4); } + if (is_niagara1_plus()) { + if (AllocatePrefetchStyle > 0 && FLAG_IS_DEFAULT(AllocatePrefetchStyle)) { + // Use BIS instruction for allocation prefetch. + FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3); + if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { + // Use smaller prefetch distance on N2 with BIS + FLAG_SET_DEFAULT(AllocatePrefetchDistance, 64); + } + } + if (AllocatePrefetchStyle != 3 && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { + // Use different prefetch distance without BIS + FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256); + } + } +#endif if (FLAG_IS_DEFAULT(OptoLoopAlignment)) { FLAG_SET_DEFAULT(OptoLoopAlignment, 4); } - if (is_niagara1_plus() && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { - // Use smaller prefetch distance on N2 - FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256); - } -#endif } // Use hardware population count instruction if available. diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/x86/vm/assembler_x86.cpp --- a/src/cpu/x86/vm/assembler_x86.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/x86/vm/assembler_x86.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -3365,6 +3365,13 @@ #else // LP64 +void Assembler::set_byte_if_not_zero(Register dst) { + int enc = prefix_and_encode(dst->encoding(), true); + emit_byte(0x0F); + emit_byte(0x95); + emit_byte(0xE0 | enc); +} + // 64bit only pieces of the assembler // This should only be used by 64bit instructions that can use rip-relative // it cannot be used by instructions that want an immediate value. diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/x86/vm/c1_LIRAssembler_x86.cpp --- a/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -2690,19 +2690,14 @@ } else { assert(code == lir_cmp_l2i, "check"); #ifdef _LP64 - Register dest = dst->as_register(); - __ xorptr(dest, dest); - Label high, done; - __ cmpptr(left->as_register_lo(), right->as_register_lo()); - __ jcc(Assembler::equal, done); - __ jcc(Assembler::greater, high); - __ decrement(dest); - __ jmp(done); - __ bind(high); - __ increment(dest); - - __ bind(done); - + Label done; + Register dest = dst->as_register(); + __ cmpptr(left->as_register_lo(), right->as_register_lo()); + __ movl(dest, -1); + __ jccb(Assembler::less, done); + __ set_byte_if_not_zero(dest); + __ movzbl(dest, dest); + __ bind(done); #else __ lcmp2int(left->as_register_hi(), left->as_register_lo(), diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/x86/vm/c1_Runtime1_x86.cpp --- a/src/cpu/x86/vm/c1_Runtime1_x86.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/x86/vm/c1_Runtime1_x86.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -781,7 +781,7 @@ // Restore SP from BP if the exception PC is a MethodHandle call site. NOT_LP64(__ get_thread(thread);) - __ cmpl(Address(thread, JavaThread::is_method_handle_exception_offset()), 0); + __ cmpl(Address(thread, JavaThread::is_method_handle_return_offset()), 0); __ cmovptr(Assembler::notEqual, rsp, rbp); // continue at exception handler (return address removed) diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/x86/vm/c2_globals_x86.hpp --- a/src/cpu/x86/vm/c2_globals_x86.hpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/x86/vm/c2_globals_x86.hpp Tue Apr 13 13:01:37 2010 -0700 @@ -80,7 +80,6 @@ // Ergonomics related flags define_pd_global(uint64_t,MaxRAM, 4ULL*G); #endif // AMD64 -define_pd_global(intx, OptoLoopAlignment, 16); define_pd_global(intx, RegisterCostAreaRatio, 16000); // Peephole and CISC spilling both break the graph, and so makes the diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/x86/vm/globals_x86.hpp --- a/src/cpu/x86/vm/globals_x86.hpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/x86/vm/globals_x86.hpp Tue Apr 13 13:01:37 2010 -0700 @@ -45,6 +45,7 @@ #else define_pd_global(intx, CodeEntryAlignment, 16); #endif // COMPILER2 +define_pd_global(intx, OptoLoopAlignment, 16); define_pd_global(intx, InlineFrequencyCount, 100); define_pd_global(intx, InlineSmallCode, 1000); diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/x86/vm/runtime_x86_32.cpp --- a/src/cpu/x86/vm/runtime_x86_32.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/x86/vm/runtime_x86_32.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -115,8 +115,8 @@ // rax: exception handler for given - // Restore SP from BP if the exception PC is a MethodHandle call. - __ cmpl(Address(rcx, JavaThread::is_method_handle_exception_offset()), 0); + // Restore SP from BP if the exception PC is a MethodHandle call site. + __ cmpl(Address(rcx, JavaThread::is_method_handle_return_offset()), 0); __ cmovptr(Assembler::notEqual, rsp, rbp); // We have a handler in rax, (could be deopt blob) diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/x86/vm/sharedRuntime_x86_64.cpp --- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -3328,8 +3328,8 @@ // rax: exception handler - // Restore SP from BP if the exception PC is a MethodHandle call. - __ cmpl(Address(r15_thread, JavaThread::is_method_handle_exception_offset()), 0); + // Restore SP from BP if the exception PC is a MethodHandle call site. + __ cmpl(Address(r15_thread, JavaThread::is_method_handle_return_offset()), 0); __ cmovptr(Assembler::notEqual, rsp, rbp); // We have a handler in rax (could be deopt blob). diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/x86/vm/stubGenerator_x86_32.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -430,7 +430,7 @@ __ verify_oop(exception_oop); // Restore SP from BP if the exception PC is a MethodHandle call site. - __ cmpl(Address(thread, JavaThread::is_method_handle_exception_offset()), 0); + __ cmpl(Address(thread, JavaThread::is_method_handle_return_offset()), 0); __ cmovptr(Assembler::notEqual, rsp, rbp); // continue at exception handler (return address removed) @@ -812,7 +812,7 @@ Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; // Copy 64-byte chunks __ jmpb(L_copy_64_bytes); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_64_bytes_loop); if(UseUnalignedLoadStores) { @@ -874,7 +874,7 @@ Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; // Copy 64-byte chunks __ jmpb(L_copy_64_bytes); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_64_bytes_loop); __ movq(mmx0, Address(from, 0)); __ movq(mmx1, Address(from, 8)); @@ -1144,7 +1144,7 @@ __ movl(Address(to, count, sf, 0), rdx); __ jmpb(L_copy_8_bytes); - __ align(16); + __ align(OptoLoopAlignment); // Move 8 bytes __ BIND(L_copy_8_bytes_loop); if (UseXMMForArrayCopy) { @@ -1235,7 +1235,7 @@ } } else { __ jmpb(L_copy_8_bytes); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_8_bytes_loop); __ fild_d(Address(from, 0)); __ fistp_d(Address(from, to_from, Address::times_1)); @@ -1282,7 +1282,7 @@ __ jmpb(L_copy_8_bytes); - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_copy_8_bytes_loop); if (VM_Version::supports_mmx()) { if (UseXMMForArrayCopy) { @@ -1454,7 +1454,7 @@ // Loop control: // for (count = -count; count != 0; count++) // Base pointers src, dst are biased by 8*count,to last element. - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_store_element); __ movptr(to_element_addr, elem); // store the oop diff -r e4c77b879561 -r fc3cd2277dc7 src/cpu/x86/vm/stubGenerator_x86_64.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -871,9 +871,8 @@ } address generate_fp_mask(const char *stub_name, int64_t mask) { + __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", stub_name); - - __ align(16); address start = __ pc(); __ emit_data64( mask, relocInfo::none ); @@ -1268,7 +1267,7 @@ Label& L_copy_32_bytes, Label& L_copy_8_bytes) { DEBUG_ONLY(__ stop("enter at entry label, not here")); Label L_loop; - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_loop); if(UseUnalignedLoadStores) { __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); @@ -1309,7 +1308,7 @@ Label& L_copy_32_bytes, Label& L_copy_8_bytes) { DEBUG_ONLY(__ stop("enter at entry label, not here")); Label L_loop; - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_loop); if(UseUnalignedLoadStores) { __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); @@ -2229,7 +2228,7 @@ // Loop control: // for (count = -count; count != 0; count++) // Base pointers src, dst are biased by 8*(count-1),to last element. - __ align(16); + __ align(OptoLoopAlignment); __ BIND(L_store_element); __ store_heap_oop(to_element_addr, rax_oop); // store the oop diff -r e4c77b879561 -r fc3cd2277dc7 src/share/vm/c1/c1_LinearScan.cpp --- a/src/share/vm/c1/c1_LinearScan.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/share/vm/c1/c1_LinearScan.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -2608,12 +2608,17 @@ } else if (opr->is_double_xmm()) { assert(opr->fpu_regnrLo() == opr->fpu_regnrHi(), "assumed in calculation"); VMReg rname_first = opr->as_xmm_double_reg()->as_VMReg(); +# ifdef _LP64 + first = new LocationValue(Location::new_reg_loc(Location::dbl, rname_first)); + second = &_int_0_scope_value; +# else first = new LocationValue(Location::new_reg_loc(Location::normal, rname_first)); // %%% This is probably a waste but we'll keep things as they were for now if (true) { VMReg rname_second = rname_first->next(); second = new LocationValue(Location::new_reg_loc(Location::normal, rname_second)); } +# endif #endif } else if (opr->is_double_fpu()) { @@ -2639,13 +2644,17 @@ #endif VMReg rname_first = frame_map()->fpu_regname(opr->fpu_regnrHi()); - +#ifdef _LP64 + first = new LocationValue(Location::new_reg_loc(Location::dbl, rname_first)); + second = &_int_0_scope_value; +#else first = new LocationValue(Location::new_reg_loc(Location::normal, rname_first)); // %%% This is probably a waste but we'll keep things as they were for now if (true) { VMReg rname_second = rname_first->next(); second = new LocationValue(Location::new_reg_loc(Location::normal, rname_second)); } +#endif } else { ShouldNotReachHere(); diff -r e4c77b879561 -r fc3cd2277dc7 src/share/vm/classfile/classFileParser.cpp --- a/src/share/vm/classfile/classFileParser.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/share/vm/classfile/classFileParser.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -2956,8 +2956,8 @@ #endif bool compact_fields = CompactFields; int allocation_style = FieldsAllocationStyle; - if( allocation_style < 0 || allocation_style > 1 ) { // Out of range? - assert(false, "0 <= FieldsAllocationStyle <= 1"); + if( allocation_style < 0 || allocation_style > 2 ) { // Out of range? + assert(false, "0 <= FieldsAllocationStyle <= 2"); allocation_style = 1; // Optimistic } @@ -2993,6 +2993,25 @@ } else if( allocation_style == 1 ) { // Fields order: longs/doubles, ints, shorts/chars, bytes, oops next_nonstatic_double_offset = next_nonstatic_field_offset; + } else if( allocation_style == 2 ) { + // Fields allocation: oops fields in super and sub classes are together. + if( nonstatic_field_size > 0 && super_klass() != NULL && + super_klass->nonstatic_oop_map_size() > 0 ) { + int map_size = super_klass->nonstatic_oop_map_size(); + OopMapBlock* first_map = super_klass->start_of_nonstatic_oop_maps(); + OopMapBlock* last_map = first_map + map_size - 1; + int next_offset = last_map->offset() + (last_map->count() * heapOopSize); + if (next_offset == next_nonstatic_field_offset) { + allocation_style = 0; // allocate oops first + next_nonstatic_oop_offset = next_nonstatic_field_offset; + next_nonstatic_double_offset = next_nonstatic_oop_offset + + (nonstatic_oop_count * heapOopSize); + } + } + if( allocation_style == 2 ) { + allocation_style = 1; // allocate oops last + next_nonstatic_double_offset = next_nonstatic_field_offset; + } } else { ShouldNotReachHere(); } diff -r e4c77b879561 -r fc3cd2277dc7 src/share/vm/code/codeCache.cpp --- a/src/share/vm/code/codeCache.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/share/vm/code/codeCache.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -1,5 +1,5 @@ /* - * Copyright 1997-2007 Sun Microsystems, Inc. All Rights Reserved. + * Copyright 1997-2010 Sun Microsystems, Inc. All Rights Reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -284,9 +284,11 @@ cur->print_on(tty, is_live ? "scavenge root" : "dead scavenge root"); tty->cr(); } #endif //PRODUCT - if (is_live) + if (is_live) { // Perform cur->oops_do(f), maybe just once per nmethod. f->do_code_blob(cur); + cur->fix_oop_relocations(); + } } // Check for stray marks. diff -r e4c77b879561 -r fc3cd2277dc7 src/share/vm/memory/threadLocalAllocBuffer.hpp --- a/src/share/vm/memory/threadLocalAllocBuffer.hpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/share/vm/memory/threadLocalAllocBuffer.hpp Tue Apr 13 13:01:37 2010 -0700 @@ -111,7 +111,22 @@ // Allocate size HeapWords. The memory is NOT initialized to zero. inline HeapWord* allocate(size_t size); - static size_t alignment_reserve() { return align_object_size(typeArrayOopDesc::header_size(T_INT)); } + + // Reserve space at the end of TLAB + static size_t end_reserve() { + int reserve_size = typeArrayOopDesc::header_size(T_INT); + if (AllocatePrefetchStyle == 3) { + // BIS is used to prefetch - we need a space for it. + // +1 for rounding up to next cache line +1 to be safe + int lines = AllocatePrefetchLines + 2; + int step_size = AllocatePrefetchStepSize; + int distance = AllocatePrefetchDistance; + int prefetch_end = (distance + step_size*lines)/(int)HeapWordSize; + reserve_size = MAX2(reserve_size, prefetch_end); + } + return reserve_size; + } + static size_t alignment_reserve() { return align_object_size(end_reserve()); } static size_t alignment_reserve_in_bytes() { return alignment_reserve() * HeapWordSize; } // Return tlab size or remaining space in eden such that the diff -r e4c77b879561 -r fc3cd2277dc7 src/share/vm/opto/c2_globals.hpp --- a/src/share/vm/opto/c2_globals.hpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/share/vm/opto/c2_globals.hpp Tue Apr 13 13:01:37 2010 -0700 @@ -52,9 +52,6 @@ "Code alignment for interior entry points " \ "in generated code (in bytes)") \ \ - product_pd(intx, OptoLoopAlignment, \ - "Align inner loops to zero relative to this modulus") \ - \ product(intx, MaxLoopPad, (OptoLoopAlignment-1), \ "Align a loop if padding size in bytes is less or equal to this value") \ \ diff -r e4c77b879561 -r fc3cd2277dc7 src/share/vm/opto/macro.cpp --- a/src/share/vm/opto/macro.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/share/vm/opto/macro.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -1487,11 +1487,11 @@ Node*& contended_phi_rawmem, Node* old_eden_top, Node* new_eden_top, Node* length) { + enum { fall_in_path = 1, pf_path = 2 }; if( UseTLAB && AllocatePrefetchStyle == 2 ) { // Generate prefetch allocation with watermark check. // As an allocation hits the watermark, we will prefetch starting // at a "distance" away from watermark. - enum { fall_in_path = 1, pf_path = 2 }; Node *pf_region = new (C, 3) RegionNode(3); Node *pf_phi_rawmem = new (C, 3) PhiNode( pf_region, Type::MEMORY, @@ -1570,6 +1570,45 @@ needgc_false = pf_region; contended_phi_rawmem = pf_phi_rawmem; i_o = pf_phi_abio; + } else if( UseTLAB && AllocatePrefetchStyle == 3 ) { + // Insert a prefetch for each allocation only on the fast-path + Node *pf_region = new (C, 3) RegionNode(3); + Node *pf_phi_rawmem = new (C, 3) PhiNode( pf_region, Type::MEMORY, + TypeRawPtr::BOTTOM ); + + // Generate several prefetch instructions only for arrays. + uint lines = (length != NULL) ? AllocatePrefetchLines : 1; + uint step_size = AllocatePrefetchStepSize; + uint distance = AllocatePrefetchDistance; + + // Next cache address. + Node *cache_adr = new (C, 4) AddPNode(old_eden_top, old_eden_top, + _igvn.MakeConX(distance)); + transform_later(cache_adr); + cache_adr = new (C, 2) CastP2XNode(needgc_false, cache_adr); + transform_later(cache_adr); + Node* mask = _igvn.MakeConX(~(intptr_t)(step_size-1)); + cache_adr = new (C, 3) AndXNode(cache_adr, mask); + transform_later(cache_adr); + cache_adr = new (C, 2) CastX2PNode(cache_adr); + transform_later(cache_adr); + + // Prefetch + Node *prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, cache_adr ); + prefetch->set_req(0, needgc_false); + transform_later(prefetch); + contended_phi_rawmem = prefetch; + Node *prefetch_adr; + distance = step_size; + for ( uint i = 1; i < lines; i++ ) { + prefetch_adr = new (C, 4) AddPNode( cache_adr, cache_adr, + _igvn.MakeConX(distance) ); + transform_later(prefetch_adr); + prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, prefetch_adr ); + transform_later(prefetch); + distance += step_size; + contended_phi_rawmem = prefetch; + } } else if( AllocatePrefetchStyle > 0 ) { // Insert a prefetch for each allocation only on the fast-path Node *prefetch_adr; diff -r e4c77b879561 -r fc3cd2277dc7 src/share/vm/opto/memnode.hpp --- a/src/share/vm/opto/memnode.hpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/share/vm/opto/memnode.hpp Tue Apr 13 13:01:37 2010 -0700 @@ -1244,5 +1244,5 @@ virtual int Opcode() const; virtual uint ideal_reg() const { return NotAMachineReg; } virtual uint match_edge(uint idx) const { return idx==2; } - virtual const Type *bottom_type() const { return Type::ABIO; } + virtual const Type *bottom_type() const { return ( AllocatePrefetchStyle == 3 ) ? Type::MEMORY : Type::ABIO; } }; diff -r e4c77b879561 -r fc3cd2277dc7 src/share/vm/opto/runtime.cpp --- a/src/share/vm/opto/runtime.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/share/vm/opto/runtime.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -865,7 +865,7 @@ thread->set_exception_stack_size(0); // Check if the exception PC is a MethodHandle call site. - thread->set_is_method_handle_exception(nm->is_method_handle_return(pc)); + thread->set_is_method_handle_return(nm->is_method_handle_return(pc)); } // Restore correct return pc. Was saved above. diff -r e4c77b879561 -r fc3cd2277dc7 src/share/vm/runtime/globals.hpp --- a/src/share/vm/runtime/globals.hpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/share/vm/runtime/globals.hpp Tue Apr 13 13:01:37 2010 -0700 @@ -1052,7 +1052,8 @@ "Use SSE2 MOVDQU instruction for Arraycopy") \ \ product(intx, FieldsAllocationStyle, 1, \ - "0 - type based with oops first, 1 - with oops last") \ + "0 - type based with oops first, 1 - with oops last, " \ + "2 - oops in super and sub classes are together") \ \ product(bool, CompactFields, true, \ "Allocate nonstatic fields in gaps between previous fields") \ @@ -2707,7 +2708,8 @@ product(intx, AllocatePrefetchStyle, 1, \ "0 = no prefetch, " \ "1 = prefetch instructions for each allocation, " \ - "2 = use TLAB watermark to gate allocation prefetch") \ + "2 = use TLAB watermark to gate allocation prefetch, " \ + "3 = use BIS instruction on Sparc for allocation prefetch") \ \ product(intx, AllocatePrefetchDistance, -1, \ "Distance to prefetch ahead of allocation pointer") \ @@ -3110,6 +3112,9 @@ develop_pd(intx, CodeEntryAlignment, \ "Code entry alignment for generated code (in bytes)") \ \ + product_pd(intx, OptoLoopAlignment, \ + "Align inner loops to zero relative to this modulus") \ + \ product_pd(uintx, InitialCodeCacheSize, \ "Initial code cache size (in bytes)") \ \ diff -r e4c77b879561 -r fc3cd2277dc7 src/share/vm/runtime/sharedRuntime.cpp --- a/src/share/vm/runtime/sharedRuntime.cpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/share/vm/runtime/sharedRuntime.cpp Tue Apr 13 13:01:37 2010 -0700 @@ -259,13 +259,16 @@ address SharedRuntime::raw_exception_handler_for_return_address(JavaThread* thread, address return_address) { assert(frame::verify_return_pc(return_address), "must be a return pc"); + // Reset MethodHandle flag. + thread->set_is_method_handle_return(false); + // the fastest case first CodeBlob* blob = CodeCache::find_blob(return_address); if (blob != NULL && blob->is_nmethod()) { nmethod* code = (nmethod*)blob; assert(code != NULL, "nmethod must be present"); // Check if the return address is a MethodHandle call site. - thread->set_is_method_handle_exception(code->is_method_handle_return(return_address)); + thread->set_is_method_handle_return(code->is_method_handle_return(return_address)); // native nmethods don't have exception handlers assert(!code->is_native_method(), "no exception handler"); assert(code->header_begin() != code->exception_begin(), "no exception handler"); @@ -292,7 +295,7 @@ nmethod* code = (nmethod*)blob; assert(code != NULL, "nmethod must be present"); // Check if the return address is a MethodHandle call site. - thread->set_is_method_handle_exception(code->is_method_handle_return(return_address)); + thread->set_is_method_handle_return(code->is_method_handle_return(return_address)); assert(code->header_begin() != code->exception_begin(), "no exception handler"); return code->exception_begin(); } diff -r e4c77b879561 -r fc3cd2277dc7 src/share/vm/runtime/thread.hpp --- a/src/share/vm/runtime/thread.hpp Fri Apr 09 15:01:49 2010 -0700 +++ b/src/share/vm/runtime/thread.hpp Tue Apr 13 13:01:37 2010 -0700 @@ -772,7 +772,7 @@ volatile address _exception_pc; // PC where exception happened volatile address _exception_handler_pc; // PC for handler of exception volatile int _exception_stack_size; // Size of frame where exception happened - volatile int _is_method_handle_exception; // True if the current exception PC is at a MethodHandle call. + volatile int _is_method_handle_return; // true (== 1) if the current exception PC is a MethodHandle call site. // support for compilation bool _is_compiling; // is true if a compilation is active inthis thread (one compilation per thread possible) @@ -1108,13 +1108,13 @@ int exception_stack_size() const { return _exception_stack_size; } address exception_pc() const { return _exception_pc; } address exception_handler_pc() const { return _exception_handler_pc; } - int is_method_handle_exception() const { return _is_method_handle_exception; } + bool is_method_handle_return() const { return _is_method_handle_return == 1; } void set_exception_oop(oop o) { _exception_oop = o; } void set_exception_pc(address a) { _exception_pc = a; } void set_exception_handler_pc(address a) { _exception_handler_pc = a; } void set_exception_stack_size(int size) { _exception_stack_size = size; } - void set_is_method_handle_exception(int value) { _is_method_handle_exception = value; } + void set_is_method_handle_return(bool value) { _is_method_handle_return = value ? 1 : 0; } // Stack overflow support inline size_t stack_available(address cur_sp); @@ -1188,7 +1188,7 @@ static ByteSize exception_pc_offset() { return byte_offset_of(JavaThread, _exception_pc ); } static ByteSize exception_handler_pc_offset() { return byte_offset_of(JavaThread, _exception_handler_pc); } static ByteSize exception_stack_size_offset() { return byte_offset_of(JavaThread, _exception_stack_size); } - static ByteSize is_method_handle_exception_offset() { return byte_offset_of(JavaThread, _is_method_handle_exception); } + static ByteSize is_method_handle_return_offset() { return byte_offset_of(JavaThread, _is_method_handle_return); } static ByteSize stack_guard_state_offset() { return byte_offset_of(JavaThread, _stack_guard_state ); } static ByteSize suspend_flags_offset() { return byte_offset_of(JavaThread, _suspend_flags ); }