# HG changeset patch # User trims # Date 1225333329 25200 # Node ID 49ca90d77f34571b0757ebfcb8a7848ef2696b88 # Parent 7c99a4bb76a161b812acaa65b1765b185c6aebbb# Parent 8fb16f19926688f6db75a9ea0055c40a1fd5482b Merge diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/cpu/x86/vm/assembler_x86.cpp --- a/src/cpu/x86/vm/assembler_x86.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/cpu/x86/vm/assembler_x86.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -1575,6 +1575,35 @@ emit_operand(src, dst); } +void Assembler::movdqu(XMMRegister dst, Address src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionMark im(this); + emit_byte(0xF3); + prefix(src, dst); + emit_byte(0x0F); + emit_byte(0x6F); + emit_operand(dst, src); +} + +void Assembler::movdqu(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_byte(0xF3); + int encode = prefixq_and_encode(dst->encoding(), src->encoding()); + emit_byte(0x0F); + emit_byte(0x6F); + emit_byte(0xC0 | encode); +} + +void Assembler::movdqu(Address dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionMark im(this); + emit_byte(0xF3); + prefix(dst, src); + emit_byte(0x0F); + emit_byte(0x7F); + emit_operand(src, dst); +} + // Uses zero extension on 64bit void Assembler::movl(Register dst, int32_t imm32) { diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/cpu/x86/vm/assembler_x86.hpp --- a/src/cpu/x86/vm/assembler_x86.hpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/cpu/x86/vm/assembler_x86.hpp Wed Oct 29 19:22:09 2008 -0700 @@ -1055,6 +1055,11 @@ void movdqa(XMMRegister dst, Address src); void movdqa(XMMRegister dst, XMMRegister src); + // Move Unaligned Double Quadword + void movdqu(Address dst, XMMRegister src); + void movdqu(XMMRegister dst, Address src); + void movdqu(XMMRegister dst, XMMRegister src); + void movl(Register dst, int32_t imm32); void movl(Address dst, int32_t imm32); void movl(Register dst, Register src); diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/cpu/x86/vm/stubGenerator_x86_32.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -791,6 +791,69 @@ } } + + // Copy 64 bytes chunks + // + // Inputs: + // from - source array address + // to_from - destination array address - from + // qword_count - 8-bytes element count, negative + // + void xmm_copy_forward(Register from, Register to_from, Register qword_count) { + assert( UseSSE >= 2, "supported cpu only" ); + Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; + // Copy 64-byte chunks + __ jmpb(L_copy_64_bytes); + __ align(16); + __ BIND(L_copy_64_bytes_loop); + + if(UseUnalignedLoadStores) { + __ movdqu(xmm0, Address(from, 0)); + __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0); + __ movdqu(xmm1, Address(from, 16)); + __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1); + __ movdqu(xmm2, Address(from, 32)); + __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2); + __ movdqu(xmm3, Address(from, 48)); + __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3); + + } else { + __ movq(xmm0, Address(from, 0)); + __ movq(Address(from, to_from, Address::times_1, 0), xmm0); + __ movq(xmm1, Address(from, 8)); + __ movq(Address(from, to_from, Address::times_1, 8), xmm1); + __ movq(xmm2, Address(from, 16)); + __ movq(Address(from, to_from, Address::times_1, 16), xmm2); + __ movq(xmm3, Address(from, 24)); + __ movq(Address(from, to_from, Address::times_1, 24), xmm3); + __ movq(xmm4, Address(from, 32)); + __ movq(Address(from, to_from, Address::times_1, 32), xmm4); + __ movq(xmm5, Address(from, 40)); + __ movq(Address(from, to_from, Address::times_1, 40), xmm5); + __ movq(xmm6, Address(from, 48)); + __ movq(Address(from, to_from, Address::times_1, 48), xmm6); + __ movq(xmm7, Address(from, 56)); + __ movq(Address(from, to_from, Address::times_1, 56), xmm7); + } + + __ addl(from, 64); + __ BIND(L_copy_64_bytes); + __ subl(qword_count, 8); + __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop); + __ addl(qword_count, 8); + __ jccb(Assembler::zero, L_exit); + // + // length is too short, just copy qwords + // + __ BIND(L_copy_8_bytes); + __ movq(xmm0, Address(from, 0)); + __ movq(Address(from, to_from, Address::times_1), xmm0); + __ addl(from, 8); + __ decrement(qword_count); + __ jcc(Assembler::greater, L_copy_8_bytes); + __ BIND(L_exit); + } + // Copy 64 bytes chunks // // Inputs: @@ -799,6 +862,7 @@ // qword_count - 8-bytes element count, negative // void mmx_copy_forward(Register from, Register to_from, Register qword_count) { + assert( VM_Version::supports_mmx(), "supported cpu only" ); Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; // Copy 64-byte chunks __ jmpb(L_copy_64_bytes); @@ -876,7 +940,7 @@ __ subptr(to, from); // to --> to_from __ cmpl(count, 2< to_from if (VM_Version::supports_mmx()) { - mmx_copy_forward(from, to_from, count); + if (UseXMMForArrayCopy) { + xmm_copy_forward(from, to_from, count); + } else { + mmx_copy_forward(from, to_from, count); + } } else { __ jmpb(L_copy_8_bytes); __ align(16); @@ -1196,8 +1277,13 @@ __ align(16); __ BIND(L_copy_8_bytes_loop); if (VM_Version::supports_mmx()) { - __ movq(mmx0, Address(from, count, Address::times_8)); - __ movq(Address(to, count, Address::times_8), mmx0); + if (UseXMMForArrayCopy) { + __ movq(xmm0, Address(from, count, Address::times_8)); + __ movq(Address(to, count, Address::times_8), xmm0); + } else { + __ movq(mmx0, Address(from, count, Address::times_8)); + __ movq(Address(to, count, Address::times_8), mmx0); + } } else { __ fild_d(Address(from, count, Address::times_8)); __ fistp_d(Address(to, count, Address::times_8)); @@ -1206,7 +1292,7 @@ __ decrement(count); __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); - if (VM_Version::supports_mmx()) { + if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) { __ emms(); } inc_copy_counter_np(T_LONG); diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/cpu/x86/vm/stubGenerator_x86_64.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -1251,6 +1251,7 @@ } } + // Copy big chunks forward // // Inputs: @@ -1268,14 +1269,22 @@ Label L_loop; __ align(16); __ BIND(L_loop); - __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); - __ movq(Address(end_to, qword_count, Address::times_8, -24), to); - __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); - __ movq(Address(end_to, qword_count, Address::times_8, -16), to); - __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); - __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); - __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); - __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); + if(UseUnalignedLoadStores) { + __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); + __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); + __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8)); + __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1); + + } else { + __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); + __ movq(Address(end_to, qword_count, Address::times_8, -24), to); + __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); + __ movq(Address(end_to, qword_count, Address::times_8, -16), to); + __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); + __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); + __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); + __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); + } __ BIND(L_copy_32_bytes); __ addptr(qword_count, 4); __ jcc(Assembler::lessEqual, L_loop); @@ -1301,14 +1310,22 @@ Label L_loop; __ align(16); __ BIND(L_loop); - __ movq(to, Address(from, qword_count, Address::times_8, 24)); - __ movq(Address(dest, qword_count, Address::times_8, 24), to); - __ movq(to, Address(from, qword_count, Address::times_8, 16)); - __ movq(Address(dest, qword_count, Address::times_8, 16), to); - __ movq(to, Address(from, qword_count, Address::times_8, 8)); - __ movq(Address(dest, qword_count, Address::times_8, 8), to); - __ movq(to, Address(from, qword_count, Address::times_8, 0)); - __ movq(Address(dest, qword_count, Address::times_8, 0), to); + if(UseUnalignedLoadStores) { + __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); + __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0); + __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); + __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); + + } else { + __ movq(to, Address(from, qword_count, Address::times_8, 24)); + __ movq(Address(dest, qword_count, Address::times_8, 24), to); + __ movq(to, Address(from, qword_count, Address::times_8, 16)); + __ movq(Address(dest, qword_count, Address::times_8, 16), to); + __ movq(to, Address(from, qword_count, Address::times_8, 8)); + __ movq(Address(dest, qword_count, Address::times_8, 8), to); + __ movq(to, Address(from, qword_count, Address::times_8, 0)); + __ movq(Address(dest, qword_count, Address::times_8, 0), to); + } __ BIND(L_copy_32_bytes); __ subptr(qword_count, 4); __ jcc(Assembler::greaterEqual, L_loop); diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/cpu/x86/vm/vm_version_x86_32.cpp --- a/src/cpu/x86/vm/vm_version_x86_32.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/cpu/x86/vm/vm_version_x86_32.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -242,9 +242,11 @@ _supports_cx8 = supports_cmpxchg8(); // if the OS doesn't support SSE, we can't use this feature even if the HW does if( !os::supports_sse()) - _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4|CPU_SSE4A); - if (UseSSE < 4) - _cpuFeatures &= ~CPU_SSE4; + _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4A|CPU_SSE4_1|CPU_SSE4_2); + if (UseSSE < 4) { + _cpuFeatures &= ~CPU_SSE4_1; + _cpuFeatures &= ~CPU_SSE4_2; + } if (UseSSE < 3) { _cpuFeatures &= ~CPU_SSE3; _cpuFeatures &= ~CPU_SSSE3; @@ -261,7 +263,7 @@ } char buf[256]; - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", cores_per_cpu(), threads_per_core(), cpu_family(), _model, _stepping, (supports_cmov() ? ", cmov" : ""), @@ -272,7 +274,8 @@ (supports_sse2() ? ", sse2" : ""), (supports_sse3() ? ", sse3" : ""), (supports_ssse3()? ", ssse3": ""), - (supports_sse4() ? ", sse4" : ""), + (supports_sse4_1() ? ", sse4.1" : ""), + (supports_sse4_2() ? ", sse4.2" : ""), (supports_mmx_ext() ? ", mmxext" : ""), (supports_3dnow() ? ", 3dnow" : ""), (supports_3dnow2() ? ", 3dnowext" : ""), @@ -285,7 +288,7 @@ // older Pentiums which do not support it. if( UseSSE > 4 ) UseSSE=4; if( UseSSE < 0 ) UseSSE=0; - if( !supports_sse4() ) // Drop to 3 if no SSE4 support + if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support UseSSE = MIN2((intx)3,UseSSE); if( !supports_sse3() ) // Drop to 2 if no SSE3 support UseSSE = MIN2((intx)2,UseSSE); @@ -375,6 +378,14 @@ MaxLoopPad = 11; } #endif // COMPILER2 + if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) { + UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus + } + if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus + if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) { + UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus + } + } } } @@ -413,7 +424,7 @@ #ifndef PRODUCT if (PrintMiscellaneous && Verbose) { - tty->print_cr("Logical CPUs per package: %u", + tty->print_cr("Logical CPUs per core: %u", logical_processors_per_package()); tty->print_cr("UseSSE=%d",UseSSE); tty->print("Allocation: "); diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/cpu/x86/vm/vm_version_x86_32.hpp --- a/src/cpu/x86/vm/vm_version_x86_32.hpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/cpu/x86/vm/vm_version_x86_32.hpp Wed Oct 29 19:22:09 2008 -0700 @@ -68,9 +68,9 @@ cmpxchg16: 1, : 4, dca : 1, - : 4, - popcnt : 1, - : 8; + sse4_1 : 1, + sse4_2 : 1, + : 11; } bits; }; @@ -177,8 +177,9 @@ CPU_SSE2 = (1 << 7), CPU_SSE3 = (1 << 8), // sse3 comes from cpuid 1 (ECX) CPU_SSSE3= (1 << 9), - CPU_SSE4 = (1 <<10), - CPU_SSE4A= (1 <<11) + CPU_SSE4A= (1 <<10), + CPU_SSE4_1 = (1 << 11), + CPU_SSE4_2 = (1 << 12) } cpuFeatureFlags; // cpuid information block. All info derived from executing cpuid with @@ -240,22 +241,14 @@ static CpuidInfo _cpuid_info; // Extractors and predicates - static bool is_extended_cpu_family() { - const uint32_t Extended_Cpu_Family = 0xf; - return _cpuid_info.std_cpuid1_rax.bits.family == Extended_Cpu_Family; - } static uint32_t extended_cpu_family() { uint32_t result = _cpuid_info.std_cpuid1_rax.bits.family; - if (is_extended_cpu_family()) { - result += _cpuid_info.std_cpuid1_rax.bits.ext_family; - } + result += _cpuid_info.std_cpuid1_rax.bits.ext_family; return result; } static uint32_t extended_cpu_model() { uint32_t result = _cpuid_info.std_cpuid1_rax.bits.model; - if (is_extended_cpu_family()) { - result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4; - } + result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4; return result; } static uint32_t cpu_stepping() { @@ -293,6 +286,10 @@ result |= CPU_SSSE3; if (is_amd() && _cpuid_info.ext_cpuid1_rcx.bits.sse4a != 0) result |= CPU_SSE4A; + if (_cpuid_info.std_cpuid1_rcx.bits.sse4_1 != 0) + result |= CPU_SSE4_1; + if (_cpuid_info.std_cpuid1_rcx.bits.sse4_2 != 0) + result |= CPU_SSE4_2; return result; } @@ -380,7 +377,8 @@ static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; } static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; } static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; } - static bool supports_sse4() { return (_cpuFeatures & CPU_SSE4) != 0; } + static bool supports_sse4_1() { return (_cpuFeatures & CPU_SSE4_1) != 0; } + static bool supports_sse4_2() { return (_cpuFeatures & CPU_SSE4_2) != 0; } // // AMD features // diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/cpu/x86/vm/vm_version_x86_64.cpp --- a/src/cpu/x86/vm/vm_version_x86_64.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/cpu/x86/vm/vm_version_x86_64.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -186,8 +186,10 @@ if (!VM_Version::supports_sse2()) { vm_exit_during_initialization("Unknown x64 processor: SSE2 not supported"); } - if (UseSSE < 4) - _cpuFeatures &= ~CPU_SSE4; + if (UseSSE < 4) { + _cpuFeatures &= ~CPU_SSE4_1; + _cpuFeatures &= ~CPU_SSE4_2; + } if (UseSSE < 3) { _cpuFeatures &= ~CPU_SSE3; _cpuFeatures &= ~CPU_SSSE3; @@ -204,7 +206,7 @@ } char buf[256]; - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", cores_per_cpu(), threads_per_core(), cpu_family(), _model, _stepping, (supports_cmov() ? ", cmov" : ""), @@ -215,7 +217,8 @@ (supports_sse2() ? ", sse2" : ""), (supports_sse3() ? ", sse3" : ""), (supports_ssse3()? ", ssse3": ""), - (supports_sse4() ? ", sse4" : ""), + (supports_sse4_1() ? ", sse4.1" : ""), + (supports_sse4_2() ? ", sse4.2" : ""), (supports_mmx_ext() ? ", mmxext" : ""), (supports_3dnow() ? ", 3dnow" : ""), (supports_3dnow2() ? ", 3dnowext" : ""), @@ -228,7 +231,7 @@ // older Pentiums which do not support it. if( UseSSE > 4 ) UseSSE=4; if( UseSSE < 0 ) UseSSE=0; - if( !supports_sse4() ) // Drop to 3 if no SSE4 support + if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support UseSSE = MIN2((intx)3,UseSSE); if( !supports_sse3() ) // Drop to 2 if no SSE3 support UseSSE = MIN2((intx)2,UseSSE); @@ -314,6 +317,14 @@ MaxLoopPad = 11; } #endif // COMPILER2 + if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) { + UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus + } + if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus + if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) { + UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus + } + } } } @@ -355,7 +366,7 @@ #ifndef PRODUCT if (PrintMiscellaneous && Verbose) { - tty->print_cr("Logical CPUs per package: %u", + tty->print_cr("Logical CPUs per core: %u", logical_processors_per_package()); tty->print_cr("UseSSE=%d",UseSSE); tty->print("Allocation: "); diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/cpu/x86/vm/vm_version_x86_64.hpp --- a/src/cpu/x86/vm/vm_version_x86_64.hpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/cpu/x86/vm/vm_version_x86_64.hpp Wed Oct 29 19:22:09 2008 -0700 @@ -68,9 +68,9 @@ cmpxchg16: 1, : 4, dca : 1, - : 4, - popcnt : 1, - : 8; + sse4_1 : 1, + sse4_2 : 1, + : 11; } bits; }; @@ -177,8 +177,9 @@ CPU_SSE2 = (1 << 7), CPU_SSE3 = (1 << 8), CPU_SSSE3= (1 << 9), - CPU_SSE4 = (1 <<10), - CPU_SSE4A= (1 <<11) + CPU_SSE4A= (1 <<10), + CPU_SSE4_1 = (1 << 11), + CPU_SSE4_2 = (1 << 12) } cpuFeatureFlags; // cpuid information block. All info derived from executing cpuid with @@ -240,22 +241,14 @@ static CpuidInfo _cpuid_info; // Extractors and predicates - static bool is_extended_cpu_family() { - const uint32_t Extended_Cpu_Family = 0xf; - return _cpuid_info.std_cpuid1_eax.bits.family == Extended_Cpu_Family; - } static uint32_t extended_cpu_family() { uint32_t result = _cpuid_info.std_cpuid1_eax.bits.family; - if (is_extended_cpu_family()) { - result += _cpuid_info.std_cpuid1_eax.bits.ext_family; - } + result += _cpuid_info.std_cpuid1_eax.bits.ext_family; return result; } static uint32_t extended_cpu_model() { uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model; - if (is_extended_cpu_family()) { - result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4; - } + result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4; return result; } static uint32_t cpu_stepping() { @@ -293,6 +286,10 @@ result |= CPU_SSSE3; if (is_amd() && _cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0) result |= CPU_SSE4A; + if (_cpuid_info.std_cpuid1_ecx.bits.sse4_1 != 0) + result |= CPU_SSE4_1; + if (_cpuid_info.std_cpuid1_ecx.bits.sse4_2 != 0) + result |= CPU_SSE4_2; return result; } @@ -380,7 +377,8 @@ static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; } static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; } static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; } - static bool supports_sse4() { return (_cpuFeatures & CPU_SSE4) != 0; } + static bool supports_sse4_1() { return (_cpuFeatures & CPU_SSE4_1) != 0; } + static bool supports_sse4_2() { return (_cpuFeatures & CPU_SSE4_2) != 0; } // // AMD features // diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/cpu/x86/vm/x86_32.ad --- a/src/cpu/x86/vm/x86_32.ad Wed Oct 29 19:18:54 2008 -0700 +++ b/src/cpu/x86/vm/x86_32.ad Wed Oct 29 19:22:09 2008 -0700 @@ -4810,6 +4810,16 @@ interface(CONST_INTER); %} +// Long Immediate zero +operand immL_M1() %{ + predicate( n->get_long() == -1L ); + match(ConL); + op_cost(0); + + format %{ %} + interface(CONST_INTER); +%} + // Long immediate from 0 to 127. // Used for a shorter form of long mul by 10. operand immL_127() %{ @@ -8621,6 +8631,18 @@ ins_pipe( ialu_reg_reg ); %} +// Xor Register with Immediate -1 +instruct xorI_eReg_im1(eRegI dst, immI_M1 imm) %{ + match(Set dst (XorI dst imm)); + + size(2); + format %{ "NOT $dst" %} + ins_encode %{ + __ notl($dst$$Register); + %} + ins_pipe( ialu_reg ); +%} + // Xor Register with Immediate instruct xorI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{ match(Set dst (XorI dst src)); @@ -8938,6 +8960,18 @@ ins_pipe( ialu_reg_reg_long ); %} +// Xor Long Register with Immediate -1 +instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{ + match(Set dst (XorL dst imm)); + format %{ "NOT $dst.lo\n\t" + "NOT $dst.hi" %} + ins_encode %{ + __ notl($dst$$Register); + __ notl(HIGH_FROM_LOW($dst$$Register)); + %} + ins_pipe( ialu_reg_long ); +%} + // Xor Long Register with Immediate instruct xorl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{ match(Set dst (XorL dst src)); diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/cpu/x86/vm/x86_64.ad --- a/src/cpu/x86/vm/x86_64.ad Wed Oct 29 19:18:54 2008 -0700 +++ b/src/cpu/x86/vm/x86_64.ad Wed Oct 29 19:22:09 2008 -0700 @@ -9309,6 +9309,17 @@ ins_pipe(ialu_reg_reg); %} +// Xor Register with Immediate -1 +instruct xorI_rReg_im1(rRegI dst, immI_M1 imm) %{ + match(Set dst (XorI dst imm)); + + format %{ "not $dst" %} + ins_encode %{ + __ notl($dst$$Register); + %} + ins_pipe(ialu_reg); +%} + // Xor Register with Immediate instruct xorI_rReg_imm(rRegI dst, immI src, rFlagsReg cr) %{ @@ -9529,6 +9540,17 @@ ins_pipe(ialu_reg_reg); %} +// Xor Register with Immediate -1 +instruct xorL_rReg_im1(rRegL dst, immL_M1 imm) %{ + match(Set dst (XorL dst imm)); + + format %{ "notq $dst" %} + ins_encode %{ + __ notq($dst$$Register); + %} + ins_pipe(ialu_reg); +%} + // Xor Register with Immediate instruct xorL_rReg_imm(rRegL dst, immL32 src, rFlagsReg cr) %{ diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/os/solaris/vm/os_solaris.cpp diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/includeDB_features --- a/src/share/vm/includeDB_features Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/includeDB_features Wed Oct 29 19:22:09 2008 -0700 @@ -99,6 +99,7 @@ heapDumper.cpp reflectionUtils.hpp heapDumper.cpp symbolTable.hpp heapDumper.cpp systemDictionary.hpp +heapDumper.cpp threadService.hpp heapDumper.cpp universe.hpp heapDumper.cpp vframe.hpp heapDumper.cpp vmGCOperations.hpp diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/memory/compactingPermGenGen.hpp --- a/src/share/vm/memory/compactingPermGenGen.hpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/memory/compactingPermGenGen.hpp Wed Oct 29 19:22:09 2008 -0700 @@ -100,7 +100,7 @@ enum { vtbl_list_size = 16, // number of entries in the shared space vtable list. - num_virtuals = 100 // number of virtual methods in Klass (or + num_virtuals = 200 // number of virtual methods in Klass (or // subclass) objects, or greater. }; diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/memory/dump.cpp --- a/src/share/vm/memory/dump.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/memory/dump.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -818,6 +818,40 @@ // across the space while doing this, as that causes the vtables to be // patched, undoing our useful work. Instead, iterate to make a list, // then use the list to do the fixing. +// +// Our constructed vtables: +// Dump time: +// 1. init_self_patching_vtbl_list: table of pointers to current virtual method addrs +// 2. generate_vtable_methods: create jump table, appended to above vtbl_list +// 3. PatchKlassVtables: for Klass list, patch the vtable entry to point to jump table +// rather than to current vtbl +// Table layout: NOTE FIXED SIZE +// 1. vtbl pointers +// 2. #Klass X #virtual methods per Klass +// 1 entry for each, in the order: +// Klass1:method1 entry, Klass1:method2 entry, ... Klass1:method entry +// Klass2:method1 entry, Klass2:method2 entry, ... Klass2:method entry +// ... +// Klass:method1 entry, Klass:method2 entry, +// ... Klass:method entry +// Sample entry: (Sparc): +// save(sp, -256, sp) +// ba,pt common_code +// mov XXX, %L0 %L0 gets: Klass index <<8 + method index (note: max method index 255) +// +// Restore time: +// 1. initialize_oops: reserve space for table +// 2. init_self_patching_vtbl_list: update pointers to NEW virtual method addrs in text +// +// Execution time: +// First virtual method call for any object of these Klass types: +// 1. object->klass->klass_part +// 2. vtable entry for that klass_part points to the jump table entries +// 3. branches to common_code with %O0/klass_part, %L0: Klass index <<8 + method index +// 4. common_code: +// Get address of new vtbl pointer for this Klass from updated table +// Update new vtbl pointer in the Klass: future virtual calls go direct +// Jump to method, using new vtbl pointer and method index class PatchKlassVtables: public ObjectClosure { private: diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/oops/objArrayKlass.cpp --- a/src/share/vm/oops/objArrayKlass.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/oops/objArrayKlass.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -475,8 +475,8 @@ assert(Universe::is_bootstrapping(), "partial objArray only at startup"); return JVM_ACC_ABSTRACT | JVM_ACC_FINAL | JVM_ACC_PUBLIC; } - // Recurse down the element list - jint element_flags = Klass::cast(element_klass())->compute_modifier_flags(CHECK_0); + // Return the flags of the bottom element type. + jint element_flags = Klass::cast(bottom_klass())->compute_modifier_flags(CHECK_0); return (element_flags & (JVM_ACC_PUBLIC | JVM_ACC_PRIVATE | JVM_ACC_PROTECTED)) | (JVM_ACC_ABSTRACT | JVM_ACC_FINAL); diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/opto/addnode.cpp --- a/src/share/vm/opto/addnode.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/opto/addnode.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -156,7 +156,8 @@ if( add1_op == this_op && !con_right ) { Node *a12 = add1->in(2); const Type *t12 = phase->type( a12 ); - if( t12->singleton() && t12 != Type::TOP && (add1 != add1->in(1)) ) { + if( t12->singleton() && t12 != Type::TOP && (add1 != add1->in(1)) && + !(add1->in(1)->is_Phi() && add1->in(1)->as_Phi()->is_tripcount()) ) { assert(add1->in(1) != this, "dead loop in AddNode::Ideal"); add2 = add1->clone(); add2->set_req(2, in(2)); @@ -173,7 +174,8 @@ if( add2_op == this_op && !con_left ) { Node *a22 = add2->in(2); const Type *t22 = phase->type( a22 ); - if( t22->singleton() && t22 != Type::TOP && (add2 != add2->in(1)) ) { + if( t22->singleton() && t22 != Type::TOP && (add2 != add2->in(1)) && + !(add2->in(1)->is_Phi() && add2->in(1)->as_Phi()->is_tripcount()) ) { assert(add2->in(1) != this, "dead loop in AddNode::Ideal"); Node *addx = add2->clone(); addx->set_req(1, in(1)); @@ -225,34 +227,63 @@ //============================================================================= //------------------------------Idealize--------------------------------------- Node *AddINode::Ideal(PhaseGVN *phase, bool can_reshape) { - int op1 = in(1)->Opcode(); - int op2 = in(2)->Opcode(); + Node* in1 = in(1); + Node* in2 = in(2); + int op1 = in1->Opcode(); + int op2 = in2->Opcode(); // Fold (con1-x)+con2 into (con1+con2)-x + if ( op1 == Op_AddI && op2 == Op_SubI ) { + // Swap edges to try optimizations below + in1 = in2; + in2 = in(1); + op1 = op2; + op2 = in2->Opcode(); + } if( op1 == Op_SubI ) { - const Type *t_sub1 = phase->type( in(1)->in(1) ); - const Type *t_2 = phase->type( in(2) ); + const Type *t_sub1 = phase->type( in1->in(1) ); + const Type *t_2 = phase->type( in2 ); if( t_sub1->singleton() && t_2->singleton() && t_sub1 != Type::TOP && t_2 != Type::TOP ) return new (phase->C, 3) SubINode(phase->makecon( add_ring( t_sub1, t_2 ) ), - in(1)->in(2) ); + in1->in(2) ); // Convert "(a-b)+(c-d)" into "(a+c)-(b+d)" if( op2 == Op_SubI ) { // Check for dead cycle: d = (a-b)+(c-d) - assert( in(1)->in(2) != this && in(2)->in(2) != this, + assert( in1->in(2) != this && in2->in(2) != this, "dead loop in AddINode::Ideal" ); Node *sub = new (phase->C, 3) SubINode(NULL, NULL); - sub->init_req(1, phase->transform(new (phase->C, 3) AddINode(in(1)->in(1), in(2)->in(1) ) )); - sub->init_req(2, phase->transform(new (phase->C, 3) AddINode(in(1)->in(2), in(2)->in(2) ) )); + sub->init_req(1, phase->transform(new (phase->C, 3) AddINode(in1->in(1), in2->in(1) ) )); + sub->init_req(2, phase->transform(new (phase->C, 3) AddINode(in1->in(2), in2->in(2) ) )); return sub; } + // Convert "(a-b)+(b+c)" into "(a+c)" + if( op2 == Op_AddI && in1->in(2) == in2->in(1) ) { + assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddINode::Ideal"); + return new (phase->C, 3) AddINode(in1->in(1), in2->in(2)); + } + // Convert "(a-b)+(c+b)" into "(a+c)" + if( op2 == Op_AddI && in1->in(2) == in2->in(2) ) { + assert(in1->in(1) != this && in2->in(1) != this,"dead loop in AddINode::Ideal"); + return new (phase->C, 3) AddINode(in1->in(1), in2->in(1)); + } + // Convert "(a-b)+(b-c)" into "(a-c)" + if( op2 == Op_SubI && in1->in(2) == in2->in(1) ) { + assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddINode::Ideal"); + return new (phase->C, 3) SubINode(in1->in(1), in2->in(2)); + } + // Convert "(a-b)+(c-a)" into "(c-b)" + if( op2 == Op_SubI && in1->in(1) == in2->in(2) ) { + assert(in1->in(2) != this && in2->in(1) != this,"dead loop in AddINode::Ideal"); + return new (phase->C, 3) SubINode(in2->in(1), in1->in(2)); + } } // Convert "x+(0-y)" into "(x-y)" - if( op2 == Op_SubI && phase->type(in(2)->in(1)) == TypeInt::ZERO ) - return new (phase->C, 3) SubINode(in(1), in(2)->in(2) ); + if( op2 == Op_SubI && phase->type(in2->in(1)) == TypeInt::ZERO ) + return new (phase->C, 3) SubINode(in1, in2->in(2) ); // Convert "(0-y)+x" into "(x-y)" - if( op1 == Op_SubI && phase->type(in(1)->in(1)) == TypeInt::ZERO ) - return new (phase->C, 3) SubINode( in(2), in(1)->in(2) ); + if( op1 == Op_SubI && phase->type(in1->in(1)) == TypeInt::ZERO ) + return new (phase->C, 3) SubINode( in2, in1->in(2) ); // Convert (x>>>z)+y into (x+(y<>>z for small constant z and y. // Helps with array allocation math constant folding @@ -266,15 +297,15 @@ // Have not observed cases where type information exists to support // positive y and (x <= -(y << z)) if( op1 == Op_URShiftI && op2 == Op_ConI && - in(1)->in(2)->Opcode() == Op_ConI ) { - jint z = phase->type( in(1)->in(2) )->is_int()->get_con() & 0x1f; // only least significant 5 bits matter - jint y = phase->type( in(2) )->is_int()->get_con(); + in1->in(2)->Opcode() == Op_ConI ) { + jint z = phase->type( in1->in(2) )->is_int()->get_con() & 0x1f; // only least significant 5 bits matter + jint y = phase->type( in2 )->is_int()->get_con(); if( z < 5 && -5 < y && y < 0 ) { - const Type *t_in11 = phase->type(in(1)->in(1)); + const Type *t_in11 = phase->type(in1->in(1)); if( t_in11 != Type::TOP && (t_in11->is_int()->_lo >= -(y << z)) ) { - Node *a = phase->transform( new (phase->C, 3) AddINode( in(1)->in(1), phase->intcon(y<C, 3) URShiftINode( a, in(1)->in(2) ); + Node *a = phase->transform( new (phase->C, 3) AddINode( in1->in(1), phase->intcon(y<C, 3) URShiftINode( a, in1->in(2) ); } } } @@ -328,39 +359,73 @@ //============================================================================= //------------------------------Idealize--------------------------------------- Node *AddLNode::Ideal(PhaseGVN *phase, bool can_reshape) { - int op1 = in(1)->Opcode(); - int op2 = in(2)->Opcode(); + Node* in1 = in(1); + Node* in2 = in(2); + int op1 = in1->Opcode(); + int op2 = in2->Opcode(); + // Fold (con1-x)+con2 into (con1+con2)-x + if ( op1 == Op_AddL && op2 == Op_SubL ) { + // Swap edges to try optimizations below + in1 = in2; + in2 = in(1); + op1 = op2; + op2 = in2->Opcode(); + } // Fold (con1-x)+con2 into (con1+con2)-x if( op1 == Op_SubL ) { - const Type *t_sub1 = phase->type( in(1)->in(1) ); - const Type *t_2 = phase->type( in(2) ); + const Type *t_sub1 = phase->type( in1->in(1) ); + const Type *t_2 = phase->type( in2 ); if( t_sub1->singleton() && t_2->singleton() && t_sub1 != Type::TOP && t_2 != Type::TOP ) return new (phase->C, 3) SubLNode(phase->makecon( add_ring( t_sub1, t_2 ) ), - in(1)->in(2) ); + in1->in(2) ); // Convert "(a-b)+(c-d)" into "(a+c)-(b+d)" if( op2 == Op_SubL ) { // Check for dead cycle: d = (a-b)+(c-d) - assert( in(1)->in(2) != this && in(2)->in(2) != this, + assert( in1->in(2) != this && in2->in(2) != this, "dead loop in AddLNode::Ideal" ); Node *sub = new (phase->C, 3) SubLNode(NULL, NULL); - sub->init_req(1, phase->transform(new (phase->C, 3) AddLNode(in(1)->in(1), in(2)->in(1) ) )); - sub->init_req(2, phase->transform(new (phase->C, 3) AddLNode(in(1)->in(2), in(2)->in(2) ) )); + sub->init_req(1, phase->transform(new (phase->C, 3) AddLNode(in1->in(1), in2->in(1) ) )); + sub->init_req(2, phase->transform(new (phase->C, 3) AddLNode(in1->in(2), in2->in(2) ) )); return sub; } + // Convert "(a-b)+(b+c)" into "(a+c)" + if( op2 == Op_AddL && in1->in(2) == in2->in(1) ) { + assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddLNode::Ideal"); + return new (phase->C, 3) AddLNode(in1->in(1), in2->in(2)); + } + // Convert "(a-b)+(c+b)" into "(a+c)" + if( op2 == Op_AddL && in1->in(2) == in2->in(2) ) { + assert(in1->in(1) != this && in2->in(1) != this,"dead loop in AddLNode::Ideal"); + return new (phase->C, 3) AddLNode(in1->in(1), in2->in(1)); + } + // Convert "(a-b)+(b-c)" into "(a-c)" + if( op2 == Op_SubL && in1->in(2) == in2->in(1) ) { + assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddLNode::Ideal"); + return new (phase->C, 3) SubLNode(in1->in(1), in2->in(2)); + } + // Convert "(a-b)+(c-a)" into "(c-b)" + if( op2 == Op_SubL && in1->in(1) == in1->in(2) ) { + assert(in1->in(2) != this && in2->in(1) != this,"dead loop in AddLNode::Ideal"); + return new (phase->C, 3) SubLNode(in2->in(1), in1->in(2)); + } } // Convert "x+(0-y)" into "(x-y)" - if( op2 == Op_SubL && phase->type(in(2)->in(1)) == TypeLong::ZERO ) - return new (phase->C, 3) SubLNode(in(1), in(2)->in(2) ); + if( op2 == Op_SubL && phase->type(in2->in(1)) == TypeLong::ZERO ) + return new (phase->C, 3) SubLNode( in1, in2->in(2) ); + + // Convert "(0-y)+x" into "(x-y)" + if( op1 == Op_SubL && phase->type(in1->in(1)) == TypeInt::ZERO ) + return new (phase->C, 3) SubLNode( in2, in1->in(2) ); // Convert "X+X+X+X+X...+X+Y" into "k*X+Y" or really convert "X+(X+Y)" // into "(X<<1)+Y" and let shift-folding happen. if( op2 == Op_AddL && - in(2)->in(1) == in(1) && + in2->in(1) == in1 && op1 != Op_ConL && 0 ) { - Node *shift = phase->transform(new (phase->C, 3) LShiftLNode(in(1),phase->intcon(1))); - return new (phase->C, 3) AddLNode(shift,in(2)->in(2)); + Node *shift = phase->transform(new (phase->C, 3) LShiftLNode(in1,phase->intcon(1))); + return new (phase->C, 3) AddLNode(shift,in2->in(2)); } return AddNode::Ideal(phase, can_reshape); diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/opto/cfgnode.cpp --- a/src/share/vm/opto/cfgnode.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/opto/cfgnode.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -1817,6 +1817,12 @@ return progress; // Return any progress } +//------------------------------is_tripcount----------------------------------- +bool PhiNode::is_tripcount() const { + return (in(0) != NULL && in(0)->is_CountedLoop() && + in(0)->as_CountedLoop()->phi() == this); +} + //------------------------------out_RegMask------------------------------------ const RegMask &PhiNode::in_RegMask(uint i) const { return i ? out_RegMask() : RegMask::Empty; @@ -1832,9 +1838,7 @@ #ifndef PRODUCT void PhiNode::dump_spec(outputStream *st) const { TypeNode::dump_spec(st); - if (in(0) != NULL && - in(0)->is_CountedLoop() && - in(0)->as_CountedLoop()->phi() == this) { + if (is_tripcount()) { st->print(" #tripcount"); } } diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/opto/cfgnode.hpp --- a/src/share/vm/opto/cfgnode.hpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/opto/cfgnode.hpp Wed Oct 29 19:22:09 2008 -0700 @@ -162,6 +162,8 @@ return NULL; // not a copy! } + bool is_tripcount() const; + // Determine a unique non-trivial input, if any. // Ignore casts if it helps. Return NULL on failure. Node* unique_input(PhaseTransform *phase); diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/opto/divnode.cpp --- a/src/share/vm/opto/divnode.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/opto/divnode.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -110,10 +110,13 @@ } else if( dividend->Opcode() == Op_AndI ) { // An AND mask of sufficient size clears the low bits and // I can avoid rounding. - const TypeInt *andconi = phase->type( dividend->in(2) )->isa_int(); - if( andconi && andconi->is_con(-d) ) { - dividend = dividend->in(1); - needs_rounding = false; + const TypeInt *andconi_t = phase->type( dividend->in(2) )->isa_int(); + if( andconi_t && andconi_t->is_con() ) { + jint andconi = andconi_t->get_con(); + if( andconi < 0 && is_power_of_2(-andconi) && (-andconi) >= d ) { + dividend = dividend->in(1); + needs_rounding = false; + } } } @@ -316,10 +319,13 @@ } else if( dividend->Opcode() == Op_AndL ) { // An AND mask of sufficient size clears the low bits and // I can avoid rounding. - const TypeLong *andconl = phase->type( dividend->in(2) )->isa_long(); - if( andconl && andconl->is_con(-d)) { - dividend = dividend->in(1); - needs_rounding = false; + const TypeLong *andconl_t = phase->type( dividend->in(2) )->isa_long(); + if( andconl_t && andconl_t->is_con() ) { + jlong andconl = andconl_t->get_con(); + if( andconl < 0 && is_power_of_2_long(-andconl) && (-andconl) >= d ) { + dividend = dividend->in(1); + needs_rounding = false; + } } } @@ -704,11 +710,18 @@ if( t2 == TypeD::ONE ) return t1; - // If divisor is a constant and not zero, divide them numbers - if( t1->base() == Type::DoubleCon && - t2->base() == Type::DoubleCon && - t2->getd() != 0.0 ) // could be negative zero - return TypeD::make( t1->getd()/t2->getd() ); +#if defined(IA32) + if (!phase->C->method()->is_strict()) + // Can't trust native compilers to properly fold strict double + // division with round-to-zero on this platform. +#endif + { + // If divisor is a constant and not zero, divide them numbers + if( t1->base() == Type::DoubleCon && + t2->base() == Type::DoubleCon && + t2->getd() != 0.0 ) // could be negative zero + return TypeD::make( t1->getd()/t2->getd() ); + } // If the dividend is a constant zero // Note: if t1 and t2 are zero then result is NaN (JVMS page 213) diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/opto/loopTransform.cpp --- a/src/share/vm/opto/loopTransform.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/opto/loopTransform.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -679,6 +679,10 @@ CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop(); post_head->set_post_loop(main_head); + // Reduce the post-loop trip count. + CountedLoopEndNode* post_end = old_new[main_end ->_idx]->as_CountedLoopEnd(); + post_end->_prob = PROB_FAIR; + // Build the main-loop normal exit. IfFalseNode *new_main_exit = new (C, 1) IfFalseNode(main_end); _igvn.register_new_node_with_optimizer( new_main_exit ); @@ -748,6 +752,9 @@ pre_head->set_pre_loop(main_head); Node *pre_incr = old_new[incr->_idx]; + // Reduce the pre-loop trip count. + pre_end->_prob = PROB_FAIR; + // Find the pre-loop normal exit. Node* pre_exit = pre_end->proj_out(false); assert( pre_exit->Opcode() == Op_IfFalse, "" ); @@ -767,8 +774,8 @@ register_new_node( min_cmp , new_pre_exit ); register_new_node( min_bol , new_pre_exit ); - // Build the IfNode - IfNode *min_iff = new (C, 2) IfNode( new_pre_exit, min_bol, PROB_FAIR, COUNT_UNKNOWN ); + // Build the IfNode (assume the main-loop is executed always). + IfNode *min_iff = new (C, 2) IfNode( new_pre_exit, min_bol, PROB_ALWAYS, COUNT_UNKNOWN ); _igvn.register_new_node_with_optimizer( min_iff ); set_idom(min_iff, new_pre_exit, dd_main_head); set_loop(min_iff, loop->_parent); @@ -1583,10 +1590,10 @@ //============================================================================= //------------------------------iteration_split_impl--------------------------- -void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ) { +bool IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ) { // Check and remove empty loops (spam micro-benchmarks) if( policy_do_remove_empty_loop(phase) ) - return; // Here we removed an empty loop + return true; // Here we removed an empty loop bool should_peel = policy_peeling(phase); // Should we peel? @@ -1596,7 +1603,8 @@ // This removes loop-invariant tests (usually null checks). if( !_head->is_CountedLoop() ) { // Non-counted loop if (PartialPeelLoop && phase->partial_peel(this, old_new)) { - return; + // Partial peel succeeded so terminate this round of loop opts + return false; } if( should_peel ) { // Should we peel? #ifndef PRODUCT @@ -1606,14 +1614,14 @@ } else if( should_unswitch ) { phase->do_unswitching(this, old_new); } - return; + return true; } CountedLoopNode *cl = _head->as_CountedLoop(); - if( !cl->loopexit() ) return; // Ignore various kinds of broken loops + if( !cl->loopexit() ) return true; // Ignore various kinds of broken loops // Do nothing special to pre- and post- loops - if( cl->is_pre_loop() || cl->is_post_loop() ) return; + if( cl->is_pre_loop() || cl->is_post_loop() ) return true; // Compute loop trip count from profile data compute_profile_trip_cnt(phase); @@ -1626,11 +1634,11 @@ // Here we did some unrolling and peeling. Eventually we will // completely unroll this loop and it will no longer be a loop. phase->do_maximally_unroll(this,old_new); - return; + return true; } if (should_unswitch) { phase->do_unswitching(this, old_new); - return; + return true; } } @@ -1691,14 +1699,16 @@ if( should_peel ) // Might want to peel but do nothing else phase->do_peeling(this,old_new); } + return true; } //============================================================================= //------------------------------iteration_split-------------------------------- -void IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new ) { +bool IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new ) { // Recursively iteration split nested loops - if( _child ) _child->iteration_split( phase, old_new ); + if( _child && !_child->iteration_split( phase, old_new )) + return false; // Clean out prior deadwood DCE_loop_body(); @@ -1720,7 +1730,9 @@ _allow_optimizations && !tail()->is_top() ) { // Also ignore the occasional dead backedge if (!_has_call) { - iteration_split_impl( phase, old_new ); + if (!iteration_split_impl( phase, old_new )) { + return false; + } } else if (policy_unswitching(phase)) { phase->do_unswitching(this, old_new); } @@ -1729,5 +1741,7 @@ // Minor offset re-organization to remove loop-fallout uses of // trip counter. if( _head->is_CountedLoop() ) phase->reorg_offsets( this ); - if( _next ) _next->iteration_split( phase, old_new ); + if( _next && !_next->iteration_split( phase, old_new )) + return false; + return true; } diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/opto/loopnode.hpp --- a/src/share/vm/opto/loopnode.hpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/opto/loopnode.hpp Wed Oct 29 19:22:09 2008 -0700 @@ -325,12 +325,14 @@ // Returns TRUE if loop tree is structurally changed. bool beautify_loops( PhaseIdealLoop *phase ); - // Perform iteration-splitting on inner loops. Split iterations to avoid - // range checks or one-shot null checks. - void iteration_split( PhaseIdealLoop *phase, Node_List &old_new ); + // Perform iteration-splitting on inner loops. Split iterations to + // avoid range checks or one-shot null checks. Returns false if the + // current round of loop opts should stop. + bool iteration_split( PhaseIdealLoop *phase, Node_List &old_new ); - // Driver for various flavors of iteration splitting - void iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ); + // Driver for various flavors of iteration splitting. Returns false + // if the current round of loop opts should stop. + bool iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ); // Given dominators, try to find loops with calls that must always be // executed (call dominates loop tail). These loops do not need non-call diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/opto/loopopts.cpp --- a/src/share/vm/opto/loopopts.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/opto/loopopts.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -1903,9 +1903,6 @@ // Use in a phi is considered a use in the associated predecessor block use_c = use->in(0)->in(j); } - if (use_c->is_CountedLoop()) { - use_c = use_c->in(LoopNode::EntryControl); - } set_ctrl(n_clone, use_c); assert(!loop->is_member(get_loop(use_c)), "should be outside loop"); get_loop(use_c)->_body.push(n_clone); diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/opto/mulnode.cpp --- a/src/share/vm/opto/mulnode.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/opto/mulnode.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -152,6 +152,14 @@ if( t1 == Type::BOTTOM || t2 == Type::BOTTOM ) return bottom_type(); +#if defined(IA32) + // Can't trust native compilers to properly fold strict double + // multiplication with round-to-zero on this platform. + if (op == Op_MulD && phase->C->method()->is_strict()) { + return TypeD::DOUBLE; + } +#endif + return mul_ring(t1,t2); // Local flavor of type multiplication } @@ -360,7 +368,7 @@ // Compute the product type of two double ranges into this node. const Type *MulDNode::mul_ring(const Type *t0, const Type *t1) const { if( t0 == Type::DOUBLE || t1 == Type::DOUBLE ) return Type::DOUBLE; - // We must be adding 2 double constants. + // We must be multiplying 2 double constants. return TypeD::make( t0->getd() * t1->getd() ); } diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/opto/node.hpp --- a/src/share/vm/opto/node.hpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/opto/node.hpp Wed Oct 29 19:22:09 2008 -0700 @@ -1320,7 +1320,8 @@ Node *pop() { if( _clock_index >= size() ) _clock_index = 0; Node *b = at(_clock_index); - map( _clock_index++, Node_List::pop()); + map( _clock_index, Node_List::pop()); + if (size() != 0) _clock_index++; // Always start from 0 _in_worklist >>= b->_idx; return b; } diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/opto/postaloc.cpp --- a/src/share/vm/opto/postaloc.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/opto/postaloc.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -34,7 +34,7 @@ #endif } -//------------------------------may_be_copy_of_callee----------------------------- +//---------------------------may_be_copy_of_callee----------------------------- // Check to see if we can possibly be a copy of a callee-save value. bool PhaseChaitin::may_be_copy_of_callee( Node *def ) const { // Short circuit if there are no callee save registers @@ -225,6 +225,20 @@ // Scan all registers to see if this value is around already for( uint reg = 0; reg < (uint)_max_reg; reg++ ) { + if (reg == (uint)nk_reg) { + // Found ourselves so check if there is only one user of this + // copy and keep on searching for a better copy if so. + bool ignore_self = true; + x = n->in(k); + DUIterator_Fast imax, i = x->fast_outs(imax); + Node* first = x->fast_out(i); i++; + while (i < imax && ignore_self) { + Node* use = x->fast_out(i); i++; + if (use != first) ignore_self = false; + } + if (ignore_self) continue; + } + Node *vv = value[reg]; if( !single ) { // Doubles check for aligned-adjacent pair if( (reg&1)==0 ) continue; // Wrong half of a pair diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/opto/subnode.cpp --- a/src/share/vm/opto/subnode.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/opto/subnode.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -206,6 +206,14 @@ if( op1 == Op_AddI && op2 == Op_AddI && in1->in(2) == in2->in(2) ) return new (phase->C, 3) SubINode( in1->in(1), in2->in(1) ); + // Convert "(A+X) - (X+B)" into "A - B" + if( op1 == Op_AddI && op2 == Op_AddI && in1->in(2) == in2->in(1) ) + return new (phase->C, 3) SubINode( in1->in(1), in2->in(2) ); + + // Convert "(X+A) - (B+X)" into "A - B" + if( op1 == Op_AddI && op2 == Op_AddI && in1->in(1) == in2->in(2) ) + return new (phase->C, 3) SubINode( in1->in(2), in2->in(1) ); + // Convert "A-(B-C)" into (A+C)-B", since add is commutative and generally // nicer to optimize than subtract. if( op2 == Op_SubI && in2->outcnt() == 1) { diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/runtime/globals.hpp --- a/src/share/vm/runtime/globals.hpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/runtime/globals.hpp Wed Oct 29 19:22:09 2008 -0700 @@ -997,6 +997,12 @@ product(bool, UseXmmI2F, false, \ "Use SSE2 CVTDQ2PS instruction to convert Integer to Float") \ \ + product(bool, UseXMMForArrayCopy, false, \ + "Use SSE2 MOVQ instruction for Arraycopy") \ + \ + product(bool, UseUnalignedLoadStores, false, \ + "Use SSE2 MOVDQU instruction for Arraycopy") \ + \ product(intx, FieldsAllocationStyle, 1, \ "0 - type based with oops first, 1 - with oops last") \ \ @@ -2555,7 +2561,7 @@ develop(intx, MaxRecursiveInlineLevel, 1, \ "maximum number of nested recursive calls that are inlined") \ \ - develop(intx, InlineSmallCode, 1000, \ + product(intx, InlineSmallCode, 1000, \ "Only inline already compiled methods if their code size is " \ "less than this") \ \ diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/services/heapDumper.cpp --- a/src/share/vm/services/heapDumper.cpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/services/heapDumper.cpp Wed Oct 29 19:22:09 2008 -0700 @@ -343,7 +343,8 @@ // Default stack trace ID (used for dummy HPROF_TRACE record) enum { - STACK_TRACE_ID = 1 + STACK_TRACE_ID = 1, + INITIAL_CLASS_COUNT = 200 }; @@ -408,6 +409,7 @@ void write_u8(u8 x); void write_objectID(oop o); void write_classID(Klass* k); + void write_id(u4 x); }; DumpWriter::DumpWriter(const char* path) { @@ -548,6 +550,14 @@ #endif } +void DumpWriter::write_id(u4 x) { +#ifdef _LP64 + write_u8((u8) x); +#else + write_u4(x); +#endif +} + // We use java mirror as the class ID void DumpWriter::write_classID(Klass* k) { write_objectID(k->java_mirror()); @@ -596,6 +606,8 @@ static void dump_object_array(DumpWriter* writer, objArrayOop array); // creates HPROF_GC_PRIM_ARRAY_DUMP record for the given type array static void dump_prim_array(DumpWriter* writer, typeArrayOop array); + // create HPROF_FRAME record for the given method and bci + static void dump_stack_frame(DumpWriter* writer, int frame_serial_num, int class_serial_num, methodOop m, int bci); }; // write a header of the given type @@ -1070,6 +1082,29 @@ } } +// create a HPROF_FRAME record of the given methodOop and bci +void DumperSupport::dump_stack_frame(DumpWriter* writer, + int frame_serial_num, + int class_serial_num, + methodOop m, + int bci) { + int line_number; + if (m->is_native()) { + line_number = -3; // native frame + } else { + line_number = m->line_number_from_bci(bci); + } + + write_header(writer, HPROF_FRAME, 4*oopSize + 2*sizeof(u4)); + writer->write_id(frame_serial_num); // frame serial number + writer->write_objectID(m->name()); // method's name + writer->write_objectID(m->signature()); // method's signature + + assert(Klass::cast(m->method_holder())->oop_is_instance(), "not instanceKlass"); + writer->write_objectID(instanceKlass::cast(m->method_holder())->source_file_name()); // source file name + writer->write_u4(class_serial_num); // class serial number + writer->write_u4((u4) line_number); // line number +} // Support class used to generate HPROF_UTF8 records from the entries in the // SymbolTable. @@ -1104,12 +1139,15 @@ private: DumpWriter* _writer; u4 _thread_serial_num; + int _frame_num; DumpWriter* writer() const { return _writer; } public: JNILocalsDumper(DumpWriter* writer, u4 thread_serial_num) { _writer = writer; _thread_serial_num = thread_serial_num; + _frame_num = -1; // default - empty stack } + void set_frame_number(int n) { _frame_num = n; } void do_oop(oop* obj_p); void do_oop(narrowOop* obj_p) { ShouldNotReachHere(); } }; @@ -1122,7 +1160,7 @@ writer()->write_u1(HPROF_GC_ROOT_JNI_LOCAL); writer()->write_objectID(o); writer()->write_u4(_thread_serial_num); - writer()->write_u4((u4)-1); // empty + writer()->write_u4((u4)_frame_num); } } @@ -1269,6 +1307,9 @@ bool _gc_before_heap_dump; bool _is_segmented_dump; jlong _dump_start; + GrowableArray* _klass_map; + ThreadStackTrace** _stack_traces; + int _num_threads; // accessors DumpWriter* writer() const { return _writer; } @@ -1291,9 +1332,16 @@ static void do_basic_type_array_class_dump(klassOop k); // HPROF_GC_ROOT_THREAD_OBJ records - void do_thread(JavaThread* thread, u4 thread_serial_num); + int do_thread(JavaThread* thread, u4 thread_serial_num); void do_threads(); + void add_class_serial_number(Klass* k, int serial_num) { + _klass_map->at_put_grow(serial_num, k); + } + + // HPROF_TRACE and HPROF_FRAME records + void dump_stack_traces(); + // writes a HPROF_HEAP_DUMP or HPROF_HEAP_DUMP_SEGMENT record void write_dump_header(); @@ -1313,6 +1361,18 @@ _gc_before_heap_dump = gc_before_heap_dump; _is_segmented_dump = false; _dump_start = (jlong)-1; + _klass_map = new (ResourceObj::C_HEAP) GrowableArray(INITIAL_CLASS_COUNT, true); + _stack_traces = NULL; + _num_threads = 0; + } + ~VM_HeapDumper() { + if (_stack_traces != NULL) { + for (int i=0; i < _num_threads; i++) { + delete _stack_traces[i]; + } + FREE_C_HEAP_ARRAY(ThreadStackTrace*, _stack_traces); + } + delete _klass_map; } VMOp_Type type() const { return VMOp_HeapDumper; } @@ -1436,6 +1496,9 @@ Klass* klass = Klass::cast(k); writer->write_classID(klass); + // add the klassOop and class serial number pair + dumper->add_class_serial_number(klass, class_serial_num); + writer->write_u4(STACK_TRACE_ID); // class name ID @@ -1465,15 +1528,15 @@ // Walk the stack of the given thread. // Dumps a HPROF_GC_ROOT_JAVA_FRAME record for each local // Dumps a HPROF_GC_ROOT_JNI_LOCAL record for each JNI local -void VM_HeapDumper::do_thread(JavaThread* java_thread, u4 thread_serial_num) { +// +// It returns the number of Java frames in this thread stack +int VM_HeapDumper::do_thread(JavaThread* java_thread, u4 thread_serial_num) { JNILocalsDumper blk(writer(), thread_serial_num); oop threadObj = java_thread->threadObj(); assert(threadObj != NULL, "sanity check"); - // JNI locals for the top frame - java_thread->active_handles()->oops_do(&blk); - + int stack_depth = 0; if (java_thread->has_last_Java_frame()) { // vframes are resource allocated @@ -1484,13 +1547,14 @@ RegisterMap reg_map(java_thread); frame f = java_thread->last_frame(); vframe* vf = vframe::new_vframe(&f, ®_map, java_thread); + frame* last_entry_frame = NULL; while (vf != NULL) { + blk.set_frame_number(stack_depth); if (vf->is_java_frame()) { // java frame (interpreted, compiled, ...) javaVFrame *jvf = javaVFrame::cast(vf); - if (!(jvf->method()->is_native())) { StackValueCollection* locals = jvf->locals(); for (int slot=0; slotsize(); slot++) { @@ -1501,44 +1565,61 @@ writer()->write_u1(HPROF_GC_ROOT_JAVA_FRAME); writer()->write_objectID(o); writer()->write_u4(thread_serial_num); - writer()->write_u4((u4)-1); // empty + writer()->write_u4((u4) stack_depth); } } } + } else { + // native frame + if (stack_depth == 0) { + // JNI locals for the top frame. + java_thread->active_handles()->oops_do(&blk); + } else { + if (last_entry_frame != NULL) { + // JNI locals for the entry frame + assert(last_entry_frame->is_entry_frame(), "checking"); + last_entry_frame->entry_frame_call_wrapper()->handles()->oops_do(&blk); + } + } } - } else { + // increment only for Java frames + stack_depth++; + last_entry_frame = NULL; + } else { // externalVFrame - if it's an entry frame then report any JNI locals - // as roots + // as roots when we find the corresponding native javaVFrame frame* fr = vf->frame_pointer(); assert(fr != NULL, "sanity check"); if (fr->is_entry_frame()) { - fr->entry_frame_call_wrapper()->handles()->oops_do(&blk); + last_entry_frame = fr; } } - vf = vf->sender(); } + } else { + // no last java frame but there may be JNI locals + java_thread->active_handles()->oops_do(&blk); } + return stack_depth; } // write a HPROF_GC_ROOT_THREAD_OBJ record for each java thread. Then walk // the stack so that locals and JNI locals are dumped. void VM_HeapDumper::do_threads() { - u4 thread_serial_num = 0; - for (JavaThread* thread = Threads::first(); thread != NULL ; thread = thread->next()) { + for (int i=0; i < _num_threads; i++) { + JavaThread* thread = _stack_traces[i]->thread(); oop threadObj = thread->threadObj(); - if (threadObj != NULL && !thread->is_exiting() && !thread->is_hidden_from_external_view()) { - ++thread_serial_num; - - writer()->write_u1(HPROF_GC_ROOT_THREAD_OBJ); - writer()->write_objectID(threadObj); - writer()->write_u4(thread_serial_num); - writer()->write_u4(STACK_TRACE_ID); - - do_thread(thread, thread_serial_num); - } + u4 thread_serial_num = i+1; + u4 stack_serial_num = thread_serial_num + STACK_TRACE_ID; + writer()->write_u1(HPROF_GC_ROOT_THREAD_OBJ); + writer()->write_objectID(threadObj); + writer()->write_u4(thread_serial_num); // thread number + writer()->write_u4(stack_serial_num); // stack trace serial number + int num_frames = do_thread(thread, thread_serial_num); + assert(num_frames == _stack_traces[i]->get_stack_depth(), + "total number of Java frames not matched"); } } @@ -1547,16 +1628,16 @@ // records: // // HPROF_HEADER -// HPROF_TRACE // [HPROF_UTF8]* // [HPROF_LOAD_CLASS]* +// [[HPROF_FRAME]*|HPROF_TRACE]* // [HPROF_GC_CLASS_DUMP]* // HPROF_HEAP_DUMP // -// The HPROF_TRACE record after the header is "dummy trace" record which does -// not include any frames. Other records which require a stack trace ID will -// specify the trace ID of this record (1). It also means we can run HAT without -// needing the -stack false option. +// The HPROF_TRACE records represent the stack traces where the heap dump +// is generated and a "dummy trace" record which does not include +// any frames. The dummy trace record is used to be referenced as the +// unknown object alloc site. // // The HPROF_HEAP_DUMP record has a length following by sub-records. To allow // the heap dump be generated in a single pass we remember the position of @@ -1592,12 +1673,6 @@ writer()->write_u4(oopSize); writer()->write_u8(os::javaTimeMillis()); - // HPROF_TRACE record without any frames - DumperSupport::write_header(writer(), HPROF_TRACE, 3*sizeof(u4)); - writer()->write_u4(STACK_TRACE_ID); - writer()->write_u4(0); // thread number - writer()->write_u4(0); // frame count - // HPROF_UTF8 records SymbolTableDumper sym_dumper(writer()); SymbolTable::oops_do(&sym_dumper); @@ -1606,6 +1681,10 @@ SystemDictionary::classes_do(&do_load_class); Universe::basic_type_classes_do(&do_load_class); + // write HPROF_FRAME and HPROF_TRACE records + // this must be called after _klass_map is built when iterating the classes above. + dump_stack_traces(); + // write HPROF_HEAP_DUMP or HPROF_HEAP_DUMP_SEGMENT write_dump_header(); @@ -1646,6 +1725,47 @@ end_of_dump(); } +void VM_HeapDumper::dump_stack_traces() { + // write a HPROF_TRACE record without any frames to be referenced as object alloc sites + DumperSupport::write_header(writer(), HPROF_TRACE, 3*sizeof(u4)); + writer()->write_u4((u4) STACK_TRACE_ID); + writer()->write_u4(0); // thread number + writer()->write_u4(0); // frame count + + _stack_traces = NEW_C_HEAP_ARRAY(ThreadStackTrace*, Threads::number_of_threads()); + int frame_serial_num = 0; + for (JavaThread* thread = Threads::first(); thread != NULL ; thread = thread->next()) { + oop threadObj = thread->threadObj(); + if (threadObj != NULL && !thread->is_exiting() && !thread->is_hidden_from_external_view()) { + // dump thread stack trace + ThreadStackTrace* stack_trace = new ThreadStackTrace(thread, false); + stack_trace->dump_stack_at_safepoint(-1); + _stack_traces[_num_threads++] = stack_trace; + + // write HPROF_FRAME records for this thread's stack trace + int depth = stack_trace->get_stack_depth(); + int thread_frame_start = frame_serial_num; + for (int j=0; j < depth; j++) { + StackFrameInfo* frame = stack_trace->stack_frame_at(j); + methodOop m = frame->method(); + int class_serial_num = _klass_map->find(Klass::cast(m->method_holder())); + // the class serial number starts from 1 + assert(class_serial_num > 0, "class not found"); + DumperSupport::dump_stack_frame(writer(), ++frame_serial_num, class_serial_num, m, frame->bci()); + } + + // write HPROF_TRACE record for one thread + DumperSupport::write_header(writer(), HPROF_TRACE, 3*sizeof(u4) + depth*oopSize); + int stack_serial_num = _num_threads + STACK_TRACE_ID; + writer()->write_u4(stack_serial_num); // stack trace serial number + writer()->write_u4((u4) _num_threads); // thread serial number + writer()->write_u4(depth); // frame count + for (int j=1; j <= depth; j++) { + writer()->write_id(thread_frame_start + j); + } + } + } +} // dump the heap to given path. int HeapDumper::dump(const char* path) { diff -r 7c99a4bb76a1 -r 49ca90d77f34 src/share/vm/services/threadService.hpp --- a/src/share/vm/services/threadService.hpp Wed Oct 29 19:18:54 2008 -0700 +++ b/src/share/vm/services/threadService.hpp Wed Oct 29 19:22:09 2008 -0700 @@ -242,6 +242,7 @@ ThreadStackTrace(JavaThread* thread, bool with_locked_monitors); ~ThreadStackTrace(); + JavaThread* thread() { return _thread; } StackFrameInfo* stack_frame_at(int i) { return _frames->at(i); } int get_stack_depth() { return _depth; } diff -r 7c99a4bb76a1 -r 49ca90d77f34 test/compiler/6700047/Test6700047.java --- a/test/compiler/6700047/Test6700047.java Wed Oct 29 19:18:54 2008 -0700 +++ b/test/compiler/6700047/Test6700047.java Wed Oct 29 19:22:09 2008 -0700 @@ -29,6 +29,8 @@ */ public class Test6700047 { + static byte[] dummy = new byte[256]; + public static void main(String[] args) { for (int i = 0; i < 100000; i++) { intToLeftPaddedAsciiBytes(); @@ -53,6 +55,7 @@ if (offset > 0) { for(int j = 0; j < offset; j++) { result++; + dummy[i] = 0; } } return result;