Mercurial > hg > truffle

--- a/src/cpu/x86/vm/assembler_x86.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -1575,6 +1575,35 @@
   emit_operand(src, dst);
 }

+void Assembler::movdqu(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  emit_byte(0xF3);
+  prefix(src, dst);
+  emit_byte(0x0F);
+  emit_byte(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_byte(0xF3);
+  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+  emit_byte(0x0F);
+  emit_byte(0x6F);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::movdqu(Address dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  emit_byte(0xF3);
+  prefix(dst, src);
+  emit_byte(0x0F);
+  emit_byte(0x7F);
+  emit_operand(src, dst);
+}
+
 // Uses zero extension on 64bit

 void Assembler::movl(Register dst, int32_t imm32) {
--- a/src/cpu/x86/vm/assembler_x86.hpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Wed Oct 29 19:22:09 2008 -0700
@@ -1055,6 +1055,11 @@
   void movdqa(XMMRegister dst, Address src);
   void movdqa(XMMRegister dst, XMMRegister src);

+  // Move Unaligned Double Quadword
+  void movdqu(Address     dst, XMMRegister src);
+  void movdqu(XMMRegister dst, Address src);
+  void movdqu(XMMRegister dst, XMMRegister src);
+
   void movl(Register dst, int32_t imm32);
   void movl(Address dst, int32_t imm32);
   void movl(Register dst, Register src);
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -791,6 +791,69 @@
     }
   }

+
+  // Copy 64 bytes chunks
+  //
+  // Inputs:
+  //   from        - source array address
+  //   to_from     - destination array address - from
+  //   qword_count - 8-bytes element count, negative
+  //
+  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
+    assert( UseSSE >= 2, "supported cpu only" );
+    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
+    // Copy 64-byte chunks
+    __ jmpb(L_copy_64_bytes);
+    __ align(16);
+  __ BIND(L_copy_64_bytes_loop);
+
+    if(UseUnalignedLoadStores) {
+      __ movdqu(xmm0, Address(from, 0));
+      __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
+      __ movdqu(xmm1, Address(from, 16));
+      __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
+      __ movdqu(xmm2, Address(from, 32));
+      __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
+      __ movdqu(xmm3, Address(from, 48));
+      __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
+
+    } else {
+      __ movq(xmm0, Address(from, 0));
+      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
+      __ movq(xmm1, Address(from, 8));
+      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
+      __ movq(xmm2, Address(from, 16));
+      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
+      __ movq(xmm3, Address(from, 24));
+      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
+      __ movq(xmm4, Address(from, 32));
+      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
+      __ movq(xmm5, Address(from, 40));
+      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
+      __ movq(xmm6, Address(from, 48));
+      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
+      __ movq(xmm7, Address(from, 56));
+      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
+    }
+
+    __ addl(from, 64);
+  __ BIND(L_copy_64_bytes);
+    __ subl(qword_count, 8);
+    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
+    __ addl(qword_count, 8);
+    __ jccb(Assembler::zero, L_exit);
+    //
+    // length is too short, just copy qwords
+    //
+  __ BIND(L_copy_8_bytes);
+    __ movq(xmm0, Address(from, 0));
+    __ movq(Address(from, to_from, Address::times_1), xmm0);
+    __ addl(from, 8);
+    __ decrement(qword_count);
+    __ jcc(Assembler::greater, L_copy_8_bytes);
+  __ BIND(L_exit);
+  }
+
   // Copy 64 bytes chunks
   //
   // Inputs:
@@ -799,6 +862,7 @@
   //   qword_count - 8-bytes element count, negative
   //
   void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
+    assert( VM_Version::supports_mmx(), "supported cpu only" );
     Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
     // Copy 64-byte chunks
     __ jmpb(L_copy_64_bytes);
@@ -876,7 +940,7 @@
     __ subptr(to, from); // to --> to_from
     __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
     __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
-    if (!aligned && (t == T_BYTE || t == T_SHORT)) {
+    if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
       // align source address at 4 bytes address boundary
       if (t == T_BYTE) {
         // One byte misalignment happens only for byte arrays
@@ -906,20 +970,26 @@
       __ mov(count, rax);      // restore 'count'
       __ jmpb(L_copy_2_bytes); // all dwords were copied
     } else {
-      // align to 8 bytes, we know we are 4 byte aligned to start
-      __ testptr(from, 4);
-      __ jccb(Assembler::zero, L_copy_64_bytes);
-      __ movl(rax, Address(from, 0));
-      __ movl(Address(from, to_from, Address::times_1, 0), rax);
-      __ addptr(from, 4);
-      __ subl(count, 1<<shift);
+      if (!UseUnalignedLoadStores) {
+        // align to 8 bytes, we know we are 4 byte aligned to start
+        __ testptr(from, 4);
+        __ jccb(Assembler::zero, L_copy_64_bytes);
+        __ movl(rax, Address(from, 0));
+        __ movl(Address(from, to_from, Address::times_1, 0), rax);
+        __ addptr(from, 4);
+        __ subl(count, 1<<shift);
+      }
     __ BIND(L_copy_64_bytes);
       __ mov(rax, count);
       __ shrl(rax, shift+1);  // 8 bytes chunk count
       //
       // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
       //
-      mmx_copy_forward(from, to_from, rax);
+      if (UseXMMForArrayCopy) {
+        xmm_copy_forward(from, to_from, rax);
+      } else {
+        mmx_copy_forward(from, to_from, rax);
+      }
     }
     // copy tailing dword
   __ BIND(L_copy_4_bytes);
@@ -1069,13 +1139,20 @@
       __ align(16);
       // Move 8 bytes
     __ BIND(L_copy_8_bytes_loop);
-      __ movq(mmx0, Address(from, count, sf, 0));
-      __ movq(Address(to, count, sf, 0), mmx0);
+      if (UseXMMForArrayCopy) {
+        __ movq(xmm0, Address(from, count, sf, 0));
+        __ movq(Address(to, count, sf, 0), xmm0);
+      } else {
+        __ movq(mmx0, Address(from, count, sf, 0));
+        __ movq(Address(to, count, sf, 0), mmx0);
+      }
     __ BIND(L_copy_8_bytes);
       __ subl(count, 2<<shift);
       __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
       __ addl(count, 2<<shift);
-      __ emms();
+      if (!UseXMMForArrayCopy) {
+        __ emms();
+      }
     }
   __ BIND(L_copy_4_bytes);
     // copy prefix qword
@@ -1143,7 +1220,11 @@

     __ subptr(to, from); // to --> to_from
     if (VM_Version::supports_mmx()) {
-      mmx_copy_forward(from, to_from, count);
+      if (UseXMMForArrayCopy) {
+        xmm_copy_forward(from, to_from, count);
+      } else {
+        mmx_copy_forward(from, to_from, count);
+      }
     } else {
       __ jmpb(L_copy_8_bytes);
       __ align(16);
@@ -1196,8 +1277,13 @@
     __ align(16);
   __ BIND(L_copy_8_bytes_loop);
     if (VM_Version::supports_mmx()) {
-      __ movq(mmx0, Address(from, count, Address::times_8));
-      __ movq(Address(to, count, Address::times_8), mmx0);
+      if (UseXMMForArrayCopy) {
+        __ movq(xmm0, Address(from, count, Address::times_8));
+        __ movq(Address(to, count, Address::times_8), xmm0);
+      } else {
+        __ movq(mmx0, Address(from, count, Address::times_8));
+        __ movq(Address(to, count, Address::times_8), mmx0);
+      }
     } else {
       __ fild_d(Address(from, count, Address::times_8));
       __ fistp_d(Address(to, count, Address::times_8));
@@ -1206,7 +1292,7 @@
     __ decrement(count);
     __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);

-    if (VM_Version::supports_mmx()) {
+    if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
       __ emms();
     }
     inc_copy_counter_np(T_LONG);
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -1251,6 +1251,7 @@
     }
   }

+
   // Copy big chunks forward
   //
   // Inputs:
@@ -1268,14 +1269,22 @@
     Label L_loop;
     __ align(16);
   __ BIND(L_loop);
-    __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
-    __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
-    __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
-    __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
-    __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
-    __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
-    __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
-    __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
+    if(UseUnalignedLoadStores) {
+      __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+      __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
+      __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
+      __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
+
+    } else {
+      __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
+      __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
+      __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
+      __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
+      __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
+      __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
+      __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
+      __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
+    }
   __ BIND(L_copy_32_bytes);
     __ addptr(qword_count, 4);
     __ jcc(Assembler::lessEqual, L_loop);
@@ -1301,14 +1310,22 @@
     Label L_loop;
     __ align(16);
   __ BIND(L_loop);
-    __ movq(to, Address(from, qword_count, Address::times_8, 24));
-    __ movq(Address(dest, qword_count, Address::times_8, 24), to);
-    __ movq(to, Address(from, qword_count, Address::times_8, 16));
-    __ movq(Address(dest, qword_count, Address::times_8, 16), to);
-    __ movq(to, Address(from, qword_count, Address::times_8,  8));
-    __ movq(Address(dest, qword_count, Address::times_8,  8), to);
-    __ movq(to, Address(from, qword_count, Address::times_8,  0));
-    __ movq(Address(dest, qword_count, Address::times_8,  0), to);
+    if(UseUnalignedLoadStores) {
+      __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
+      __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
+      __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
+      __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
+
+    } else {
+      __ movq(to, Address(from, qword_count, Address::times_8, 24));
+      __ movq(Address(dest, qword_count, Address::times_8, 24), to);
+      __ movq(to, Address(from, qword_count, Address::times_8, 16));
+      __ movq(Address(dest, qword_count, Address::times_8, 16), to);
+      __ movq(to, Address(from, qword_count, Address::times_8,  8));
+      __ movq(Address(dest, qword_count, Address::times_8,  8), to);
+      __ movq(to, Address(from, qword_count, Address::times_8,  0));
+      __ movq(Address(dest, qword_count, Address::times_8,  0), to);
+    }
   __ BIND(L_copy_32_bytes);
     __ subptr(qword_count, 4);
     __ jcc(Assembler::greaterEqual, L_loop);
--- a/src/cpu/x86/vm/vm_version_x86_32.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/cpu/x86/vm/vm_version_x86_32.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -242,9 +242,11 @@
   _supports_cx8 = supports_cmpxchg8();
   // if the OS doesn't support SSE, we can't use this feature even if the HW does
   if( !os::supports_sse())
-    _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4|CPU_SSE4A);
-  if (UseSSE < 4)
-    _cpuFeatures &= ~CPU_SSE4;
+    _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4A|CPU_SSE4_1|CPU_SSE4_2);
+  if (UseSSE < 4) {
+    _cpuFeatures &= ~CPU_SSE4_1;
+    _cpuFeatures &= ~CPU_SSE4_2;
+  }
   if (UseSSE < 3) {
     _cpuFeatures &= ~CPU_SSE3;
     _cpuFeatures &= ~CPU_SSSE3;
@@ -261,7 +263,7 @@
   }

   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -272,7 +274,8 @@
                (supports_sse2() ? ", sse2" : ""),
                (supports_sse3() ? ", sse3" : ""),
                (supports_ssse3()? ", ssse3": ""),
-               (supports_sse4() ? ", sse4" : ""),
+               (supports_sse4_1() ? ", sse4.1" : ""),
+               (supports_sse4_2() ? ", sse4.2" : ""),
                (supports_mmx_ext() ? ", mmxext" : ""),
                (supports_3dnow()   ? ", 3dnow"  : ""),
                (supports_3dnow2()  ? ", 3dnowext" : ""),
@@ -285,7 +288,7 @@
   // older Pentiums which do not support it.
   if( UseSSE > 4 ) UseSSE=4;
   if( UseSSE < 0 ) UseSSE=0;
-  if( !supports_sse4() ) // Drop to 3 if no SSE4 support
+  if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
     UseSSE = MIN2((intx)3,UseSSE);
   if( !supports_sse3() ) // Drop to 2 if no SSE3 support
     UseSSE = MIN2((intx)2,UseSSE);
@@ -375,6 +378,14 @@
         MaxLoopPad = 11;
       }
 #endif // COMPILER2
+      if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+        UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
+      }
+      if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
+        if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+          UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
+        }
+      }
     }
   }

@@ -413,7 +424,7 @@

 #ifndef PRODUCT
   if (PrintMiscellaneous && Verbose) {
-    tty->print_cr("Logical CPUs per package: %u",
+    tty->print_cr("Logical CPUs per core: %u",
                   logical_processors_per_package());
     tty->print_cr("UseSSE=%d",UseSSE);
     tty->print("Allocation: ");
--- a/src/cpu/x86/vm/vm_version_x86_32.hpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/cpu/x86/vm/vm_version_x86_32.hpp	Wed Oct 29 19:22:09 2008 -0700
@@ -68,9 +68,9 @@
                cmpxchg16: 1,
                         : 4,
                dca      : 1,
-                        : 4,
-               popcnt   : 1,
-                        : 8;
+               sse4_1   : 1,
+               sse4_2   : 1,
+                        : 11;
     } bits;
   };

@@ -177,8 +177,9 @@
      CPU_SSE2 = (1 << 7),
      CPU_SSE3 = (1 << 8), // sse3  comes from cpuid 1 (ECX)
      CPU_SSSE3= (1 << 9),
-     CPU_SSE4 = (1 <<10),
-     CPU_SSE4A= (1 <<11)
+     CPU_SSE4A= (1 <<10),
+     CPU_SSE4_1 = (1 << 11),
+     CPU_SSE4_2 = (1 << 12)
    } cpuFeatureFlags;

   // cpuid information block.  All info derived from executing cpuid with
@@ -240,22 +241,14 @@
   static CpuidInfo _cpuid_info;

   // Extractors and predicates
-  static bool is_extended_cpu_family() {
-    const uint32_t Extended_Cpu_Family = 0xf;
-    return _cpuid_info.std_cpuid1_rax.bits.family == Extended_Cpu_Family;
-  }
   static uint32_t extended_cpu_family() {
     uint32_t result = _cpuid_info.std_cpuid1_rax.bits.family;
-    if (is_extended_cpu_family()) {
-      result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
-    }
+    result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
     return result;
   }
   static uint32_t extended_cpu_model() {
     uint32_t result = _cpuid_info.std_cpuid1_rax.bits.model;
-    if (is_extended_cpu_family()) {
-      result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
-    }
+    result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
     return result;
   }
   static uint32_t cpu_stepping() {
@@ -293,6 +286,10 @@
       result |= CPU_SSSE3;
     if (is_amd() && _cpuid_info.ext_cpuid1_rcx.bits.sse4a != 0)
       result |= CPU_SSE4A;
+    if (_cpuid_info.std_cpuid1_rcx.bits.sse4_1 != 0)
+      result |= CPU_SSE4_1;
+    if (_cpuid_info.std_cpuid1_rcx.bits.sse4_2 != 0)
+      result |= CPU_SSE4_2;
     return result;
   }

@@ -380,7 +377,8 @@
   static bool supports_sse2()     { return (_cpuFeatures & CPU_SSE2) != 0; }
   static bool supports_sse3()     { return (_cpuFeatures & CPU_SSE3) != 0; }
   static bool supports_ssse3()    { return (_cpuFeatures & CPU_SSSE3)!= 0; }
-  static bool supports_sse4()     { return (_cpuFeatures & CPU_SSE4) != 0; }
+  static bool supports_sse4_1()   { return (_cpuFeatures & CPU_SSE4_1) != 0; }
+  static bool supports_sse4_2()   { return (_cpuFeatures & CPU_SSE4_2) != 0; }
   //
   // AMD features
   //
--- a/src/cpu/x86/vm/vm_version_x86_64.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/cpu/x86/vm/vm_version_x86_64.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -186,8 +186,10 @@
   if (!VM_Version::supports_sse2()) {
     vm_exit_during_initialization("Unknown x64 processor: SSE2 not supported");
   }
-  if (UseSSE < 4)
-    _cpuFeatures &= ~CPU_SSE4;
+  if (UseSSE < 4) {
+    _cpuFeatures &= ~CPU_SSE4_1;
+    _cpuFeatures &= ~CPU_SSE4_2;
+  }
   if (UseSSE < 3) {
     _cpuFeatures &= ~CPU_SSE3;
     _cpuFeatures &= ~CPU_SSSE3;
@@ -204,7 +206,7 @@
   }

   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -215,7 +217,8 @@
                (supports_sse2() ? ", sse2" : ""),
                (supports_sse3() ? ", sse3" : ""),
                (supports_ssse3()? ", ssse3": ""),
-               (supports_sse4() ? ", sse4" : ""),
+               (supports_sse4_1() ? ", sse4.1" : ""),
+               (supports_sse4_2() ? ", sse4.2" : ""),
                (supports_mmx_ext() ? ", mmxext" : ""),
                (supports_3dnow()   ? ", 3dnow"  : ""),
                (supports_3dnow2()  ? ", 3dnowext" : ""),
@@ -228,7 +231,7 @@
   // older Pentiums which do not support it.
   if( UseSSE > 4 ) UseSSE=4;
   if( UseSSE < 0 ) UseSSE=0;
-  if( !supports_sse4() ) // Drop to 3 if no SSE4 support
+  if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
     UseSSE = MIN2((intx)3,UseSSE);
   if( !supports_sse3() ) // Drop to 2 if no SSE3 support
     UseSSE = MIN2((intx)2,UseSSE);
@@ -314,6 +317,14 @@
         MaxLoopPad = 11;
       }
 #endif // COMPILER2
+      if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+        UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
+      }
+      if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
+        if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+          UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
+        }
+      }
     }
   }

@@ -355,7 +366,7 @@

 #ifndef PRODUCT
   if (PrintMiscellaneous && Verbose) {
-    tty->print_cr("Logical CPUs per package: %u",
+    tty->print_cr("Logical CPUs per core: %u",
                   logical_processors_per_package());
     tty->print_cr("UseSSE=%d",UseSSE);
     tty->print("Allocation: ");
--- a/src/cpu/x86/vm/vm_version_x86_64.hpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/cpu/x86/vm/vm_version_x86_64.hpp	Wed Oct 29 19:22:09 2008 -0700
@@ -68,9 +68,9 @@
                cmpxchg16: 1,
                         : 4,
                dca      : 1,
-                        : 4,
-               popcnt   : 1,
-                        : 8;
+               sse4_1   : 1,
+               sse4_2   : 1,
+                        : 11;
     } bits;
   };

@@ -177,8 +177,9 @@
      CPU_SSE2 = (1 << 7),
      CPU_SSE3 = (1 << 8),
      CPU_SSSE3= (1 << 9),
-     CPU_SSE4 = (1 <<10),
-     CPU_SSE4A= (1 <<11)
+     CPU_SSE4A= (1 <<10),
+     CPU_SSE4_1 = (1 << 11),
+     CPU_SSE4_2 = (1 << 12)
    } cpuFeatureFlags;

   // cpuid information block.  All info derived from executing cpuid with
@@ -240,22 +241,14 @@
   static CpuidInfo _cpuid_info;

   // Extractors and predicates
-  static bool is_extended_cpu_family() {
-    const uint32_t Extended_Cpu_Family = 0xf;
-    return _cpuid_info.std_cpuid1_eax.bits.family == Extended_Cpu_Family;
-  }
   static uint32_t extended_cpu_family() {
     uint32_t result = _cpuid_info.std_cpuid1_eax.bits.family;
-    if (is_extended_cpu_family()) {
-      result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
-    }
+    result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
     return result;
   }
   static uint32_t extended_cpu_model() {
     uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model;
-    if (is_extended_cpu_family()) {
-      result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
-    }
+    result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
     return result;
   }
   static uint32_t cpu_stepping() {
@@ -293,6 +286,10 @@
       result |= CPU_SSSE3;
     if (is_amd() && _cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0)
       result |= CPU_SSE4A;
+    if (_cpuid_info.std_cpuid1_ecx.bits.sse4_1 != 0)
+      result |= CPU_SSE4_1;
+    if (_cpuid_info.std_cpuid1_ecx.bits.sse4_2 != 0)
+      result |= CPU_SSE4_2;
     return result;
   }

@@ -380,7 +377,8 @@
   static bool supports_sse2()     { return (_cpuFeatures & CPU_SSE2) != 0; }
   static bool supports_sse3()     { return (_cpuFeatures & CPU_SSE3) != 0; }
   static bool supports_ssse3()    { return (_cpuFeatures & CPU_SSSE3)!= 0; }
-  static bool supports_sse4()     { return (_cpuFeatures & CPU_SSE4) != 0; }
+  static bool supports_sse4_1()   { return (_cpuFeatures & CPU_SSE4_1) != 0; }
+  static bool supports_sse4_2()   { return (_cpuFeatures & CPU_SSE4_2) != 0; }
   //
   // AMD features
   //
--- a/src/cpu/x86/vm/x86_32.ad	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/cpu/x86/vm/x86_32.ad	Wed Oct 29 19:22:09 2008 -0700
@@ -4810,6 +4810,16 @@
   interface(CONST_INTER);
 %}

+// Long Immediate zero
+operand immL_M1() %{
+  predicate( n->get_long() == -1L );
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 // Long immediate from 0 to 127.
 // Used for a shorter form of long mul by 10.
 operand immL_127() %{
@@ -8621,6 +8631,18 @@
   ins_pipe( ialu_reg_reg );
 %}

+// Xor Register with Immediate -1
+instruct xorI_eReg_im1(eRegI dst, immI_M1 imm) %{
+  match(Set dst (XorI dst imm));
+
+  size(2);
+  format %{ "NOT    $dst" %}
+  ins_encode %{
+     __ notl($dst$$Register);
+  %}
+  ins_pipe( ialu_reg );
+%}
+
 // Xor Register with Immediate
 instruct xorI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (XorI dst src));
@@ -8938,6 +8960,18 @@
   ins_pipe( ialu_reg_reg_long );
 %}

+// Xor Long Register with Immediate -1
+instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{
+  match(Set dst (XorL dst imm));
+  format %{ "NOT    $dst.lo\n\t"
+            "NOT    $dst.hi" %}
+  ins_encode %{
+     __ notl($dst$$Register);
+     __ notl(HIGH_FROM_LOW($dst$$Register));
+  %}
+  ins_pipe( ialu_reg_long );
+%}
+
 // Xor Long Register with Immediate
 instruct xorl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
   match(Set dst (XorL dst src));
--- a/src/cpu/x86/vm/x86_64.ad	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/cpu/x86/vm/x86_64.ad	Wed Oct 29 19:22:09 2008 -0700
@@ -9309,6 +9309,17 @@
   ins_pipe(ialu_reg_reg);
 %}

+// Xor Register with Immediate -1
+instruct xorI_rReg_im1(rRegI dst, immI_M1 imm) %{
+  match(Set dst (XorI dst imm));
+
+  format %{ "not    $dst" %}
+  ins_encode %{
+     __ notl($dst$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
 // Xor Register with Immediate
 instruct xorI_rReg_imm(rRegI dst, immI src, rFlagsReg cr)
 %{
@@ -9529,6 +9540,17 @@
   ins_pipe(ialu_reg_reg);
 %}

+// Xor Register with Immediate -1
+instruct xorL_rReg_im1(rRegL dst, immL_M1 imm) %{
+  match(Set dst (XorL dst imm));
+
+  format %{ "notq   $dst" %}
+  ins_encode %{
+     __ notq($dst$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
 // Xor Register with Immediate
 instruct xorL_rReg_imm(rRegL dst, immL32 src, rFlagsReg cr)
 %{
--- a/src/share/vm/includeDB_features	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/includeDB_features	Wed Oct 29 19:22:09 2008 -0700
@@ -99,6 +99,7 @@
 heapDumper.cpp                          reflectionUtils.hpp
 heapDumper.cpp                          symbolTable.hpp
 heapDumper.cpp                          systemDictionary.hpp
+heapDumper.cpp                          threadService.hpp
 heapDumper.cpp                          universe.hpp
 heapDumper.cpp                          vframe.hpp
 heapDumper.cpp                          vmGCOperations.hpp
--- a/src/share/vm/memory/compactingPermGenGen.hpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/memory/compactingPermGenGen.hpp	Wed Oct 29 19:22:09 2008 -0700
@@ -100,7 +100,7 @@

   enum {
     vtbl_list_size = 16, // number of entries in the shared space vtable list.
-    num_virtuals = 100   // number of virtual methods in Klass (or
+    num_virtuals = 200   // number of virtual methods in Klass (or
                          // subclass) objects, or greater.
   };
--- a/src/share/vm/memory/dump.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/memory/dump.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -818,6 +818,40 @@
 // across the space while doing this, as that causes the vtables to be
 // patched, undoing our useful work.  Instead, iterate to make a list,
 // then use the list to do the fixing.
+//
+// Our constructed vtables:
+// Dump time:
+//  1. init_self_patching_vtbl_list: table of pointers to current virtual method addrs
+//  2. generate_vtable_methods: create jump table, appended to above vtbl_list
+//  3. PatchKlassVtables: for Klass list, patch the vtable entry to point to jump table
+//     rather than to current vtbl
+// Table layout: NOTE FIXED SIZE
+//   1. vtbl pointers
+//   2. #Klass X #virtual methods per Klass
+//   1 entry for each, in the order:
+//   Klass1:method1 entry, Klass1:method2 entry, ... Klass1:method<num_virtuals> entry
+//   Klass2:method1 entry, Klass2:method2 entry, ... Klass2:method<num_virtuals> entry
+//   ...
+//   Klass<vtbl_list_size>:method1 entry, Klass<vtbl_list_size>:method2 entry,
+//       ... Klass<vtbl_list_size>:method<num_virtuals> entry
+//  Sample entry: (Sparc):
+//   save(sp, -256, sp)
+//   ba,pt common_code
+//   mov XXX, %L0       %L0 gets: Klass index <<8 + method index (note: max method index 255)
+//
+// Restore time:
+//   1. initialize_oops: reserve space for table
+//   2. init_self_patching_vtbl_list: update pointers to NEW virtual method addrs in text
+//
+// Execution time:
+//   First virtual method call for any object of these Klass types:
+//   1. object->klass->klass_part
+//   2. vtable entry for that klass_part points to the jump table entries
+//   3. branches to common_code with %O0/klass_part, %L0: Klass index <<8 + method index
+//   4. common_code:
+//      Get address of new vtbl pointer for this Klass from updated table
+//      Update new vtbl pointer in the Klass: future virtual calls go direct
+//      Jump to method, using new vtbl pointer and method index

 class PatchKlassVtables: public ObjectClosure {
 private:
--- a/src/share/vm/oops/objArrayKlass.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/oops/objArrayKlass.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -475,8 +475,8 @@
     assert(Universe::is_bootstrapping(), "partial objArray only at startup");
     return JVM_ACC_ABSTRACT | JVM_ACC_FINAL | JVM_ACC_PUBLIC;
   }
-  // Recurse down the element list
-  jint element_flags = Klass::cast(element_klass())->compute_modifier_flags(CHECK_0);
+  // Return the flags of the bottom element type.
+  jint element_flags = Klass::cast(bottom_klass())->compute_modifier_flags(CHECK_0);

   return (element_flags & (JVM_ACC_PUBLIC | JVM_ACC_PRIVATE | JVM_ACC_PROTECTED))
                         | (JVM_ACC_ABSTRACT | JVM_ACC_FINAL);
--- a/src/share/vm/opto/addnode.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/opto/addnode.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -156,7 +156,8 @@
   if( add1_op == this_op && !con_right ) {
     Node *a12 = add1->in(2);
     const Type *t12 = phase->type( a12 );
-    if( t12->singleton() && t12 != Type::TOP && (add1 != add1->in(1)) ) {
+    if( t12->singleton() && t12 != Type::TOP && (add1 != add1->in(1)) &&
+       !(add1->in(1)->is_Phi() && add1->in(1)->as_Phi()->is_tripcount()) ) {
       assert(add1->in(1) != this, "dead loop in AddNode::Ideal");
       add2 = add1->clone();
       add2->set_req(2, in(2));
@@ -173,7 +174,8 @@
   if( add2_op == this_op && !con_left ) {
     Node *a22 = add2->in(2);
     const Type *t22 = phase->type( a22 );
-    if( t22->singleton() && t22 != Type::TOP && (add2 != add2->in(1)) ) {
+    if( t22->singleton() && t22 != Type::TOP && (add2 != add2->in(1)) &&
+       !(add2->in(1)->is_Phi() && add2->in(1)->as_Phi()->is_tripcount()) ) {
       assert(add2->in(1) != this, "dead loop in AddNode::Ideal");
       Node *addx = add2->clone();
       addx->set_req(1, in(1));
@@ -225,34 +227,63 @@
 //=============================================================================
 //------------------------------Idealize---------------------------------------
 Node *AddINode::Ideal(PhaseGVN *phase, bool can_reshape) {
-  int op1 = in(1)->Opcode();
-  int op2 = in(2)->Opcode();
+  Node* in1 = in(1);
+  Node* in2 = in(2);
+  int op1 = in1->Opcode();
+  int op2 = in2->Opcode();
   // Fold (con1-x)+con2 into (con1+con2)-x
+  if ( op1 == Op_AddI && op2 == Op_SubI ) {
+    // Swap edges to try optimizations below
+    in1 = in2;
+    in2 = in(1);
+    op1 = op2;
+    op2 = in2->Opcode();
+  }
   if( op1 == Op_SubI ) {
-    const Type *t_sub1 = phase->type( in(1)->in(1) );
-    const Type *t_2    = phase->type( in(2)        );
+    const Type *t_sub1 = phase->type( in1->in(1) );
+    const Type *t_2    = phase->type( in2        );
     if( t_sub1->singleton() && t_2->singleton() && t_sub1 != Type::TOP && t_2 != Type::TOP )
       return new (phase->C, 3) SubINode(phase->makecon( add_ring( t_sub1, t_2 ) ),
-                              in(1)->in(2) );
+                              in1->in(2) );
     // Convert "(a-b)+(c-d)" into "(a+c)-(b+d)"
     if( op2 == Op_SubI ) {
       // Check for dead cycle: d = (a-b)+(c-d)
-      assert( in(1)->in(2) != this && in(2)->in(2) != this,
+      assert( in1->in(2) != this && in2->in(2) != this,
               "dead loop in AddINode::Ideal" );
       Node *sub  = new (phase->C, 3) SubINode(NULL, NULL);
-      sub->init_req(1, phase->transform(new (phase->C, 3) AddINode(in(1)->in(1), in(2)->in(1) ) ));
-      sub->init_req(2, phase->transform(new (phase->C, 3) AddINode(in(1)->in(2), in(2)->in(2) ) ));
+      sub->init_req(1, phase->transform(new (phase->C, 3) AddINode(in1->in(1), in2->in(1) ) ));
+      sub->init_req(2, phase->transform(new (phase->C, 3) AddINode(in1->in(2), in2->in(2) ) ));
       return sub;
     }
+    // Convert "(a-b)+(b+c)" into "(a+c)"
+    if( op2 == Op_AddI && in1->in(2) == in2->in(1) ) {
+      assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddINode::Ideal");
+      return new (phase->C, 3) AddINode(in1->in(1), in2->in(2));
+    }
+    // Convert "(a-b)+(c+b)" into "(a+c)"
+    if( op2 == Op_AddI && in1->in(2) == in2->in(2) ) {
+      assert(in1->in(1) != this && in2->in(1) != this,"dead loop in AddINode::Ideal");
+      return new (phase->C, 3) AddINode(in1->in(1), in2->in(1));
+    }
+    // Convert "(a-b)+(b-c)" into "(a-c)"
+    if( op2 == Op_SubI && in1->in(2) == in2->in(1) ) {
+      assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddINode::Ideal");
+      return new (phase->C, 3) SubINode(in1->in(1), in2->in(2));
+    }
+    // Convert "(a-b)+(c-a)" into "(c-b)"
+    if( op2 == Op_SubI && in1->in(1) == in2->in(2) ) {
+      assert(in1->in(2) != this && in2->in(1) != this,"dead loop in AddINode::Ideal");
+      return new (phase->C, 3) SubINode(in2->in(1), in1->in(2));
+    }
   }

   // Convert "x+(0-y)" into "(x-y)"
-  if( op2 == Op_SubI && phase->type(in(2)->in(1)) == TypeInt::ZERO )
-    return new (phase->C, 3) SubINode(in(1), in(2)->in(2) );
+  if( op2 == Op_SubI && phase->type(in2->in(1)) == TypeInt::ZERO )
+    return new (phase->C, 3) SubINode(in1, in2->in(2) );

   // Convert "(0-y)+x" into "(x-y)"
-  if( op1 == Op_SubI && phase->type(in(1)->in(1)) == TypeInt::ZERO )
-    return new (phase->C, 3) SubINode( in(2), in(1)->in(2) );
+  if( op1 == Op_SubI && phase->type(in1->in(1)) == TypeInt::ZERO )
+    return new (phase->C, 3) SubINode( in2, in1->in(2) );

   // Convert (x>>>z)+y into (x+(y<<z))>>>z for small constant z and y.
   // Helps with array allocation math constant folding
@@ -266,15 +297,15 @@
   // Have not observed cases where type information exists to support
   // positive y and (x <= -(y << z))
   if( op1 == Op_URShiftI && op2 == Op_ConI &&
-      in(1)->in(2)->Opcode() == Op_ConI ) {
-    jint z = phase->type( in(1)->in(2) )->is_int()->get_con() & 0x1f; // only least significant 5 bits matter
-    jint y = phase->type( in(2) )->is_int()->get_con();
+      in1->in(2)->Opcode() == Op_ConI ) {
+    jint z = phase->type( in1->in(2) )->is_int()->get_con() & 0x1f; // only least significant 5 bits matter
+    jint y = phase->type( in2 )->is_int()->get_con();

     if( z < 5 && -5 < y && y < 0 ) {
-      const Type *t_in11 = phase->type(in(1)->in(1));
+      const Type *t_in11 = phase->type(in1->in(1));
       if( t_in11 != Type::TOP && (t_in11->is_int()->_lo >= -(y << z)) ) {
-        Node *a = phase->transform( new (phase->C, 3) AddINode( in(1)->in(1), phase->intcon(y<<z) ) );
-        return new (phase->C, 3) URShiftINode( a, in(1)->in(2) );
+        Node *a = phase->transform( new (phase->C, 3) AddINode( in1->in(1), phase->intcon(y<<z) ) );
+        return new (phase->C, 3) URShiftINode( a, in1->in(2) );
       }
     }
   }
@@ -328,39 +359,73 @@
 //=============================================================================
 //------------------------------Idealize---------------------------------------
 Node *AddLNode::Ideal(PhaseGVN *phase, bool can_reshape) {
-  int op1 = in(1)->Opcode();
-  int op2 = in(2)->Opcode();
+  Node* in1 = in(1);
+  Node* in2 = in(2);
+  int op1 = in1->Opcode();
+  int op2 = in2->Opcode();
+  // Fold (con1-x)+con2 into (con1+con2)-x
+  if ( op1 == Op_AddL && op2 == Op_SubL ) {
+    // Swap edges to try optimizations below
+    in1 = in2;
+    in2 = in(1);
+    op1 = op2;
+    op2 = in2->Opcode();
+  }
   // Fold (con1-x)+con2 into (con1+con2)-x
   if( op1 == Op_SubL ) {
-    const Type *t_sub1 = phase->type( in(1)->in(1) );
-    const Type *t_2    = phase->type( in(2)        );
+    const Type *t_sub1 = phase->type( in1->in(1) );
+    const Type *t_2    = phase->type( in2        );
     if( t_sub1->singleton() && t_2->singleton() && t_sub1 != Type::TOP && t_2 != Type::TOP )
       return new (phase->C, 3) SubLNode(phase->makecon( add_ring( t_sub1, t_2 ) ),
-                              in(1)->in(2) );
+                              in1->in(2) );
     // Convert "(a-b)+(c-d)" into "(a+c)-(b+d)"
     if( op2 == Op_SubL ) {
       // Check for dead cycle: d = (a-b)+(c-d)
-      assert( in(1)->in(2) != this && in(2)->in(2) != this,
+      assert( in1->in(2) != this && in2->in(2) != this,
               "dead loop in AddLNode::Ideal" );
       Node *sub  = new (phase->C, 3) SubLNode(NULL, NULL);
-      sub->init_req(1, phase->transform(new (phase->C, 3) AddLNode(in(1)->in(1), in(2)->in(1) ) ));
-      sub->init_req(2, phase->transform(new (phase->C, 3) AddLNode(in(1)->in(2), in(2)->in(2) ) ));
+      sub->init_req(1, phase->transform(new (phase->C, 3) AddLNode(in1->in(1), in2->in(1) ) ));
+      sub->init_req(2, phase->transform(new (phase->C, 3) AddLNode(in1->in(2), in2->in(2) ) ));
       return sub;
     }
+    // Convert "(a-b)+(b+c)" into "(a+c)"
+    if( op2 == Op_AddL && in1->in(2) == in2->in(1) ) {
+      assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddLNode::Ideal");
+      return new (phase->C, 3) AddLNode(in1->in(1), in2->in(2));
+    }
+    // Convert "(a-b)+(c+b)" into "(a+c)"
+    if( op2 == Op_AddL && in1->in(2) == in2->in(2) ) {
+      assert(in1->in(1) != this && in2->in(1) != this,"dead loop in AddLNode::Ideal");
+      return new (phase->C, 3) AddLNode(in1->in(1), in2->in(1));
+    }
+    // Convert "(a-b)+(b-c)" into "(a-c)"
+    if( op2 == Op_SubL && in1->in(2) == in2->in(1) ) {
+      assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddLNode::Ideal");
+      return new (phase->C, 3) SubLNode(in1->in(1), in2->in(2));
+    }
+    // Convert "(a-b)+(c-a)" into "(c-b)"
+    if( op2 == Op_SubL && in1->in(1) == in1->in(2) ) {
+      assert(in1->in(2) != this && in2->in(1) != this,"dead loop in AddLNode::Ideal");
+      return new (phase->C, 3) SubLNode(in2->in(1), in1->in(2));
+    }
   }

   // Convert "x+(0-y)" into "(x-y)"
-  if( op2 == Op_SubL && phase->type(in(2)->in(1)) == TypeLong::ZERO )
-    return new (phase->C, 3) SubLNode(in(1), in(2)->in(2) );
+  if( op2 == Op_SubL && phase->type(in2->in(1)) == TypeLong::ZERO )
+    return new (phase->C, 3) SubLNode( in1, in2->in(2) );
+
+  // Convert "(0-y)+x" into "(x-y)"
+  if( op1 == Op_SubL && phase->type(in1->in(1)) == TypeInt::ZERO )
+    return new (phase->C, 3) SubLNode( in2, in1->in(2) );

   // Convert "X+X+X+X+X...+X+Y" into "k*X+Y" or really convert "X+(X+Y)"
   // into "(X<<1)+Y" and let shift-folding happen.
   if( op2 == Op_AddL &&
-      in(2)->in(1) == in(1) &&
+      in2->in(1) == in1 &&
       op1 != Op_ConL &&
       0 ) {
-    Node *shift = phase->transform(new (phase->C, 3) LShiftLNode(in(1),phase->intcon(1)));
-    return new (phase->C, 3) AddLNode(shift,in(2)->in(2));
+    Node *shift = phase->transform(new (phase->C, 3) LShiftLNode(in1,phase->intcon(1)));
+    return new (phase->C, 3) AddLNode(shift,in2->in(2));
   }

   return AddNode::Ideal(phase, can_reshape);
--- a/src/share/vm/opto/cfgnode.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/opto/cfgnode.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -1817,6 +1817,12 @@
   return progress;              // Return any progress
 }

+//------------------------------is_tripcount-----------------------------------
+bool PhiNode::is_tripcount() const {
+  return (in(0) != NULL && in(0)->is_CountedLoop() &&
+          in(0)->as_CountedLoop()->phi() == this);
+}
+
 //------------------------------out_RegMask------------------------------------
 const RegMask &PhiNode::in_RegMask(uint i) const {
   return i ? out_RegMask() : RegMask::Empty;
@@ -1832,9 +1838,7 @@
 #ifndef PRODUCT
 void PhiNode::dump_spec(outputStream *st) const {
   TypeNode::dump_spec(st);
-  if (in(0) != NULL &&
-      in(0)->is_CountedLoop() &&
-      in(0)->as_CountedLoop()->phi() == this) {
+  if (is_tripcount()) {
     st->print(" #tripcount");
   }
 }
--- a/src/share/vm/opto/cfgnode.hpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/opto/cfgnode.hpp	Wed Oct 29 19:22:09 2008 -0700
@@ -162,6 +162,8 @@
     return NULL;  // not a copy!
   }

+  bool is_tripcount() const;
+
   // Determine a unique non-trivial input, if any.
   // Ignore casts if it helps.  Return NULL on failure.
   Node* unique_input(PhaseTransform *phase);
--- a/src/share/vm/opto/divnode.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/opto/divnode.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -110,10 +110,13 @@
     } else if( dividend->Opcode() == Op_AndI ) {
       // An AND mask of sufficient size clears the low bits and
       // I can avoid rounding.
-      const TypeInt *andconi = phase->type( dividend->in(2) )->isa_int();
-      if( andconi && andconi->is_con(-d) ) {
-        dividend = dividend->in(1);
-        needs_rounding = false;
+      const TypeInt *andconi_t = phase->type( dividend->in(2) )->isa_int();
+      if( andconi_t && andconi_t->is_con() ) {
+        jint andconi = andconi_t->get_con();
+        if( andconi < 0 && is_power_of_2(-andconi) && (-andconi) >= d ) {
+          dividend = dividend->in(1);
+          needs_rounding = false;
+        }
       }
     }

@@ -316,10 +319,13 @@
     } else if( dividend->Opcode() == Op_AndL ) {
       // An AND mask of sufficient size clears the low bits and
       // I can avoid rounding.
-      const TypeLong *andconl = phase->type( dividend->in(2) )->isa_long();
-      if( andconl && andconl->is_con(-d)) {
-        dividend = dividend->in(1);
-        needs_rounding = false;
+      const TypeLong *andconl_t = phase->type( dividend->in(2) )->isa_long();
+      if( andconl_t && andconl_t->is_con() ) {
+        jlong andconl = andconl_t->get_con();
+        if( andconl < 0 && is_power_of_2_long(-andconl) && (-andconl) >= d ) {
+          dividend = dividend->in(1);
+          needs_rounding = false;
+        }
       }
     }

@@ -704,11 +710,18 @@
   if( t2 == TypeD::ONE )
     return t1;

-  // If divisor is a constant and not zero, divide them numbers
-  if( t1->base() == Type::DoubleCon &&
-      t2->base() == Type::DoubleCon &&
-      t2->getd() != 0.0 ) // could be negative zero
-    return TypeD::make( t1->getd()/t2->getd() );
+#if defined(IA32)
+  if (!phase->C->method()->is_strict())
+    // Can't trust native compilers to properly fold strict double
+    // division with round-to-zero on this platform.
+#endif
+    {
+      // If divisor is a constant and not zero, divide them numbers
+      if( t1->base() == Type::DoubleCon &&
+          t2->base() == Type::DoubleCon &&
+          t2->getd() != 0.0 ) // could be negative zero
+        return TypeD::make( t1->getd()/t2->getd() );
+    }

   // If the dividend is a constant zero
   // Note: if t1 and t2 are zero then result is NaN (JVMS page 213)
--- a/src/share/vm/opto/loopTransform.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/opto/loopTransform.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -679,6 +679,10 @@
   CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop();
   post_head->set_post_loop(main_head);

+  // Reduce the post-loop trip count.
+  CountedLoopEndNode* post_end = old_new[main_end ->_idx]->as_CountedLoopEnd();
+  post_end->_prob = PROB_FAIR;
+
   // Build the main-loop normal exit.
   IfFalseNode *new_main_exit = new (C, 1) IfFalseNode(main_end);
   _igvn.register_new_node_with_optimizer( new_main_exit );
@@ -748,6 +752,9 @@
   pre_head->set_pre_loop(main_head);
   Node *pre_incr = old_new[incr->_idx];

+  // Reduce the pre-loop trip count.
+  pre_end->_prob = PROB_FAIR;
+
   // Find the pre-loop normal exit.
   Node* pre_exit = pre_end->proj_out(false);
   assert( pre_exit->Opcode() == Op_IfFalse, "" );
@@ -767,8 +774,8 @@
   register_new_node( min_cmp , new_pre_exit );
   register_new_node( min_bol , new_pre_exit );

-  // Build the IfNode
-  IfNode *min_iff = new (C, 2) IfNode( new_pre_exit, min_bol, PROB_FAIR, COUNT_UNKNOWN );
+  // Build the IfNode (assume the main-loop is executed always).
+  IfNode *min_iff = new (C, 2) IfNode( new_pre_exit, min_bol, PROB_ALWAYS, COUNT_UNKNOWN );
   _igvn.register_new_node_with_optimizer( min_iff );
   set_idom(min_iff, new_pre_exit, dd_main_head);
   set_loop(min_iff, loop->_parent);
@@ -1583,10 +1590,10 @@

 //=============================================================================
 //------------------------------iteration_split_impl---------------------------
-void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ) {
+bool IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ) {
   // Check and remove empty loops (spam micro-benchmarks)
   if( policy_do_remove_empty_loop(phase) )
-    return;                     // Here we removed an empty loop
+    return true;                     // Here we removed an empty loop

   bool should_peel = policy_peeling(phase); // Should we peel?

@@ -1596,7 +1603,8 @@
   // This removes loop-invariant tests (usually null checks).
   if( !_head->is_CountedLoop() ) { // Non-counted loop
     if (PartialPeelLoop && phase->partial_peel(this, old_new)) {
-      return;
+      // Partial peel succeeded so terminate this round of loop opts
+      return false;
     }
     if( should_peel ) {            // Should we peel?
 #ifndef PRODUCT
@@ -1606,14 +1614,14 @@
     } else if( should_unswitch ) {
       phase->do_unswitching(this, old_new);
     }
-    return;
+    return true;
   }
   CountedLoopNode *cl = _head->as_CountedLoop();

-  if( !cl->loopexit() ) return; // Ignore various kinds of broken loops
+  if( !cl->loopexit() ) return true; // Ignore various kinds of broken loops

   // Do nothing special to pre- and post- loops
-  if( cl->is_pre_loop() || cl->is_post_loop() ) return;
+  if( cl->is_pre_loop() || cl->is_post_loop() ) return true;

   // Compute loop trip count from profile data
   compute_profile_trip_cnt(phase);
@@ -1626,11 +1634,11 @@
       // Here we did some unrolling and peeling.  Eventually we will
       // completely unroll this loop and it will no longer be a loop.
       phase->do_maximally_unroll(this,old_new);
-      return;
+      return true;
     }
     if (should_unswitch) {
       phase->do_unswitching(this, old_new);
-      return;
+      return true;
     }
   }

@@ -1691,14 +1699,16 @@
     if( should_peel )           // Might want to peel but do nothing else
       phase->do_peeling(this,old_new);
   }
+  return true;
 }


 //=============================================================================
 //------------------------------iteration_split--------------------------------
-void IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new ) {
+bool IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new ) {
   // Recursively iteration split nested loops
-  if( _child ) _child->iteration_split( phase, old_new );
+  if( _child && !_child->iteration_split( phase, old_new ))
+    return false;

   // Clean out prior deadwood
   DCE_loop_body();
@@ -1720,7 +1730,9 @@
       _allow_optimizations &&
       !tail()->is_top() ) {     // Also ignore the occasional dead backedge
     if (!_has_call) {
-      iteration_split_impl( phase, old_new );
+      if (!iteration_split_impl( phase, old_new )) {
+        return false;
+      }
     } else if (policy_unswitching(phase)) {
       phase->do_unswitching(this, old_new);
     }
@@ -1729,5 +1741,7 @@
   // Minor offset re-organization to remove loop-fallout uses of
   // trip counter.
   if( _head->is_CountedLoop() ) phase->reorg_offsets( this );
-  if( _next ) _next->iteration_split( phase, old_new );
+  if( _next && !_next->iteration_split( phase, old_new ))
+    return false;
+  return true;
 }
--- a/src/share/vm/opto/loopnode.hpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/opto/loopnode.hpp	Wed Oct 29 19:22:09 2008 -0700
@@ -325,12 +325,14 @@
   // Returns TRUE if loop tree is structurally changed.
   bool beautify_loops( PhaseIdealLoop *phase );

-  // Perform iteration-splitting on inner loops.  Split iterations to avoid
-  // range checks or one-shot null checks.
-  void iteration_split( PhaseIdealLoop *phase, Node_List &old_new );
+  // Perform iteration-splitting on inner loops.  Split iterations to
+  // avoid range checks or one-shot null checks.  Returns false if the
+  // current round of loop opts should stop.
+  bool iteration_split( PhaseIdealLoop *phase, Node_List &old_new );

-  // Driver for various flavors of iteration splitting
-  void iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new );
+  // Driver for various flavors of iteration splitting.  Returns false
+  // if the current round of loop opts should stop.
+  bool iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new );

   // Given dominators, try to find loops with calls that must always be
   // executed (call dominates loop tail).  These loops do not need non-call
--- a/src/share/vm/opto/loopopts.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/opto/loopopts.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -1903,9 +1903,6 @@
       // Use in a phi is considered a use in the associated predecessor block
       use_c = use->in(0)->in(j);
     }
-    if (use_c->is_CountedLoop()) {
-      use_c = use_c->in(LoopNode::EntryControl);
-    }
     set_ctrl(n_clone, use_c);
     assert(!loop->is_member(get_loop(use_c)), "should be outside loop");
     get_loop(use_c)->_body.push(n_clone);
--- a/src/share/vm/opto/mulnode.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/opto/mulnode.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -152,6 +152,14 @@
   if( t1 == Type::BOTTOM || t2 == Type::BOTTOM )
     return bottom_type();

+#if defined(IA32)
+  // Can't trust native compilers to properly fold strict double
+  // multiplication with round-to-zero on this platform.
+  if (op == Op_MulD && phase->C->method()->is_strict()) {
+    return TypeD::DOUBLE;
+  }
+#endif
+
   return mul_ring(t1,t2);            // Local flavor of type multiplication
 }

@@ -360,7 +368,7 @@
 // Compute the product type of two double ranges into this node.
 const Type *MulDNode::mul_ring(const Type *t0, const Type *t1) const {
   if( t0 == Type::DOUBLE || t1 == Type::DOUBLE ) return Type::DOUBLE;
-  // We must be adding 2 double constants.
+  // We must be multiplying 2 double constants.
   return TypeD::make( t0->getd() * t1->getd() );
 }
--- a/src/share/vm/opto/node.hpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/opto/node.hpp	Wed Oct 29 19:22:09 2008 -0700
@@ -1320,7 +1320,8 @@
   Node *pop() {
     if( _clock_index >= size() ) _clock_index = 0;
     Node *b = at(_clock_index);
-    map( _clock_index++, Node_List::pop());
+    map( _clock_index, Node_List::pop());
+    if (size() != 0) _clock_index++; // Always start from 0
     _in_worklist >>= b->_idx;
     return b;
   }
--- a/src/share/vm/opto/postaloc.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/opto/postaloc.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -34,7 +34,7 @@
 #endif
 }

-//------------------------------may_be_copy_of_callee-----------------------------
+//---------------------------may_be_copy_of_callee-----------------------------
 // Check to see if we can possibly be a copy of a callee-save value.
 bool PhaseChaitin::may_be_copy_of_callee( Node *def ) const {
   // Short circuit if there are no callee save registers
@@ -225,6 +225,20 @@

   // Scan all registers to see if this value is around already
   for( uint reg = 0; reg < (uint)_max_reg; reg++ ) {
+    if (reg == (uint)nk_reg) {
+      // Found ourselves so check if there is only one user of this
+      // copy and keep on searching for a better copy if so.
+      bool ignore_self = true;
+      x = n->in(k);
+      DUIterator_Fast imax, i = x->fast_outs(imax);
+      Node* first = x->fast_out(i); i++;
+      while (i < imax && ignore_self) {
+        Node* use = x->fast_out(i); i++;
+        if (use != first) ignore_self = false;
+      }
+      if (ignore_self) continue;
+    }
+
     Node *vv = value[reg];
     if( !single ) {             // Doubles check for aligned-adjacent pair
       if( (reg&1)==0 ) continue;  // Wrong half of a pair
--- a/src/share/vm/opto/subnode.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/opto/subnode.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -206,6 +206,14 @@
   if( op1 == Op_AddI && op2 == Op_AddI && in1->in(2) == in2->in(2) )
     return new (phase->C, 3) SubINode( in1->in(1), in2->in(1) );

+  // Convert "(A+X) - (X+B)" into "A - B"
+  if( op1 == Op_AddI && op2 == Op_AddI && in1->in(2) == in2->in(1) )
+    return new (phase->C, 3) SubINode( in1->in(1), in2->in(2) );
+
+  // Convert "(X+A) - (B+X)" into "A - B"
+  if( op1 == Op_AddI && op2 == Op_AddI && in1->in(1) == in2->in(2) )
+    return new (phase->C, 3) SubINode( in1->in(2), in2->in(1) );
+
   // Convert "A-(B-C)" into (A+C)-B", since add is commutative and generally
   // nicer to optimize than subtract.
   if( op2 == Op_SubI && in2->outcnt() == 1) {
--- a/src/share/vm/runtime/globals.hpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/runtime/globals.hpp	Wed Oct 29 19:22:09 2008 -0700
@@ -997,6 +997,12 @@
   product(bool, UseXmmI2F, false,                                           \
           "Use SSE2 CVTDQ2PS instruction to convert Integer to Float")      \
                                                                             \
+  product(bool, UseXMMForArrayCopy, false,                                  \
+          "Use SSE2 MOVQ instruction for Arraycopy")                        \
+                                                                            \
+  product(bool, UseUnalignedLoadStores, false,                              \
+          "Use SSE2 MOVDQU instruction for Arraycopy")                      \
+                                                                            \
   product(intx, FieldsAllocationStyle, 1,                                   \
           "0 - type based with oops first, 1 - with oops last")             \
                                                                             \
@@ -2555,7 +2561,7 @@
   develop(intx, MaxRecursiveInlineLevel, 1,                                 \
           "maximum number of nested recursive calls that are inlined")      \
                                                                             \
-  develop(intx, InlineSmallCode, 1000,                                      \
+  product(intx, InlineSmallCode, 1000,                                      \
           "Only inline already compiled methods if their code size is "     \
           "less than this")                                                 \
                                                                             \
--- a/src/share/vm/services/heapDumper.cpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/services/heapDumper.cpp	Wed Oct 29 19:22:09 2008 -0700
@@ -343,7 +343,8 @@

 // Default stack trace ID (used for dummy HPROF_TRACE record)
 enum {
-  STACK_TRACE_ID = 1
+  STACK_TRACE_ID = 1,
+  INITIAL_CLASS_COUNT = 200
 };


@@ -408,6 +409,7 @@
   void write_u8(u8 x);
   void write_objectID(oop o);
   void write_classID(Klass* k);
+  void write_id(u4 x);
 };

 DumpWriter::DumpWriter(const char* path) {
@@ -548,6 +550,14 @@
 #endif
 }

+void DumpWriter::write_id(u4 x) {
+#ifdef _LP64
+  write_u8((u8) x);
+#else
+  write_u4(x);
+#endif
+}
+
 // We use java mirror as the class ID
 void DumpWriter::write_classID(Klass* k) {
   write_objectID(k->java_mirror());
@@ -596,6 +606,8 @@
   static void dump_object_array(DumpWriter* writer, objArrayOop array);
   // creates HPROF_GC_PRIM_ARRAY_DUMP record for the given type array
   static void dump_prim_array(DumpWriter* writer, typeArrayOop array);
+  // create HPROF_FRAME record for the given method and bci
+  static void dump_stack_frame(DumpWriter* writer, int frame_serial_num, int class_serial_num, methodOop m, int bci);
 };

 // write a header of the given type
@@ -1070,6 +1082,29 @@
   }
 }

+// create a HPROF_FRAME record of the given methodOop and bci
+void DumperSupport::dump_stack_frame(DumpWriter* writer,
+                                     int frame_serial_num,
+                                     int class_serial_num,
+                                     methodOop m,
+                                     int bci) {
+  int line_number;
+  if (m->is_native()) {
+    line_number = -3;  // native frame
+  } else {
+    line_number = m->line_number_from_bci(bci);
+  }
+
+  write_header(writer, HPROF_FRAME, 4*oopSize + 2*sizeof(u4));
+  writer->write_id(frame_serial_num);               // frame serial number
+  writer->write_objectID(m->name());                // method's name
+  writer->write_objectID(m->signature());           // method's signature
+
+  assert(Klass::cast(m->method_holder())->oop_is_instance(), "not instanceKlass");
+  writer->write_objectID(instanceKlass::cast(m->method_holder())->source_file_name());  // source file name
+  writer->write_u4(class_serial_num);               // class serial number
+  writer->write_u4((u4) line_number);               // line number
+}

 // Support class used to generate HPROF_UTF8 records from the entries in the
 // SymbolTable.
@@ -1104,12 +1139,15 @@
  private:
   DumpWriter* _writer;
   u4 _thread_serial_num;
+  int _frame_num;
   DumpWriter* writer() const                { return _writer; }
  public:
   JNILocalsDumper(DumpWriter* writer, u4 thread_serial_num) {
     _writer = writer;
     _thread_serial_num = thread_serial_num;
+    _frame_num = -1;  // default - empty stack
   }
+  void set_frame_number(int n) { _frame_num = n; }
   void do_oop(oop* obj_p);
   void do_oop(narrowOop* obj_p) { ShouldNotReachHere(); }
 };
@@ -1122,7 +1160,7 @@
     writer()->write_u1(HPROF_GC_ROOT_JNI_LOCAL);
     writer()->write_objectID(o);
     writer()->write_u4(_thread_serial_num);
-    writer()->write_u4((u4)-1); // empty
+    writer()->write_u4((u4)_frame_num);
   }
 }

@@ -1269,6 +1307,9 @@
   bool _gc_before_heap_dump;
   bool _is_segmented_dump;
   jlong _dump_start;
+  GrowableArray<Klass*>* _klass_map;
+  ThreadStackTrace** _stack_traces;
+  int _num_threads;

   // accessors
   DumpWriter* writer() const                    { return _writer; }
@@ -1291,9 +1332,16 @@
   static void do_basic_type_array_class_dump(klassOop k);

   // HPROF_GC_ROOT_THREAD_OBJ records
-  void do_thread(JavaThread* thread, u4 thread_serial_num);
+  int do_thread(JavaThread* thread, u4 thread_serial_num);
   void do_threads();

+  void add_class_serial_number(Klass* k, int serial_num) {
+    _klass_map->at_put_grow(serial_num, k);
+  }
+
+  // HPROF_TRACE and HPROF_FRAME records
+  void dump_stack_traces();
+
   // writes a HPROF_HEAP_DUMP or HPROF_HEAP_DUMP_SEGMENT record
   void write_dump_header();

@@ -1313,6 +1361,18 @@
     _gc_before_heap_dump = gc_before_heap_dump;
     _is_segmented_dump = false;
     _dump_start = (jlong)-1;
+    _klass_map = new (ResourceObj::C_HEAP) GrowableArray<Klass*>(INITIAL_CLASS_COUNT, true);
+    _stack_traces = NULL;
+    _num_threads = 0;
+  }
+  ~VM_HeapDumper() {
+    if (_stack_traces != NULL) {
+      for (int i=0; i < _num_threads; i++) {
+        delete _stack_traces[i];
+      }
+      FREE_C_HEAP_ARRAY(ThreadStackTrace*, _stack_traces);
+    }
+    delete _klass_map;
   }

   VMOp_Type type() const { return VMOp_HeapDumper; }
@@ -1436,6 +1496,9 @@
     Klass* klass = Klass::cast(k);
     writer->write_classID(klass);

+    // add the klassOop and class serial number pair
+    dumper->add_class_serial_number(klass, class_serial_num);
+
     writer->write_u4(STACK_TRACE_ID);

     // class name ID
@@ -1465,15 +1528,15 @@
 // Walk the stack of the given thread.
 // Dumps a HPROF_GC_ROOT_JAVA_FRAME record for each local
 // Dumps a HPROF_GC_ROOT_JNI_LOCAL record for each JNI local
-void VM_HeapDumper::do_thread(JavaThread* java_thread, u4 thread_serial_num) {
+//
+// It returns the number of Java frames in this thread stack
+int VM_HeapDumper::do_thread(JavaThread* java_thread, u4 thread_serial_num) {
   JNILocalsDumper blk(writer(), thread_serial_num);

   oop threadObj = java_thread->threadObj();
   assert(threadObj != NULL, "sanity check");

-  // JNI locals for the top frame
-  java_thread->active_handles()->oops_do(&blk);
-
+  int stack_depth = 0;
   if (java_thread->has_last_Java_frame()) {

     // vframes are resource allocated
@@ -1484,13 +1547,14 @@
     RegisterMap reg_map(java_thread);
     frame f = java_thread->last_frame();
     vframe* vf = vframe::new_vframe(&f, &reg_map, java_thread);
+    frame* last_entry_frame = NULL;

     while (vf != NULL) {
+      blk.set_frame_number(stack_depth);
       if (vf->is_java_frame()) {

         // java frame (interpreted, compiled, ...)
         javaVFrame *jvf = javaVFrame::cast(vf);
-
         if (!(jvf->method()->is_native())) {
           StackValueCollection* locals = jvf->locals();
           for (int slot=0; slot<locals->size(); slot++) {
@@ -1501,44 +1565,61 @@
                 writer()->write_u1(HPROF_GC_ROOT_JAVA_FRAME);
                 writer()->write_objectID(o);
                 writer()->write_u4(thread_serial_num);
-                writer()->write_u4((u4)-1); // empty
+                writer()->write_u4((u4) stack_depth);
               }
             }
           }
+        } else {
+          // native frame
+          if (stack_depth == 0) {
+            // JNI locals for the top frame.
+            java_thread->active_handles()->oops_do(&blk);
+          } else {
+            if (last_entry_frame != NULL) {
+              // JNI locals for the entry frame
+              assert(last_entry_frame->is_entry_frame(), "checking");
+              last_entry_frame->entry_frame_call_wrapper()->handles()->oops_do(&blk);
+            }
+          }
         }
-      } else {
+        // increment only for Java frames
+        stack_depth++;
+        last_entry_frame = NULL;

+      } else {
         // externalVFrame - if it's an entry frame then report any JNI locals
-        // as roots
+        // as roots when we find the corresponding native javaVFrame
         frame* fr = vf->frame_pointer();
         assert(fr != NULL, "sanity check");
         if (fr->is_entry_frame()) {
-          fr->entry_frame_call_wrapper()->handles()->oops_do(&blk);
+          last_entry_frame = fr;
         }
       }
-
       vf = vf->sender();
     }
+  } else {
+    // no last java frame but there may be JNI locals
+    java_thread->active_handles()->oops_do(&blk);
   }
+  return stack_depth;
 }


 // write a HPROF_GC_ROOT_THREAD_OBJ record for each java thread. Then walk
 // the stack so that locals and JNI locals are dumped.
 void VM_HeapDumper::do_threads() {
-  u4 thread_serial_num = 0;
-  for (JavaThread* thread = Threads::first(); thread != NULL ; thread = thread->next()) {
+  for (int i=0; i < _num_threads; i++) {
+    JavaThread* thread = _stack_traces[i]->thread();
     oop threadObj = thread->threadObj();
-    if (threadObj != NULL && !thread->is_exiting() && !thread->is_hidden_from_external_view()) {
-      ++thread_serial_num;
-
-      writer()->write_u1(HPROF_GC_ROOT_THREAD_OBJ);
-      writer()->write_objectID(threadObj);
-      writer()->write_u4(thread_serial_num);
-      writer()->write_u4(STACK_TRACE_ID);
-
-      do_thread(thread, thread_serial_num);
-    }
+    u4 thread_serial_num = i+1;
+    u4 stack_serial_num = thread_serial_num + STACK_TRACE_ID;
+    writer()->write_u1(HPROF_GC_ROOT_THREAD_OBJ);
+    writer()->write_objectID(threadObj);
+    writer()->write_u4(thread_serial_num);  // thread number
+    writer()->write_u4(stack_serial_num);   // stack trace serial number
+    int num_frames = do_thread(thread, thread_serial_num);
+    assert(num_frames == _stack_traces[i]->get_stack_depth(),
+           "total number of Java frames not matched");
   }
 }

@@ -1547,16 +1628,16 @@
 // records:
 //
 //  HPROF_HEADER
-//  HPROF_TRACE
 //  [HPROF_UTF8]*
 //  [HPROF_LOAD_CLASS]*
+//  [[HPROF_FRAME]*|HPROF_TRACE]*
 //  [HPROF_GC_CLASS_DUMP]*
 //  HPROF_HEAP_DUMP
 //
-// The HPROF_TRACE record after the header is "dummy trace" record which does
-// not include any frames. Other records which require a stack trace ID will
-// specify the trace ID of this record (1). It also means we can run HAT without
-// needing the -stack false option.
+// The HPROF_TRACE records represent the stack traces where the heap dump
+// is generated and a "dummy trace" record which does not include
+// any frames. The dummy trace record is used to be referenced as the
+// unknown object alloc site.
 //
 // The HPROF_HEAP_DUMP record has a length following by sub-records. To allow
 // the heap dump be generated in a single pass we remember the position of
@@ -1592,12 +1673,6 @@
   writer()->write_u4(oopSize);
   writer()->write_u8(os::javaTimeMillis());

-  // HPROF_TRACE record without any frames
-  DumperSupport::write_header(writer(), HPROF_TRACE, 3*sizeof(u4));
-  writer()->write_u4(STACK_TRACE_ID);
-  writer()->write_u4(0);                    // thread number
-  writer()->write_u4(0);                    // frame count
-
   // HPROF_UTF8 records
   SymbolTableDumper sym_dumper(writer());
   SymbolTable::oops_do(&sym_dumper);
@@ -1606,6 +1681,10 @@
   SystemDictionary::classes_do(&do_load_class);
   Universe::basic_type_classes_do(&do_load_class);

+  // write HPROF_FRAME and HPROF_TRACE records
+  // this must be called after _klass_map is built when iterating the classes above.
+  dump_stack_traces();
+
   // write HPROF_HEAP_DUMP or HPROF_HEAP_DUMP_SEGMENT
   write_dump_header();

@@ -1646,6 +1725,47 @@
   end_of_dump();
 }

+void VM_HeapDumper::dump_stack_traces() {
+  // write a HPROF_TRACE record without any frames to be referenced as object alloc sites
+  DumperSupport::write_header(writer(), HPROF_TRACE, 3*sizeof(u4));
+  writer()->write_u4((u4) STACK_TRACE_ID);
+  writer()->write_u4(0);                    // thread number
+  writer()->write_u4(0);                    // frame count
+
+  _stack_traces = NEW_C_HEAP_ARRAY(ThreadStackTrace*, Threads::number_of_threads());
+  int frame_serial_num = 0;
+  for (JavaThread* thread = Threads::first(); thread != NULL ; thread = thread->next()) {
+    oop threadObj = thread->threadObj();
+    if (threadObj != NULL && !thread->is_exiting() && !thread->is_hidden_from_external_view()) {
+      // dump thread stack trace
+      ThreadStackTrace* stack_trace = new ThreadStackTrace(thread, false);
+      stack_trace->dump_stack_at_safepoint(-1);
+      _stack_traces[_num_threads++] = stack_trace;
+
+      // write HPROF_FRAME records for this thread's stack trace
+      int depth = stack_trace->get_stack_depth();
+      int thread_frame_start = frame_serial_num;
+      for (int j=0; j < depth; j++) {
+        StackFrameInfo* frame = stack_trace->stack_frame_at(j);
+        methodOop m = frame->method();
+        int class_serial_num = _klass_map->find(Klass::cast(m->method_holder()));
+        // the class serial number starts from 1
+        assert(class_serial_num > 0, "class not found");
+        DumperSupport::dump_stack_frame(writer(), ++frame_serial_num, class_serial_num, m, frame->bci());
+      }
+
+      // write HPROF_TRACE record for one thread
+      DumperSupport::write_header(writer(), HPROF_TRACE, 3*sizeof(u4) + depth*oopSize);
+      int stack_serial_num = _num_threads + STACK_TRACE_ID;
+      writer()->write_u4(stack_serial_num);      // stack trace serial number
+      writer()->write_u4((u4) _num_threads);     // thread serial number
+      writer()->write_u4(depth);                 // frame count
+      for (int j=1; j <= depth; j++) {
+        writer()->write_id(thread_frame_start + j);
+      }
+    }
+  }
+}

 // dump the heap to given path.
 int HeapDumper::dump(const char* path) {
--- a/src/share/vm/services/threadService.hpp	Wed Oct 29 19:18:54 2008 -0700
+++ b/src/share/vm/services/threadService.hpp	Wed Oct 29 19:22:09 2008 -0700
@@ -242,6 +242,7 @@
   ThreadStackTrace(JavaThread* thread, bool with_locked_monitors);
   ~ThreadStackTrace();

+  JavaThread*     thread()              { return _thread; }
   StackFrameInfo* stack_frame_at(int i) { return _frames->at(i); }
   int             get_stack_depth()     { return _depth; }
--- a/test/compiler/6700047/Test6700047.java	Wed Oct 29 19:18:54 2008 -0700
+++ b/test/compiler/6700047/Test6700047.java	Wed Oct 29 19:22:09 2008 -0700
@@ -29,6 +29,8 @@
  */

 public class Test6700047 {
+    static byte[] dummy = new byte[256];
+
     public static void main(String[] args) {
         for (int i = 0; i < 100000; i++) {
             intToLeftPaddedAsciiBytes();
@@ -53,6 +55,7 @@
         if (offset > 0) {
             for(int j = 0; j < offset; j++) {
                 result++;
+                dummy[i] = 0;
             }
         }
         return result;