diff src/cpu/x86/vm/assembler_x86.cpp @ 1930:2d26b0046e0d

Merge.
author Thomas Wuerthinger <wuerthinger@ssw.jku.at>
date Tue, 30 Nov 2010 14:53:30 +0100
parents 3483ec571caf 2fe998383789
children 06f017f7daa7
line wrap: on
line diff
--- a/src/cpu/x86/vm/assembler_x86.cpp	Mon Nov 29 18:32:30 2010 +0100
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Tue Nov 30 14:53:30 2010 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright 1997-2009 Sun Microsystems, Inc.  All Rights Reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -16,9 +16,9 @@
  * 2 along with this work; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
  *
  */
 
@@ -1275,6 +1275,12 @@
   emit_byte(0xF8 | encode);
 }
 
+void Assembler::divl(Register src) { // Unsigned
+  int encode = prefix_and_encode(src->encoding());
+  emit_byte(0xF7);
+  emit_byte(0xF0 | encode);
+}
+
 void Assembler::imull(Register dst, Register src) {
   int encode = prefix_and_encode(dst->encoding(), src->encoding());
   emit_byte(0x0F);
@@ -1288,7 +1294,7 @@
   if (is8bit(value)) {
     emit_byte(0x6B);
     emit_byte(0xC0 | encode);
-    emit_byte(value);
+    emit_byte(value & 0xFF);
   } else {
     emit_byte(0x69);
     emit_byte(0xC0 | encode);
@@ -3903,7 +3909,7 @@
   if (is8bit(value)) {
     emit_byte(0x6B);
     emit_byte(0xC0 | encode);
-    emit_byte(value);
+    emit_byte(value & 0xFF);
   } else {
     emit_byte(0x69);
     emit_byte(0xC0 | encode);
@@ -4993,19 +4999,22 @@
       ttyLocker ttyl;
       tty->print_cr("eip = 0x%08x", eip);
 #ifndef PRODUCT
-      tty->cr();
-      findpc(eip);
-      tty->cr();
+      if ((WizardMode || Verbose) && PrintMiscellaneous) {
+        tty->cr();
+        findpc(eip);
+        tty->cr();
+      }
 #endif
-      tty->print_cr("rax, = 0x%08x", rax);
-      tty->print_cr("rbx, = 0x%08x", rbx);
+      tty->print_cr("rax = 0x%08x", rax);
+      tty->print_cr("rbx = 0x%08x", rbx);
       tty->print_cr("rcx = 0x%08x", rcx);
       tty->print_cr("rdx = 0x%08x", rdx);
       tty->print_cr("rdi = 0x%08x", rdi);
       tty->print_cr("rsi = 0x%08x", rsi);
-      tty->print_cr("rbp, = 0x%08x", rbp);
+      tty->print_cr("rbp = 0x%08x", rbp);
       tty->print_cr("rsp = 0x%08x", rsp);
       BREAKPOINT;
+      assert(false, "start up GDB");
     }
   } else {
     ttyLocker ttyl;
@@ -6494,24 +6503,19 @@
 }
 
 void MacroAssembler::load_sized_value(Register dst, Address src,
-                                      int size_in_bytes, bool is_signed) {
-  switch (size_in_bytes ^ (is_signed ? -1 : 0)) {
+                                      size_t size_in_bytes, bool is_signed) {
+  switch (size_in_bytes) {
 #ifndef _LP64
   // For case 8, caller is responsible for manually loading
   // the second word into another register.
-  case ~8:  // fall through:
-  case  8:  movl(                dst, src ); break;
+  case  8: movl(dst, src); break;
 #else
-  case ~8:  // fall through:
-  case  8:  movq(                dst, src ); break;
+  case  8: movq(dst, src); break;
 #endif
-  case ~4:  // fall through:
-  case  4:  movl(                dst, src ); break;
-  case ~2:  load_signed_short(   dst, src ); break;
-  case  2:  load_unsigned_short( dst, src ); break;
-  case ~1:  load_signed_byte(    dst, src ); break;
-  case  1:  load_unsigned_byte(  dst, src ); break;
-  default:  ShouldNotReachHere();
+  case  4: movl(dst, src); break;
+  case  2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
+  case  1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
+  default: ShouldNotReachHere();
   }
 }
 
@@ -7158,7 +7162,7 @@
   subptr(t1, typeArrayOopDesc::header_size(T_INT));
   addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
   shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
-  movptr(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
+  movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
   // set klass to intArrayKlass
   // dubious reloc why not an oop reloc?
   movptr(t1, ExternalAddress((address) Universe::intArrayKlassObj_addr()));
@@ -7575,21 +7579,27 @@
 
   // Scan RCX words at [RDI] for an occurrence of RAX.
   // Set NZ/Z based on last compare.
+  // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
+  // not change flags (only scas instruction which is repeated sets flags).
+  // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
 #ifdef _LP64
   // This part is tricky, as values in supers array could be 32 or 64 bit wide
   // and we store values in objArrays always encoded, thus we need to encode
   // the value of rax before repne.  Note that rax is dead after the repne.
   if (UseCompressedOops) {
-    encode_heap_oop_not_null(rax);
+    encode_heap_oop_not_null(rax); // Changes flags.
     // The superclass is never null; it would be a basic system error if a null
     // pointer were to sneak in here.  Note that we have already loaded the
     // Klass::super_check_offset from the super_klass in the fast path,
     // so if there is a null in that register, we are already in the afterlife.
+    testl(rax,rax); // Set Z = 0
     repne_scanl();
   } else
 #endif // _LP64
+  {
+    testptr(rax,rax); // Set Z = 0
     repne_scan();
-
+  }
   // Unspill the temp. registers:
   if (pushed_rdi)  pop(rdi);
   if (pushed_rcx)  pop(rcx);
@@ -7650,6 +7660,9 @@
   // Pass register number to verify_oop_subroutine
   char* b = new char[strlen(s) + 50];
   sprintf(b, "verify_oop: %s: %s", reg->name(), s);
+#ifdef _LP64
+  push(rscratch1);                    // save r10, trashed by movptr()
+#endif
   push(rax);                          // save rax,
   push(reg);                          // pass register argument
   ExternalAddress buffer((address) b);
@@ -7660,6 +7673,7 @@
   // call indirectly to solve generation ordering problem
   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
   call(rax);
+  // Caller pops the arguments (oop, message) and restores rax, r10
 }
 
 
@@ -7674,11 +7688,19 @@
   movptr(tmp, ExternalAddress((address) delayed_value_addr));
 
 #ifdef ASSERT
-  Label L;
-  testptr(tmp, tmp);
-  jccb(Assembler::notZero, L);
-  hlt();
-  bind(L);
+  { Label L;
+    testptr(tmp, tmp);
+    if (WizardMode) {
+      jcc(Assembler::notZero, L);
+      char* buf = new char[40];
+      sprintf(buf, "DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
+      stop(buf);
+    } else {
+      jccb(Assembler::notZero, L);
+      hlt();
+    }
+    bind(L);
+  }
 #endif
 
   if (offset != 0)
@@ -7695,9 +7717,14 @@
 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
                                               Register temp_reg,
                                               Label& wrong_method_type) {
-  if (UseCompressedOops)  unimplemented();  // field accesses must decode
+  Address type_addr(mh_reg, delayed_value(java_dyn_MethodHandle::type_offset_in_bytes, temp_reg));
   // compare method type against that of the receiver
-  cmpptr(mtype_reg, Address(mh_reg, delayed_value(java_dyn_MethodHandle::type_offset_in_bytes, temp_reg)));
+  if (UseCompressedOops) {
+    load_heap_oop(temp_reg, type_addr);
+    cmpptr(mtype_reg, temp_reg);
+  } else {
+    cmpptr(mtype_reg, type_addr);
+  }
   jcc(Assembler::notEqual, wrong_method_type);
 }
 
@@ -7708,15 +7735,15 @@
 // method handle's MethodType.  This macro hides the distinction.
 void MacroAssembler::load_method_handle_vmslots(Register vmslots_reg, Register mh_reg,
                                                 Register temp_reg) {
-  if (UseCompressedOops)  unimplemented();  // field accesses must decode
+  assert_different_registers(vmslots_reg, mh_reg, temp_reg);
   // load mh.type.form.vmslots
   if (java_dyn_MethodHandle::vmslots_offset_in_bytes() != 0) {
     // hoist vmslots into every mh to avoid dependent load chain
     movl(vmslots_reg, Address(mh_reg, delayed_value(java_dyn_MethodHandle::vmslots_offset_in_bytes, temp_reg)));
   } else {
     Register temp2_reg = vmslots_reg;
-    movptr(temp2_reg, Address(mh_reg,    delayed_value(java_dyn_MethodHandle::type_offset_in_bytes, temp_reg)));
-    movptr(temp2_reg, Address(temp2_reg, delayed_value(java_dyn_MethodType::form_offset_in_bytes, temp_reg)));
+    load_heap_oop(temp2_reg, Address(mh_reg,    delayed_value(java_dyn_MethodHandle::type_offset_in_bytes, temp_reg)));
+    load_heap_oop(temp2_reg, Address(temp2_reg, delayed_value(java_dyn_MethodType::form_offset_in_bytes, temp_reg)));
     movl(vmslots_reg, Address(temp2_reg, delayed_value(java_dyn_MethodTypeForm::vmslots_offset_in_bytes, temp_reg)));
   }
 }
@@ -7730,9 +7757,8 @@
   assert(mh_reg == rcx, "caller must put MH object in rcx");
   assert_different_registers(mh_reg, temp_reg);
 
-  if (UseCompressedOops)  unimplemented();  // field accesses must decode
-
   // pick out the interpreted side of the handler
+  // NOTE: vmentry is not an oop!
   movptr(temp_reg, Address(mh_reg, delayed_value(java_dyn_MethodHandle::vmentry_offset_in_bytes, temp_reg)));
 
   // off we go...
@@ -7746,7 +7772,7 @@
 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
                                          int extra_slot_offset) {
   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
-  int stackElementSize = Interpreter::stackElementSize();
+  int stackElementSize = Interpreter::stackElementSize;
   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 #ifdef ASSERT
   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
@@ -7773,6 +7799,9 @@
   char* b = new char[strlen(s) + 50];
   sprintf(b, "verify_oop_addr: %s", s);
 
+#ifdef _LP64
+  push(rscratch1);                    // save r10, trashed by movptr()
+#endif
   push(rax);                          // save rax,
   // addr may contain rsp so we will have to adjust it based on the push
   // we just did
@@ -7795,7 +7824,7 @@
   // call indirectly to solve generation ordering problem
   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
   call(rax);
-  // Caller pops the arguments and restores rax, from the stack
+  // Caller pops the arguments (addr, message) and restores rax, r10.
 }
 
 void MacroAssembler::verify_tlab() {
@@ -7977,7 +8006,7 @@
       case 2: return "special";
       case 3: return "empty";
     }
-    ShouldNotReachHere()
+    ShouldNotReachHere();
     return NULL;
   }
 
@@ -8191,9 +8220,14 @@
     assert (Universe::heap() != NULL, "java heap should be initialized");
     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
     if (Universe::narrow_oop_shift() != 0) {
-      assert(Address::times_8 == LogMinObjAlignmentInBytes &&
-             Address::times_8 == Universe::narrow_oop_shift(), "decode alg wrong");
-      movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+      assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+      if (LogMinObjAlignmentInBytes == Address::times_8) {
+        movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+      } else {
+        // OK to use shift since we don't need to preserve flags.
+        shlq(dst, LogMinObjAlignmentInBytes);
+        movq(dst, Address(r12_heapbase, dst, Address::times_1, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+      }
     } else {
       movq(dst, Address(dst, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
     }
@@ -8215,6 +8249,40 @@
     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
 }
 
+void MacroAssembler::load_heap_oop(Register dst, Address src) {
+#ifdef _LP64
+  if (UseCompressedOops) {
+    movl(dst, src);
+    decode_heap_oop(dst);
+  } else
+#endif
+    movptr(dst, src);
+}
+
+void MacroAssembler::store_heap_oop(Address dst, Register src) {
+#ifdef _LP64
+  if (UseCompressedOops) {
+    assert(!dst.uses(src), "not enough registers");
+    encode_heap_oop(src);
+    movl(dst, src);
+  } else
+#endif
+    movptr(dst, src);
+}
+
+// Used for storing NULLs.
+void MacroAssembler::store_heap_oop_null(Address dst) {
+#ifdef _LP64
+  if (UseCompressedOops) {
+    movl(dst, (int32_t)NULL_WORD);
+  } else {
+    movslq(dst, (int32_t)NULL_WORD);
+  }
+#else
+  movl(dst, (int32_t)NULL_WORD);
+#endif
+}
+
 #ifdef _LP64
 void MacroAssembler::store_klass_gap(Register dst, Register src) {
   if (UseCompressedOops) {
@@ -8223,58 +8291,35 @@
   }
 }
 
-void MacroAssembler::load_heap_oop(Register dst, Address src) {
-  if (UseCompressedOops) {
-    movl(dst, src);
-    decode_heap_oop(dst);
-  } else {
-    movq(dst, src);
-  }
-}
-
-void MacroAssembler::store_heap_oop(Address dst, Register src) {
-  if (UseCompressedOops) {
-    assert(!dst.uses(src), "not enough registers");
-    encode_heap_oop(src);
-    movl(dst, src);
-  } else {
-    movq(dst, src);
-  }
-}
-
-// Used for storing NULLs.
-void MacroAssembler::store_heap_oop_null(Address dst) {
-  if (UseCompressedOops) {
-    movl(dst, (int32_t)NULL_WORD);
-  } else {
-    movslq(dst, (int32_t)NULL_WORD);
-  }
-}
+#ifdef ASSERT
+void MacroAssembler::verify_heapbase(const char* msg) {
+  assert (UseCompressedOops, "should be compressed");
+  assert (Universe::heap() != NULL, "java heap should be initialized");
+  if (CheckCompressedOops) {
+    Label ok;
+    push(rscratch1); // cmpptr trashes rscratch1
+    cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
+    jcc(Assembler::equal, ok);
+    stop(msg);
+    bind(ok);
+    pop(rscratch1);
+  }
+}
+#endif
 
 // Algorithm must match oop.inline.hpp encode_heap_oop.
 void MacroAssembler::encode_heap_oop(Register r) {
-  assert (UseCompressedOops, "should be compressed");
-  assert (Universe::heap() != NULL, "java heap should be initialized");
+#ifdef ASSERT
+  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
+#endif
+  verify_oop(r, "broken oop in encode_heap_oop");
   if (Universe::narrow_oop_base() == NULL) {
-    verify_oop(r, "broken oop in encode_heap_oop");
     if (Universe::narrow_oop_shift() != 0) {
       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
       shrq(r, LogMinObjAlignmentInBytes);
     }
     return;
   }
-#ifdef ASSERT
-  if (CheckCompressedOops) {
-    Label ok;
-    push(rscratch1); // cmpptr trashes rscratch1
-    cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
-    jcc(Assembler::equal, ok);
-    stop("MacroAssembler::encode_heap_oop: heap base corrupted?");
-    bind(ok);
-    pop(rscratch1);
-  }
-#endif
-  verify_oop(r, "broken oop in encode_heap_oop");
   testq(r, r);
   cmovq(Assembler::equal, r, r12_heapbase);
   subq(r, r12_heapbase);
@@ -8282,9 +8327,8 @@
 }
 
 void MacroAssembler::encode_heap_oop_not_null(Register r) {
-  assert (UseCompressedOops, "should be compressed");
-  assert (Universe::heap() != NULL, "java heap should be initialized");
 #ifdef ASSERT
+  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
   if (CheckCompressedOops) {
     Label ok;
     testq(r, r);
@@ -8304,9 +8348,8 @@
 }
 
 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
-  assert (UseCompressedOops, "should be compressed");
-  assert (Universe::heap() != NULL, "java heap should be initialized");
 #ifdef ASSERT
+  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
   if (CheckCompressedOops) {
     Label ok;
     testq(src, src);
@@ -8329,72 +8372,67 @@
 }
 
 void  MacroAssembler::decode_heap_oop(Register r) {
-  assert (UseCompressedOops, "should be compressed");
-  assert (Universe::heap() != NULL, "java heap should be initialized");
+#ifdef ASSERT
+  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
+#endif
   if (Universe::narrow_oop_base() == NULL) {
     if (Universe::narrow_oop_shift() != 0) {
       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
       shlq(r, LogMinObjAlignmentInBytes);
     }
-    verify_oop(r, "broken oop in decode_heap_oop");
-    return;
-  }
-#ifdef ASSERT
-  if (CheckCompressedOops) {
-    Label ok;
-    push(rscratch1);
-    cmpptr(r12_heapbase,
-           ExternalAddress((address)Universe::narrow_oop_base_addr()));
-    jcc(Assembler::equal, ok);
-    stop("MacroAssembler::decode_heap_oop: heap base corrupted?");
-    bind(ok);
-    pop(rscratch1);
-  }
-#endif
-
-  Label done;
-  shlq(r, LogMinObjAlignmentInBytes);
-  jccb(Assembler::equal, done);
-  addq(r, r12_heapbase);
-#if 0
-   // alternate decoding probably a wash.
-   testq(r, r);
-   jccb(Assembler::equal, done);
-   leaq(r, Address(r12_heapbase, r, Address::times_8, 0));
-#endif
-  bind(done);
+  } else {
+    Label done;
+    shlq(r, LogMinObjAlignmentInBytes);
+    jccb(Assembler::equal, done);
+    addq(r, r12_heapbase);
+    bind(done);
+  }
   verify_oop(r, "broken oop in decode_heap_oop");
 }
 
 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
+  // Note: it will change flags
   assert (UseCompressedOops, "should only be used for compressed headers");
   assert (Universe::heap() != NULL, "java heap should be initialized");
   // Cannot assert, unverified entry point counts instructions (see .ad file)
   // vtableStubs also counts instructions in pd_code_size_limit.
   // Also do not verify_oop as this is called by verify_oop.
   if (Universe::narrow_oop_shift() != 0) {
-    assert (Address::times_8 == LogMinObjAlignmentInBytes &&
-            Address::times_8 == Universe::narrow_oop_shift(), "decode alg wrong");
-    // Don't use Shift since it modifies flags.
-    leaq(r, Address(r12_heapbase, r, Address::times_8, 0));
+    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+    shlq(r, LogMinObjAlignmentInBytes);
+    if (Universe::narrow_oop_base() != NULL) {
+      addq(r, r12_heapbase);
+    }
   } else {
     assert (Universe::narrow_oop_base() == NULL, "sanity");
   }
 }
 
 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
+  // Note: it will change flags
   assert (UseCompressedOops, "should only be used for compressed headers");
   assert (Universe::heap() != NULL, "java heap should be initialized");
   // Cannot assert, unverified entry point counts instructions (see .ad file)
   // vtableStubs also counts instructions in pd_code_size_limit.
   // Also do not verify_oop as this is called by verify_oop.
   if (Universe::narrow_oop_shift() != 0) {
-    assert (Address::times_8 == LogMinObjAlignmentInBytes &&
-            Address::times_8 == Universe::narrow_oop_shift(), "decode alg wrong");
-    leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
-  } else if (dst != src) {
+    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+    if (LogMinObjAlignmentInBytes == Address::times_8) {
+      leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
+    } else {
+      if (dst != src) {
+        movq(dst, src);
+      }
+      shlq(dst, LogMinObjAlignmentInBytes);
+      if (Universe::narrow_oop_base() != NULL) {
+        addq(dst, r12_heapbase);
+      }
+    }
+  } else {
     assert (Universe::narrow_oop_base() == NULL, "sanity");
-    movq(dst, src);
+    if (dst != src) {
+      movq(dst, src);
+    }
   }
 }
 
@@ -8757,6 +8795,186 @@
   bind(DONE);
 }
 
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) block_comment(str)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+void MacroAssembler::generate_fill(BasicType t, bool aligned,
+                                   Register to, Register value, Register count,
+                                   Register rtmp, XMMRegister xtmp) {
+  assert_different_registers(to, value, count, rtmp);
+  Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
+  Label L_fill_2_bytes, L_fill_4_bytes;
+
+  int shift = -1;
+  switch (t) {
+    case T_BYTE:
+      shift = 2;
+      break;
+    case T_SHORT:
+      shift = 1;
+      break;
+    case T_INT:
+      shift = 0;
+      break;
+    default: ShouldNotReachHere();
+  }
+
+  if (t == T_BYTE) {
+    andl(value, 0xff);
+    movl(rtmp, value);
+    shll(rtmp, 8);
+    orl(value, rtmp);
+  }
+  if (t == T_SHORT) {
+    andl(value, 0xffff);
+  }
+  if (t == T_BYTE || t == T_SHORT) {
+    movl(rtmp, value);
+    shll(rtmp, 16);
+    orl(value, rtmp);
+  }
+
+  cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
+  jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
+  if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
+    // align source address at 4 bytes address boundary
+    if (t == T_BYTE) {
+      // One byte misalignment happens only for byte arrays
+      testptr(to, 1);
+      jccb(Assembler::zero, L_skip_align1);
+      movb(Address(to, 0), value);
+      increment(to);
+      decrement(count);
+      BIND(L_skip_align1);
+    }
+    // Two bytes misalignment happens only for byte and short (char) arrays
+    testptr(to, 2);
+    jccb(Assembler::zero, L_skip_align2);
+    movw(Address(to, 0), value);
+    addptr(to, 2);
+    subl(count, 1<<(shift-1));
+    BIND(L_skip_align2);
+  }
+  if (UseSSE < 2) {
+    Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
+    // Fill 32-byte chunks
+    subl(count, 8 << shift);
+    jcc(Assembler::less, L_check_fill_8_bytes);
+    align(16);
+
+    BIND(L_fill_32_bytes_loop);
+
+    for (int i = 0; i < 32; i += 4) {
+      movl(Address(to, i), value);
+    }
+
+    addptr(to, 32);
+    subl(count, 8 << shift);
+    jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
+    BIND(L_check_fill_8_bytes);
+    addl(count, 8 << shift);
+    jccb(Assembler::zero, L_exit);
+    jmpb(L_fill_8_bytes);
+
+    //
+    // length is too short, just fill qwords
+    //
+    BIND(L_fill_8_bytes_loop);
+    movl(Address(to, 0), value);
+    movl(Address(to, 4), value);
+    addptr(to, 8);
+    BIND(L_fill_8_bytes);
+    subl(count, 1 << (shift + 1));
+    jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
+    // fall through to fill 4 bytes
+  } else {
+    Label L_fill_32_bytes;
+    if (!UseUnalignedLoadStores) {
+      // align to 8 bytes, we know we are 4 byte aligned to start
+      testptr(to, 4);
+      jccb(Assembler::zero, L_fill_32_bytes);
+      movl(Address(to, 0), value);
+      addptr(to, 4);
+      subl(count, 1<<shift);
+    }
+    BIND(L_fill_32_bytes);
+    {
+      assert( UseSSE >= 2, "supported cpu only" );
+      Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
+      // Fill 32-byte chunks
+      movdl(xtmp, value);
+      pshufd(xtmp, xtmp, 0);
+
+      subl(count, 8 << shift);
+      jcc(Assembler::less, L_check_fill_8_bytes);
+      align(16);
+
+      BIND(L_fill_32_bytes_loop);
+
+      if (UseUnalignedLoadStores) {
+        movdqu(Address(to, 0), xtmp);
+        movdqu(Address(to, 16), xtmp);
+      } else {
+        movq(Address(to, 0), xtmp);
+        movq(Address(to, 8), xtmp);
+        movq(Address(to, 16), xtmp);
+        movq(Address(to, 24), xtmp);
+      }
+
+      addptr(to, 32);
+      subl(count, 8 << shift);
+      jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
+      BIND(L_check_fill_8_bytes);
+      addl(count, 8 << shift);
+      jccb(Assembler::zero, L_exit);
+      jmpb(L_fill_8_bytes);
+
+      //
+      // length is too short, just fill qwords
+      //
+      BIND(L_fill_8_bytes_loop);
+      movq(Address(to, 0), xtmp);
+      addptr(to, 8);
+      BIND(L_fill_8_bytes);
+      subl(count, 1 << (shift + 1));
+      jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
+    }
+  }
+  // fill trailing 4 bytes
+  BIND(L_fill_4_bytes);
+  testl(count, 1<<shift);
+  jccb(Assembler::zero, L_fill_2_bytes);
+  movl(Address(to, 0), value);
+  if (t == T_BYTE || t == T_SHORT) {
+    addptr(to, 4);
+    BIND(L_fill_2_bytes);
+    // fill trailing 2 bytes
+    testl(count, 1<<(shift-1));
+    jccb(Assembler::zero, L_fill_byte);
+    movw(Address(to, 0), value);
+    if (t == T_BYTE) {
+      addptr(to, 2);
+      BIND(L_fill_byte);
+      // fill trailing byte
+      testl(count, 1);
+      jccb(Assembler::zero, L_exit);
+      movb(Address(to, 0), value);
+    } else {
+      BIND(L_fill_byte);
+    }
+  } else {
+    BIND(L_fill_2_bytes);
+  }
+  BIND(L_exit);
+}
+#undef BIND
+#undef BLOCK_COMMENT
+
+
 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
   switch (cond) {
     // Note some conditions are synonyms for others