diff src/cpu/sparc/vm/stubGenerator_sparc.cpp @ 4137:04b9a2566eec

Merge with hsx23/hotspot.
author Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
date Sat, 17 Dec 2011 21:40:27 +0100
parents a92cdbac8b9e
children 33df1aeaebbf
line wrap: on
line diff
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Sat Dec 17 20:50:09 2011 +0100
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Sat Dec 17 21:40:27 2011 +0100
@@ -150,8 +150,7 @@
     { const Register t = G3_scratch;
       Label L;
       __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
-      __ br_null(t, false, Assembler::pt, L);
-      __ delayed()->nop();
+      __ br_null_short(t, Assembler::pt, L);
       __ stop("StubRoutines::call_stub: entered with pending exception");
       __ bind(L);
     }
@@ -207,8 +206,7 @@
       Label exit;
       __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
       __ add( FP, STACK_BIAS, dst );
-      __ tst(cnt);
-      __ br(Assembler::zero, false, Assembler::pn, exit);
+      __ cmp_zero_and_br(Assembler::zero, cnt, exit);
       __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
 
       // copy parameters if any
@@ -282,20 +280,20 @@
       __ delayed()->restore();
 
       __ BIND(is_object);
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->st_ptr(O0, addr, G0);
 
       __ BIND(is_float);
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
 
       __ BIND(is_double);
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
 
       __ BIND(is_long);
 #ifdef _LP64
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->st_long(O0, addr, G0);      // store entire long
 #else
 #if defined(COMPILER2)
@@ -307,11 +305,11 @@
   // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
   // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
 
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->stx(G1, addr, G0);  // store entire long
 #else
       __ st(O1, addr, BytesPerInt);
-      __ ba(false, exit);
+      __ ba(exit);
       __ delayed()->st(O0, addr, G0);
 #endif /* COMPILER2 */
 #endif /* _LP64 */
@@ -382,8 +380,7 @@
     // make sure that this code is only executed if there is a pending exception
     { Label L;
       __ ld_ptr(exception_addr, Gtemp);
-      __ br_notnull(Gtemp, false, Assembler::pt, L);
-      __ delayed()->nop();
+      __ br_notnull_short(Gtemp, Assembler::pt, L);
       __ stop("StubRoutines::forward exception: no pending exception (1)");
       __ bind(L);
     }
@@ -406,8 +403,7 @@
 #ifdef ASSERT
     // make sure exception is set
     { Label L;
-      __ br_notnull(Oexception, false, Assembler::pt, L);
-      __ delayed()->nop();
+      __ br_notnull_short(Oexception, Assembler::pt, L);
       __ stop("StubRoutines::forward exception: no pending exception (2)");
       __ bind(L);
     }
@@ -466,11 +462,6 @@
 
     int frame_complete = __ offset();
 
-    if (restore_saved_exception_pc) {
-      __ ld_ptr(G2_thread, JavaThread::saved_exception_pc_offset(), I7);
-      __ sub(I7, frame::pc_return_offset, I7);
-    }
-
     // Note that we always have a runtime stub frame on the top of stack by this point
     Register last_java_sp = SP;
     // 64-bit last_java_sp is biased!
@@ -501,8 +492,7 @@
     Address exception_addr(G2_thread, Thread::pending_exception_offset());
     Register scratch_reg = Gtemp;
     __ ld_ptr(exception_addr, scratch_reg);
-    __ br_notnull(scratch_reg, false, Assembler::pt, L);
-    __ delayed()->nop();
+    __ br_notnull_short(scratch_reg, Assembler::pt, L);
     __ should_not_reach_here();
     __ bind(L);
 #endif // ASSERT
@@ -614,9 +604,7 @@
     __ mov(G0,yield_reg);
 
     __ BIND(retry);
-    __ cmp(yield_reg, V8AtomicOperationUnderLockSpinCount);
-    __ br(Assembler::less, false, Assembler::pt, dontyield);
-    __ delayed()->nop();
+    __ cmp_and_br_short(yield_reg, V8AtomicOperationUnderLockSpinCount, Assembler::less, Assembler::pt, dontyield);
 
     // This code can only be called from inside the VM, this
     // stub is only invoked from Atomic::add().  We do not
@@ -676,9 +664,7 @@
       // try to replace O2 with O3
       __ cas_under_lock(O1, O2, O3,
       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
-      __ cmp(O2, O3);
-      __ br(Assembler::notEqual, false, Assembler::pn, retry);
-      __ delayed()->nop();
+      __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
 
       __ retl(false);
       __ delayed()->mov(O2, O0);  // report previous value to caller
@@ -798,11 +784,9 @@
       __ BIND(retry);
 
       __ lduw(O1, 0, O2);
-      __ add(O0,   O2, O3);
-      __ cas(O1,   O2, O3);
-      __ cmp(      O2, O3);
-      __ br(Assembler::notEqual, false, Assembler::pn, retry);
-      __ delayed()->nop();
+      __ add(O0, O2, O3);
+      __ cas(O1, O2, O3);
+      __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
       __ retl(false);
       __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
     } else {
@@ -1135,6 +1119,126 @@
     }
   }
 
+  //
+  // Generate main code for disjoint arraycopy
+  //
+  typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
+                                              Label& L_loop, bool use_prefetch, bool use_bis);
+
+  void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
+                          int iter_size, CopyLoopFunc copy_loop_func) {
+    Label L_copy;
+
+    assert(log2_elem_size <= 3, "the following code should be changed");
+    int count_dec = 16>>log2_elem_size;
+
+    int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
+    assert(prefetch_dist < 4096, "invalid value");
+    prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
+    int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
+
+    if (UseBlockCopy) {
+      Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
+
+      // 64 bytes tail + bytes copied in one loop iteration
+      int tail_size = 64 + iter_size;
+      int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
+      // Use BIS copy only for big arrays since it requires membar.
+      __ set(block_copy_count, O4);
+      __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
+      // This code is for disjoint source and destination:
+      //   to <= from || to >= from+count
+      // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
+      __ sub(from, to, O4);
+      __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
+      __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
+
+      __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
+      // BIS should not be used to copy tail (64 bytes+iter_size)
+      // to avoid zeroing of following values.
+      __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
+
+      if (prefetch_count > 0) { // rounded up to one iteration count
+        // Do prefetching only if copy size is bigger
+        // than prefetch distance.
+        __ set(prefetch_count, O4);
+        __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
+        __ sub(count, prefetch_count, count);
+
+        (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
+        __ add(count, prefetch_count, count); // restore count
+
+      } // prefetch_count > 0
+
+      (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
+      __ add(count, (tail_size>>log2_elem_size), count); // restore count
+
+      __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
+      // BIS needs membar.
+      __ membar(Assembler::StoreLoad);
+      // Copy tail
+      __ ba_short(L_copy);
+
+      __ BIND(L_skip_block_copy);
+    } // UseBlockCopy
+
+    if (prefetch_count > 0) { // rounded up to one iteration count
+      // Do prefetching only if copy size is bigger
+      // than prefetch distance.
+      __ set(prefetch_count, O4);
+      __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
+      __ sub(count, prefetch_count, count);
+
+      Label L_copy_prefetch;
+      (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
+      __ add(count, prefetch_count, count); // restore count
+
+    } // prefetch_count > 0
+
+    (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
+  }
+
+
+
+  //
+  // Helper methods for copy_16_bytes_forward_with_shift()
+  //
+  void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
+                                Label& L_loop, bool use_prefetch, bool use_bis) {
+
+    const Register left_shift  = G1; // left  shift bit counter
+    const Register right_shift = G5; // right shift bit counter
+
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loop);
+    if (use_prefetch) {
+      if (ArraycopySrcPrefetchDistance > 0) {
+        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
+      }
+      if (ArraycopyDstPrefetchDistance > 0) {
+        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
+      }
+    }
+    __ ldx(from, 0, O4);
+    __ ldx(from, 8, G4);
+    __ inc(to, 16);
+    __ inc(from, 16);
+    __ deccc(count, count_dec); // Can we do next iteration after this one?
+    __ srlx(O4, right_shift, G3);
+    __ bset(G3, O3);
+    __ sllx(O4, left_shift,  O4);
+    __ srlx(G4, right_shift, G3);
+    __ bset(G3, O4);
+    if (use_bis) {
+      __ stxa(O3, to, -16);
+      __ stxa(O4, to, -8);
+    } else {
+      __ stx(O3, to, -16);
+      __ stx(O4, to, -8);
+    }
+    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
+    __ delayed()->sllx(G4, left_shift,  O3);
+  }
 
   // Copy big chunks forward with shift
   //
@@ -1146,64 +1250,51 @@
   //   L_copy_bytes - copy exit label
   //
   void copy_16_bytes_forward_with_shift(Register from, Register to,
-                     Register count, int count_dec, Label& L_copy_bytes) {
-    Label L_loop, L_aligned_copy, L_copy_last_bytes;
+                     Register count, int log2_elem_size, Label& L_copy_bytes) {
+    Label L_aligned_copy, L_copy_last_bytes;
+    assert(log2_elem_size <= 3, "the following code should be changed");
+    int count_dec = 16>>log2_elem_size;
 
     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
-      __ andcc(from, 7, G1); // misaligned bytes
-      __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
-      __ delayed()->nop();
+    __ andcc(from, 7, G1); // misaligned bytes
+    __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
+    __ delayed()->nop();
 
     const Register left_shift  = G1; // left  shift bit counter
     const Register right_shift = G5; // right shift bit counter
 
-      __ sll(G1, LogBitsPerByte, left_shift);
-      __ mov(64, right_shift);
-      __ sub(right_shift, left_shift, right_shift);
+    __ sll(G1, LogBitsPerByte, left_shift);
+    __ mov(64, right_shift);
+    __ sub(right_shift, left_shift, right_shift);
 
     //
     // Load 2 aligned 8-bytes chunks and use one from previous iteration
     // to form 2 aligned 8-bytes chunks to store.
     //
-      __ deccc(count, count_dec); // Pre-decrement 'count'
-      __ andn(from, 7, from);     // Align address
-      __ ldx(from, 0, O3);
-      __ inc(from, 8);
-      __ align(OptoLoopAlignment);
-    __ BIND(L_loop);
-      __ ldx(from, 0, O4);
-      __ deccc(count, count_dec); // Can we do next iteration after this one?
-      __ ldx(from, 8, G4);
-      __ inc(to, 16);
-      __ inc(from, 16);
-      __ sllx(O3, left_shift,  O3);
-      __ srlx(O4, right_shift, G3);
-      __ bset(G3, O3);
-      __ stx(O3, to, -16);
-      __ sllx(O4, left_shift,  O4);
-      __ srlx(G4, right_shift, G3);
-      __ bset(G3, O4);
-      __ stx(O4, to, -8);
-      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
-      __ delayed()->mov(G4, O3);
-
-      __ inccc(count, count_dec>>1 ); // + 8 bytes
-      __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
-      __ delayed()->inc(count, count_dec>>1); // restore 'count'
-
-      // copy 8 bytes, part of them already loaded in O3
-      __ ldx(from, 0, O4);
-      __ inc(to, 8);
-      __ inc(from, 8);
-      __ sllx(O3, left_shift,  O3);
-      __ srlx(O4, right_shift, G3);
-      __ bset(O3, G3);
-      __ stx(G3, to, -8);
+    __ dec(count, count_dec);   // Pre-decrement 'count'
+    __ andn(from, 7, from);     // Align address
+    __ ldx(from, 0, O3);
+    __ inc(from, 8);
+    __ sllx(O3, left_shift,  O3);
+
+    disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop);
+
+    __ inccc(count, count_dec>>1 ); // + 8 bytes
+    __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
+    __ delayed()->inc(count, count_dec>>1); // restore 'count'
+
+    // copy 8 bytes, part of them already loaded in O3
+    __ ldx(from, 0, O4);
+    __ inc(to, 8);
+    __ inc(from, 8);
+    __ srlx(O4, right_shift, G3);
+    __ bset(O3, G3);
+    __ stx(G3, to, -8);
 
     __ BIND(L_copy_last_bytes);
-      __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
-      __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
-      __ delayed()->sub(from, right_shift, from);       // restore address
+    __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
+    __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
+    __ delayed()->sub(from, right_shift, from);       // restore address
 
     __ BIND(L_aligned_copy);
   }
@@ -1359,7 +1450,7 @@
       // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
       // Also jump over aligned copy after the copy with shift completed.
 
-      copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte);
+      copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
     }
 
     // Both array are 8 bytes aligned, copy 16 bytes at a time
@@ -1370,8 +1461,7 @@
 
     // copy tailing bytes
     __ BIND(L_copy_byte);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       __ align(OptoLoopAlignment);
     __ BIND(L_copy_byte_loop);
       __ ldub(from, offset, O3);
@@ -1482,8 +1572,7 @@
 
     // copy 1 element (2 bytes) at a time
     __ BIND(L_copy_byte);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       __ align(OptoLoopAlignment);
     __ BIND(L_copy_byte_loop);
       __ dec(end_from);
@@ -1589,7 +1678,7 @@
       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
       // Also jump over aligned copy after the copy with shift completed.
 
-      copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes);
+      copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
     }
 
     // Both array are 8 bytes aligned, copy 16 bytes at a time
@@ -1600,8 +1689,7 @@
 
     // copy 1 element at a time
     __ BIND(L_copy_2_bytes);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       __ align(OptoLoopAlignment);
     __ BIND(L_copy_2_bytes_loop);
       __ lduh(from, offset, O3);
@@ -1946,8 +2034,7 @@
 
     // copy 1 element (2 bytes) at a time
     __ BIND(L_copy_2_bytes);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
     __ BIND(L_copy_2_bytes_loop);
       __ dec(end_from, 2);
       __ dec(end_to, 2);
@@ -1965,6 +2052,45 @@
   }
 
   //
+  // Helper methods for generate_disjoint_int_copy_core()
+  //
+  void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
+                          Label& L_loop, bool use_prefetch, bool use_bis) {
+
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loop);
+    if (use_prefetch) {
+      if (ArraycopySrcPrefetchDistance > 0) {
+        __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
+      }
+      if (ArraycopyDstPrefetchDistance > 0) {
+        __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
+      }
+    }
+    __ ldx(from, 4, O4);
+    __ ldx(from, 12, G4);
+    __ inc(to, 16);
+    __ inc(from, 16);
+    __ deccc(count, 4); // Can we do next iteration after this one?
+
+    __ srlx(O4, 32, G3);
+    __ bset(G3, O3);
+    __ sllx(O4, 32, O4);
+    __ srlx(G4, 32, G3);
+    __ bset(G3, O4);
+    if (use_bis) {
+      __ stxa(O3, to, -16);
+      __ stxa(O4, to, -8);
+    } else {
+      __ stx(O3, to, -16);
+      __ stx(O4, to, -8);
+    }
+    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
+    __ delayed()->sllx(G4, 32,  O3);
+
+  }
+
+  //
   //  Generate core code for disjoint int copy (and oop copy on 32-bit).
   //  If "aligned" is true, the "from" and "to" addresses are assumed
   //  to be heapword aligned.
@@ -1977,7 +2103,7 @@
   void generate_disjoint_int_copy_core(bool aligned) {
 
     Label L_skip_alignment, L_aligned_copy;
-    Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
+    Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
 
     const Register from      = O0;   // source array address
     const Register to        = O1;   // destination array address
@@ -2028,30 +2154,16 @@
 
     // copy with shift 4 elements (16 bytes) at a time
       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
-
-      __ align(OptoLoopAlignment);
-    __ BIND(L_copy_16_bytes);
-      __ ldx(from, 4, O4);
-      __ deccc(count, 4); // Can we do next iteration after this one?
-      __ ldx(from, 12, G4);
-      __ inc(to, 16);
-      __ inc(from, 16);
-      __ sllx(O3, 32, O3);
-      __ srlx(O4, 32, G3);
-      __ bset(G3, O3);
-      __ stx(O3, to, -16);
-      __ sllx(O4, 32, O4);
-      __ srlx(G4, 32, G3);
-      __ bset(G3, O4);
-      __ stx(O4, to, -8);
-      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
-      __ delayed()->mov(G4, O3);
+      __ sllx(O3, 32,  O3);
+
+      disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop);
 
       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
       __ delayed()->inc(count, 4); // restore 'count'
 
     __ BIND(L_aligned_copy);
-    }
+    } // !aligned
+
     // copy 4 elements (16 bytes) at a time
       __ and3(count, 1, G4); // Save
       __ srl(count, 1, count);
@@ -2060,8 +2172,7 @@
 
     // copy 1 element at a time
     __ BIND(L_copy_4_bytes);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
     __ BIND(L_copy_4_bytes_loop);
       __ ld(from, offset, O3);
       __ deccc(count);
@@ -2193,8 +2304,7 @@
 
     // copy 1 element (4 bytes) at a time
     __ BIND(L_copy_4_bytes);
-      __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
-      __ delayed()->nop();
+      __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
     __ BIND(L_copy_4_bytes_loop);
       __ dec(end_from, 4);
       __ dec(end_to, 4);
@@ -2240,6 +2350,38 @@
   }
 
   //
+  // Helper methods for generate_disjoint_long_copy_core()
+  //
+  void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
+                          Label& L_loop, bool use_prefetch, bool use_bis) {
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loop);
+    for (int off = 0; off < 64; off += 16) {
+      if (use_prefetch && (off & 31) == 0) {
+        if (ArraycopySrcPrefetchDistance > 0) {
+          __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
+        }
+        if (ArraycopyDstPrefetchDistance > 0) {
+          __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
+        }
+      }
+      __ ldx(from,  off+0, O4);
+      __ ldx(from,  off+8, O5);
+      if (use_bis) {
+        __ stxa(O4, to,  off+0);
+        __ stxa(O5, to,  off+8);
+      } else {
+        __ stx(O4, to,  off+0);
+        __ stx(O5, to,  off+8);
+      }
+    }
+    __ deccc(count, 8);
+    __ inc(from, 64);
+    __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
+    __ delayed()->inc(to, 64);
+  }
+
+  //
   //  Generate core code for disjoint long copy (and oop copy on 64-bit).
   //  "aligned" is ignored, because we must make the stronger
   //  assumption that both addresses are always 64-bit aligned.
@@ -2278,38 +2420,28 @@
     const Register offset0 = O4;  // element offset
     const Register offset8 = O5;  // next element offset
 
-      __ deccc(count, 2);
-      __ mov(G0, offset0);   // offset from start of arrays (0)
-      __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
-      __ delayed()->add(offset0, 8, offset8);
+    __ deccc(count, 2);
+    __ mov(G0, offset0);   // offset from start of arrays (0)
+    __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
+    __ delayed()->add(offset0, 8, offset8);
 
     // Copy by 64 bytes chunks
-    Label L_copy_64_bytes;
+
     const Register from64 = O3;  // source address
     const Register to64   = G3;  // destination address
-      __ subcc(count, 6, O3);
-      __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
-      __ delayed()->mov(to,   to64);
-      // Now we can use O4(offset0), O5(offset8) as temps
-      __ mov(O3, count);
-      __ mov(from, from64);
-
-      __ align(OptoLoopAlignment);
-    __ BIND(L_copy_64_bytes);
-      for( int off = 0; off < 64; off += 16 ) {
-        __ ldx(from64,  off+0, O4);
-        __ ldx(from64,  off+8, O5);
-        __ stx(O4, to64,  off+0);
-        __ stx(O5, to64,  off+8);
-      }
-      __ deccc(count, 8);
-      __ inc(from64, 64);
-      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes);
-      __ delayed()->inc(to64, 64);
+    __ subcc(count, 6, O3);
+    __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
+    __ delayed()->mov(to,   to64);
+    // Now we can use O4(offset0), O5(offset8) as temps
+    __ mov(O3, count);
+    // count >= 0 (original count - 8)
+    __ mov(from, from64);
+
+    disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop);
 
       // Restore O4(offset0), O5(offset8)
       __ sub(from64, from, offset0);
-      __ inccc(count, 6);
+      __ inccc(count, 6); // restore count
       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
       __ delayed()->add(offset0, 8, offset8);
 
@@ -2576,7 +2708,7 @@
                                      super_klass->after_save(),
                                      L0, L1, L2, L4,
                                      NULL, &L_pop_to_miss);
-    __ ba(false, L_success);
+    __ ba(L_success);
     __ delayed()->restore();
 
     __ bind(L_pop_to_miss);
@@ -2673,8 +2805,7 @@
     // ======== loop entry is here ========
     __ BIND(load_element);
     __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
-    __ br_null(G3_oop, true, Assembler::pt, store_element);
-    __ delayed()->nop();
+    __ br_null_short(G3_oop, Assembler::pt, store_element);
 
     __ load_klass(G3_oop, G4_klass); // query the object klass
 
@@ -2896,8 +3027,7 @@
     //  assert(src->klass() != NULL);
     BLOCK_COMMENT("assert klasses not null");
     { Label L_a, L_b;
-      __ br_notnull(G3_src_klass, false, Assembler::pt, L_b); // it is broken if klass is NULL
-      __ delayed()->nop();
+      __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
       __ bind(L_a);
       __ stop("broken null klass");
       __ bind(L_b);
@@ -2937,9 +3067,7 @@
     }
 
     //  if (src->klass() != dst->klass()) return -1;
-    __ cmp(G3_src_klass, G4_dst_klass);
-    __ brx(Assembler::notEqual, false, Assembler::pn, L_failed);
-    __ delayed()->nop();
+    __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
 
     //  if (!src->is_Array()) return -1;
     __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
@@ -3007,9 +3135,7 @@
     __ delayed()->signx(length, count); // length
 #ifdef ASSERT
     { Label L;
-      __ cmp(G3_elsize, LogBytesPerLong);
-      __ br(Assembler::equal, false, Assembler::pt, L);
-      __ delayed()->nop();
+      __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
       __ stop("must be long copy, but elsize is wrong");
       __ bind(L);
     }
@@ -3092,6 +3218,34 @@
     return start;
   }
 
+  //
+  //  Generate stub for heap zeroing.
+  //  "to" address is aligned to jlong (8 bytes).
+  //
+  // Arguments for generated stub:
+  //      to:    O0
+  //      count: O1 treated as signed (count of HeapWord)
+  //             count could be 0
+  //
+  address generate_zero_aligned_words(const char* name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    const Register to    = O0;   // source array address
+    const Register count = O1;   // HeapWords count
+    const Register temp  = O2;   // scratch
+
+    Label Ldone;
+    __ sllx(count, LogHeapWordSize, count); // to bytes count
+    // Use BIS for zeroing
+    __ bis_zeroing(to, count, temp, Ldone);
+    __ bind(Ldone);
+    __ retl();
+    __ delayed()->nop();
+    return start;
+}
+
   void generate_arraycopy_stubs() {
     address entry;
     address entry_jbyte_arraycopy;
@@ -3218,6 +3372,10 @@
     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
+
+    if (UseBlockZeroing) {
+      StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
+    }
   }
 
   void generate_initial() {
@@ -3266,12 +3424,10 @@
     // UseZeroBaseCompressedOops which is defined after heap initialization.
     StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
     // These entry points require SharedInfo::stack0 to be set up in non-core builds
-    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
-    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
-    StubRoutines::_throw_ArithmeticException_entry         = generate_throw_exception("ArithmeticException throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException),  true);
-    StubRoutines::_throw_NullPointerException_entry        = generate_throw_exception("NullPointerException throw_exception",         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
-    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
-    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
+    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
+    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
+    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
+    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
 
     StubRoutines::_handler_for_unsafe_access_entry =
       generate_handler_for_unsafe_access();