# HG changeset patch
# User amurillo
# Date 1399615631 25200
# Node ID 28bbbecff5f08c1e343fc0c40923c05d86b7cf82
# Parent  7dd67cb4f225e2437717e6cb863247b1e6efca26# Parent  63c5920a038d73df37c6ae3101937010f7e9659a
Merge

diff -r 7dd67cb4f225 -r 28bbbecff5f0 .hgtags
--- a/.hgtags	Wed May 07 10:58:47 2014 -0700
+++ b/.hgtags	Thu May 08 23:07:11 2014 -0700
@@ -462,3 +462,4 @@
 3c291bc2aa7c58efb1219701f38c41731609e595 hs25.20-b12
 18ae0dac7620474547aa1721bc3fd748af07b8b5 jdk8u20-b12
 47951595af60460a479b8574622375bfbf5c8ed2 jdk8u20-b13
+798f5b02be897151fdad44d695446088b1cca6b1 hs25.20-b13
diff -r 7dd67cb4f225 -r 28bbbecff5f0 make/hotspot_version
--- a/make/hotspot_version	Wed May 07 10:58:47 2014 -0700
+++ b/make/hotspot_version	Thu May 08 23:07:11 2014 -0700
@@ -35,7 +35,7 @@
 
 HS_MAJOR_VER=25
 HS_MINOR_VER=20
-HS_BUILD_NUMBER=12
+HS_BUILD_NUMBER=14
 
 JDK_MAJOR_VER=1
 JDK_MINOR_VER=8
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/cppInterpreter_ppc.cpp
--- a/src/cpu/ppc/vm/cppInterpreter_ppc.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/ppc/vm/cppInterpreter_ppc.cpp	Thu May 08 23:07:11 2014 -0700
@@ -1,3 +1,4 @@
+
 /*
  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2012, 2013 SAP AG. All rights reserved.
@@ -403,7 +404,7 @@
   BLOCK_COMMENT("compute_interpreter_state {");
 
   // access_flags = method->access_flags();
-  // TODO: PPC port: assert(4 == methodOopDesc::sz_access_flags(), "unexpected field size");
+  // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size");
   __ lwa(access_flags, method_(access_flags));
 
   // parameter_count = method->constMethod->size_of_parameters();
@@ -1055,7 +1056,7 @@
   assert(access_flags->is_nonvolatile(),
          "access_flags must be in a non-volatile register");
   // Type check.
-  // TODO: PPC port: assert(4 == methodOopDesc::sz_access_flags(), "unexpected field size");
+  // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size");
   __ lwz(access_flags, method_(access_flags));
 
   // We don't want to reload R19_method and access_flags after calls
@@ -1838,7 +1839,7 @@
   // Interpreter state fields.
   const Register msg               = R24_tmp4;
 
-  // MethodOop fields.
+  // Method fields.
   const Register parameter_count   = R25_tmp5;
   const Register result_index      = R26_tmp6;
 
@@ -2023,7 +2024,7 @@
   __ add(R17_tos, R17_tos, parameter_count);
 
   // Result stub address array index
-  // TODO: PPC port: assert(4 == methodOopDesc::sz_result_index(), "unexpected field size");
+  // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size");
   __ lwa(result_index, method_(result_index));
 
   __ li(msg, BytecodeInterpreter::method_resume);
@@ -2709,7 +2710,7 @@
   __ ld(R3_ARG1, state_(_result._osr._osr_buf));
   __ mtctr(R12_scratch2);
 
-  // Load method oop, gc may move it during execution of osr'd method.
+  // Load method, gc may move it during execution of osr'd method.
   __ ld(R22_tmp2, state_(_method));
   // Load message 'call_method'.
   __ li(R23_tmp3, BytecodeInterpreter::call_method);
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/frame_ppc.inline.hpp
--- a/src/cpu/ppc/vm/frame_ppc.inline.hpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/ppc/vm/frame_ppc.inline.hpp	Thu May 08 23:07:11 2014 -0700
@@ -26,6 +26,8 @@
 #ifndef CPU_PPC_VM_FRAME_PPC_INLINE_HPP
 #define CPU_PPC_VM_FRAME_PPC_INLINE_HPP
 
+#include "code/codeCache.hpp"
+
 // Inline functions for ppc64 frames:
 
 // Find codeblob and set deopt_state.
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/interp_masm_ppc_64.hpp
--- a/src/cpu/ppc/vm/interp_masm_ppc_64.hpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/ppc/vm/interp_masm_ppc_64.hpp	Thu May 08 23:07:11 2014 -0700
@@ -26,7 +26,7 @@
 #ifndef CPU_PPC_VM_INTERP_MASM_PPC_64_HPP
 #define CPU_PPC_VM_INTERP_MASM_PPC_64_HPP
 
-#include "assembler_ppc.inline.hpp"
+#include "asm/macroAssembler.hpp"
 #include "interpreter/invocationCounter.hpp"
 
 // This file specializes the assembler with interpreter-specific macros.
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/interpreterRT_ppc.cpp
--- a/src/cpu/ppc/vm/interpreterRT_ppc.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/ppc/vm/interpreterRT_ppc.cpp	Thu May 08 23:07:11 2014 -0700
@@ -24,6 +24,7 @@
  */
 
 #include "precompiled.hpp"
+#include "asm/assembler.inline.hpp"
 #include "interpreter/interpreter.hpp"
 #include "interpreter/interpreterRuntime.hpp"
 #include "memory/allocation.inline.hpp"
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/interpreter_ppc.cpp
--- a/src/cpu/ppc/vm/interpreter_ppc.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/ppc/vm/interpreter_ppc.cpp	Thu May 08 23:07:11 2014 -0700
@@ -139,32 +139,16 @@
   // Signature is in R3_RET. Signature is callee saved.
   __ mr(signature, R3_RET);
 
-  // Reload method, it may have moved.
-#ifdef CC_INTERP
-  __ ld(R19_method, state_(_method));
-#else
-  __ ld(R19_method, 0, target_sp);
-  __ ld(R19_method, _ijava_state_neg(method), R19_method);
-#endif
-
   // Get the result handler.
   __ call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::get_result_handler), R16_thread, R19_method);
 
-  // Reload method, it may have moved.
-#ifdef CC_INTERP
-  __ ld(R19_method, state_(_method));
-#else
-  __ ld(R19_method, 0, target_sp);
-  __ ld(R19_method, _ijava_state_neg(method), R19_method);
-#endif
-
   {
     Label L;
     // test if static
     // _access_flags._flags must be at offset 0.
     // TODO PPC port: requires change in shared code.
     //assert(in_bytes(AccessFlags::flags_offset()) == 0,
-    //       "MethodOopDesc._access_flags == MethodOopDesc._access_flags._flags");
+    //       "MethodDesc._access_flags == MethodDesc._access_flags._flags");
     // _access_flags must be a 32 bit value.
     assert(sizeof(AccessFlags) == 4, "wrong size");
     __ lwa(R11_scratch1/*access_flags*/, method_(access_flags));
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/jniFastGetField_ppc.cpp
--- a/src/cpu/ppc/vm/jniFastGetField_ppc.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/ppc/vm/jniFastGetField_ppc.cpp	Thu May 08 23:07:11 2014 -0700
@@ -32,7 +32,7 @@
 
 
 address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) {
-  // we don't have fast jni accessors.
+  // We don't have fast jni accessors.
   return (address) -1;
 }
 
@@ -57,12 +57,12 @@
 }
 
 address JNI_FastGetField::generate_fast_get_long_field() {
-  // we don't have fast jni accessors.
+  // We don't have fast jni accessors.
   return (address) -1;
 }
 
 address JNI_FastGetField::generate_fast_get_float_field0(BasicType type) {
-  // e don't have fast jni accessors.
+  // We don't have fast jni accessors.
   return (address) -1;
 }
 
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/ppc.ad
--- a/src/cpu/ppc/vm/ppc.ad	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/ppc/vm/ppc.ad	Thu May 08 23:07:11 2014 -0700
@@ -898,7 +898,7 @@
   // To keep related declarations/definitions/uses close together,
   // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 
-  // Returns true if Node n is followed by a MemBar node that 
+  // Returns true if Node n is followed by a MemBar node that
   // will do an acquire. If so, this node must not do the acquire
   // operation.
   bool followed_by_acquire(const Node *n);
@@ -908,7 +908,7 @@
 
 // Optimize load-acquire.
 //
-// Check if acquire is unnecessary due to following operation that does 
+// Check if acquire is unnecessary due to following operation that does
 // acquire anyways.
 // Walk the pattern:
 //
@@ -919,12 +919,12 @@
 //  Proj(ctrl)  Proj(mem)
 //       |         |
 //   MemBarRelease/Volatile
-// 
+//
 bool followed_by_acquire(const Node *load) {
   assert(load->is_Load(), "So far implemented only for loads.");
 
   // Find MemBarAcquire.
-  const Node *mba = NULL;         
+  const Node *mba = NULL;
   for (DUIterator_Fast imax, i = load->fast_outs(imax); i < imax; i++) {
     const Node *out = load->fast_out(i);
     if (out->Opcode() == Op_MemBarAcquire) {
@@ -937,7 +937,7 @@
 
   // Find following MemBar node.
   //
-  // The following node must be reachable by control AND memory 
+  // The following node must be reachable by control AND memory
   // edge to assure no other operations are in between the two nodes.
   //
   // So first get the Proj node, mem_proj, to use it to iterate forward.
@@ -1135,6 +1135,7 @@
 
  public:
 
+  // Emit call stub, compiled java to interpreter.
   static void emit_trampoline_stub(MacroAssembler &_masm, int destination_toc_offset, int insts_call_instruction_offset);
 
   // Size of call trampoline stub.
@@ -2752,7 +2753,7 @@
       // inputs for new nodes
       m1->add_req(NULL, n_toc);
       m2->add_req(NULL, m1);
-      
+
       // operands for new nodes
       m1->_opnds[0] = new (C) iRegPdstOper(); // dst
       m1->_opnds[1] = op_src;                 // src
@@ -2760,29 +2761,29 @@
       m2->_opnds[0] = new (C) iRegPdstOper(); // dst
       m2->_opnds[1] = op_src;                 // src
       m2->_opnds[2] = new (C) iRegLdstOper(); // base
-      
+
       // Initialize ins_attrib TOC fields.
       m1->_const_toc_offset = -1;
       m2->_const_toc_offset_hi_node = m1;
-      
+
       // Register allocation for new nodes.
       ra_->set_pair(m1->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
       ra_->set_pair(m2->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
-      
+
       nodes->push(m1);
       nodes->push(m2);
       assert(m2->bottom_type()->isa_ptr(), "must be ptr");
     } else {
       loadConPNode *m2 = new (C) loadConPNode();
-      
+
       // inputs for new nodes
       m2->add_req(NULL, n_toc);
-      
+
       // operands for new nodes
       m2->_opnds[0] = new (C) iRegPdstOper(); // dst
       m2->_opnds[1] = op_src;                 // src
       m2->_opnds[2] = new (C) iRegPdstOper(); // toc
-      
+
       // Register allocation for new nodes.
       ra_->set_pair(m2->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
 
@@ -2974,17 +2975,17 @@
       n_sub_base->_opnds[1] = op_crx;
       n_sub_base->_opnds[2] = op_src;
       n_sub_base->_bottom_type = _bottom_type;
-   
+
       n_shift->add_req(n_region, n_sub_base);
       n_shift->_opnds[0] = op_dst;
       n_shift->_opnds[1] = op_dst;
       n_shift->_bottom_type = _bottom_type;
-   
+
       ra_->set_pair(n_shift->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
       ra_->set_pair(n_compare->_idx, ra_->get_reg_second(n_crx), ra_->get_reg_first(n_crx));
       ra_->set_pair(n_sub_base->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
       ra_->set_pair(n_move->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
-   
+
       nodes->push(n_move);
       nodes->push(n_compare);
       nodes->push(n_sub_base);
@@ -3061,20 +3062,20 @@
     } else {
       // before Power 7
       cond_add_baseNode *n_add_base = new (C) cond_add_baseNode();
-     
+
       n_add_base->add_req(n_region, n_compare, n_shift);
       n_add_base->_opnds[0] = op_dst;
       n_add_base->_opnds[1] = op_crx;
       n_add_base->_opnds[2] = op_dst;
       n_add_base->_bottom_type = _bottom_type;
-     
+
       assert(ra_->is_oop(this) == true, "A decodeN node must produce an oop!");
       ra_->set_oop(n_add_base, true);
-     
+
       ra_->set_pair(n_shift->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
       ra_->set_pair(n_compare->_idx, ra_->get_reg_second(n_crx), ra_->get_reg_first(n_crx));
       ra_->set_pair(n_add_base->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
-     
+
       nodes->push(n_compare);
       nodes->push(n_shift);
       nodes->push(n_add_base);
@@ -3631,11 +3632,11 @@
     // Req...
     for (uint i = 0; i < req(); ++i) {
       // The expanded node does not need toc any more.
-      // Add the inline cache constant here instead.  This expresses the 
+      // Add the inline cache constant here instead. This expresses the
       // register of the inline cache must be live at the call.
       // Else we would have to adapt JVMState by -1.
       if (i == mach_constant_base_node_input()) {
-        call->add_req(loadConLNodes_IC._last);        
+        call->add_req(loadConLNodes_IC._last);
       } else {
         call->add_req(in(i));
       }
@@ -3663,6 +3664,8 @@
   %}
 
   // Compound version of call dynamic
+  // Toc is only passed so that it can be used in ins_encode statement.
+  // In the code we have to use $constanttablebase.
   enc_class enc_java_dynamic_call(method meth, iRegLdst toc) %{
     // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     MacroAssembler _masm(&cbuf);
@@ -3670,14 +3673,17 @@
 
     Register Rtoc = (ra_) ? $constanttablebase : R2_TOC;
 #if 0
+    int vtable_index = this->_vtable_index;
     if (_vtable_index < 0) {
       // Must be invalid_vtable_index, not nonvirtual_vtable_index.
       assert(_vtable_index == Method::invalid_vtable_index, "correct sentinel value");
       Register ic_reg = as_Register(Matcher::inline_cache_reg_encode());
-      AddressLiteral meta = __ allocate_metadata_address((Metadata *)Universe::non_oop_word());
-
+
+      // Virtual call relocation will point to ic load.
       address virtual_call_meta_addr = __ pc();
-      __ load_const_from_method_toc(ic_reg, meta, Rtoc);
+      // Load a clear inline cache.
+      AddressLiteral empty_ic((address) Universe::non_oop_word());
+      __ load_const_from_method_toc(ic_reg, empty_ic, Rtoc);
       // CALL to fixup routine.  Fixup routine uses ScopeDesc info
       // to determine who we intended to call.
       __ relocate(virtual_call_Relocation::spec(virtual_call_meta_addr));
@@ -3710,7 +3716,6 @@
              "Fix constant in ret_addr_offset()");
     }
 #endif
-    guarantee(0, "Fix handling of toc edge: messes up derived/base pairs.");
     Unimplemented();  // ret_addr_offset not yet fixed. Depends on compressed oops (load klass!).
   %}
 
@@ -5436,7 +5441,7 @@
   ins_pipe(pipe_class_memory);
 %}
 
-// Match loading integer and casting it to unsigned int in 
+// Match loading integer and casting it to unsigned int in
 // long register.
 // LoadI + ConvI2L + AndL 0xffffffff.
 instruct loadUI2L(iRegLdst dst, memory mem, immL_32bits mask) %{
@@ -6078,7 +6083,7 @@
   ins_pipe(pipe_class_default);
 %}
 
-// This needs a match rule so that build_oop_map knows this is 
+// This needs a match rule so that build_oop_map knows this is
 // not a narrow oop.
 instruct loadConNKlass_lo(iRegNdst dst, immNKlass_NM src1, iRegNsrc src2) %{
   match(Set dst src1);
@@ -6702,7 +6707,7 @@
   size(4);
   ins_encode %{
     // This is a Power7 instruction for which no machine description exists.
-    // TODO: PPC port $archOpcode(ppc64Opcode_compound); 
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     __ isel_0($dst$$Register, $crx$$CondRegister, Assembler::equal, $src1$$Register);
   %}
   ins_pipe(pipe_class_default);
@@ -6847,7 +6852,7 @@
   size(4);
   ins_encode %{
     // This is a Power7 instruction for which no machine description exists.
-    // TODO: PPC port $archOpcode(ppc64Opcode_compound); 
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     __ isel_0($dst$$Register, $crx$$CondRegister, Assembler::equal, $src1$$Register);
   %}
   ins_pipe(pipe_class_default);
@@ -7064,7 +7069,7 @@
     n1->_bottom_type = _bottom_type;
 
     decodeNKlass_shiftNode *n2 = new (C) decodeNKlass_shiftNode();
-    n2->add_req(n_region, n2);
+    n2->add_req(n_region, n1);
     n2->_opnds[0] = op_dst;
     n2->_opnds[1] = op_dst;
     n2->_bottom_type = _bottom_type;
@@ -7199,7 +7204,7 @@
 //  inline_unsafe_load_store).
 //
 // Add this node again if we found a good solution for inline_unsafe_load_store().
-// Don't forget to look at the implementation of post_store_load_barrier again, 
+// Don't forget to look at the implementation of post_store_load_barrier again,
 // we did other fixes in that method.
 //instruct unnecessary_membar_volatile() %{
 //  match(MemBarVolatile);
@@ -7237,7 +7242,7 @@
     // exists. Anyways, the scheduler should be off on Power7.
     // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     int cc        = $cmp$$cmpcode;
-    __ isel($dst$$Register, $crx$$CondRegister, 
+    __ isel($dst$$Register, $crx$$CondRegister,
             (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register);
   %}
   ins_pipe(pipe_class_default);
@@ -7283,7 +7288,7 @@
     // exists. Anyways, the scheduler should be off on Power7.
     // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     int cc        = $cmp$$cmpcode;
-    __ isel($dst$$Register, $crx$$CondRegister, 
+    __ isel($dst$$Register, $crx$$CondRegister,
             (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register);
   %}
   ins_pipe(pipe_class_default);
@@ -7329,7 +7334,7 @@
     // exists. Anyways, the scheduler should be off on Power7.
     // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     int cc        = $cmp$$cmpcode;
-    __ isel($dst$$Register, $crx$$CondRegister, 
+    __ isel($dst$$Register, $crx$$CondRegister,
             (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register);
   %}
   ins_pipe(pipe_class_default);
@@ -7376,7 +7381,7 @@
     // exists. Anyways, the scheduler should be off on Power7.
     // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     int cc        = $cmp$$cmpcode;
-    __ isel($dst$$Register, $crx$$CondRegister, 
+    __ isel($dst$$Register, $crx$$CondRegister,
             (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register);
   %}
   ins_pipe(pipe_class_default);
@@ -7522,8 +7527,8 @@
   ins_encode %{
     // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
-    __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register, 
-                MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(), 
+    __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(),
                 $res$$Register, true);
   %}
   ins_pipe(pipe_class_default);
@@ -7929,7 +7934,23 @@
 
 // Turn the sign-bit of a long into a 64-bit mask, 0x0...0 for
 // positive longs and 0xF...F for negative ones.
-instruct signmask64I_regI(iRegIdst dst, iRegIsrc src) %{
+instruct signmask64I_regL(iRegIdst dst, iRegLsrc src) %{
+  // no match-rule, false predicate
+  effect(DEF dst, USE src);
+  predicate(false);
+
+  format %{ "SRADI   $dst, $src, #63" %}
+  size(4);
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_sradi);
+    __ sradi($dst$$Register, $src$$Register, 0x3f);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// Turn the sign-bit of a long into a 64-bit mask, 0x0...0 for
+// positive longs and 0xF...F for negative ones.
+instruct signmask64L_regL(iRegLdst dst, iRegLsrc src) %{
   // no match-rule, false predicate
   effect(DEF dst, USE src);
   predicate(false);
@@ -8893,7 +8914,7 @@
   size(4);
   ins_encode %{
     // TODO: PPC port $archOpcode(ppc64Opcode_rlwinm);
-    __ rlwinm($dst$$Register, $src1$$Register, 0, 
+    __ rlwinm($dst$$Register, $src1$$Register, 0,
               (31-log2_long((jlong) $src2$$constant)) & 0x1f, (31-log2_long((jlong) $src2$$constant)) & 0x1f);
   %}
   ins_pipe(pipe_class_default);
@@ -9619,14 +9640,14 @@
   ins_cost(DEFAULT_COST*4);
 
   expand %{
-    iRegIdst src1s;
-    iRegIdst src2s;
-    iRegIdst diff;
-    sxtI_reg(src1s, src1); // ensure proper sign extention
-    sxtI_reg(src2s, src2); // ensure proper sign extention
-    subI_reg_reg(diff, src1s, src2s);
+    iRegLdst src1s;
+    iRegLdst src2s;
+    iRegLdst diff;
+    convI2L_reg(src1s, src1); // Ensure proper sign extension.
+    convI2L_reg(src2s, src2); // Ensure proper sign extension.
+    subL_reg_reg(diff, src1s, src2s);
     // Need to consider >=33 bit result, therefore we need signmaskL.
-    signmask64I_regI(dst, diff);
+    signmask64I_regL(dst, diff);
   %}
 %}
 
@@ -10863,7 +10884,7 @@
   format %{ "PartialSubtypeCheck $result = ($subklass instanceOf $superklass) tmp: $tmp_klass, $tmp_arrayptr" %}
   ins_encode %{
     // TODO: PPC port $archOpcode(ppc64Opcode_compound);
-    __ check_klass_subtype_slow_path($subklass$$Register, $superklass$$Register, $tmp_arrayptr$$Register, 
+    __ check_klass_subtype_slow_path($subklass$$Register, $superklass$$Register, $tmp_arrayptr$$Register,
                                      $tmp_klass$$Register, NULL, $result$$Register);
   %}
   ins_pipe(pipe_class_default);
@@ -11178,18 +11199,18 @@
   ins_cost(DEFAULT_COST*6);
 
   expand %{
-    iRegIdst src1s;
-    iRegIdst src2s;
-    iRegIdst diff;
-    iRegIdst sm;
-    iRegIdst doz; // difference or zero
-    sxtI_reg(src1s, src1); // Ensure proper sign extention.
-    sxtI_reg(src2s, src2); // Ensure proper sign extention.
-    subI_reg_reg(diff, src2s, src1s);
+    iRegLdst src1s;
+    iRegLdst src2s;
+    iRegLdst diff;
+    iRegLdst sm;
+    iRegLdst doz; // difference or zero
+    convI2L_reg(src1s, src1); // Ensure proper sign extension.
+    convI2L_reg(src2s, src2); // Ensure proper sign extension.
+    subL_reg_reg(diff, src2s, src1s);
     // Need to consider >=33 bit result, therefore we need signmaskL.
-    signmask64I_regI(sm, diff);
-    andI_reg_reg(doz, diff, sm); // <=0
-    addI_reg_reg(dst, doz, src1s);
+    signmask64L_regL(sm, diff);
+    andL_reg_reg(doz, diff, sm); // <=0
+    addI_regL_regL(dst, doz, src1s);
   %}
 %}
 
@@ -11198,19 +11219,18 @@
   ins_cost(DEFAULT_COST*6);
 
   expand %{
-    immI_minus1 m1 %{ -1 %}
-    iRegIdst src1s;
-    iRegIdst src2s;
-    iRegIdst diff;
-    iRegIdst sm;
-    iRegIdst doz; // difference or zero
-    sxtI_reg(src1s, src1); // Ensure proper sign extention.
-    sxtI_reg(src2s, src2); // Ensure proper sign extention.
-    subI_reg_reg(diff, src2s, src1s);
+    iRegLdst src1s;
+    iRegLdst src2s;
+    iRegLdst diff;
+    iRegLdst sm;
+    iRegLdst doz; // difference or zero
+    convI2L_reg(src1s, src1); // Ensure proper sign extension.
+    convI2L_reg(src2s, src2); // Ensure proper sign extension.
+    subL_reg_reg(diff, src2s, src1s);
     // Need to consider >=33 bit result, therefore we need signmaskL.
-    signmask64I_regI(sm, diff);
-    andcI_reg_reg(doz, sm, m1, diff); // >=0
-    addI_reg_reg(dst, doz, src1s);
+    signmask64L_regL(sm, diff);
+    andcL_reg_reg(doz, diff, sm); // >=0
+    addI_regL_regL(dst, doz, src1s);
   %}
 %}
 
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/templateInterpreter_ppc.cpp
--- a/src/cpu/ppc/vm/templateInterpreter_ppc.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/ppc/vm/templateInterpreter_ppc.cpp	Thu May 08 23:07:11 2014 -0700
@@ -81,24 +81,18 @@
 #if 0
 // Call special ClassCastException constructor taking object to cast
 // and target class as arguments.
-address TemplateInterpreterGenerator::generate_ClassCastException_verbose_handler(const char* name) {
+address TemplateInterpreterGenerator::generate_ClassCastException_verbose_handler() {
   address entry = __ pc();
 
-  // Target class oop is in register R6_ARG4 by convention!
-
   // Expression stack must be empty before entering the VM if an
   // exception happened.
   __ empty_expression_stack();
-  // Setup parameters.
+
   // Thread will be loaded to R3_ARG1.
-  __ load_const_optimized(R4_ARG2, (address) name);
-  __ mr(R5_ARG3, R17_tos);
-  // R6_ARG4 contains specified class.
-  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_ClassCastException_verbose));
-#ifdef ASSERT
+  // Target class oop is in register R5_ARG3 by convention!
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_ClassCastException_verbose, R17_tos, R5_ARG3));
   // Above call must not return here since exception pending.
-  __ should_not_reach_here();
-#endif
+  DEBUG_ONLY(__ should_not_reach_here();)
   return entry;
 }
 #endif
@@ -1535,14 +1529,32 @@
     __ stw(R0, in_bytes(JavaThread::popframe_condition_offset()), R16_thread);
 
     // Get out of the current method and re-execute the call that called us.
-    __ merge_frames(/*top_frame_sp*/ R21_sender_SP, /*return_pc*/ return_pc, R11_scratch1, R12_scratch2);
+    __ merge_frames(/*top_frame_sp*/ R21_sender_SP, /*return_pc*/ noreg, R11_scratch1, R12_scratch2);
     __ restore_interpreter_state(R11_scratch1);
     __ ld(R12_scratch2, _ijava_state_neg(top_frame_sp), R11_scratch1);
     __ resize_frame_absolute(R12_scratch2, R11_scratch1, R0);
-    __ mtlr(return_pc);
     if (ProfileInterpreter) {
       __ set_method_data_pointer_for_bcp();
     }
+#if INCLUDE_JVMTI
+    Label L_done;
+
+    __ lbz(R11_scratch1, 0, R14_bcp);
+    __ cmpwi(CCR0, R11_scratch1, Bytecodes::_invokestatic);
+    __ bne(CCR0, L_done);
+
+    // The member name argument must be restored if _invokestatic is re-executed after a PopFrame call.
+    // Detect such a case in the InterpreterRuntime function and return the member name argument, or NULL.
+    __ ld(R4_ARG2, 0, R18_locals);
+    __ call_VM(R11_scratch1, CAST_FROM_FN_PTR(address, InterpreterRuntime::member_name_arg_or_null),
+               R4_ARG2, R19_method, R14_bcp);
+
+    __ cmpdi(CCR0, R11_scratch1, 0);
+    __ beq(CCR0, L_done);
+
+    __ std(R11_scratch1, wordSize, R15_esp);
+    __ bind(L_done);
+#endif // INCLUDE_JVMTI
     __ dispatch_next(vtos);
   }
   // end of JVMTI PopFrame support
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/templateTable_ppc_64.cpp
--- a/src/cpu/ppc/vm/templateTable_ppc_64.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/ppc/vm/templateTable_ppc_64.cpp	Thu May 08 23:07:11 2014 -0700
@@ -64,7 +64,7 @@
   assert_different_registers(Rtmp1, Rtmp2, Rtmp3, Rval, Rbase);
 
   switch (barrier) {
-#ifndef SERIALGC
+#if INCLUDE_ALL_GCS
     case BarrierSet::G1SATBCT:
     case BarrierSet::G1SATBCTLogging:
       {
@@ -104,7 +104,7 @@
         __ bind(Ldone);
       }
       break;
-#endif // SERIALGC
+#endif // INCLUDE_ALL_GCS
     case BarrierSet::CardTableModRef:
     case BarrierSet::CardTableExtension:
       {
@@ -259,17 +259,17 @@
   switch (value) {
     default: ShouldNotReachHere();
     case 0: {
-      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0);
+      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0, true);
       __ lfs(F15_ftos, simm16_offset, R11_scratch1);
       break;
     }
     case 1: {
-      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0);
+      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0, true);
       __ lfs(F15_ftos, simm16_offset, R11_scratch1);
       break;
     }
     case 2: {
-      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&two, R0);
+      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&two, R0, true);
       __ lfs(F15_ftos, simm16_offset, R11_scratch1);
       break;
     }
@@ -282,12 +282,12 @@
   static double one  = 1.0;
   switch (value) {
     case 0: {
-      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0);
+      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0, true);
       __ lfd(F15_ftos, simm16_offset, R11_scratch1);
       break;
     }
     case 1: {
-      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0);
+      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0, true);
       __ lfd(F15_ftos, simm16_offset, R11_scratch1);
       break;
     }
@@ -3728,9 +3728,9 @@
   transition(atos, atos);
 
   Label Ldone, Lis_null, Lquicked, Lresolved;
-  Register Roffset         = R5_ARG3,
+  Register Roffset         = R6_ARG4,
            RobjKlass       = R4_ARG2,
-           RspecifiedKlass = R6_ARG4, // Generate_ClassCastException_verbose_handler will expect this register.
+           RspecifiedKlass = R5_ARG3, // Generate_ClassCastException_verbose_handler will read value from this register.
            Rcpool          = R11_scratch1,
            Rtags           = R12_scratch2;
 
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/sparc/vm/assembler_sparc.hpp
--- a/src/cpu/sparc/vm/assembler_sparc.hpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/sparc/vm/assembler_sparc.hpp	Thu May 08 23:07:11 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -123,8 +123,13 @@
     fpop2_op3    = 0x35,
     impdep1_op3  = 0x36,
     aes3_op3     = 0x36,
+    alignaddr_op3  = 0x36,
+    faligndata_op3 = 0x36,
     flog3_op3    = 0x36,
+    edge_op3     = 0x36,
+    fsrc_op3     = 0x36,
     impdep2_op3  = 0x37,
+    stpartialf_op3 = 0x37,
     jmpl_op3     = 0x38,
     rett_op3     = 0x39,
     trap_op3     = 0x3a,
@@ -175,17 +180,23 @@
 
   enum opfs {
     // selected opfs
+    edge8n_opf         = 0x01,
+
     fmovs_opf          = 0x01,
     fmovd_opf          = 0x02,
 
     fnegs_opf          = 0x05,
     fnegd_opf          = 0x06,
 
+    alignaddr_opf      = 0x18,
+
     fadds_opf          = 0x41,
     faddd_opf          = 0x42,
     fsubs_opf          = 0x45,
     fsubd_opf          = 0x46,
 
+    faligndata_opf     = 0x48,
+
     fmuls_opf          = 0x49,
     fmuld_opf          = 0x4a,
     fdivs_opf          = 0x4d,
@@ -348,6 +359,8 @@
     ASI_PRIMARY            = 0x80,
     ASI_PRIMARY_NOFAULT    = 0x82,
     ASI_PRIMARY_LITTLE     = 0x88,
+    // 8x8-bit partial store
+    ASI_PST8_PRIMARY       = 0xC0,
     // Block initializing store
     ASI_ST_BLKINIT_PRIMARY = 0xE2,
     // Most-Recently-Used (MRU) BIS variant
@@ -585,6 +598,9 @@
   // instruction only in VIS1
   static void vis1_only() { assert( VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); }
 
+  // instruction only in VIS2
+  static void vis2_only() { assert( VM_Version::has_vis2(), "This instruction only works on SPARC with VIS2"); }
+
   // instruction only in VIS3
   static void vis3_only() { assert( VM_Version::has_vis3(), "This instruction only works on SPARC with VIS3"); }
 
@@ -1164,6 +1180,20 @@
   inline void wrfprs( Register d) { v9_only(); emit_int32( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(6, 29, 25)); }
 
 
+  //  VIS1 instructions
+
+  void alignaddr( Register s1, Register s2, Register d ) { vis1_only(); emit_int32( op(arith_op) | rd(d) | op3(alignaddr_op3) | rs1(s1) | opf(alignaddr_opf) | rs2(s2)); }
+
+  void faligndata( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(faligndata_op3) | fs1(s1, FloatRegisterImpl::D) | opf(faligndata_opf) | fs2(s2, FloatRegisterImpl::D)); }
+
+  void fsrc2( FloatRegisterImpl::Width w, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fsrc_op3) | opf(0x7A - w) | fs2(s2, w)); }
+
+  void stpartialf( Register s1, Register s2, FloatRegister d, int ia = -1 ) { vis1_only(); emit_int32( op(ldst_op) | fd(d, FloatRegisterImpl::D) | op3(stpartialf_op3) | rs1(s1) | imm_asi(ia) | rs2(s2)); }
+
+  //  VIS2 instructions
+
+  void edge8n( Register s1, Register s2, Register d ) { vis2_only(); emit_int32( op(arith_op) | rd(d) | op3(edge_op3) | rs1(s1) | opf(edge8n_opf) | rs2(s2)); }
+
   // VIS3 instructions
 
   void movstosw( FloatRegister s, Register d ) { vis3_only();  emit_int32( op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mstosw_opf) | fs2(s, FloatRegisterImpl::S)); }
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/sparc/vm/stubGenerator_sparc.cpp
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu May 08 23:07:11 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -3305,9 +3305,12 @@
   }
 
   address generate_aescrypt_encryptBlock() {
+    // required since we read expanded key 'int' array starting first element without alignment considerations
+    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
+           "the following code assumes that first element of an int array is aligned to 8 bytes");
     __ align(CodeEntryAlignment);
-    StubCodeMark mark(this, "StubRoutines", "aesencryptBlock");
-    Label L_doLast128bit, L_storeOutput;
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+    Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
     address start = __ pc();
     Register from = O0; // source byte array
     Register to = O1;   // destination byte array
@@ -3317,15 +3320,33 @@
     // read expanded key length
     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
 
-    // load input into F54-F56; F30-F31 used as temp
-    __ ldf(FloatRegisterImpl::S, from, 0, F30);
-    __ ldf(FloatRegisterImpl::S, from, 4, F31);
-    __ fmov(FloatRegisterImpl::D, F30, F54);
-    __ ldf(FloatRegisterImpl::S, from, 8, F30);
-    __ ldf(FloatRegisterImpl::S, from, 12, F31);
-    __ fmov(FloatRegisterImpl::D, F30, F56);
-
-    // load expanded key
+    // Method to address arbitrary alignment for load instructions:
+    // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
+    // If zero/aligned then continue with double FP load instructions
+    // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
+    // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
+    // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
+    // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
+
+    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(from, 7, G0);
+    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
+    __ delayed()->alignaddr(from, G0, from);
+
+    // aligned case: load input into F54-F56
+    __ ldf(FloatRegisterImpl::D, from, 0, F54);
+    __ ldf(FloatRegisterImpl::D, from, 8, F56);
+    __ ba_short(L_load_expanded_key);
+
+    __ BIND(L_load_misaligned_input);
+    __ ldf(FloatRegisterImpl::D, from, 0, F54);
+    __ ldf(FloatRegisterImpl::D, from, 8, F56);
+    __ ldf(FloatRegisterImpl::D, from, 16, F58);
+    __ faligndata(F54, F56, F54);
+    __ faligndata(F56, F58, F56);
+
+    __ BIND(L_load_expanded_key);
+    // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
     for ( int i = 0;  i <= 38; i += 2 ) {
       __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
     }
@@ -3365,8 +3386,7 @@
     __ ldf(FloatRegisterImpl::D, key, 232, F50);
     __ aes_eround01(F52, F54, F56, F58); //round 13
     __ aes_eround23(F46, F54, F56, F60);
-    __ br(Assembler::always, false, Assembler::pt, L_storeOutput);
-    __ delayed()->nop();
+    __ ba_short(L_storeOutput);
 
     __ BIND(L_doLast128bit);
     __ ldf(FloatRegisterImpl::D, key, 160, F48);
@@ -3377,23 +3397,62 @@
     __ aes_eround01_l(F48, F58, F60, F54); //last round
     __ aes_eround23_l(F50, F58, F60, F56);
 
-    // store output into the destination array, F0-F1 used as temp
-    __ fmov(FloatRegisterImpl::D, F54, F0);
-    __ stf(FloatRegisterImpl::S, F0, to, 0);
-    __ stf(FloatRegisterImpl::S, F1, to, 4);
-    __ fmov(FloatRegisterImpl::D, F56, F0);
-    __ stf(FloatRegisterImpl::S, F0, to, 8);
+    // Method to address arbitrary alignment for store instructions:
+    // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
+    // If zero/aligned then continue with double FP store instructions
+    // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
+    // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
+    // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
+    // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
+    // Set GSR.align to (8-n) using alignaddr
+    // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
+    // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
+    // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
+    // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
+    // We need to execute this process for both the 8-byte result values
+
+    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(to, 7, O5);
+    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
+    __ delayed()->edge8n(to, G0, O3);
+
+    // aligned case: store output into the destination array
+    __ stf(FloatRegisterImpl::D, F54, to, 0);
     __ retl();
-    __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12);
+    __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
+
+    __ BIND(L_store_misaligned_output);
+    __ add(to, 8, O4);
+    __ mov(8, O2);
+    __ sub(O2, O5, O2);
+    __ alignaddr(O2, G0, O2);
+    __ faligndata(F54, F54, F54);
+    __ faligndata(F56, F56, F56);
+    __ and3(to, -8, to);
+    __ and3(O4, -8, O4);
+    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
+    __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
+    __ add(to, 8, to);
+    __ add(O4, 8, O4);
+    __ orn(G0, O3, O3);
+    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
+    __ retl();
+    __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
 
     return start;
   }
 
   address generate_aescrypt_decryptBlock() {
+    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
+           "the following code assumes that first element of an int array is aligned to 8 bytes");
+    // required since we read original key 'byte' array as well in the decryption stubs
+    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
+           "the following code assumes that first element of a byte array is aligned to 8 bytes");
     __ align(CodeEntryAlignment);
-    StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock");
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
     address start = __ pc();
-    Label L_expand192bit, L_expand256bit, L_common_transform;
+    Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
+    Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
     Register from = O0; // source byte array
     Register to = O1;   // destination byte array
     Register key = O2;  // expanded key array
@@ -3403,15 +3462,29 @@
     // read expanded key array length
     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
 
-    // load input into F52-F54; F30,F31 used as temp
-    __ ldf(FloatRegisterImpl::S, from, 0, F30);
-    __ ldf(FloatRegisterImpl::S, from, 4, F31);
-    __ fmov(FloatRegisterImpl::D, F30, F52);
-    __ ldf(FloatRegisterImpl::S, from, 8, F30);
-    __ ldf(FloatRegisterImpl::S, from, 12, F31);
-    __ fmov(FloatRegisterImpl::D, F30, F54);
-
+    // save 'from' since we may need to recheck alignment in case of 256-bit decryption
+    __ mov(from, G1);
+
+    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(from, 7, G0);
+    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
+    __ delayed()->alignaddr(from, G0, from);
+
+    // aligned case: load input into F52-F54
+    __ ldf(FloatRegisterImpl::D, from, 0, F52);
+    __ ldf(FloatRegisterImpl::D, from, 8, F54);
+    __ ba_short(L_load_original_key);
+
+    __ BIND(L_load_misaligned_input);
+    __ ldf(FloatRegisterImpl::D, from, 0, F52);
+    __ ldf(FloatRegisterImpl::D, from, 8, F54);
+    __ ldf(FloatRegisterImpl::D, from, 16, F56);
+    __ faligndata(F52, F54, F52);
+    __ faligndata(F54, F56, F54);
+
+    __ BIND(L_load_original_key);
     // load original key from SunJCE expanded decryption key
+    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
     for ( int i = 0;  i <= 3; i++ ) {
       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
     }
@@ -3432,8 +3505,7 @@
     // perform 128-bit key specific inverse cipher transformation
     __ fxor(FloatRegisterImpl::D, F42, F54, F54);
     __ fxor(FloatRegisterImpl::D, F40, F52, F52);
-    __ br(Assembler::always, false, Assembler::pt, L_common_transform);
-    __ delayed()->nop();
+    __ ba_short(L_common_transform);
 
     __ BIND(L_expand192bit);
 
@@ -3457,8 +3529,7 @@
     __ aes_dround01(F44, F52, F54, F56);
     __ aes_dround23(F42, F56, F58, F54);
     __ aes_dround01(F40, F56, F58, F52);
-    __ br(Assembler::always, false, Assembler::pt, L_common_transform);
-    __ delayed()->nop();
+    __ ba_short(L_common_transform);
 
     __ BIND(L_expand256bit);
 
@@ -3478,14 +3549,31 @@
     __ aes_kexpand2(F50, F56, F58);
 
     for ( int i = 0;  i <= 6; i += 2 ) {
-      __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
+      __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
     }
 
-    // load input into F52-F54
+    // reload original 'from' address
+    __ mov(G1, from);
+
+    // re-check 8-byte alignment
+    __ andcc(from, 7, G0);
+    __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
+    __ delayed()->alignaddr(from, G0, from);
+
+    // aligned case: load input into F52-F54
     __ ldf(FloatRegisterImpl::D, from, 0, F52);
     __ ldf(FloatRegisterImpl::D, from, 8, F54);
+    __ ba_short(L_256bit_transform);
+
+    __ BIND(L_reload_misaligned_input);
+    __ ldf(FloatRegisterImpl::D, from, 0, F52);
+    __ ldf(FloatRegisterImpl::D, from, 8, F54);
+    __ ldf(FloatRegisterImpl::D, from, 16, F56);
+    __ faligndata(F52, F54, F52);
+    __ faligndata(F54, F56, F54);
 
     // perform 256-bit key specific inverse cipher transformation
+    __ BIND(L_256bit_transform);
     __ fxor(FloatRegisterImpl::D, F0, F54, F54);
     __ fxor(FloatRegisterImpl::D, F2, F52, F52);
     __ aes_dround23(F4, F52, F54, F58);
@@ -3515,43 +3603,71 @@
       }
     }
 
-    // store output to destination array, F0-F1 used as temp
-    __ fmov(FloatRegisterImpl::D, F52, F0);
-    __ stf(FloatRegisterImpl::S, F0, to, 0);
-    __ stf(FloatRegisterImpl::S, F1, to, 4);
-    __ fmov(FloatRegisterImpl::D, F54, F0);
-    __ stf(FloatRegisterImpl::S, F0, to, 8);
+    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(to, 7, O5);
+    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
+    __ delayed()->edge8n(to, G0, O3);
+
+    // aligned case: store output into the destination array
+    __ stf(FloatRegisterImpl::D, F52, to, 0);
     __ retl();
-    __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12);
+    __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
+
+    __ BIND(L_store_misaligned_output);
+    __ add(to, 8, O4);
+    __ mov(8, O2);
+    __ sub(O2, O5, O2);
+    __ alignaddr(O2, G0, O2);
+    __ faligndata(F52, F52, F52);
+    __ faligndata(F54, F54, F54);
+    __ and3(to, -8, to);
+    __ and3(O4, -8, O4);
+    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
+    __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
+    __ add(to, 8, to);
+    __ add(O4, 8, O4);
+    __ orn(G0, O3, O3);
+    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
+    __ retl();
+    __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
 
     return start;
   }
 
   address generate_cipherBlockChaining_encryptAESCrypt() {
+    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
+           "the following code assumes that first element of an int array is aligned to 8 bytes");
+    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
+           "the following code assumes that first element of a byte array is aligned to 8 bytes");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
-    Label L_cbcenc128, L_cbcenc192, L_cbcenc256;
+    Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
+    Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
+    Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
+    Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
     address start = __ pc();
-    Register from = O0; // source byte array
-    Register to = O1;   // destination byte array
-    Register key = O2;  // expanded key array
-    Register rvec = O3; // init vector
-    const Register len_reg = O4; // cipher length
-    const Register keylen = O5;  // reg for storing expanded key array length
-
-    // save cipher len to return in the end
-    __ mov(len_reg, L1);
+    Register from = I0; // source byte array
+    Register to = I1;   // destination byte array
+    Register key = I2;  // expanded key array
+    Register rvec = I3; // init vector
+    const Register len_reg = I4; // cipher length
+    const Register keylen = I5;  // reg for storing expanded key array length
+
+    // save cipher len before save_frame, to return in the end
+    __ mov(O4, L0);
+    __ save_frame(0);
 
     // read expanded key length
     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
 
-    // load init vector
+    // load initial vector, 8-byte alignment is guranteed
     __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
     __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
+    // load key, 8-byte alignment is guranteed
     __ ldx(key,0,G1);
-    __ ldx(key,8,G2);
-
-    // start loading expanded key
+    __ ldx(key,8,G5);
+
+    // start loading expanded key, 8-byte alignment is guranteed
     for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
     }
@@ -3571,15 +3687,35 @@
     }
 
     // 256-bit original key size
-    __ br(Assembler::always, false, Assembler::pt, L_cbcenc256);
-    __ delayed()->nop();
+    __ ba_short(L_cbcenc256);
 
     __ align(OptoLoopAlignment);
     __ BIND(L_cbcenc128);
+    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(from, 7, G0);
+    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
+    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
+
+    // aligned case: load input into G3 and G4
     __ ldx(from,0,G3);
     __ ldx(from,8,G4);
+    __ ba_short(L_128bit_transform);
+
+    __ BIND(L_load_misaligned_input_128bit);
+    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
+    __ alignaddr(from, G0, from);
+    __ ldf(FloatRegisterImpl::D, from, 0, F48);
+    __ ldf(FloatRegisterImpl::D, from, 8, F50);
+    __ ldf(FloatRegisterImpl::D, from, 16, F52);
+    __ faligndata(F48, F50, F48);
+    __ faligndata(F50, F52, F50);
+    __ movdtox(F48, G3);
+    __ movdtox(F50, G4);
+    __ mov(L1, from);
+
+    __ BIND(L_128bit_transform);
     __ xor3(G1,G3,G3);
-    __ xor3(G2,G4,G4);
+    __ xor3(G5,G4,G4);
     __ movxtod(G3,F56);
     __ movxtod(G4,F58);
     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
@@ -3598,24 +3734,81 @@
       }
     }
 
+    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(to, 7, L1);
+    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
+    __ delayed()->edge8n(to, G0, L2);
+
+    // aligned case: store output into the destination array
     __ stf(FloatRegisterImpl::D, F60, to, 0);
     __ stf(FloatRegisterImpl::D, F62, to, 8);
+    __ ba_short(L_check_loop_end_128bit);
+
+    __ BIND(L_store_misaligned_output_128bit);
+    __ add(to, 8, L3);
+    __ mov(8, L4);
+    __ sub(L4, L1, L4);
+    __ alignaddr(L4, G0, L4);
+    // save cipher text before circular right shift
+    // as it needs to be stored as iv for next block (see code before next retl)
+    __ movdtox(F60, L6);
+    __ movdtox(F62, L7);
+    __ faligndata(F60, F60, F60);
+    __ faligndata(F62, F62, F62);
+    __ mov(to, L5);
+    __ and3(to, -8, to);
+    __ and3(L3, -8, L3);
+    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+    __ add(to, 8, to);
+    __ add(L3, 8, L3);
+    __ orn(G0, L2, L2);
+    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+    __ mov(L5, to);
+    __ movxtod(L6, F60);
+    __ movxtod(L7, F62);
+
+    __ BIND(L_check_loop_end_128bit);
     __ add(from, 16, from);
     __ add(to, 16, to);
     __ subcc(len_reg, 16, len_reg);
     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
     __ delayed()->nop();
+    // re-init intial vector for next block, 8-byte alignment is guaranteed
     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
+    __ restore();
     __ retl();
-    __ delayed()->mov(L1, O0);
+    __ delayed()->mov(L0, O0);
 
     __ align(OptoLoopAlignment);
     __ BIND(L_cbcenc192);
+    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(from, 7, G0);
+    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
+    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
+
+    // aligned case: load input into G3 and G4
     __ ldx(from,0,G3);
     __ ldx(from,8,G4);
+    __ ba_short(L_192bit_transform);
+
+    __ BIND(L_load_misaligned_input_192bit);
+    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
+    __ alignaddr(from, G0, from);
+    __ ldf(FloatRegisterImpl::D, from, 0, F48);
+    __ ldf(FloatRegisterImpl::D, from, 8, F50);
+    __ ldf(FloatRegisterImpl::D, from, 16, F52);
+    __ faligndata(F48, F50, F48);
+    __ faligndata(F50, F52, F50);
+    __ movdtox(F48, G3);
+    __ movdtox(F50, G4);
+    __ mov(L1, from);
+
+    __ BIND(L_192bit_transform);
     __ xor3(G1,G3,G3);
-    __ xor3(G2,G4,G4);
+    __ xor3(G5,G4,G4);
     __ movxtod(G3,F56);
     __ movxtod(G4,F58);
     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
@@ -3634,24 +3827,81 @@
       }
     }
 
+    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(to, 7, L1);
+    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
+    __ delayed()->edge8n(to, G0, L2);
+
+    // aligned case: store output into the destination array
     __ stf(FloatRegisterImpl::D, F60, to, 0);
     __ stf(FloatRegisterImpl::D, F62, to, 8);
+    __ ba_short(L_check_loop_end_192bit);
+
+    __ BIND(L_store_misaligned_output_192bit);
+    __ add(to, 8, L3);
+    __ mov(8, L4);
+    __ sub(L4, L1, L4);
+    __ alignaddr(L4, G0, L4);
+    __ movdtox(F60, L6);
+    __ movdtox(F62, L7);
+    __ faligndata(F60, F60, F60);
+    __ faligndata(F62, F62, F62);
+    __ mov(to, L5);
+    __ and3(to, -8, to);
+    __ and3(L3, -8, L3);
+    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+    __ add(to, 8, to);
+    __ add(L3, 8, L3);
+    __ orn(G0, L2, L2);
+    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+    __ mov(L5, to);
+    __ movxtod(L6, F60);
+    __ movxtod(L7, F62);
+
+    __ BIND(L_check_loop_end_192bit);
     __ add(from, 16, from);
     __ subcc(len_reg, 16, len_reg);
     __ add(to, 16, to);
     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
     __ delayed()->nop();
+    // re-init intial vector for next block, 8-byte alignment is guaranteed
     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
+    __ restore();
     __ retl();
-    __ delayed()->mov(L1, O0);
+    __ delayed()->mov(L0, O0);
 
     __ align(OptoLoopAlignment);
     __ BIND(L_cbcenc256);
+    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(from, 7, G0);
+    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
+    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
+
+    // aligned case: load input into G3 and G4
     __ ldx(from,0,G3);
     __ ldx(from,8,G4);
+    __ ba_short(L_256bit_transform);
+
+    __ BIND(L_load_misaligned_input_256bit);
+    // cannot clobber F48, F50 and F52. F56, F58 can be used though
+    __ alignaddr(from, G0, from);
+    __ movdtox(F60, L2); // save F60 before overwriting
+    __ ldf(FloatRegisterImpl::D, from, 0, F56);
+    __ ldf(FloatRegisterImpl::D, from, 8, F58);
+    __ ldf(FloatRegisterImpl::D, from, 16, F60);
+    __ faligndata(F56, F58, F56);
+    __ faligndata(F58, F60, F58);
+    __ movdtox(F56, G3);
+    __ movdtox(F58, G4);
+    __ mov(L1, from);
+    __ movxtod(L2, F60);
+
+    __ BIND(L_256bit_transform);
     __ xor3(G1,G3,G3);
-    __ xor3(G2,G4,G4);
+    __ xor3(G5,G4,G4);
     __ movxtod(G3,F56);
     __ movxtod(G4,F58);
     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
@@ -3670,26 +3920,69 @@
       }
     }
 
+    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(to, 7, L1);
+    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
+    __ delayed()->edge8n(to, G0, L2);
+
+    // aligned case: store output into the destination array
     __ stf(FloatRegisterImpl::D, F60, to, 0);
     __ stf(FloatRegisterImpl::D, F62, to, 8);
+    __ ba_short(L_check_loop_end_256bit);
+
+    __ BIND(L_store_misaligned_output_256bit);
+    __ add(to, 8, L3);
+    __ mov(8, L4);
+    __ sub(L4, L1, L4);
+    __ alignaddr(L4, G0, L4);
+    __ movdtox(F60, L6);
+    __ movdtox(F62, L7);
+    __ faligndata(F60, F60, F60);
+    __ faligndata(F62, F62, F62);
+    __ mov(to, L5);
+    __ and3(to, -8, to);
+    __ and3(L3, -8, L3);
+    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+    __ add(to, 8, to);
+    __ add(L3, 8, L3);
+    __ orn(G0, L2, L2);
+    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+    __ mov(L5, to);
+    __ movxtod(L6, F60);
+    __ movxtod(L7, F62);
+
+    __ BIND(L_check_loop_end_256bit);
     __ add(from, 16, from);
     __ subcc(len_reg, 16, len_reg);
     __ add(to, 16, to);
     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
     __ delayed()->nop();
+    // re-init intial vector for next block, 8-byte alignment is guaranteed
     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
+    __ restore();
     __ retl();
-    __ delayed()->mov(L1, O0);
+    __ delayed()->mov(L0, O0);
 
     return start;
   }
 
   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
+    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
+           "the following code assumes that first element of an int array is aligned to 8 bytes");
+    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
+           "the following code assumes that first element of a byte array is aligned to 8 bytes");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
     Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
     Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
+    Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
+    Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
+    Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
+    Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
+    Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
     address start = __ pc();
     Register from = I0; // source byte array
     Register to = I1;   // destination byte array
@@ -3704,11 +3997,12 @@
     __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
 
     // load original key from SunJCE expanded decryption key
+    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
     for ( int i = 0;  i <= 3; i++ ) {
       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
     }
 
-    // load initial vector
+    // load initial vector, 8-byte alignment is guaranteed
     __ ldx(rvec,0,L0);
     __ ldx(rvec,8,L1);
 
@@ -3733,11 +4027,10 @@
     __ movdtox(F42,L3);
 
     __ and3(len_reg, 16, L4);
-    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128);
-    __ delayed()->nop();
-
-    __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start);
-    __ delayed()->nop();
+    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
+    __ nop();
+
+    __ ba_short(L_dec_first_block_start);
 
     __ BIND(L_expand192bit);
     // load rest of the 192-bit key
@@ -3758,11 +4051,10 @@
     __ movdtox(F50,L3);
 
     __ and3(len_reg, 16, L4);
-    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192);
-    __ delayed()->nop();
-
-    __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start);
-    __ delayed()->nop();
+    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
+    __ nop();
+
+    __ ba_short(L_dec_first_block_start);
 
     __ BIND(L_expand256bit);
     // load rest of the 256-bit key
@@ -3785,12 +4077,32 @@
     __ movdtox(F58,L3);
 
     __ and3(len_reg, 16, L4);
-    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256);
-    __ delayed()->nop();
+    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
 
     __ BIND(L_dec_first_block_start);
+    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(from, 7, G0);
+    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
+    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
+
+    // aligned case: load input into L4 and L5
     __ ldx(from,0,L4);
     __ ldx(from,8,L5);
+    __ ba_short(L_transform_first_block);
+
+    __ BIND(L_load_misaligned_input_first_block);
+    __ alignaddr(from, G0, from);
+    // F58, F60, F62 can be clobbered
+    __ ldf(FloatRegisterImpl::D, from, 0, F58);
+    __ ldf(FloatRegisterImpl::D, from, 8, F60);
+    __ ldf(FloatRegisterImpl::D, from, 16, F62);
+    __ faligndata(F58, F60, F58);
+    __ faligndata(F60, F62, F60);
+    __ movdtox(F58, L4);
+    __ movdtox(F60, L5);
+    __ mov(G1, from);
+
+    __ BIND(L_transform_first_block);
     __ xor3(L2,L4,G1);
     __ movxtod(G1,F60);
     __ xor3(L3,L5,G1);
@@ -3833,9 +4145,36 @@
     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
 
+    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(to, 7, G1);
+    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
+    __ delayed()->edge8n(to, G0, G2);
+
+    // aligned case: store output into the destination array
     __ stf(FloatRegisterImpl::D, F60, to, 0);
     __ stf(FloatRegisterImpl::D, F62, to, 8);
-
+    __ ba_short(L_check_decrypt_end);
+
+    __ BIND(L_store_misaligned_output_first_block);
+    __ add(to, 8, G3);
+    __ mov(8, G4);
+    __ sub(G4, G1, G4);
+    __ alignaddr(G4, G0, G4);
+    __ faligndata(F60, F60, F60);
+    __ faligndata(F62, F62, F62);
+    __ mov(to, G1);
+    __ and3(to, -8, to);
+    __ and3(G3, -8, G3);
+    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
+    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
+    __ add(to, 8, to);
+    __ add(G3, 8, G3);
+    __ orn(G0, G2, G2);
+    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
+    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
+    __ mov(G1, to);
+
+    __ BIND(L_check_decrypt_end);
     __ add(from, 16, from);
     __ add(to, 16, to);
     __ subcc(len_reg, 16, len_reg);
@@ -3852,17 +4191,44 @@
     __ BIND(L_dec_next2_blocks128);
     __ nop();
 
-    // F40:F42 used for first 16-bytes
+    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(from, 7, G0);
+    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
+    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
+
+    // aligned case: load input into G4, G5, L4 and L5
     __ ldx(from,0,G4);
     __ ldx(from,8,G5);
+    __ ldx(from,16,L4);
+    __ ldx(from,24,L5);
+    __ ba_short(L_transform_next2_blocks128);
+
+    __ BIND(L_load_misaligned_next2_blocks128);
+    __ alignaddr(from, G0, from);
+    // F40, F42, F58, F60, F62 can be clobbered
+    __ ldf(FloatRegisterImpl::D, from, 0, F40);
+    __ ldf(FloatRegisterImpl::D, from, 8, F42);
+    __ ldf(FloatRegisterImpl::D, from, 16, F60);
+    __ ldf(FloatRegisterImpl::D, from, 24, F62);
+    __ ldf(FloatRegisterImpl::D, from, 32, F58);
+    __ faligndata(F40, F42, F40);
+    __ faligndata(F42, F60, F42);
+    __ faligndata(F60, F62, F60);
+    __ faligndata(F62, F58, F62);
+    __ movdtox(F40, G4);
+    __ movdtox(F42, G5);
+    __ movdtox(F60, L4);
+    __ movdtox(F62, L5);
+    __ mov(G1, from);
+
+    __ BIND(L_transform_next2_blocks128);
+    // F40:F42 used for first 16-bytes
     __ xor3(L2,G4,G1);
     __ movxtod(G1,F40);
     __ xor3(L3,G5,G1);
     __ movxtod(G1,F42);
 
     // F60:F62 used for next 16-bytes
-    __ ldx(from,16,L4);
-    __ ldx(from,24,L5);
     __ xor3(L2,L4,G1);
     __ movxtod(G1,F60);
     __ xor3(L3,L5,G1);
@@ -3891,9 +4257,6 @@
     __ fxor(FloatRegisterImpl::D, F46, F40, F40);
     __ fxor(FloatRegisterImpl::D, F44, F42, F42);
 
-    __ stf(FloatRegisterImpl::D, F40, to, 0);
-    __ stf(FloatRegisterImpl::D, F42, to, 8);
-
     __ movxtod(G4,F56);
     __ movxtod(G5,F58);
     __ mov(L4,L0);
@@ -3901,32 +4264,93 @@
     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
 
+    // For mis-aligned store of 32 bytes of result we can do:
+    // Circular right-shift all 4 FP registers so that 'head' and 'tail'
+    // parts that need to be stored starting at mis-aligned address are in a FP reg
+    // the other 3 FP regs can thus be stored using regular store
+    // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
+
+    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(to, 7, G1);
+    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
+    __ delayed()->edge8n(to, G0, G2);
+
+    // aligned case: store output into the destination array
+    __ stf(FloatRegisterImpl::D, F40, to, 0);
+    __ stf(FloatRegisterImpl::D, F42, to, 8);
     __ stf(FloatRegisterImpl::D, F60, to, 16);
     __ stf(FloatRegisterImpl::D, F62, to, 24);
-
+    __ ba_short(L_check_decrypt_loop_end128);
+
+    __ BIND(L_store_misaligned_output_next2_blocks128);
+    __ mov(8, G4);
+    __ sub(G4, G1, G4);
+    __ alignaddr(G4, G0, G4);
+    __ faligndata(F40, F42, F56); // F56 can be clobbered
+    __ faligndata(F42, F60, F42);
+    __ faligndata(F60, F62, F60);
+    __ faligndata(F62, F40, F40);
+    __ mov(to, G1);
+    __ and3(to, -8, to);
+    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
+    __ stf(FloatRegisterImpl::D, F56, to, 8);
+    __ stf(FloatRegisterImpl::D, F42, to, 16);
+    __ stf(FloatRegisterImpl::D, F60, to, 24);
+    __ add(to, 32, to);
+    __ orn(G0, G2, G2);
+    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
+    __ mov(G1, to);
+
+    __ BIND(L_check_decrypt_loop_end128);
     __ add(from, 32, from);
     __ add(to, 32, to);
     __ subcc(len_reg, 32, len_reg);
     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
     __ delayed()->nop();
-    __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end);
-    __ delayed()->nop();
+    __ ba_short(L_cbcdec_end);
 
     __ align(OptoLoopAlignment);
     __ BIND(L_dec_next2_blocks192);
     __ nop();
 
-    // F48:F50 used for first 16-bytes
+    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(from, 7, G0);
+    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
+    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
+
+    // aligned case: load input into G4, G5, L4 and L5
     __ ldx(from,0,G4);
     __ ldx(from,8,G5);
+    __ ldx(from,16,L4);
+    __ ldx(from,24,L5);
+    __ ba_short(L_transform_next2_blocks192);
+
+    __ BIND(L_load_misaligned_next2_blocks192);
+    __ alignaddr(from, G0, from);
+    // F48, F50, F52, F60, F62 can be clobbered
+    __ ldf(FloatRegisterImpl::D, from, 0, F48);
+    __ ldf(FloatRegisterImpl::D, from, 8, F50);
+    __ ldf(FloatRegisterImpl::D, from, 16, F60);
+    __ ldf(FloatRegisterImpl::D, from, 24, F62);
+    __ ldf(FloatRegisterImpl::D, from, 32, F52);
+    __ faligndata(F48, F50, F48);
+    __ faligndata(F50, F60, F50);
+    __ faligndata(F60, F62, F60);
+    __ faligndata(F62, F52, F62);
+    __ movdtox(F48, G4);
+    __ movdtox(F50, G5);
+    __ movdtox(F60, L4);
+    __ movdtox(F62, L5);
+    __ mov(G1, from);
+
+    __ BIND(L_transform_next2_blocks192);
+    // F48:F50 used for first 16-bytes
     __ xor3(L2,G4,G1);
     __ movxtod(G1,F48);
     __ xor3(L3,G5,G1);
     __ movxtod(G1,F50);
 
     // F60:F62 used for next 16-bytes
-    __ ldx(from,16,L4);
-    __ ldx(from,24,L5);
     __ xor3(L2,L4,G1);
     __ movxtod(G1,F60);
     __ xor3(L3,L5,G1);
@@ -3955,9 +4379,6 @@
     __ fxor(FloatRegisterImpl::D, F54, F48, F48);
     __ fxor(FloatRegisterImpl::D, F52, F50, F50);
 
-    __ stf(FloatRegisterImpl::D, F48, to, 0);
-    __ stf(FloatRegisterImpl::D, F50, to, 8);
-
     __ movxtod(G4,F56);
     __ movxtod(G5,F58);
     __ mov(L4,L0);
@@ -3965,32 +4386,87 @@
     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
 
+    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(to, 7, G1);
+    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
+    __ delayed()->edge8n(to, G0, G2);
+
+    // aligned case: store output into the destination array
+    __ stf(FloatRegisterImpl::D, F48, to, 0);
+    __ stf(FloatRegisterImpl::D, F50, to, 8);
     __ stf(FloatRegisterImpl::D, F60, to, 16);
     __ stf(FloatRegisterImpl::D, F62, to, 24);
-
+    __ ba_short(L_check_decrypt_loop_end192);
+
+    __ BIND(L_store_misaligned_output_next2_blocks192);
+    __ mov(8, G4);
+    __ sub(G4, G1, G4);
+    __ alignaddr(G4, G0, G4);
+    __ faligndata(F48, F50, F56); // F56 can be clobbered
+    __ faligndata(F50, F60, F50);
+    __ faligndata(F60, F62, F60);
+    __ faligndata(F62, F48, F48);
+    __ mov(to, G1);
+    __ and3(to, -8, to);
+    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
+    __ stf(FloatRegisterImpl::D, F56, to, 8);
+    __ stf(FloatRegisterImpl::D, F50, to, 16);
+    __ stf(FloatRegisterImpl::D, F60, to, 24);
+    __ add(to, 32, to);
+    __ orn(G0, G2, G2);
+    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
+    __ mov(G1, to);
+
+    __ BIND(L_check_decrypt_loop_end192);
     __ add(from, 32, from);
     __ add(to, 32, to);
     __ subcc(len_reg, 32, len_reg);
     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
     __ delayed()->nop();
-    __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end);
-    __ delayed()->nop();
+    __ ba_short(L_cbcdec_end);
 
     __ align(OptoLoopAlignment);
     __ BIND(L_dec_next2_blocks256);
     __ nop();
 
-    // F0:F2 used for first 16-bytes
+    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(from, 7, G0);
+    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
+    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
+
+    // aligned case: load input into G4, G5, L4 and L5
     __ ldx(from,0,G4);
     __ ldx(from,8,G5);
+    __ ldx(from,16,L4);
+    __ ldx(from,24,L5);
+    __ ba_short(L_transform_next2_blocks256);
+
+    __ BIND(L_load_misaligned_next2_blocks256);
+    __ alignaddr(from, G0, from);
+    // F0, F2, F4, F60, F62 can be clobbered
+    __ ldf(FloatRegisterImpl::D, from, 0, F0);
+    __ ldf(FloatRegisterImpl::D, from, 8, F2);
+    __ ldf(FloatRegisterImpl::D, from, 16, F60);
+    __ ldf(FloatRegisterImpl::D, from, 24, F62);
+    __ ldf(FloatRegisterImpl::D, from, 32, F4);
+    __ faligndata(F0, F2, F0);
+    __ faligndata(F2, F60, F2);
+    __ faligndata(F60, F62, F60);
+    __ faligndata(F62, F4, F62);
+    __ movdtox(F0, G4);
+    __ movdtox(F2, G5);
+    __ movdtox(F60, L4);
+    __ movdtox(F62, L5);
+    __ mov(G1, from);
+
+    __ BIND(L_transform_next2_blocks256);
+    // F0:F2 used for first 16-bytes
     __ xor3(L2,G4,G1);
     __ movxtod(G1,F0);
     __ xor3(L3,G5,G1);
     __ movxtod(G1,F2);
 
     // F60:F62 used for next 16-bytes
-    __ ldx(from,16,L4);
-    __ ldx(from,24,L5);
     __ xor3(L2,L4,G1);
     __ movxtod(G1,F60);
     __ xor3(L3,L5,G1);
@@ -4043,9 +4519,6 @@
     __ fxor(FloatRegisterImpl::D, F6, F0, F0);
     __ fxor(FloatRegisterImpl::D, F4, F2, F2);
 
-    __ stf(FloatRegisterImpl::D, F0, to, 0);
-    __ stf(FloatRegisterImpl::D, F2, to, 8);
-
     __ movxtod(G4,F56);
     __ movxtod(G5,F58);
     __ mov(L4,L0);
@@ -4053,9 +4526,38 @@
     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
 
+    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+    __ andcc(to, 7, G1);
+    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
+    __ delayed()->edge8n(to, G0, G2);
+
+    // aligned case: store output into the destination array
+    __ stf(FloatRegisterImpl::D, F0, to, 0);
+    __ stf(FloatRegisterImpl::D, F2, to, 8);
     __ stf(FloatRegisterImpl::D, F60, to, 16);
     __ stf(FloatRegisterImpl::D, F62, to, 24);
-
+    __ ba_short(L_check_decrypt_loop_end256);
+
+    __ BIND(L_store_misaligned_output_next2_blocks256);
+    __ mov(8, G4);
+    __ sub(G4, G1, G4);
+    __ alignaddr(G4, G0, G4);
+    __ faligndata(F0, F2, F56); // F56 can be clobbered
+    __ faligndata(F2, F60, F2);
+    __ faligndata(F60, F62, F60);
+    __ faligndata(F62, F0, F0);
+    __ mov(to, G1);
+    __ and3(to, -8, to);
+    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
+    __ stf(FloatRegisterImpl::D, F56, to, 8);
+    __ stf(FloatRegisterImpl::D, F2, to, 16);
+    __ stf(FloatRegisterImpl::D, F60, to, 24);
+    __ add(to, 32, to);
+    __ orn(G0, G2, G2);
+    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
+    __ mov(G1, to);
+
+    __ BIND(L_check_decrypt_loop_end256);
     __ add(from, 32, from);
     __ add(to, 32, to);
     __ subcc(len_reg, 32, len_reg);
@@ -4063,6 +4565,7 @@
     __ delayed()->nop();
 
     __ BIND(L_cbcdec_end);
+    // re-init intial vector for next block, 8-byte alignment is guaranteed
     __ stx(L0, rvec, 0);
     __ stx(L1, rvec, 8);
     __ restore();
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/sparc/vm/stubRoutines_sparc.hpp
--- a/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Thu May 08 23:07:11 2014 -0700
@@ -41,7 +41,7 @@
 enum /* platform_dependent_constants */ {
   // %%%%%%%% May be able to shrink this a lot
   code_size1 = 20000,           // simply increase if too small (assembler will crash if too small)
-  code_size2 = 20000            // simply increase if too small (assembler will crash if too small)
+  code_size2 = 22000            // simply increase if too small (assembler will crash if too small)
 };
 
 class Sparc {
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/sparc/vm/vm_version_sparc.cpp
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu May 08 23:07:11 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -266,9 +266,9 @@
   if (!has_vis1()) // Drop to 0 if no VIS1 support
     UseVIS = 0;
 
-  // T2 and above should have support for AES instructions
+  // SPARC T4 and above should have support for AES instructions
   if (has_aes()) {
-    if (UseVIS > 0) { // AES intrinsics use FXOR instruction which is VIS1
+    if (UseVIS > 2) { // AES intrinsics use MOVxTOd/MOVdTOx which are VIS3
       if (FLAG_IS_DEFAULT(UseAES)) {
         FLAG_SET_DEFAULT(UseAES, true);
       }
@@ -282,7 +282,7 @@
       }
     } else {
         if (UseAES || UseAESIntrinsics) {
-          warning("SPARC AES intrinsics require VIS1 instruction support. Intrinsics will be disabled.");
+          warning("SPARC AES intrinsics require VIS3 instruction support. Intrinsics will be disabled.");
           if (UseAES) {
             FLAG_SET_DEFAULT(UseAES, false);
           }
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/x86/vm/assembler_x86.cpp
--- a/src/cpu/x86/vm/assembler_x86.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Thu May 08 23:07:11 2014 -0700
@@ -1766,7 +1766,7 @@
 
 // Move Unaligned 256bit Vector
 void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
-  assert(UseAVX, "");
+  assert(UseAVX > 0, "");
   bool vector256 = true;
   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256);
   emit_int8(0x6F);
@@ -1774,7 +1774,7 @@
 }
 
 void Assembler::vmovdqu(XMMRegister dst, Address src) {
-  assert(UseAVX, "");
+  assert(UseAVX > 0, "");
   InstructionMark im(this);
   bool vector256 = true;
   vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256);
@@ -1783,7 +1783,7 @@
 }
 
 void Assembler::vmovdqu(Address dst, XMMRegister src) {
-  assert(UseAVX, "");
+  assert(UseAVX > 0, "");
   InstructionMark im(this);
   bool vector256 = true;
   // swap src<->dst for encoding
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/x86/vm/vm_version_x86.cpp
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Thu May 08 23:07:11 2014 -0700
@@ -263,6 +263,10 @@
     // and check upper YMM bits after it.
     //
     VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
+    intx saved_useavx = UseAVX;
+    intx saved_usesse = UseSSE;
+    UseAVX = 1;
+    UseSSE = 2;
 
     // load value into all 32 bytes of ymm7 register
     __ movl(rcx, VM_Version::ymm_test_value());
@@ -292,6 +296,8 @@
 #endif
 
     VM_Version::clean_cpuFeatures();
+    UseAVX = saved_useavx;
+    UseSSE = saved_usesse;
 
     //
     // cpuid(0x7) Structured Extended Features
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/os_cpu/linux_ppc/vm/atomic_linux_ppc.inline.hpp
--- a/src/os_cpu/linux_ppc/vm/atomic_linux_ppc.inline.hpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/os_cpu/linux_ppc/vm/atomic_linux_ppc.inline.hpp	Thu May 08 23:07:11 2014 -0700
@@ -53,41 +53,41 @@
 
 inline jlong Atomic::load(volatile jlong* src) { return *src; }
 
-/*
-  machine barrier instructions:
-
-  - sync            two-way memory barrier, aka fence
-  - lwsync          orders  Store|Store,
-                             Load|Store,
-                             Load|Load,
-                    but not Store|Load
-  - eieio           orders memory accesses for device memory (only)
-  - isync           invalidates speculatively executed instructions
-                    From the POWER ISA 2.06 documentation:
-                     "[...] an isync instruction prevents the execution of
-                    instructions following the isync until instructions
-                    preceding the isync have completed, [...]"
-                    From IBM's AIX assembler reference:
-                     "The isync [...] instructions causes the processor to
-                    refetch any instructions that might have been fetched
-                    prior to the isync instruction. The instruction isync
-                    causes the processor to wait for all previous instructions
-                    to complete. Then any instructions already fetched are
-                    discarded and instruction processing continues in the
-                    environment established by the previous instructions."
-
-  semantic barrier instructions:
-  (as defined in orderAccess.hpp)
-
-  - release         orders Store|Store,       (maps to lwsync)
-                            Load|Store
-  - acquire         orders  Load|Store,       (maps to lwsync)
-                            Load|Load
-  - fence           orders Store|Store,       (maps to sync)
-                            Load|Store,
-                            Load|Load,
-                           Store|Load
-*/
+//
+// machine barrier instructions:
+//
+// - sync            two-way memory barrier, aka fence
+// - lwsync          orders  Store|Store,
+//                            Load|Store,
+//                            Load|Load,
+//                   but not Store|Load
+// - eieio           orders memory accesses for device memory (only)
+// - isync           invalidates speculatively executed instructions
+//                   From the POWER ISA 2.06 documentation:
+//                    "[...] an isync instruction prevents the execution of
+//                   instructions following the isync until instructions
+//                   preceding the isync have completed, [...]"
+//                   From IBM's AIX assembler reference:
+//                    "The isync [...] instructions causes the processor to
+//                   refetch any instructions that might have been fetched
+//                   prior to the isync instruction. The instruction isync
+//                   causes the processor to wait for all previous instructions
+//                   to complete. Then any instructions already fetched are
+//                   discarded and instruction processing continues in the
+//                   environment established by the previous instructions."
+//
+// semantic barrier instructions:
+// (as defined in orderAccess.hpp)
+//
+// - release         orders Store|Store,       (maps to lwsync)
+//                           Load|Store
+// - acquire         orders  Load|Store,       (maps to lwsync)
+//                           Load|Load
+// - fence           orders Store|Store,       (maps to sync)
+//                           Load|Store,
+//                           Load|Load,
+//                          Store|Load
+//
 
 #define strasm_sync                       "\n  sync    \n"
 #define strasm_lwsync                     "\n  lwsync  \n"
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/ci/ciReplay.cpp
--- a/src/share/vm/ci/ciReplay.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/ci/ciReplay.cpp	Thu May 08 23:07:11 2014 -0700
@@ -376,11 +376,15 @@
     int c = getc(_stream);
     while(c != EOF) {
       c = get_line(c);
-      process_command(CHECK);
+      process_command(THREAD);
       if (had_error()) {
         tty->print_cr("Error while parsing line %d: %s\n", line_no, _error_message);
-        tty->print_cr("%s", _buffer);
-        return;
+        if (ReplayIgnoreInitErrors) {
+          CLEAR_PENDING_EXCEPTION;
+          _error_message = NULL;
+        } else {
+          return;
+        }
       }
       line_no++;
     }
@@ -565,10 +569,14 @@
   void process_ciMethodData(TRAPS) {
     Method* method = parse_method(CHECK);
     if (had_error()) return;
-    /* jsut copied from Method, to build interpret data*/
+    /* just copied from Method, to build interpret data*/
     if (InstanceRefKlass::owns_pending_list_lock((JavaThread*)THREAD)) {
       return;
     }
+    // To be properly initialized, some profiling in the MDO needs the
+    // method to be rewritten (number of arguments at a call for
+    // instance)
+    method->method_holder()->link_class(CHECK);
     // methodOopDesc::build_interpreter_method_data(method, CHECK);
     {
       // Grab a lock here to prevent multiple
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/classfile/vmSymbols.hpp
--- a/src/share/vm/classfile/vmSymbols.hpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/classfile/vmSymbols.hpp	Thu May 08 23:07:11 2014 -0700
@@ -774,7 +774,7 @@
   /* java/lang/ref/Reference */                                                                                         \
   do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
                                                                                                                         \
-  /* support for com.sum.crypto.provider.AESCrypt and some of its callers */                                            \
+  /* support for com.sun.crypto.provider.AESCrypt and some of its callers */                                            \
   do_class(com_sun_crypto_provider_aescrypt,      "com/sun/crypto/provider/AESCrypt")                                   \
   do_intrinsic(_aescrypt_encryptBlock, com_sun_crypto_provider_aescrypt, encryptBlock_name, byteArray_int_byteArray_int_signature, F_R)   \
   do_intrinsic(_aescrypt_decryptBlock, com_sun_crypto_provider_aescrypt, decryptBlock_name, byteArray_int_byteArray_int_signature, F_R)   \
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/code/nmethod.cpp
--- a/src/share/vm/code/nmethod.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/code/nmethod.cpp	Thu May 08 23:07:11 2014 -0700
@@ -771,7 +771,11 @@
     _hotness_counter         = NMethodSweeper::hotness_counter_reset_val();
 
     code_buffer->copy_values_to(this);
-    debug_only(verify_scavenge_root_oops());
+    if (ScavengeRootsInCode && detect_scavenge_root_oops()) {
+      CodeCache::add_scavenge_root_nmethod(this);
+      Universe::heap()->register_nmethod(this);
+    }
+    DEBUG_ONLY(verify_scavenge_root_oops();)
     CodeCache::commit(this);
   }
 
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/oops/klass.cpp
--- a/src/share/vm/oops/klass.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/oops/klass.cpp	Thu May 08 23:07:11 2014 -0700
@@ -496,6 +496,7 @@
 }
 
 void Klass::restore_unshareable_info(TRAPS) {
+  TRACE_INIT_ID(this);
   // If an exception happened during CDS restore, some of these fields may already be
   // set.  We leave the class on the CLD list, even if incomplete so that we don't
   // modify the CLD list outside a safepoint.
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/compile.cpp
--- a/src/share/vm/opto/compile.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/opto/compile.cpp	Thu May 08 23:07:11 2014 -0700
@@ -693,6 +693,7 @@
 #endif
   set_print_inlining(PrintInlining || method()->has_option("PrintInlining") NOT_PRODUCT( || PrintOptoInlining));
   set_print_intrinsics(PrintIntrinsics || method()->has_option("PrintIntrinsics"));
+  set_has_irreducible_loop(true); // conservative until build_loop_tree() reset it
 
   if (ProfileTraps RTM_OPT_ONLY( || UseRTMLocking )) {
     // Make sure the method being compiled gets its own MDO,
@@ -977,6 +978,8 @@
   set_print_assembly(PrintFrameConverterAssembly);
   set_parsed_irreducible_loop(false);
 #endif
+  set_has_irreducible_loop(false); // no loops
+
   CompileWrapper cw(this);
   Init(/*AliasLevel=*/ 0);
   init_tf((*generator)());
@@ -1147,7 +1150,7 @@
     if( start->is_Start() )
       return start->as_Start();
   }
-  ShouldNotReachHere();
+  fatal("Did not find Start node!");
   return NULL;
 }
 
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/compile.hpp
--- a/src/share/vm/opto/compile.hpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/opto/compile.hpp	Thu May 08 23:07:11 2014 -0700
@@ -319,6 +319,7 @@
   bool                  _trace_opto_output;
   bool                  _parsed_irreducible_loop; // True if ciTypeFlow detected irreducible loops during parsing
 #endif
+  bool                  _has_irreducible_loop;  // Found irreducible loops
   // JSR 292
   bool                  _has_method_handle_invokes; // True if this method has MethodHandle invokes.
   RTMState              _rtm_state;             // State of Restricted Transactional Memory usage
@@ -605,6 +606,8 @@
   void          set_parsed_irreducible_loop(bool z) { _parsed_irreducible_loop = z; }
   int _in_dump_cnt;  // Required for dumping ir nodes.
 #endif
+  bool              has_irreducible_loop() const { return _has_irreducible_loop; }
+  void          set_has_irreducible_loop(bool z) { _has_irreducible_loop = z; }
 
   // JSR 292
   bool              has_method_handle_invokes() const { return _has_method_handle_invokes;     }
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/loopnode.cpp
--- a/src/share/vm/opto/loopnode.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/opto/loopnode.cpp	Thu May 08 23:07:11 2014 -0700
@@ -266,9 +266,9 @@
 
   // Counted loop head must be a good RegionNode with only 3 not NULL
   // control input edges: Self, Entry, LoopBack.
-  if (x->in(LoopNode::Self) == NULL || x->req() != 3)
+  if (x->in(LoopNode::Self) == NULL || x->req() != 3 || loop->_irreducible) {
     return false;
-
+  }
   Node *init_control = x->in(LoopNode::EntryControl);
   Node *back_control = x->in(LoopNode::LoopBackControl);
   if (init_control == NULL || back_control == NULL)    // Partially dead
@@ -1522,11 +1522,11 @@
 
   // If I have one hot backedge, peel off myself loop.
   // I better be the outermost loop.
-  if( _head->req() > 3 ) {
+  if (_head->req() > 3 && !_irreducible) {
     split_outer_loop( phase );
     result = true;
 
-  } else if( !_head->is_Loop() && !_irreducible ) {
+  } else if (!_head->is_Loop() && !_irreducible) {
     // Make a new LoopNode to replace the old loop head
     Node *l = new (phase->C) LoopNode( _head->in(1), _head->in(2) );
     l = igvn.register_new_node_with_optimizer(l, _head);
@@ -2938,6 +2938,7 @@
           return pre_order;
         }
       }
+      C->set_has_irreducible_loop(_has_irreducible_loops);
     }
 
     // This Node might be a decision point for loops.  It is only if
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/memnode.cpp
--- a/src/share/vm/opto/memnode.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/opto/memnode.cpp	Thu May 08 23:07:11 2014 -0700
@@ -306,33 +306,16 @@
     int alias_idx = phase->C->get_alias_index(t_adr->is_ptr());
   }
 
-#ifdef ASSERT
   Node* base = NULL;
-  if (address->is_AddP())
+  if (address->is_AddP()) {
     base = address->in(AddPNode::Base);
+  }
   if (base != NULL && phase->type(base)->higher_equal(TypePtr::NULL_PTR) &&
       !t_adr->isa_rawptr()) {
     // Note: raw address has TOP base and top->higher_equal(TypePtr::NULL_PTR) is true.
-    Compile* C = phase->C;
-    tty->cr();
-    tty->print_cr("===== NULL+offs not RAW address =====");
-    if (C->is_dead_node(this->_idx))    tty->print_cr("'this' is dead");
-    if ((ctl != NULL) && C->is_dead_node(ctl->_idx)) tty->print_cr("'ctl' is dead");
-    if (C->is_dead_node(mem->_idx))     tty->print_cr("'mem' is dead");
-    if (C->is_dead_node(address->_idx)) tty->print_cr("'address' is dead");
-    if (C->is_dead_node(base->_idx))    tty->print_cr("'base' is dead");
-    tty->cr();
-    base->dump(1);
-    tty->cr();
-    this->dump(2);
-    tty->print("this->adr_type():     "); adr_type()->dump(); tty->cr();
-    tty->print("phase->type(address): "); t_adr->dump(); tty->cr();
-    tty->print("phase->type(base):    "); phase->type(address)->dump(); tty->cr();
-    tty->cr();
+    // Skip this node optimization if its address has TOP base.
+    return NodeSentinel; // caller will return NULL
   }
-  assert(base == NULL || t_adr->isa_rawptr() ||
-        !phase->type(base)->higher_equal(TypePtr::NULL_PTR), "NULL+offs not RAW address?");
-#endif
 
   // Avoid independent memory operations
   Node* old_mem = mem;
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/node.cpp
--- a/src/share/vm/opto/node.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/opto/node.cpp	Thu May 08 23:07:11 2014 -0700
@@ -27,6 +27,7 @@
 #include "memory/allocation.inline.hpp"
 #include "opto/cfgnode.hpp"
 #include "opto/connode.hpp"
+#include "opto/loopnode.hpp"
 #include "opto/machnode.hpp"
 #include "opto/matcher.hpp"
 #include "opto/node.hpp"
@@ -1255,6 +1256,7 @@
 
   Node *top = igvn->C->top();
   nstack.push(dead);
+  bool has_irreducible_loop = igvn->C->has_irreducible_loop();
 
   while (nstack.size() > 0) {
     dead = nstack.pop();
@@ -1269,13 +1271,31 @@
           assert (!use->is_Con(), "Control for Con node should be Root node.");
           use->set_req(0, top);       // Cut dead edge to prevent processing
           nstack.push(use);           // the dead node again.
+        } else if (!has_irreducible_loop && // Backedge could be alive in irreducible loop
+                   use->is_Loop() && !use->is_Root() &&       // Don't kill Root (RootNode extends LoopNode)
+                   use->in(LoopNode::EntryControl) == dead) { // Dead loop if its entry is dead
+          use->set_req(LoopNode::EntryControl, top);          // Cut dead edge to prevent processing
+          use->set_req(0, top);       // Cut self edge
+          nstack.push(use);
         } else {                      // Else found a not-dead user
+          // Dead if all inputs are top or null
+          bool dead_use = !use->is_Root(); // Keep empty graph alive
           for (uint j = 1; j < use->req(); j++) {
-            if (use->in(j) == dead) { // Turn all dead inputs into TOP
+            Node* in = use->in(j);
+            if (in == dead) {         // Turn all dead inputs into TOP
               use->set_req(j, top);
+            } else if (in != NULL && !in->is_top()) {
+              dead_use = false;
             }
           }
-          igvn->_worklist.push(use);
+          if (dead_use) {
+            if (use->is_Region()) {
+              use->set_req(0, top);   // Cut self edge
+            }
+            nstack.push(use);
+          } else {
+            igvn->_worklist.push(use);
+          }
         }
         // Refresh the iterator, since any number of kills might have happened.
         k = dead->last_outs(kmin);
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/runtime.cpp
--- a/src/share/vm/opto/runtime.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/opto/runtime.cpp	Thu May 08 23:07:11 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -870,7 +870,7 @@
   return TypeFunc::make(domain, range);
 }
 
-// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning void
+// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning int
 const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() {
   // create input type (domain)
   int num_args      = 5;
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/runtime/advancedThresholdPolicy.cpp
--- a/src/share/vm/runtime/advancedThresholdPolicy.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/runtime/advancedThresholdPolicy.cpp	Thu May 08 23:07:11 2014 -0700
@@ -53,7 +53,8 @@
   }
 
   set_c1_count(MAX2(count / 3, 1));
-  set_c2_count(MAX2(count - count / 3, 1));
+  set_c2_count(MAX2(count - c1_count(), 1));
+  FLAG_SET_ERGO(intx, CICompilerCount, c1_count() + c2_count());
 
   // Some inlining tuning
 #ifdef X86
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/runtime/arguments.cpp
--- a/src/share/vm/runtime/arguments.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/runtime/arguments.cpp	Thu May 08 23:07:11 2014 -0700
@@ -2383,6 +2383,10 @@
   status &= verify_interval(NmethodSweepFraction, 1, ReservedCodeCacheSize/K, "NmethodSweepFraction");
   status &= verify_interval(NmethodSweepActivity, 0, 2000, "NmethodSweepActivity");
 
+  if (!FLAG_IS_DEFAULT(CICompilerCount) && !FLAG_IS_DEFAULT(CICompilerCountPerCPU) && CICompilerCountPerCPU) {
+    warning("The VM option CICompilerCountPerCPU overrides CICompilerCount.");
+  }
+
   return status;
 }
 
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/runtime/compilationPolicy.cpp
--- a/src/share/vm/runtime/compilationPolicy.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/runtime/compilationPolicy.cpp	Thu May 08 23:07:11 2014 -0700
@@ -182,6 +182,7 @@
     // max(log2(8)-1,1) = 2 compiler threads on an 8-way machine.
     // May help big-app startup time.
     _compiler_count = MAX2(log2_intptr(os::active_processor_count())-1,1);
+    FLAG_SET_ERGO(intx, CICompilerCount, _compiler_count);
   } else {
     _compiler_count = CICompilerCount;
   }
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/runtime/sharedRuntime.cpp
--- a/src/share/vm/runtime/sharedRuntime.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/runtime/sharedRuntime.cpp	Thu May 08 23:07:11 2014 -0700
@@ -2690,19 +2690,20 @@
 JRT_END
 
 #ifdef HAVE_DTRACE_H
-// Create a dtrace nmethod for this method.  The wrapper converts the
-// java compiled calling convention to the native convention, makes a dummy call
-// (actually nops for the size of the call instruction, which become a trap if
-// probe is enabled). The returns to the caller. Since this all looks like a
-// leaf no thread transition is needed.
-
+/**
+ * Create a dtrace nmethod for this method.  The wrapper converts the
+ * Java-compiled calling convention to the native convention, makes a dummy call
+ * (actually nops for the size of the call instruction, which become a trap if
+ * probe is enabled), and finally returns to the caller. Since this all looks like a
+ * leaf, no thread transition is needed.
+ */
 nmethod *AdapterHandlerLibrary::create_dtrace_nmethod(methodHandle method) {
   ResourceMark rm;
   nmethod* nm = NULL;
 
   if (PrintCompilation) {
     ttyLocker ttyl;
-    tty->print("---   n%s  ");
+    tty->print("---   n  ");
     method->print_short_name(tty);
     if (method->is_static()) {
       tty->print(" (static)");
diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/runtime/simpleThresholdPolicy.cpp
--- a/src/share/vm/runtime/simpleThresholdPolicy.cpp	Wed May 07 10:58:47 2014 -0700
+++ b/src/share/vm/runtime/simpleThresholdPolicy.cpp	Thu May 08 23:07:11 2014 -0700
@@ -142,7 +142,8 @@
     count = MAX2(log2_intptr(os::active_processor_count()), 1) * 3 / 2;
   }
   set_c1_count(MAX2(count / 3, 1));
-  set_c2_count(MAX2(count - count / 3, 1));
+  set_c2_count(MAX2(count - c1_count(), 1));
+  FLAG_SET_ERGO(intx, CICompilerCount, c1_count() + c2_count());
 }
 
 void SimpleThresholdPolicy::set_carry_if_necessary(InvocationCounter *counter) {
@@ -191,6 +192,10 @@
       thread->is_interp_only_mode()) {
     return NULL;
   }
+  if (CompileTheWorld || ReplayCompiles) {
+    // Don't trigger other compiles in testing mode
+    return NULL;
+  }
   nmethod *osr_nm = NULL;
 
   handle_counter_overflow(method());
diff -r 7dd67cb4f225 -r 28bbbecff5f0 test/compiler/7184394/TestAESBase.java
--- a/test/compiler/7184394/TestAESBase.java	Wed May 07 10:58:47 2014 -0700
+++ b/test/compiler/7184394/TestAESBase.java	Thu May 08 23:07:11 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -40,9 +40,20 @@
   int msgSize = Integer.getInteger("msgSize", 646);
   boolean checkOutput = Boolean.getBoolean("checkOutput");
   boolean noReinit = Boolean.getBoolean("noReinit");
+  boolean testingMisalignment;
+  private static final int ALIGN = 8;
+  int encInputOffset = Integer.getInteger("encInputOffset", 0) % ALIGN;
+  int encOutputOffset = Integer.getInteger("encOutputOffset", 0) % ALIGN;
+  int decOutputOffset = Integer.getInteger("decOutputOffset", 0) % ALIGN;
+  int lastChunkSize = Integer.getInteger("lastChunkSize", 32);
   int keySize = Integer.getInteger("keySize", 128);
+  int inputLength;
+  int encodeLength;
+  int decodeLength;
+  int decodeMsgSize;
   String algorithm = System.getProperty("algorithm", "AES");
   String mode = System.getProperty("mode", "CBC");
+  String paddingStr = System.getProperty("paddingStr", "PKCS5Padding");
   byte[] input;
   byte[] encode;
   byte[] expectedEncode;
@@ -51,7 +62,6 @@
   Random random = new Random(0);
   Cipher cipher;
   Cipher dCipher;
-  String paddingStr = "PKCS5Padding";
   AlgorithmParameters algParams;
   SecretKey key;
 
@@ -67,7 +77,10 @@
 
   public void prepare() {
     try {
-    System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput);
+    System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", paddingStr=" + paddingStr + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput + ", encInputOffset=" + encInputOffset + ", encOutputOffset=" + encOutputOffset + ", decOutputOffset=" + decOutputOffset + ", lastChunkSize=" +lastChunkSize );
+
+      if (encInputOffset % ALIGN != 0 || encOutputOffset % ALIGN != 0 || decOutputOffset % ALIGN !=0 )
+        testingMisalignment = true;
 
       int keyLenBytes = (keySize == 0 ? 16 : keySize/8);
       byte keyBytes[] = new byte[keyLenBytes];
@@ -81,10 +94,6 @@
         System.out.println("Algorithm: " + key.getAlgorithm() + "("
                            + key.getEncoded().length * 8 + "bit)");
       }
-      input = new byte[msgSize];
-      for (int i=0; i<input.length; i++) {
-        input[i] = (byte) (i & 0xff);
-      }
 
       cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
       dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
@@ -103,10 +112,35 @@
         childShowCipher();
       }
 
+      inputLength = msgSize + encInputOffset;
+      if (testingMisalignment) {
+        encodeLength = cipher.getOutputSize(msgSize - lastChunkSize) + encOutputOffset;
+        encodeLength += cipher.getOutputSize(lastChunkSize);
+        decodeLength = dCipher.getOutputSize(encodeLength - lastChunkSize) + decOutputOffset;
+        decodeLength += dCipher.getOutputSize(lastChunkSize);
+      } else {
+        encodeLength = cipher.getOutputSize(msgSize) + encOutputOffset;
+        decodeLength = dCipher.getOutputSize(encodeLength) + decOutputOffset;
+      }
+
+      input = new byte[inputLength];
+      for (int i=encInputOffset, j=0; i<inputLength; i++, j++) {
+        input[i] = (byte) (j & 0xff);
+      }
+
       // do one encode and decode in preparation
-      // this will also create the encode buffer and decode buffer
-      encode = cipher.doFinal(input);
-      decode = dCipher.doFinal(encode);
+      encode = new byte[encodeLength];
+      decode = new byte[decodeLength];
+      if (testingMisalignment) {
+        decodeMsgSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset);
+        decodeMsgSize += cipher.doFinal(input, (encInputOffset + msgSize - lastChunkSize), lastChunkSize, encode, (encOutputOffset + decodeMsgSize));
+
+        int tempSize = dCipher.update(encode, encOutputOffset, (decodeMsgSize - lastChunkSize), decode, decOutputOffset);
+        dCipher.doFinal(encode, (encOutputOffset + decodeMsgSize - lastChunkSize), lastChunkSize, decode, (decOutputOffset + tempSize));
+      } else {
+        decodeMsgSize = cipher.doFinal(input, encInputOffset, msgSize, encode, encOutputOffset);
+        dCipher.doFinal(encode, encOutputOffset, decodeMsgSize, decode, decOutputOffset);
+      }
       if (checkOutput) {
         expectedEncode = (byte[]) encode.clone();
         expectedDecode = (byte[]) decode.clone();
diff -r 7dd67cb4f225 -r 28bbbecff5f0 test/compiler/7184394/TestAESDecode.java
--- a/test/compiler/7184394/TestAESDecode.java	Wed May 07 10:58:47 2014 -0700
+++ b/test/compiler/7184394/TestAESDecode.java	Thu May 08 23:07:11 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -33,14 +33,15 @@
   public void run() {
     try {
       if (!noReinit) dCipher.init(Cipher.DECRYPT_MODE, key, algParams);
+      decode = new byte[decodeLength];
+      if (testingMisalignment) {
+        int tempSize = dCipher.update(encode, encOutputOffset, (decodeMsgSize - lastChunkSize), decode, decOutputOffset);
+        dCipher.doFinal(encode, (encOutputOffset + decodeMsgSize - lastChunkSize), lastChunkSize, decode, (decOutputOffset + tempSize));
+      } else {
+        dCipher.doFinal(encode, encOutputOffset, decodeMsgSize, decode, decOutputOffset);
+      }
       if (checkOutput) {
-        // checked version creates new output buffer each time
-        decode = dCipher.doFinal(encode, 0, encode.length);
         compareArrays(decode, expectedDecode);
-      } else {
-        // non-checked version outputs to existing encode buffer for maximum speed
-        decode = new byte[dCipher.getOutputSize(encode.length)];
-        dCipher.doFinal(encode, 0, encode.length, decode);
       }
     }
     catch (Exception e) {
diff -r 7dd67cb4f225 -r 28bbbecff5f0 test/compiler/7184394/TestAESEncode.java
--- a/test/compiler/7184394/TestAESEncode.java	Wed May 07 10:58:47 2014 -0700
+++ b/test/compiler/7184394/TestAESEncode.java	Thu May 08 23:07:11 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -33,14 +33,15 @@
   public void run() {
     try {
       if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+      encode = new byte[encodeLength];
+      if (testingMisalignment) {
+        int tempSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset);
+        cipher.doFinal(input, (encInputOffset + msgSize - lastChunkSize), lastChunkSize, encode, (encOutputOffset + tempSize));
+      } else {
+        cipher.doFinal(input, encInputOffset, msgSize, encode, encOutputOffset);
+      }
       if (checkOutput) {
-        // checked version creates new output buffer each time
-        encode = cipher.doFinal(input, 0, msgSize);
         compareArrays(encode, expectedEncode);
-      } else {
-        // non-checked version outputs to existing encode buffer for maximum speed
-        encode = new byte[cipher.getOutputSize(msgSize)];
-        cipher.doFinal(input, 0, msgSize, encode);
       }
     }
     catch (Exception e) {
diff -r 7dd67cb4f225 -r 28bbbecff5f0 test/compiler/7184394/TestAESMain.java
--- a/test/compiler/7184394/TestAESMain.java	Wed May 07 10:58:47 2014 -0700
+++ b/test/compiler/7184394/TestAESMain.java	Thu May 08 23:07:11 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2014 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,7 +28,19 @@
  * @summary add intrinsics to use AES instructions
  *
  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
  *
  * @author Tom Deneau
  */
@@ -36,12 +48,13 @@
 public class TestAESMain {
   public static void main(String[] args) {
     int iters = (args.length > 0 ? Integer.valueOf(args[0]) : 1000000);
+    int warmupIters = (args.length > 1 ? Integer.valueOf(args[1]) : 20000);
     System.out.println(iters + " iterations");
     TestAESEncode etest = new TestAESEncode();
     etest.prepare();
-    // warm-up for 20K iterations
+    // warm-up
     System.out.println("Starting encryption warm-up");
-    for (int i=0; i<20000; i++) {
+    for (int i=0; i<warmupIters; i++) {
       etest.run();
     }
     System.out.println("Finished encryption warm-up");
@@ -54,9 +67,9 @@
 
     TestAESDecode dtest = new TestAESDecode();
     dtest.prepare();
-    // warm-up for 20K iterations
+    // warm-up
     System.out.println("Starting decryption warm-up");
-    for (int i=0; i<20000; i++) {
+    for (int i=0; i<warmupIters; i++) {
       dtest.run();
     }
     System.out.println("Finished decryption warm-up");