# HG changeset patch
# User kvn
# Date 1348523166 25200
# Node ID c92f43386117fafbfd76b8100a563cfb5b197540
# Parent  04ed664b7e30a8cc0d007acda3a0856653effc1e# Parent  f7c1f489db55c60491ec010f76d5b3d49776c4d6
Merge

diff -r 04ed664b7e30 -r c92f43386117 src/cpu/sparc/vm/assembler_sparc.cpp
--- a/src/cpu/sparc/vm/assembler_sparc.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/sparc/vm/assembler_sparc.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -725,24 +725,6 @@
 }
 
 
-// Convert to C varargs format
-void MacroAssembler::set_varargs( Argument inArg, Register d ) {
-  // spill register-resident args to their memory slots
-  // (SPARC calling convention requires callers to have already preallocated these)
-  // Note that the inArg might in fact be an outgoing argument,
-  // if a leaf routine or stub does some tricky argument shuffling.
-  // This routine must work even though one of the saved arguments
-  // is in the d register (e.g., set_varargs(Argument(0, false), O0)).
-  for (Argument savePtr = inArg;
-       savePtr.is_register();
-       savePtr = savePtr.successor()) {
-    st_ptr(savePtr.as_register(), savePtr.address_in_frame());
-  }
-  // return the address of the first memory slot
-  Address a = inArg.address_in_frame();
-  add(a.base(), a.disp(), d);
-}
-
 // Conditional breakpoint (for assertion checks in assembly code)
 void MacroAssembler::breakpoint_trap(Condition c, CC cc) {
   trap(c, cc, G0, ST_RESERVED_FOR_USER_0);
@@ -2943,6 +2925,20 @@
   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
          "caller must use same register for non-constant itable index as for method");
 
+  Label L_no_such_interface_restore;
+  bool did_save = false;
+  if (scan_temp == noreg || sethi_temp == noreg) {
+    Register recv_2 = recv_klass->is_global() ? recv_klass : L0;
+    Register intf_2 = intf_klass->is_global() ? intf_klass : L1;
+    assert(method_result->is_global(), "must be able to return value");
+    scan_temp  = L2;
+    sethi_temp = L3;
+    save_frame_and_mov(0, recv_klass, recv_2, intf_klass, intf_2);
+    recv_klass = recv_2;
+    intf_klass = intf_2;
+    did_save = true;
+  }
+
   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
   int scan_step   = itableOffsetEntry::size() * wordSize;
@@ -2981,7 +2977,7 @@
   //     result = (klass + scan->offset() + itable_index);
   //   }
   // }
-  Label search, found_method;
+  Label L_search, L_found_method;
 
   for (int peel = 1; peel >= 0; peel--) {
     // %%%% Could load both offset and interface in one ldx, if they were
@@ -2991,23 +2987,23 @@
     // Check that this entry is non-null.  A null entry means that
     // the receiver class doesn't implement the interface, and wasn't the
     // same as when the caller was compiled.
-    bpr(Assembler::rc_z, false, Assembler::pn, method_result, L_no_such_interface);
+    bpr(Assembler::rc_z, false, Assembler::pn, method_result, did_save ? L_no_such_interface_restore : L_no_such_interface);
     delayed()->cmp(method_result, intf_klass);
 
     if (peel) {
-      brx(Assembler::equal,    false, Assembler::pt, found_method);
+      brx(Assembler::equal,    false, Assembler::pt, L_found_method);
     } else {
-      brx(Assembler::notEqual, false, Assembler::pn, search);
+      brx(Assembler::notEqual, false, Assembler::pn, L_search);
       // (invert the test to fall through to found_method...)
     }
     delayed()->add(scan_temp, scan_step, scan_temp);
 
     if (!peel)  break;
 
-    bind(search);
+    bind(L_search);
   }
 
-  bind(found_method);
+  bind(L_found_method);
 
   // Got a hit.
   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
@@ -3015,6 +3011,18 @@
   ito_offset -= scan_step;
   lduw(scan_temp, ito_offset, scan_temp);
   ld_ptr(recv_klass, scan_temp, method_result);
+
+  if (did_save) {
+    Label L_done;
+    ba(L_done);
+    delayed()->restore();
+
+    bind(L_no_such_interface_restore);
+    ba(L_no_such_interface);
+    delayed()->restore();
+
+    bind(L_done);
+  }
 }
 
 
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/sparc/vm/assembler_sparc.hpp
--- a/src/cpu/sparc/vm/assembler_sparc.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/sparc/vm/assembler_sparc.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -2428,9 +2428,6 @@
   static void test();
 #endif
 
-  // convert an incoming arglist to varargs format; put the pointer in d
-  void set_varargs( Argument a, Register d );
-
   int total_frame_size_in_bytes(int extraWords);
 
   // used when extraWords known statically
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/sparc/vm/assembler_sparc.inline.hpp
--- a/src/cpu/sparc/vm/assembler_sparc.inline.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/sparc/vm/assembler_sparc.inline.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -347,7 +347,11 @@
 inline void Assembler::swap(    Register s1, Register s2, Register d) { v9_dep();  emit_long( op(ldst_op) | rd(d) | op3(swap_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::swap(    Register s1, int simm13a, Register d) { v9_dep();  emit_data( op(ldst_op) | rd(d) | op3(swap_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }
 
-inline void Assembler::swap(    Address& a, Register d, int offset ) { relocate(a.rspec(offset)); swap(  a.base(), a.disp() + offset, d ); }
+inline void Assembler::swap(    Address& a, Register d, int offset ) {
+  relocate(a.rspec(offset));
+  if (a.has_index()) { assert(offset == 0, ""); swap( a.base(), a.index(), d         ); }
+  else               {                          swap( a.base(), a.disp() + offset, d ); }
+}
 
 
 // Use the right loads/stores for the platform
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp
--- a/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -1315,7 +1315,13 @@
 
 Address LIR_Assembler::as_Address(LIR_Address* addr) {
   Register reg = addr->base()->as_register();
-  return Address(reg, addr->disp());
+  LIR_Opr index = addr->index();
+  if (index->is_illegal()) {
+    return Address(reg, addr->disp());
+  } else {
+    assert (addr->disp() == 0, "unsupported address mode");
+    return Address(reg, index->as_pointer_register());
+  }
 }
 
 
@@ -3438,7 +3444,28 @@
   }
 }
 
-
-
+void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp) {
+  LIR_Address* addr = src->as_address_ptr();
+
+  assert(data == dest, "swap uses only 2 operands");
+  assert (code == lir_xchg, "no xadd on sparc");
+
+  if (data->type() == T_INT) {
+    __ swap(as_Address(addr), data->as_register());
+  } else if (data->is_oop()) {
+    Register obj = data->as_register();
+    Register narrow = tmp->as_register();
+#ifdef _LP64
+    assert(UseCompressedOops, "swap is 32bit only");
+    __ encode_heap_oop(obj, narrow);
+    __ swap(as_Address(addr), narrow);
+    __ decode_heap_oop(narrow, obj);
+#else
+    __ swap(as_Address(addr), obj);
+#endif
+  } else {
+    ShouldNotReachHere();
+  }
+}
 
 #undef __
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
--- a/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -1204,3 +1204,58 @@
     __ load(addr, dst);
   }
 }
+
+void LIRGenerator::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {
+  BasicType type = x->basic_type();
+  LIRItem src(x->object(), this);
+  LIRItem off(x->offset(), this);
+  LIRItem value(x->value(), this);
+
+  src.load_item();
+  value.load_item();
+  off.load_nonconstant();
+
+  LIR_Opr dst = rlock_result(x, type);
+  LIR_Opr data = value.result();
+  bool is_obj = (type == T_ARRAY || type == T_OBJECT);
+  LIR_Opr offset = off.result();
+
+  if (data != dst) {
+    __ move(data, dst);
+    data = dst;
+  }
+
+  assert (!x->is_add() && (type == T_INT || (is_obj LP64_ONLY(&& UseCompressedOops))), "unexpected type");
+  LIR_Address* addr;
+  if (offset->is_constant()) {
+
+#ifdef _LP64
+    jlong l = offset->as_jlong();
+    assert((jlong)((jint)l) == l, "offset too large for constant");
+    jint c = (jint)l;
+#else
+    jint c = offset->as_jint();
+#endif
+    addr = new LIR_Address(src.result(), c, type);
+  } else {
+    addr = new LIR_Address(src.result(), offset, type);
+  }
+
+  LIR_Opr tmp = LIR_OprFact::illegalOpr;
+  LIR_Opr ptr = LIR_OprFact::illegalOpr;
+
+  if (is_obj) {
+    // Do the pre-write barrier, if any.
+    // barriers on sparc don't work with a base + index address
+    tmp = FrameMap::G3_opr;
+    ptr = new_pointer_register();
+    __ add(src.result(), off.result(), ptr);
+    pre_barrier(ptr, LIR_OprFact::illegalOpr /* pre_val */,
+                true /* do_load */, false /* patch */, NULL);
+  }
+  __ xchg(LIR_OprFact::address(addr), data, dst, tmp);
+  if (is_obj) {
+    // Seems to be a precise address
+    post_barrier(ptr, data);
+  }
+}
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/sparc/vm/methodHandles_sparc.cpp
--- a/src/cpu/sparc/vm/methodHandles_sparc.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/sparc/vm/methodHandles_sparc.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -121,6 +121,7 @@
 void MethodHandles::jump_from_method_handle(MacroAssembler* _masm, Register method, Register target, Register temp,
                                             bool for_compiler_entry) {
   assert(method == G5_method, "interpreter calling convention");
+  assert_different_registers(method, target, temp);
 
   if (!for_compiler_entry && JvmtiExport::can_post_interpreter_events()) {
     Label run_compiled_code;
@@ -153,19 +154,19 @@
   BLOCK_COMMENT("jump_to_lambda_form {");
   // This is the initial entry point of a lazy method handle.
   // After type checking, it picks up the invoker from the LambdaForm.
-  assert_different_registers(recv, method_temp, temp2, temp3);
+  assert_different_registers(recv, method_temp, temp2);  // temp3 is only passed on
   assert(method_temp == G5_method, "required register for loading method");
 
   //NOT_PRODUCT({ FlagSetting fs(TraceMethodHandles, true); trace_method_handle(_masm, "LZMH"); });
 
   // Load the invoker, as MH -> MH.form -> LF.vmentry
   __ verify_oop(recv);
-  __ load_heap_oop(Address(recv,        NONZERO(java_lang_invoke_MethodHandle::form_offset_in_bytes())),       method_temp);
+  __ load_heap_oop(Address(recv,        NONZERO(java_lang_invoke_MethodHandle::form_offset_in_bytes())),   method_temp);
   __ verify_oop(method_temp);
-  __ load_heap_oop(Address(method_temp, NONZERO(java_lang_invoke_LambdaForm::vmentry_offset_in_bytes())), method_temp);
+  __ load_heap_oop(Address(method_temp, NONZERO(java_lang_invoke_LambdaForm::vmentry_offset_in_bytes())),  method_temp);
   __ verify_oop(method_temp);
   // the following assumes that a Method* is normally compressed in the vmtarget field:
-  __ ld_ptr(Address(method_temp, NONZERO(java_lang_invoke_MemberName::vmtarget_offset_in_bytes())),     method_temp);
+  __ ld_ptr(       Address(method_temp, NONZERO(java_lang_invoke_MemberName::vmtarget_offset_in_bytes())), method_temp);
 
   if (VerifyMethodHandles && !for_compiler_entry) {
     // make sure recv is already on stack
@@ -303,25 +304,25 @@
                                                     Register member_reg,
                                                     bool for_compiler_entry) {
   assert(is_signature_polymorphic(iid), "expected invoke iid");
-  // temps used in this code are not used in *either* compiled or interpreted calling sequences
   Register temp1 = (for_compiler_entry ? G1_scratch : O1);
-  Register temp2 = (for_compiler_entry ? G4_scratch : O4);
-  Register temp3 = G3_scratch;
-  Register temp4 = (for_compiler_entry ? noreg      : O2);
+  Register temp2 = (for_compiler_entry ? G3_scratch : O2);
+  Register temp3 = (for_compiler_entry ? G4_scratch : O3);
+  Register temp4 = (for_compiler_entry ? noreg      : O4);
   if (for_compiler_entry) {
     assert(receiver_reg == (iid == vmIntrinsics::_linkToStatic ? noreg : O0), "only valid assignment");
-    assert_different_registers(temp1,      O0, O1, O2, O3, O4, O5);
-    assert_different_registers(temp2,      O0, O1, O2, O3, O4, O5);
-    assert_different_registers(temp3,      O0, O1, O2, O3, O4, O5);
-    assert_different_registers(temp4,      O0, O1, O2, O3, O4, O5);
+    assert_different_registers(temp1, O0, O1, O2, O3, O4, O5);
+    assert_different_registers(temp2, O0, O1, O2, O3, O4, O5);
+    assert_different_registers(temp3, O0, O1, O2, O3, O4, O5);
+    assert_different_registers(temp4, O0, O1, O2, O3, O4, O5);
+  } else {
+    assert_different_registers(temp1, temp2, temp3, temp4, O5_savedSP);  // don't trash lastSP
   }
   if (receiver_reg != noreg)  assert_different_registers(temp1, temp2, temp3, temp4, receiver_reg);
   if (member_reg   != noreg)  assert_different_registers(temp1, temp2, temp3, temp4, member_reg);
-  if (!for_compiler_entry)    assert_different_registers(temp1, temp2, temp3, temp4, O5_savedSP);  // don't trash lastSP
 
   if (iid == vmIntrinsics::_invokeBasic) {
     // indirect through MH.form.vmentry.vmtarget
-    jump_to_lambda_form(_masm, receiver_reg, G5_method, temp2, temp3, for_compiler_entry);
+    jump_to_lambda_form(_masm, receiver_reg, G5_method, temp1, temp2, for_compiler_entry);
 
   } else {
     // The method is a member invoker used by direct method handles.
@@ -378,24 +379,22 @@
     //  member_reg - MemberName that was the trailing argument
     //  temp1_recv_klass - klass of stacked receiver, if needed
     //  O5_savedSP - interpreter linkage (if interpreted)
-    //  O0..O7,G1,G4 - compiler arguments (if compiled)
+    //  O0..O5 - compiler arguments (if compiled)
 
-    bool method_is_live = false;
+    Label L_incompatible_class_change_error;
     switch (iid) {
     case vmIntrinsics::_linkToSpecial:
       if (VerifyMethodHandles) {
-        verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp3);
+        verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp2);
       }
       __ ld_ptr(member_vmtarget, G5_method);
-      method_is_live = true;
       break;
 
     case vmIntrinsics::_linkToStatic:
       if (VerifyMethodHandles) {
-        verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp3);
+        verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp2);
       }
       __ ld_ptr(member_vmtarget, G5_method);
-      method_is_live = true;
       break;
 
     case vmIntrinsics::_linkToVirtual:
@@ -404,7 +403,7 @@
       // minus the CP setup and profiling:
 
       if (VerifyMethodHandles) {
-        verify_ref_kind(_masm, JVM_REF_invokeVirtual, member_reg, temp3);
+        verify_ref_kind(_masm, JVM_REF_invokeVirtual, member_reg, temp2);
       }
 
       // pick out the vtable index from the MemberName, and then we can discard it:
@@ -423,7 +422,6 @@
 
       // get target Method* & entry point
       __ lookup_virtual_method(temp1_recv_klass, temp2_index, G5_method);
-      method_is_live = true;
       break;
     }
 
@@ -432,13 +430,13 @@
       // same as TemplateTable::invokeinterface
       // (minus the CP setup and profiling, with different argument motion)
       if (VerifyMethodHandles) {
-        verify_ref_kind(_masm, JVM_REF_invokeInterface, member_reg, temp3);
+        verify_ref_kind(_masm, JVM_REF_invokeInterface, member_reg, temp2);
       }
 
-      Register temp3_intf = temp3;
-      __ load_heap_oop(member_clazz, temp3_intf);
-      load_klass_from_Class(_masm, temp3_intf, temp2, temp4);
-      __ verify_klass_ptr(temp3_intf);
+      Register temp2_intf = temp2;
+      __ load_heap_oop(member_clazz, temp2_intf);
+      load_klass_from_Class(_masm, temp2_intf, temp3, temp4);
+      __ verify_klass_ptr(temp2_intf);
 
       Register G5_index = G5_method;
       __ ld_ptr(member_vmindex, G5_index);
@@ -450,37 +448,34 @@
       }
 
       // given intf, index, and recv klass, dispatch to the implementation method
-      Label L_no_such_interface;
-      Register no_sethi_temp = noreg;
-      __ lookup_interface_method(temp1_recv_klass, temp3_intf,
+      __ lookup_interface_method(temp1_recv_klass, temp2_intf,
                                  // note: next two args must be the same:
                                  G5_index, G5_method,
-                                 temp2, no_sethi_temp,
-                                 L_no_such_interface);
-
-      __ verify_method_ptr(G5_method);
-      jump_from_method_handle(_masm, G5_method, temp2, temp3, for_compiler_entry);
-
-      __ bind(L_no_such_interface);
-      AddressLiteral icce(StubRoutines::throw_IncompatibleClassChangeError_entry());
-      __ jump_to(icce, temp3);
-      __ delayed()->nop();
+                                 temp3, temp4,
+                                 L_incompatible_class_change_error);
       break;
     }
 
     default:
-      fatal(err_msg("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid)));
+      fatal(err_msg_res("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid)));
       break;
     }
 
-    if (method_is_live) {
-      // live at this point:  G5_method, O5_savedSP (if interpreted)
+    // Live at this point:
+    //   G5_method
+    //   O5_savedSP (if interpreted)
 
-      // After figuring out which concrete method to call, jump into it.
-      // Note that this works in the interpreter with no data motion.
-      // But the compiled version will require that rcx_recv be shifted out.
-      __ verify_method_ptr(G5_method);
-      jump_from_method_handle(_masm, G5_method, temp1, temp3, for_compiler_entry);
+    // After figuring out which concrete method to call, jump into it.
+    // Note that this works in the interpreter with no data motion.
+    // But the compiled version will require that rcx_recv be shifted out.
+    __ verify_method_ptr(G5_method);
+    jump_from_method_handle(_masm, G5_method, temp1, temp2, for_compiler_entry);
+
+    if (iid == vmIntrinsics::_linkToInterface) {
+      __ BIND(L_incompatible_class_change_error);
+      AddressLiteral icce(StubRoutines::throw_IncompatibleClassChangeError_entry());
+      __ jump_to(icce, temp1);
+      __ delayed()->nop();
     }
   }
 }
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/sparc/vm/sharedRuntime_sparc.cpp
--- a/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -313,6 +313,14 @@
 
 }
 
+// Is vector's size (in bytes) bigger than a size saved by default?
+// 8 bytes FP registers are saved by default on SPARC.
+bool SharedRuntime::is_wide_vector(int size) {
+  // Note, MaxVectorSize == 8 on SPARC.
+  assert(size <= 8, err_msg_res("%d bytes vectors are not supported", size));
+  return size > 8;
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
@@ -364,9 +372,9 @@
 // ---------------------------------------------------------------------------
 // The compiled Java calling convention.  The Java convention always passes
 // 64-bit values in adjacent aligned locations (either registers or stack),
-// floats in float registers and doubles in aligned float pairs.  Values are
-// packed in the registers.  There is no backing varargs store for values in
-// registers.  In the 32-bit build, longs are passed in G1 and G4 (cannot be
+// floats in float registers and doubles in aligned float pairs.  There is
+// no backing varargs store for values in registers.
+// In the 32-bit build, longs are passed on the stack (cannot be
 // passed in I's, because longs in I's get their heads chopped off at
 // interrupt).
 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
@@ -375,76 +383,13 @@
                                            int is_outgoing) {
   assert(F31->as_VMReg()->is_reg(), "overlapping stack/register numbers");
 
-  // Convention is to pack the first 6 int/oop args into the first 6 registers
-  // (I0-I5), extras spill to the stack.  Then pack the first 8 float args
-  // into F0-F7, extras spill to the stack.  Then pad all register sets to
-  // align.  Then put longs and doubles into the same registers as they fit,
-  // else spill to the stack.
   const int int_reg_max = SPARC_ARGS_IN_REGS_NUM;
   const int flt_reg_max = 8;
-  //
-  // Where 32-bit 1-reg longs start being passed
-  // In tiered we must pass on stack because c1 can't use a "pair" in a single reg.
-  // So make it look like we've filled all the G regs that c2 wants to use.
-  Register g_reg = TieredCompilation ? noreg : G1;
-
-  // Count int/oop and float args.  See how many stack slots we'll need and
-  // where the longs & doubles will go.
-  int int_reg_cnt   = 0;
-  int flt_reg_cnt   = 0;
-  // int stk_reg_pairs = frame::register_save_words*(wordSize>>2);
-  // int stk_reg_pairs = SharedRuntime::out_preserve_stack_slots();
-  int stk_reg_pairs = 0;
-  for (int i = 0; i < total_args_passed; i++) {
-    switch (sig_bt[i]) {
-    case T_LONG:                // LP64, longs compete with int args
-      assert(sig_bt[i+1] == T_VOID, "");
-#ifdef _LP64
-      if (int_reg_cnt < int_reg_max)  int_reg_cnt++;
-#endif
-      break;
-    case T_OBJECT:
-    case T_ARRAY:
-    case T_ADDRESS: // Used, e.g., in slow-path locking for the lock's stack address
-      if (int_reg_cnt < int_reg_max)  int_reg_cnt++;
-#ifndef _LP64
-      else                            stk_reg_pairs++;
-#endif
-      break;
-    case T_INT:
-    case T_SHORT:
-    case T_CHAR:
-    case T_BYTE:
-    case T_BOOLEAN:
-      if (int_reg_cnt < int_reg_max)  int_reg_cnt++;
-      else                            stk_reg_pairs++;
-      break;
-    case T_FLOAT:
-      if (flt_reg_cnt < flt_reg_max)  flt_reg_cnt++;
-      else                            stk_reg_pairs++;
-      break;
-    case T_DOUBLE:
-      assert(sig_bt[i+1] == T_VOID, "");
-      break;
-    case T_VOID:
-      break;
-    default:
-      ShouldNotReachHere();
-    }
-  }
-
-  // This is where the longs/doubles start on the stack.
-  stk_reg_pairs = (stk_reg_pairs+1) & ~1; // Round
-
-  int flt_reg_pairs = (flt_reg_cnt+1) & ~1;
-
-  // int stk_reg = frame::register_save_words*(wordSize>>2);
-  // int stk_reg = SharedRuntime::out_preserve_stack_slots();
-  int stk_reg = 0;
+
   int int_reg = 0;
   int flt_reg = 0;
-
-  // Now do the signature layout
+  int slot = 0;
+
   for (int i = 0; i < total_args_passed; i++) {
     switch (sig_bt[i]) {
     case T_INT:
@@ -461,11 +406,14 @@
         Register r = is_outgoing ? as_oRegister(int_reg++) : as_iRegister(int_reg++);
         regs[i].set1(r->as_VMReg());
       } else {
-        regs[i].set1(VMRegImpl::stack2reg(stk_reg++));
+        regs[i].set1(VMRegImpl::stack2reg(slot++));
       }
       break;
 
 #ifdef _LP64
+    case T_LONG:
+      assert(sig_bt[i+1] == T_VOID, "expecting VOID in other half");
+      // fall-through
     case T_OBJECT:
     case T_ARRAY:
     case T_ADDRESS: // Used, e.g., in slow-path locking for the lock's stack address
@@ -473,78 +421,57 @@
         Register r = is_outgoing ? as_oRegister(int_reg++) : as_iRegister(int_reg++);
         regs[i].set2(r->as_VMReg());
       } else {
-        regs[i].set2(VMRegImpl::stack2reg(stk_reg_pairs));
-        stk_reg_pairs += 2;
+        slot = round_to(slot, 2);  // align
+        regs[i].set2(VMRegImpl::stack2reg(slot));
+        slot += 2;
       }
       break;
-#endif // _LP64
-
+#else
     case T_LONG:
       assert(sig_bt[i+1] == T_VOID, "expecting VOID in other half");
-#ifdef _LP64
-        if (int_reg < int_reg_max) {
-          Register r = is_outgoing ? as_oRegister(int_reg++) : as_iRegister(int_reg++);
-          regs[i].set2(r->as_VMReg());
-        } else {
-          regs[i].set2(VMRegImpl::stack2reg(stk_reg_pairs));
-          stk_reg_pairs += 2;
-        }
-#else
-#ifdef COMPILER2
-        // For 32-bit build, can't pass longs in O-regs because they become
-        // I-regs and get trashed.  Use G-regs instead.  G1 and G4 are almost
-        // spare and available.  This convention isn't used by the Sparc ABI or
-        // anywhere else. If we're tiered then we don't use G-regs because c1
-        // can't deal with them as a "pair". (Tiered makes this code think g's are filled)
-        // G0: zero
-        // G1: 1st Long arg
-        // G2: global allocated to TLS
-        // G3: used in inline cache check
-        // G4: 2nd Long arg
-        // G5: used in inline cache check
-        // G6: used by OS
-        // G7: used by OS
-
-        if (g_reg == G1) {
-          regs[i].set2(G1->as_VMReg()); // This long arg in G1
-          g_reg = G4;                  // Where the next arg goes
-        } else if (g_reg == G4) {
-          regs[i].set2(G4->as_VMReg()); // The 2nd long arg in G4
-          g_reg = noreg;               // No more longs in registers
-        } else {
-          regs[i].set2(VMRegImpl::stack2reg(stk_reg_pairs));
-          stk_reg_pairs += 2;
-        }
-#else // COMPILER2
-          regs[i].set2(VMRegImpl::stack2reg(stk_reg_pairs));
-          stk_reg_pairs += 2;
-#endif // COMPILER2
-#endif // _LP64
+      // On 32-bit SPARC put longs always on the stack to keep the pressure off
+      // integer argument registers.  They should be used for oops.
+      slot = round_to(slot, 2);  // align
+      regs[i].set2(VMRegImpl::stack2reg(slot));
+      slot += 2;
+#endif
       break;
 
     case T_FLOAT:
-      if (flt_reg < flt_reg_max) regs[i].set1(as_FloatRegister(flt_reg++)->as_VMReg());
-      else                       regs[i].set1(VMRegImpl::stack2reg(stk_reg++));
+      if (flt_reg < flt_reg_max) {
+        FloatRegister r = as_FloatRegister(flt_reg++);
+        regs[i].set1(r->as_VMReg());
+      } else {
+        regs[i].set1(VMRegImpl::stack2reg(slot++));
+      }
       break;
+
     case T_DOUBLE:
       assert(sig_bt[i+1] == T_VOID, "expecting half");
-      if (flt_reg_pairs + 1 < flt_reg_max) {
-        regs[i].set2(as_FloatRegister(flt_reg_pairs)->as_VMReg());
-        flt_reg_pairs += 2;
+      if (round_to(flt_reg, 2) + 1 < flt_reg_max) {
+        flt_reg = round_to(flt_reg, 2);  // align
+        FloatRegister r = as_FloatRegister(flt_reg);
+        regs[i].set2(r->as_VMReg());
+        flt_reg += 2;
       } else {
-        regs[i].set2(VMRegImpl::stack2reg(stk_reg_pairs));
-        stk_reg_pairs += 2;
+        slot = round_to(slot, 2);  // align
+        regs[i].set2(VMRegImpl::stack2reg(slot));
+        slot += 2;
       }
       break;
-    case T_VOID: regs[i].set_bad();  break; // Halves of longs & doubles
+
+    case T_VOID:
+      regs[i].set_bad();   // Halves of longs & doubles
+      break;
+
     default:
-      ShouldNotReachHere();
+      fatal(err_msg_res("unknown basic type %d", sig_bt[i]));
+      break;
     }
   }
 
   // retun the amount of stack space these arguments will need.
-  return stk_reg_pairs;
-
+  return slot;
 }
 
 // Helper class mostly to avoid passing masm everywhere, and handle
@@ -601,8 +528,7 @@
   Label L;
   __ ld_ptr(G5_method, in_bytes(Method::code_offset()), G3_scratch);
   __ br_null(G3_scratch, false, Assembler::pt, L);
-  // Schedule the branch target address early.
-  __ delayed()->ld_ptr(G5_method, in_bytes(Method::interpreter_entry_offset()), G3_scratch);
+  __ delayed()->nop();
   // Call into the VM to patch the caller, then jump to compiled callee
   __ save_frame(4);     // Args in compiled layout; do not blow them
 
@@ -645,7 +571,6 @@
   __ ldx(FP, -8 + STACK_BIAS, G1);
   __ ldx(FP, -16 + STACK_BIAS, G4);
   __ mov(L5, G5_method);
-  __ ld_ptr(G5_method, in_bytes(Method::interpreter_entry_offset()), G3_scratch);
 #endif /* _LP64 */
 
   __ restore();      // Restore args
@@ -726,7 +651,7 @@
                             int comp_args_on_stack, // VMRegStackSlots
                             const BasicType *sig_bt,
                             const VMRegPair *regs,
-                            Label& skip_fixup) {
+                            Label& L_skip_fixup) {
 
   // Before we get into the guts of the C2I adapter, see if we should be here
   // at all.  We've come from compiled code and are attempting to jump to the
@@ -747,7 +672,7 @@
 
   patch_callers_callsite();
 
-  __ bind(skip_fixup);
+  __ bind(L_skip_fixup);
 
   // Since all args are passed on the stack, total_args_passed*wordSize is the
   // space we need.  Add in varargs area needed by the interpreter. Round up
@@ -757,46 +682,18 @@
                  (frame::varargs_offset - frame::register_save_words)*wordSize;
   const int extraspace = round_to(arg_size + varargs_area, 2*wordSize);
 
-  int bias = STACK_BIAS;
+  const int bias = STACK_BIAS;
   const int interp_arg_offset = frame::varargs_offset*wordSize +
                         (total_args_passed-1)*Interpreter::stackElementSize;
 
-  Register base = SP;
-
-#ifdef _LP64
-  // In the 64bit build because of wider slots and STACKBIAS we can run
-  // out of bits in the displacement to do loads and stores.  Use g3 as
-  // temporary displacement.
-  if (!Assembler::is_simm13(extraspace)) {
-    __ set(extraspace, G3_scratch);
-    __ sub(SP, G3_scratch, SP);
-  } else {
-    __ sub(SP, extraspace, SP);
-  }
+  const Register base = SP;
+
+  // Make some extra space on the stack.
+  __ sub(SP, __ ensure_simm13_or_reg(extraspace, G3_scratch), SP);
   set_Rdisp(G3_scratch);
-#else
-  __ sub(SP, extraspace, SP);
-#endif // _LP64
-
-  // First write G1 (if used) to where ever it must go
-  for (int i=0; i<total_args_passed; i++) {
-    const int st_off = interp_arg_offset - (i*Interpreter::stackElementSize) + bias;
-    VMReg r_1 = regs[i].first();
-    VMReg r_2 = regs[i].second();
-    if (r_1 == G1_scratch->as_VMReg()) {
-      if (sig_bt[i] == T_OBJECT || sig_bt[i] == T_ARRAY) {
-        store_c2i_object(G1_scratch, base, st_off);
-      } else if (sig_bt[i] == T_LONG) {
-        assert(!TieredCompilation, "should not use register args for longs");
-        store_c2i_long(G1_scratch, base, st_off, false);
-      } else {
-        store_c2i_int(G1_scratch, base, st_off);
-      }
-    }
-  }
-
-  // Now write the args into the outgoing interpreter space
-  for (int i=0; i<total_args_passed; i++) {
+
+  // Write the args into the outgoing interpreter space.
+  for (int i = 0; i < total_args_passed; i++) {
     const int st_off = interp_arg_offset - (i*Interpreter::stackElementSize) + bias;
     VMReg r_1 = regs[i].first();
     VMReg r_2 = regs[i].second();
@@ -804,23 +701,9 @@
       assert(!r_2->is_valid(), "");
       continue;
     }
-    // Skip G1 if found as we did it first in order to free it up
-    if (r_1 == G1_scratch->as_VMReg()) {
-      continue;
-    }
-#ifdef ASSERT
-    bool G1_forced = false;
-#endif // ASSERT
     if (r_1->is_stack()) {        // Pretend stack targets are loaded into G1
-#ifdef _LP64
-      Register ld_off = Rdisp;
-      __ set(reg2offset(r_1) + extraspace + bias, ld_off);
-#else
-      int ld_off = reg2offset(r_1) + extraspace + bias;
-#endif // _LP64
-#ifdef ASSERT
-      G1_forced = true;
-#endif // ASSERT
+      RegisterOrConstant ld_off = reg2offset(r_1) + extraspace + bias;
+      ld_off = __ ensure_simm13_or_reg(ld_off, Rdisp);
       r_1 = G1_scratch->as_VMReg();// as part of the load/store shuffle
       if (!r_2->is_valid()) __ ld (base, ld_off, G1_scratch);
       else                  __ ldx(base, ld_off, G1_scratch);
@@ -831,11 +714,6 @@
       if (sig_bt[i] == T_OBJECT || sig_bt[i] == T_ARRAY) {
         store_c2i_object(r, base, st_off);
       } else if (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
-#ifndef _LP64
-        if (TieredCompilation) {
-          assert(G1_forced || sig_bt[i] != T_LONG, "should not use register args for longs");
-        }
-#endif // _LP64
         store_c2i_long(r, base, st_off, r_2->is_stack());
       } else {
         store_c2i_int(r, base, st_off);
@@ -851,19 +729,12 @@
     }
   }
 
-#ifdef _LP64
-  // Need to reload G3_scratch, used for temporary displacements.
+  // Load the interpreter entry point.
   __ ld_ptr(G5_method, in_bytes(Method::interpreter_entry_offset()), G3_scratch);
 
   // Pass O5_savedSP as an argument to the interpreter.
   // The interpreter will restore SP to this value before returning.
-  __ set(extraspace, G1);
-  __ add(SP, G1, O5_savedSP);
-#else
-  // Pass O5_savedSP as an argument to the interpreter.
-  // The interpreter will restore SP to this value before returning.
-  __ add(SP, extraspace, O5_savedSP);
-#endif // _LP64
+  __ add(SP, __ ensure_simm13_or_reg(extraspace, G1), O5_savedSP);
 
   __ mov((frame::varargs_offset)*wordSize -
          1*Interpreter::stackElementSize+bias+BytesPerWord, G1);
@@ -971,7 +842,6 @@
 
   // Outputs:
   // G2_thread      - TLS
-  // G1, G4         - Outgoing long args in 32-bit build
   // O0-O5          - Outgoing args in compiled layout
   // O6             - Adjusted or restored SP
   // O7             - Valid return address
@@ -1016,10 +886,10 @@
   // +--------------+ <--- start of outgoing args
   // |  pad, align  |   |
   // +--------------+   |
-  // | ints, floats |   |---Outgoing stack args, packed low.
-  // +--------------+   |   First few args in registers.
-  // :   doubles    :   |
-  // |   longs      |   |
+  // | ints, longs, |   |
+  // |    floats,   |   |---Outgoing stack args.
+  // :    doubles   :   |   First few args in registers.
+  // |              |   |
   // +--------------+ <--- SP' + 16*wordsize
   // |              |
   // :    window    :
@@ -1033,7 +903,6 @@
   // Cut-out for having no stack args.  Since up to 6 args are passed
   // in registers, we will commonly have no stack args.
   if (comp_args_on_stack > 0) {
-
     // Convert VMReg stack slots to words.
     int comp_words_on_stack = round_to(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
     // Round up to miminum stack alignment, in wordSize
@@ -1044,13 +913,9 @@
     __ sub(SP, (comp_words_on_stack)*wordSize, SP);
   }
 
-  // Will jump to the compiled code just as if compiled code was doing it.
-  // Pre-load the register-jump target early, to schedule it better.
-  __ ld_ptr(G5_method, in_bytes(Method::from_compiled_offset()), G3);
-
   // Now generate the shuffle code.  Pick up all register args and move the
   // rest through G1_scratch.
-  for (int i=0; i<total_args_passed; i++) {
+  for (int i = 0; i < total_args_passed; i++) {
     if (sig_bt[i] == T_VOID) {
       // Longs and doubles are passed in native word order, but misaligned
       // in the 32-bit build.
@@ -1088,14 +953,13 @@
               next_arg_slot(ld_off) : arg_slot(ld_off);
         __ ldx(Gargs, slot, r);
 #else
-        // Need to load a 64-bit value into G1/G4, but G1/G4 is being used in the
-        // stack shuffle.  Load the first 2 longs into G1/G4 later.
+        fatal("longs should be on stack");
 #endif
       }
     } else {
       assert(r_1->is_FloatRegister(), "");
       if (!r_2->is_valid()) {
-        __ ldf(FloatRegisterImpl::S, Gargs, arg_slot(ld_off), r_1->as_FloatRegister());
+        __ ldf(FloatRegisterImpl::S, Gargs,      arg_slot(ld_off), r_1->as_FloatRegister());
       } else {
 #ifdef _LP64
         // In V9, doubles are given 2 64-bit slots in the interpreter, but the
@@ -1104,11 +968,11 @@
         // spare float register.
         RegisterOrConstant slot = (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) ?
               next_arg_slot(ld_off) : arg_slot(ld_off);
-        __ ldf(FloatRegisterImpl::D, Gargs, slot, r_1->as_FloatRegister());
+        __ ldf(FloatRegisterImpl::D, Gargs,                  slot, r_1->as_FloatRegister());
 #else
         // Need to marshal 64-bit value from misaligned Lesp loads
         __ ldf(FloatRegisterImpl::S, Gargs, next_arg_slot(ld_off), r_1->as_FloatRegister());
-        __ ldf(FloatRegisterImpl::S, Gargs, arg_slot(ld_off), r_2->as_FloatRegister());
+        __ ldf(FloatRegisterImpl::S, Gargs,      arg_slot(ld_off), r_2->as_FloatRegister());
 #endif
       }
     }
@@ -1124,76 +988,35 @@
       else                  __ stf(FloatRegisterImpl::D, r_1->as_FloatRegister(), SP, slot);
     }
   }
-  bool made_space = false;
-#ifndef _LP64
-  // May need to pick up a few long args in G1/G4
-  bool g4_crushed = false;
-  bool g3_crushed = false;
-  for (int i=0; i<total_args_passed; i++) {
-    if (regs[i].first()->is_Register() && regs[i].second()->is_valid()) {
-      // Load in argument order going down
-      int ld_off = (total_args_passed-i)*Interpreter::stackElementSize;
-      // Need to marshal 64-bit value from misaligned Lesp loads
-      Register r = regs[i].first()->as_Register()->after_restore();
-      if (r == G1 || r == G4) {
-        assert(!g4_crushed, "ordering problem");
-        if (r == G4){
-          g4_crushed = true;
-          __ lduw(Gargs, arg_slot(ld_off)     , G3_scratch); // Load lo bits
-          __ ld  (Gargs, next_arg_slot(ld_off), r);          // Load hi bits
-        } else {
-          // better schedule this way
-          __ ld  (Gargs, next_arg_slot(ld_off), r);          // Load hi bits
-          __ lduw(Gargs, arg_slot(ld_off)     , G3_scratch); // Load lo bits
-        }
-        g3_crushed = true;
-        __ sllx(r, 32, r);
-        __ or3(G3_scratch, r, r);
-      } else {
-        assert(r->is_out(), "longs passed in two O registers");
-        __ ld  (Gargs, arg_slot(ld_off)     , r->successor()); // Load lo bits
-        __ ld  (Gargs, next_arg_slot(ld_off), r);              // Load hi bits
-      }
-    }
-  }
-#endif
 
   // Jump to the compiled code just as if compiled code was doing it.
-  //
-#ifndef _LP64
-    if (g3_crushed) {
-      // Rats load was wasted, at least it is in cache...
-      __ ld_ptr(G5_method, Method::from_compiled_offset(), G3);
-    }
-#endif /* _LP64 */
-
-    // 6243940 We might end up in handle_wrong_method if
-    // the callee is deoptimized as we race thru here. If that
-    // happens we don't want to take a safepoint because the
-    // caller frame will look interpreted and arguments are now
-    // "compiled" so it is much better to make this transition
-    // invisible to the stack walking code. Unfortunately if
-    // we try and find the callee by normal means a safepoint
-    // is possible. So we stash the desired callee in the thread
-    // and the vm will find there should this case occur.
-    Address callee_target_addr(G2_thread, JavaThread::callee_target_offset());
-    __ st_ptr(G5_method, callee_target_addr);
-
-    if (StressNonEntrant) {
-      // Open a big window for deopt failure
-      __ save_frame(0);
-      __ mov(G0, L0);
-      Label loop;
-      __ bind(loop);
-      __ sub(L0, 1, L0);
-      __ br_null_short(L0, Assembler::pt, loop);
-
-      __ restore();
-    }
-
-
-    __ jmpl(G3, 0, G0);
-    __ delayed()->nop();
+  __ ld_ptr(G5_method, in_bytes(Method::from_compiled_offset()), G3);
+
+  // 6243940 We might end up in handle_wrong_method if
+  // the callee is deoptimized as we race thru here. If that
+  // happens we don't want to take a safepoint because the
+  // caller frame will look interpreted and arguments are now
+  // "compiled" so it is much better to make this transition
+  // invisible to the stack walking code. Unfortunately if
+  // we try and find the callee by normal means a safepoint
+  // is possible. So we stash the desired callee in the thread
+  // and the vm will find there should this case occur.
+  Address callee_target_addr(G2_thread, JavaThread::callee_target_offset());
+  __ st_ptr(G5_method, callee_target_addr);
+
+  if (StressNonEntrant) {
+    // Open a big window for deopt failure
+    __ save_frame(0);
+    __ mov(G0, L0);
+    Label loop;
+    __ bind(loop);
+    __ sub(L0, 1, L0);
+    __ br_null_short(L0, Assembler::pt, loop);
+    __ restore();
+  }
+
+  __ jmpl(G3, 0, G0);
+  __ delayed()->nop();
 }
 
 // ---------------------------------------------------------------
@@ -1221,28 +1044,17 @@
   // compiled code, which relys solely on SP and not FP, get sick).
 
   address c2i_unverified_entry = __ pc();
-  Label skip_fixup;
+  Label L_skip_fixup;
   {
-#if !defined(_LP64) && defined(COMPILER2)
-    Register R_temp   = L0;   // another scratch register
-#else
-    Register R_temp   = G1;   // another scratch register
-#endif
+    Register R_temp = G1;  // another scratch register
 
     AddressLiteral ic_miss(SharedRuntime::get_ic_miss_stub());
 
     __ verify_oop(O0);
     __ load_klass(O0, G3_scratch);
 
-#if !defined(_LP64) && defined(COMPILER2)
-    __ save(SP, -frame::register_save_words*wordSize, SP);
     __ ld_ptr(G5_method, CompiledICHolder::holder_klass_offset(), R_temp);
     __ cmp(G3_scratch, R_temp);
-    __ restore();
-#else
-    __ ld_ptr(G5_method, CompiledICHolder::holder_klass_offset(), R_temp);
-    __ cmp(G3_scratch, R_temp);
-#endif
 
     Label ok, ok2;
     __ brx(Assembler::equal, false, Assembler::pt, ok);
@@ -1256,8 +1068,8 @@
     // the call site corrected.
     __ ld_ptr(G5_method, in_bytes(Method::code_offset()), G3_scratch);
     __ bind(ok2);
-    __ br_null(G3_scratch, false, Assembler::pt, skip_fixup);
-    __ delayed()->ld_ptr(G5_method, in_bytes(Method::interpreter_entry_offset()), G3_scratch);
+    __ br_null(G3_scratch, false, Assembler::pt, L_skip_fixup);
+    __ delayed()->nop();
     __ jump_to(ic_miss, G3_scratch);
     __ delayed()->nop();
 
@@ -1265,7 +1077,7 @@
 
   address c2i_entry = __ pc();
 
-  agen.gen_c2i_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
+  agen.gen_c2i_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs, L_skip_fixup);
 
   __ flush();
   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
@@ -1985,12 +1797,12 @@
 }
 
 static void verify_oop_args(MacroAssembler* masm,
-                            int total_args_passed,
+                            methodHandle method,
                             const BasicType* sig_bt,
                             const VMRegPair* regs) {
   Register temp_reg = G5_method;  // not part of any compiled calling seq
   if (VerifyOops) {
-    for (int i = 0; i < total_args_passed; i++) {
+    for (int i = 0; i < method->size_of_parameters(); i++) {
       if (sig_bt[i] == T_OBJECT ||
           sig_bt[i] == T_ARRAY) {
         VMReg r = regs[i].first();
@@ -2009,35 +1821,32 @@
 }
 
 static void gen_special_dispatch(MacroAssembler* masm,
-                                 int total_args_passed,
-                                 int comp_args_on_stack,
-                                 vmIntrinsics::ID special_dispatch,
+                                 methodHandle method,
                                  const BasicType* sig_bt,
                                  const VMRegPair* regs) {
-  verify_oop_args(masm, total_args_passed, sig_bt, regs);
+  verify_oop_args(masm, method, sig_bt, regs);
+  vmIntrinsics::ID iid = method->intrinsic_id();
 
   // Now write the args into the outgoing interpreter space
   bool     has_receiver   = false;
   Register receiver_reg   = noreg;
   int      member_arg_pos = -1;
   Register member_reg     = noreg;
-  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(special_dispatch);
+  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
   if (ref_kind != 0) {
-    member_arg_pos = total_args_passed - 1;  // trailing MemberName argument
+    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
     member_reg = G5_method;  // known to be free at this point
     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
-  } else if (special_dispatch == vmIntrinsics::_invokeBasic) {
+  } else if (iid == vmIntrinsics::_invokeBasic) {
     has_receiver = true;
   } else {
-    fatal(err_msg("special_dispatch=%d", special_dispatch));
+    fatal(err_msg_res("unexpected intrinsic id %d", iid));
   }
 
   if (member_reg != noreg) {
     // Load the member_arg into register, if necessary.
-    assert(member_arg_pos >= 0 && member_arg_pos < total_args_passed, "oob");
-    assert(sig_bt[member_arg_pos] == T_OBJECT, "dispatch argument must be an object");
+    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
     VMReg r = regs[member_arg_pos].first();
-    assert(r->is_valid(), "bad member arg");
     if (r->is_stack()) {
       RegisterOrConstant ld_off = reg2offset(r) + STACK_BIAS;
       ld_off = __ ensure_simm13_or_reg(ld_off, member_reg);
@@ -2050,7 +1859,7 @@
 
   if (has_receiver) {
     // Make sure the receiver is loaded into a register.
-    assert(total_args_passed > 0, "oob");
+    assert(method->size_of_parameters() > 0, "oob");
     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
     VMReg r = regs[0].first();
     assert(r->is_valid(), "bad receiver arg");
@@ -2058,7 +1867,7 @@
       // Porting note:  This assumes that compiled calling conventions always
       // pass the receiver oop in a register.  If this is not true on some
       // platform, pick a temp and load the receiver from stack.
-      assert(false, "receiver always in a register");
+      fatal("receiver always in a register");
       receiver_reg = G3_scratch;  // known to be free at this point
       RegisterOrConstant ld_off = reg2offset(r) + STACK_BIAS;
       ld_off = __ ensure_simm13_or_reg(ld_off, member_reg);
@@ -2070,7 +1879,7 @@
   }
 
   // Figure out which address we are really jumping to:
-  MethodHandles::generate_method_handle_dispatch(masm, special_dispatch,
+  MethodHandles::generate_method_handle_dispatch(masm, iid,
                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
 }
 
@@ -2103,11 +1912,9 @@
 //    transition back to thread_in_Java
 //    return to caller
 //
-nmethod *SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
+nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
                                                 methodHandle method,
                                                 int compile_id,
-                                                int total_in_args,
-                                                int comp_args_on_stack, // in VMRegStackSlots
                                                 BasicType* in_sig_bt,
                                                 VMRegPair* in_regs,
                                                 BasicType ret_type) {
@@ -2116,9 +1923,7 @@
     intptr_t start = (intptr_t)__ pc();
     int vep_offset = ((intptr_t)__ pc()) - start;
     gen_special_dispatch(masm,
-                         total_in_args,
-                         comp_args_on_stack,
-                         method->intrinsic_id(),
+                         method,
                          in_sig_bt,
                          in_regs);
     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
@@ -2220,6 +2025,7 @@
   // we convert the java signature to a C signature by inserting
   // the hidden arguments as arg[0] and possibly arg[1] (static method)
 
+  const int total_in_args = method->size_of_parameters();
   int total_c_args = total_in_args;
   int total_save_slots = 6 * VMRegImpl::slots_per_word;
   if (!is_critical_native) {
@@ -3936,7 +3742,7 @@
 // the 64-bit %o's, then do a save, then fixup the caller's SP (our FP).
 // Tricky, tricky, tricky...
 
-SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) {
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
 
   // allocate space for the code
@@ -3954,6 +3760,7 @@
 
   int start = __ offset();
 
+  bool cause_return = (poll_type == POLL_AT_RETURN);
   // If this causes a return before the processing, then do a "restore"
   if (cause_return) {
     __ restore();
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/sparc/vm/sparc.ad
--- a/src/cpu/sparc/vm/sparc.ad	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/sparc/vm/sparc.ad	Mon Sep 24 14:46:06 2012 -0700
@@ -1838,6 +1838,12 @@
   case Op_PopCountL:
     if (!UsePopCountInstruction)
       return false;
+  case Op_CompareAndSwapL:
+#ifdef _LP64
+  case Op_CompareAndSwapP:
+#endif
+    if (!VM_Version::supports_cx8())
+      return false;
     break;
   }
 
@@ -7199,6 +7205,7 @@
 // No flag versions for CompareAndSwap{P,I,L} because matcher can't match them
 
 instruct compareAndSwapL_bool(iRegP mem_ptr, iRegL oldval, iRegL newval, iRegI res, o7RegI tmp1, flagsReg ccr ) %{
+  predicate(VM_Version::supports_cx8());
   match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
   effect( USE mem_ptr, KILL ccr, KILL tmp1);
   format %{
@@ -7230,6 +7237,9 @@
 %}
 
 instruct compareAndSwapP_bool(iRegP mem_ptr, iRegP oldval, iRegP newval, iRegI res, o7RegI tmp1, flagsReg ccr ) %{
+#ifdef _LP64
+  predicate(VM_Version::supports_cx8());
+#endif
   match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
   effect( USE mem_ptr, KILL ccr, KILL tmp1);
   format %{
@@ -7264,6 +7274,38 @@
   ins_pipe( long_memory_op );
 %}
 
+instruct xchgI( memory mem, iRegI newval) %{
+  match(Set newval (GetAndSetI mem newval));
+  format %{ "SWAP  [$mem],$newval" %}
+  size(4);
+  ins_encode %{
+    __ swap($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+#ifndef _LP64
+instruct xchgP( memory mem, iRegP newval) %{
+  match(Set newval (GetAndSetP mem newval));
+  format %{ "SWAP  [$mem],$newval" %}
+  size(4);
+  ins_encode %{
+    __ swap($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+
+instruct xchgN( memory mem, iRegN newval) %{
+  match(Set newval (GetAndSetN mem newval));
+  format %{ "SWAP  [$mem],$newval" %}
+  size(4);
+  ins_encode %{
+    __ swap($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
 //---------------------
 // Subtraction Instructions
 // Register Subtraction
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/sparc/vm/vm_version_sparc.cpp
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -96,6 +96,7 @@
   UseSSE = 0; // Only on x86 and x64
 
   _supports_cx8 = has_v9();
+  _supports_atomic_getset4 = true; // swap instruction
 
   if (is_niagara()) {
     // Indirect branch is the same cost as direct
@@ -338,7 +339,11 @@
 
 unsigned int VM_Version::calc_parallel_worker_threads() {
   unsigned int result;
-  if (is_niagara_plus()) {
+  if (is_M_series()) {
+    // for now, use same gc thread calculation for M-series as for niagara-plus
+    // in future, we may want to tweak parameters for nof_parallel_worker_thread
+    result = nof_parallel_worker_threads(5, 16, 8);
+  } else if (is_niagara_plus()) {
     result = nof_parallel_worker_threads(5, 16, 8);
   } else {
     result = nof_parallel_worker_threads(5, 8, 8);
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/sparc/vm/vm_version_sparc.hpp
--- a/src/cpu/sparc/vm/vm_version_sparc.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/sparc/vm/vm_version_sparc.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -124,6 +124,8 @@
   // Returns true if the platform is in the niagara line (T series)
   // and newer than the niagara1.
   static bool is_niagara_plus()         { return is_T_family(_features) && !is_T1_model(_features); }
+
+  static bool is_M_series()             { return is_M_family(_features); }
   static bool is_T4()                   { return is_T_family(_features) && has_cbcond(); }
 
   // Fujitsu SPARC64
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/x86/vm/assembler_x86.cpp
--- a/src/cpu/x86/vm/assembler_x86.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -3496,6 +3496,33 @@
   emit_byte(0x01);
 }
 
+void Assembler::vinsertf128h(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x18);
+  emit_operand(dst, src);
+  // 0x01 - insert into upper 128 bits
+  emit_byte(0x01);
+}
+
+void Assembler::vextractf128h(Address dst, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(src != xnoreg, "sanity");
+  int src_enc = src->encoding();
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x19);
+  emit_operand(src, dst);
+  // 0x01 - extract from upper 128 bits
+  emit_byte(0x01);
+}
+
 void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
   bool vector256 = true;
@@ -3507,6 +3534,33 @@
   emit_byte(0x01);
 }
 
+void Assembler::vinserti128h(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_avx2(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x38);
+  emit_operand(dst, src);
+  // 0x01 - insert into upper 128 bits
+  emit_byte(0x01);
+}
+
+void Assembler::vextracti128h(Address dst, XMMRegister src) {
+  assert(VM_Version::supports_avx2(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(src != xnoreg, "sanity");
+  int src_enc = src->encoding();
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x39);
+  emit_operand(src, dst);
+  // 0x01 - extract from upper 128 bits
+  emit_byte(0x01);
+}
+
 void Assembler::vzeroupper() {
   assert(VM_Version::supports_avx(), "");
   (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
@@ -8907,11 +8961,9 @@
   pusha();
 
   // if we are coming from c1, xmm registers may be live
-  if (UseSSE >= 1) {
-    subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
-  }
   int off = 0;
   if (UseSSE == 1)  {
+    subptr(rsp, sizeof(jdouble)*8);
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
@@ -8921,23 +8973,50 @@
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
   } else if (UseSSE >= 2)  {
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
+#ifdef COMPILER2
+    if (MaxVectorSize > 16) {
+      assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+      // Save upper half of YMM registes
+      subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+      vextractf128h(Address(rsp,  0),xmm0);
+      vextractf128h(Address(rsp, 16),xmm1);
+      vextractf128h(Address(rsp, 32),xmm2);
+      vextractf128h(Address(rsp, 48),xmm3);
+      vextractf128h(Address(rsp, 64),xmm4);
+      vextractf128h(Address(rsp, 80),xmm5);
+      vextractf128h(Address(rsp, 96),xmm6);
+      vextractf128h(Address(rsp,112),xmm7);
 #ifdef _LP64
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
+      vextractf128h(Address(rsp,128),xmm8);
+      vextractf128h(Address(rsp,144),xmm9);
+      vextractf128h(Address(rsp,160),xmm10);
+      vextractf128h(Address(rsp,176),xmm11);
+      vextractf128h(Address(rsp,192),xmm12);
+      vextractf128h(Address(rsp,208),xmm13);
+      vextractf128h(Address(rsp,224),xmm14);
+      vextractf128h(Address(rsp,240),xmm15);
+#endif
+    }
+#endif
+    // Save whole 128bit (16 bytes) XMM regiters
+    subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+    movdqu(Address(rsp,off++*16),xmm0);
+    movdqu(Address(rsp,off++*16),xmm1);
+    movdqu(Address(rsp,off++*16),xmm2);
+    movdqu(Address(rsp,off++*16),xmm3);
+    movdqu(Address(rsp,off++*16),xmm4);
+    movdqu(Address(rsp,off++*16),xmm5);
+    movdqu(Address(rsp,off++*16),xmm6);
+    movdqu(Address(rsp,off++*16),xmm7);
+#ifdef _LP64
+    movdqu(Address(rsp,off++*16),xmm8);
+    movdqu(Address(rsp,off++*16),xmm9);
+    movdqu(Address(rsp,off++*16),xmm10);
+    movdqu(Address(rsp,off++*16),xmm11);
+    movdqu(Address(rsp,off++*16),xmm12);
+    movdqu(Address(rsp,off++*16),xmm13);
+    movdqu(Address(rsp,off++*16),xmm14);
+    movdqu(Address(rsp,off++*16),xmm15);
 #endif
   }
 
@@ -9015,28 +9094,52 @@
     movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
     movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
     movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
+    addptr(rsp, sizeof(jdouble)*8);
   } else if (UseSSE >= 2)  {
-    movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
+    // Restore whole 128bit (16 bytes) XMM regiters
+    movdqu(xmm0, Address(rsp,off++*16));
+    movdqu(xmm1, Address(rsp,off++*16));
+    movdqu(xmm2, Address(rsp,off++*16));
+    movdqu(xmm3, Address(rsp,off++*16));
+    movdqu(xmm4, Address(rsp,off++*16));
+    movdqu(xmm5, Address(rsp,off++*16));
+    movdqu(xmm6, Address(rsp,off++*16));
+    movdqu(xmm7, Address(rsp,off++*16));
 #ifdef _LP64
-    movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
+    movdqu(xmm8, Address(rsp,off++*16));
+    movdqu(xmm9, Address(rsp,off++*16));
+    movdqu(xmm10, Address(rsp,off++*16));
+    movdqu(xmm11, Address(rsp,off++*16));
+    movdqu(xmm12, Address(rsp,off++*16));
+    movdqu(xmm13, Address(rsp,off++*16));
+    movdqu(xmm14, Address(rsp,off++*16));
+    movdqu(xmm15, Address(rsp,off++*16));
 #endif
-  }
-  if (UseSSE >= 1) {
-    addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
+    addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+#ifdef COMPILER2
+    if (MaxVectorSize > 16) {
+      // Restore upper half of YMM registes.
+      vinsertf128h(xmm0, Address(rsp,  0));
+      vinsertf128h(xmm1, Address(rsp, 16));
+      vinsertf128h(xmm2, Address(rsp, 32));
+      vinsertf128h(xmm3, Address(rsp, 48));
+      vinsertf128h(xmm4, Address(rsp, 64));
+      vinsertf128h(xmm5, Address(rsp, 80));
+      vinsertf128h(xmm6, Address(rsp, 96));
+      vinsertf128h(xmm7, Address(rsp,112));
+#ifdef _LP64
+      vinsertf128h(xmm8, Address(rsp,128));
+      vinsertf128h(xmm9, Address(rsp,144));
+      vinsertf128h(xmm10, Address(rsp,160));
+      vinsertf128h(xmm11, Address(rsp,176));
+      vinsertf128h(xmm12, Address(rsp,192));
+      vinsertf128h(xmm13, Address(rsp,208));
+      vinsertf128h(xmm14, Address(rsp,224));
+      vinsertf128h(xmm15, Address(rsp,240));
+#endif
+      addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+    }
+#endif
   }
   popa();
 }
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/x86/vm/assembler_x86.hpp
--- a/src/cpu/x86/vm/assembler_x86.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -1743,6 +1743,12 @@
   void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
 
+  // Load/store high 128bit of YMM registers which does not destroy other half.
+  void vinsertf128h(XMMRegister dst, Address src);
+  void vinserti128h(XMMRegister dst, Address src);
+  void vextractf128h(Address dst, XMMRegister src);
+  void vextracti128h(Address dst, XMMRegister src);
+
   // AVX instruction which is used to clear upper 128 bits of YMM registers and
   // to avoid transaction penalty between AVX and SSE states. There is no
   // penalty if legacy SSE instructions are encoded using VEX prefix because
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
--- a/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -3794,5 +3794,49 @@
   // do nothing for now
 }
 
+void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp) {
+  assert(data == dest, "xchg/xadd uses only 2 operands");
+
+  if (data->type() == T_INT) {
+    if (code == lir_xadd) {
+      if (os::is_MP()) {
+        __ lock();
+      }
+      __ xaddl(as_Address(src->as_address_ptr()), data->as_register());
+    } else {
+      __ xchgl(data->as_register(), as_Address(src->as_address_ptr()));
+    }
+  } else if (data->is_oop()) {
+    assert (code == lir_xchg, "xadd for oops");
+    Register obj = data->as_register();
+#ifdef _LP64
+    if (UseCompressedOops) {
+      __ encode_heap_oop(obj);
+      __ xchgl(obj, as_Address(src->as_address_ptr()));
+      __ decode_heap_oop(obj);
+    } else {
+      __ xchgptr(obj, as_Address(src->as_address_ptr()));
+    }
+#else
+    __ xchgl(obj, as_Address(src->as_address_ptr()));
+#endif
+  } else if (data->type() == T_LONG) {
+#ifdef _LP64
+    assert(data->as_register_lo() == data->as_register_hi(), "should be a single register");
+    if (code == lir_xadd) {
+      if (os::is_MP()) {
+        __ lock();
+      }
+      __ xaddq(as_Address(src->as_address_ptr()), data->as_register_lo());
+    } else {
+      __ xchgq(data->as_register_lo(), as_Address(src->as_address_ptr()));
+    }
+#else
+    ShouldNotReachHere();
+#endif
+  } else {
+    ShouldNotReachHere();
+  }
+}
 
 #undef __
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
--- a/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -753,9 +753,24 @@
   LIR_Opr addr = new_pointer_register();
   LIR_Address* a;
   if(offset.result()->is_constant()) {
+#ifdef _LP64
+    jlong c = offset.result()->as_jlong();
+    if ((jlong)((jint)c) == c) {
+      a = new LIR_Address(obj.result(),
+                          (jint)c,
+                          as_BasicType(type));
+    } else {
+      LIR_Opr tmp = new_register(T_LONG);
+      __ move(offset.result(), tmp);
+      a = new LIR_Address(obj.result(),
+                          tmp,
+                          as_BasicType(type));
+    }
+#else
     a = new LIR_Address(obj.result(),
-                        NOT_LP64(offset.result()->as_constant_ptr()->as_jint()) LP64_ONLY((int)offset.result()->as_constant_ptr()->as_jlong()),
+                        offset.result()->as_jint(),
                         as_BasicType(type));
+#endif
   } else {
     a = new LIR_Address(obj.result(),
                         offset.result(),
@@ -1345,3 +1360,57 @@
     }
   }
 }
+
+void LIRGenerator::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {
+  BasicType type = x->basic_type();
+  LIRItem src(x->object(), this);
+  LIRItem off(x->offset(), this);
+  LIRItem value(x->value(), this);
+
+  src.load_item();
+  value.load_item();
+  off.load_nonconstant();
+
+  LIR_Opr dst = rlock_result(x, type);
+  LIR_Opr data = value.result();
+  bool is_obj = (type == T_ARRAY || type == T_OBJECT);
+  LIR_Opr offset = off.result();
+
+  assert (type == T_INT || (!x->is_add() && is_obj) LP64_ONLY( || type == T_LONG ), "unexpected type");
+  LIR_Address* addr;
+  if (offset->is_constant()) {
+#ifdef _LP64
+    jlong c = offset->as_jlong();
+    if ((jlong)((jint)c) == c) {
+      addr = new LIR_Address(src.result(), (jint)c, type);
+    } else {
+      LIR_Opr tmp = new_register(T_LONG);
+      __ move(offset, tmp);
+      addr = new LIR_Address(src.result(), tmp, type);
+    }
+#else
+    addr = new LIR_Address(src.result(), offset->as_jint(), type);
+#endif
+  } else {
+    addr = new LIR_Address(src.result(), offset, type);
+  }
+
+  if (data != dst) {
+    __ move(data, dst);
+    data = dst;
+  }
+  if (x->is_add()) {
+    __ xadd(LIR_OprFact::address(addr), data, dst, LIR_OprFact::illegalOpr);
+  } else {
+    if (is_obj) {
+      // Do the pre-write barrier, if any.
+      pre_barrier(LIR_OprFact::address(addr), LIR_OprFact::illegalOpr /* pre_val */,
+                  true /* do_load */, false /* patch */, NULL);
+    }
+    __ xchg(LIR_OprFact::address(addr), data, dst, LIR_OprFact::illegalOpr);
+    if (is_obj) {
+      // Seems to be a precise address
+      post_barrier(LIR_OprFact::address(addr), data);
+    }
+  }
+}
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/x86/vm/methodHandles_x86.cpp
--- a/src/cpu/x86/vm/methodHandles_x86.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/x86/vm/methodHandles_x86.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -327,10 +327,11 @@
     assert_different_registers(temp3,        rcx, rdx);
   }
 #endif
+  else {
+    assert_different_registers(temp1, temp2, temp3, saved_last_sp_register());  // don't trash lastSP
+  }
   assert_different_registers(temp1, temp2, temp3, receiver_reg);
   assert_different_registers(temp1, temp2, temp3, member_reg);
-  if (!for_compiler_entry)
-    assert_different_registers(temp1, temp2, temp3, saved_last_sp_register());  // don't trash lastSP
 
   if (iid == vmIntrinsics::_invokeBasic) {
     // indirect through MH.form.vmentry.vmtarget
@@ -392,14 +393,13 @@
     //  rsi/r13 - interpreter linkage (if interpreted)
     //  rcx, rdx, rsi, rdi, r8, r8 - compiler arguments (if compiled)
 
-    bool method_is_live = false;
+    Label L_incompatible_class_change_error;
     switch (iid) {
     case vmIntrinsics::_linkToSpecial:
       if (VerifyMethodHandles) {
         verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp3);
       }
       __ movptr(rbx_method, member_vmtarget);
-      method_is_live = true;
       break;
 
     case vmIntrinsics::_linkToStatic:
@@ -407,7 +407,6 @@
         verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp3);
       }
       __ movptr(rbx_method, member_vmtarget);
-      method_is_live = true;
       break;
 
     case vmIntrinsics::_linkToVirtual:
@@ -436,7 +435,6 @@
 
       // get target Method* & entry point
       __ lookup_virtual_method(temp1_recv_klass, temp2_index, rbx_method);
-      method_is_live = true;
       break;
     }
 
@@ -464,35 +462,32 @@
       }
 
       // given intf, index, and recv klass, dispatch to the implementation method
-      Label L_no_such_interface;
       __ lookup_interface_method(temp1_recv_klass, temp3_intf,
                                  // note: next two args must be the same:
                                  rbx_index, rbx_method,
                                  temp2,
-                                 L_no_such_interface);
-
-      __ verify_method_ptr(rbx_method);
-      jump_from_method_handle(_masm, rbx_method, temp2, for_compiler_entry);
-      __ hlt();
-
-      __ bind(L_no_such_interface);
-      __ jump(RuntimeAddress(StubRoutines::throw_IncompatibleClassChangeError_entry()));
+                                 L_incompatible_class_change_error);
       break;
     }
 
     default:
-      fatal(err_msg("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid)));
+      fatal(err_msg_res("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid)));
       break;
     }
 
-    if (method_is_live) {
-      // live at this point:  rbx_method, rsi/r13 (if interpreted)
+    // Live at this point:
+    //   rbx_method
+    //   rsi/r13 (if interpreted)
 
-      // After figuring out which concrete method to call, jump into it.
-      // Note that this works in the interpreter with no data motion.
-      // But the compiled version will require that rcx_recv be shifted out.
-      __ verify_method_ptr(rbx_method);
-      jump_from_method_handle(_masm, rbx_method, temp1, for_compiler_entry);
+    // After figuring out which concrete method to call, jump into it.
+    // Note that this works in the interpreter with no data motion.
+    // But the compiled version will require that rcx_recv be shifted out.
+    __ verify_method_ptr(rbx_method);
+    jump_from_method_handle(_masm, rbx_method, temp1, for_compiler_entry);
+
+    if (iid == vmIntrinsics::_linkToInterface) {
+      __ bind(L_incompatible_class_change_error);
+      __ jump(RuntimeAddress(StubRoutines::throw_IncompatibleClassChangeError_entry()));
     }
   }
 }
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/x86/vm/sharedRuntime_x86_32.cpp
--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -46,11 +46,11 @@
 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
 
 class RegisterSaver {
-  enum { FPU_regs_live = 8 /*for the FPU stack*/+8/*eight more for XMM registers*/ };
   // Capture info about frame layout
+#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
   enum layout {
                 fpu_state_off = 0,
-                fpu_state_end = fpu_state_off+FPUStateSizeInWords-1,
+                fpu_state_end = fpu_state_off+FPUStateSizeInWords,
                 st0_off, st0H_off,
                 st1_off, st1H_off,
                 st2_off, st2H_off,
@@ -59,16 +59,16 @@
                 st5_off, st5H_off,
                 st6_off, st6H_off,
                 st7_off, st7H_off,
-
-                xmm0_off, xmm0H_off,
-                xmm1_off, xmm1H_off,
-                xmm2_off, xmm2H_off,
-                xmm3_off, xmm3H_off,
-                xmm4_off, xmm4H_off,
-                xmm5_off, xmm5H_off,
-                xmm6_off, xmm6H_off,
-                xmm7_off, xmm7H_off,
-                flags_off,
+                xmm_off,
+                DEF_XMM_OFFS(0),
+                DEF_XMM_OFFS(1),
+                DEF_XMM_OFFS(2),
+                DEF_XMM_OFFS(3),
+                DEF_XMM_OFFS(4),
+                DEF_XMM_OFFS(5),
+                DEF_XMM_OFFS(6),
+                DEF_XMM_OFFS(7),
+                flags_off = xmm7_off + 16/BytesPerInt + 1, // 16-byte stack alignment fill word
                 rdi_off,
                 rsi_off,
                 ignore_off,  // extra copy of rbp,
@@ -83,13 +83,13 @@
                 rbp_off,
                 return_off,      // slot for return address
                 reg_save_size };
-
+  enum { FPU_regs_live = flags_off - fpu_state_end };
 
   public:
 
   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words,
-                                     int* total_frame_words, bool verify_fpu = true);
-  static void restore_live_registers(MacroAssembler* masm);
+                                     int* total_frame_words, bool verify_fpu = true, bool save_vectors = false);
+  static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 
   static int rax_offset() { return rax_off; }
   static int rbx_offset() { return rbx_off; }
@@ -113,9 +113,20 @@
 };
 
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words,
-                                           int* total_frame_words, bool verify_fpu) {
-
-  int frame_size_in_bytes =  (reg_save_size + additional_frame_words) * wordSize;
+                                           int* total_frame_words, bool verify_fpu, bool save_vectors) {
+  int vect_words = 0;
+#ifdef COMPILER2
+  if (save_vectors) {
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    // Save upper half of YMM registes
+    vect_words = 8 * 16 / wordSize;
+    additional_frame_words += vect_words;
+  }
+#else
+  assert(!save_vectors, "vectors are generated only by C2");
+#endif
+  int frame_size_in_bytes = (reg_save_size + additional_frame_words) * wordSize;
   int frame_words = frame_size_in_bytes / wordSize;
   *total_frame_words = frame_words;
 
@@ -129,7 +140,7 @@
   __ enter();
   __ pusha();
   __ pushf();
-  __ subptr(rsp,FPU_regs_live*sizeof(jdouble)); // Push FPU registers space
+  __ subptr(rsp,FPU_regs_live*wordSize); // Push FPU registers space
   __ push_FPU_state();          // Save FPU state & init
 
   if (verify_fpu) {
@@ -183,14 +194,28 @@
     __ movflt(Address(rsp,xmm6_off*wordSize),xmm6);
     __ movflt(Address(rsp,xmm7_off*wordSize),xmm7);
   } else if( UseSSE >= 2 ) {
-    __ movdbl(Address(rsp,xmm0_off*wordSize),xmm0);
-    __ movdbl(Address(rsp,xmm1_off*wordSize),xmm1);
-    __ movdbl(Address(rsp,xmm2_off*wordSize),xmm2);
-    __ movdbl(Address(rsp,xmm3_off*wordSize),xmm3);
-    __ movdbl(Address(rsp,xmm4_off*wordSize),xmm4);
-    __ movdbl(Address(rsp,xmm5_off*wordSize),xmm5);
-    __ movdbl(Address(rsp,xmm6_off*wordSize),xmm6);
-    __ movdbl(Address(rsp,xmm7_off*wordSize),xmm7);
+    // Save whole 128bit (16 bytes) XMM regiters
+    __ movdqu(Address(rsp,xmm0_off*wordSize),xmm0);
+    __ movdqu(Address(rsp,xmm1_off*wordSize),xmm1);
+    __ movdqu(Address(rsp,xmm2_off*wordSize),xmm2);
+    __ movdqu(Address(rsp,xmm3_off*wordSize),xmm3);
+    __ movdqu(Address(rsp,xmm4_off*wordSize),xmm4);
+    __ movdqu(Address(rsp,xmm5_off*wordSize),xmm5);
+    __ movdqu(Address(rsp,xmm6_off*wordSize),xmm6);
+    __ movdqu(Address(rsp,xmm7_off*wordSize),xmm7);
+  }
+
+  if (vect_words > 0) {
+    assert(vect_words*wordSize == 128, "");
+    __ subptr(rsp, 128); // Save upper half of YMM registes
+    __ vextractf128h(Address(rsp,  0),xmm0);
+    __ vextractf128h(Address(rsp, 16),xmm1);
+    __ vextractf128h(Address(rsp, 32),xmm2);
+    __ vextractf128h(Address(rsp, 48),xmm3);
+    __ vextractf128h(Address(rsp, 64),xmm4);
+    __ vextractf128h(Address(rsp, 80),xmm5);
+    __ vextractf128h(Address(rsp, 96),xmm6);
+    __ vextractf128h(Address(rsp,112),xmm7);
   }
 
   // Set an oopmap for the call site.  This oopmap will map all
@@ -253,10 +278,20 @@
 
 }
 
-void RegisterSaver::restore_live_registers(MacroAssembler* masm) {
-
+void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
   // Recover XMM & FPU state
-  if( UseSSE == 1 ) {
+  int additional_frame_bytes = 0;
+#ifdef COMPILER2
+  if (restore_vectors) {
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    additional_frame_bytes = 128;
+  }
+#else
+  assert(!restore_vectors, "vectors are generated only by C2");
+#endif
+  if (UseSSE == 1) {
+    assert(additional_frame_bytes == 0, "");
     __ movflt(xmm0,Address(rsp,xmm0_off*wordSize));
     __ movflt(xmm1,Address(rsp,xmm1_off*wordSize));
     __ movflt(xmm2,Address(rsp,xmm2_off*wordSize));
@@ -265,18 +300,33 @@
     __ movflt(xmm5,Address(rsp,xmm5_off*wordSize));
     __ movflt(xmm6,Address(rsp,xmm6_off*wordSize));
     __ movflt(xmm7,Address(rsp,xmm7_off*wordSize));
-  } else if( UseSSE >= 2 ) {
-    __ movdbl(xmm0,Address(rsp,xmm0_off*wordSize));
-    __ movdbl(xmm1,Address(rsp,xmm1_off*wordSize));
-    __ movdbl(xmm2,Address(rsp,xmm2_off*wordSize));
-    __ movdbl(xmm3,Address(rsp,xmm3_off*wordSize));
-    __ movdbl(xmm4,Address(rsp,xmm4_off*wordSize));
-    __ movdbl(xmm5,Address(rsp,xmm5_off*wordSize));
-    __ movdbl(xmm6,Address(rsp,xmm6_off*wordSize));
-    __ movdbl(xmm7,Address(rsp,xmm7_off*wordSize));
+  } else if (UseSSE >= 2) {
+#define STACK_ADDRESS(x) Address(rsp,(x)*wordSize + additional_frame_bytes)
+    __ movdqu(xmm0,STACK_ADDRESS(xmm0_off));
+    __ movdqu(xmm1,STACK_ADDRESS(xmm1_off));
+    __ movdqu(xmm2,STACK_ADDRESS(xmm2_off));
+    __ movdqu(xmm3,STACK_ADDRESS(xmm3_off));
+    __ movdqu(xmm4,STACK_ADDRESS(xmm4_off));
+    __ movdqu(xmm5,STACK_ADDRESS(xmm5_off));
+    __ movdqu(xmm6,STACK_ADDRESS(xmm6_off));
+    __ movdqu(xmm7,STACK_ADDRESS(xmm7_off));
+#undef STACK_ADDRESS
+  }
+  if (restore_vectors) {
+    // Restore upper half of YMM registes.
+    assert(additional_frame_bytes == 128, "");
+    __ vinsertf128h(xmm0, Address(rsp,  0));
+    __ vinsertf128h(xmm1, Address(rsp, 16));
+    __ vinsertf128h(xmm2, Address(rsp, 32));
+    __ vinsertf128h(xmm3, Address(rsp, 48));
+    __ vinsertf128h(xmm4, Address(rsp, 64));
+    __ vinsertf128h(xmm5, Address(rsp, 80));
+    __ vinsertf128h(xmm6, Address(rsp, 96));
+    __ vinsertf128h(xmm7, Address(rsp,112));
+    __ addptr(rsp, additional_frame_bytes);
   }
   __ pop_FPU_state();
-  __ addptr(rsp, FPU_regs_live*sizeof(jdouble)); // Pop FPU registers
+  __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers
 
   __ popf();
   __ popa();
@@ -308,6 +358,13 @@
   __ addptr(rsp, return_off * wordSize);
 }
 
+// Is vector's size (in bytes) bigger than a size saved by default?
+// 16 bytes XMM registers are saved by default using SSE2 movdqu instructions.
+// Note, MaxVectorSize == 0 with UseSSE < 2 and vectors are not generated.
+bool SharedRuntime::is_wide_vector(int size) {
+  return size > 16;
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
@@ -1346,12 +1403,12 @@
 }
 
 static void verify_oop_args(MacroAssembler* masm,
-                            int total_args_passed,
+                            methodHandle method,
                             const BasicType* sig_bt,
                             const VMRegPair* regs) {
   Register temp_reg = rbx;  // not part of any compiled calling seq
   if (VerifyOops) {
-    for (int i = 0; i < total_args_passed; i++) {
+    for (int i = 0; i < method->size_of_parameters(); i++) {
       if (sig_bt[i] == T_OBJECT ||
           sig_bt[i] == T_ARRAY) {
         VMReg r = regs[i].first();
@@ -1368,35 +1425,32 @@
 }
 
 static void gen_special_dispatch(MacroAssembler* masm,
-                                 int total_args_passed,
-                                 int comp_args_on_stack,
-                                 vmIntrinsics::ID special_dispatch,
+                                 methodHandle method,
                                  const BasicType* sig_bt,
                                  const VMRegPair* regs) {
-  verify_oop_args(masm, total_args_passed, sig_bt, regs);
+  verify_oop_args(masm, method, sig_bt, regs);
+  vmIntrinsics::ID iid = method->intrinsic_id();
 
   // Now write the args into the outgoing interpreter space
   bool     has_receiver   = false;
   Register receiver_reg   = noreg;
   int      member_arg_pos = -1;
   Register member_reg     = noreg;
-  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(special_dispatch);
+  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
   if (ref_kind != 0) {
-    member_arg_pos = total_args_passed - 1;  // trailing MemberName argument
+    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
     member_reg = rbx;  // known to be free at this point
     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
-  } else if (special_dispatch == vmIntrinsics::_invokeBasic) {
+  } else if (iid == vmIntrinsics::_invokeBasic) {
     has_receiver = true;
   } else {
-    guarantee(false, err_msg("special_dispatch=%d", special_dispatch));
+    fatal(err_msg_res("unexpected intrinsic id %d", iid));
   }
 
   if (member_reg != noreg) {
     // Load the member_arg into register, if necessary.
-    assert(member_arg_pos >= 0 && member_arg_pos < total_args_passed, "oob");
-    assert(sig_bt[member_arg_pos] == T_OBJECT, "dispatch argument must be an object");
+    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
     VMReg r = regs[member_arg_pos].first();
-    assert(r->is_valid(), "bad member arg");
     if (r->is_stack()) {
       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
     } else {
@@ -1407,7 +1461,7 @@
 
   if (has_receiver) {
     // Make sure the receiver is loaded into a register.
-    assert(total_args_passed > 0, "oob");
+    assert(method->size_of_parameters() > 0, "oob");
     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
     VMReg r = regs[0].first();
     assert(r->is_valid(), "bad receiver arg");
@@ -1415,7 +1469,7 @@
       // Porting note:  This assumes that compiled calling conventions always
       // pass the receiver oop in a register.  If this is not true on some
       // platform, pick a temp and load the receiver from stack.
-      assert(false, "receiver always in a register");
+      fatal("receiver always in a register");
       receiver_reg = rcx;  // known to be free at this point
       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
     } else {
@@ -1425,7 +1479,7 @@
   }
 
   // Figure out which address we are really jumping to:
-  MethodHandles::generate_method_handle_dispatch(masm, special_dispatch,
+  MethodHandles::generate_method_handle_dispatch(masm, iid,
                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
 }
 
@@ -1461,8 +1515,6 @@
 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
                                                 methodHandle method,
                                                 int compile_id,
-                                                int total_in_args,
-                                                int comp_args_on_stack,
                                                 BasicType* in_sig_bt,
                                                 VMRegPair* in_regs,
                                                 BasicType ret_type) {
@@ -1471,9 +1523,7 @@
     intptr_t start = (intptr_t)__ pc();
     int vep_offset = ((intptr_t)__ pc()) - start;
     gen_special_dispatch(masm,
-                         total_in_args,
-                         comp_args_on_stack,
-                         method->intrinsic_id(),
+                         method,
                          in_sig_bt,
                          in_regs);
     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
@@ -1506,6 +1556,7 @@
   // we convert the java signature to a C signature by inserting
   // the hidden arguments as arg[0] and possibly arg[1] (static method)
 
+  const int total_in_args = method->size_of_parameters();
   int total_c_args = total_in_args;
   if (!is_critical_native) {
     total_c_args += 1;
@@ -2738,7 +2789,6 @@
   return 0;
 }
 
-
 //------------------------------generate_deopt_blob----------------------------
 void SharedRuntime::generate_deopt_blob() {
   // allocate space for the code
@@ -3276,7 +3326,7 @@
 // setup oopmap, and calls safepoint code to stop the compiled code for
 // a safepoint.
 //
-SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) {
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
 
   // Account for thread arg in our frame
   const int additional_words = 1;
@@ -3296,17 +3346,18 @@
   const Register java_thread = rdi; // callee-saved for VC++
   address start   = __ pc();
   address call_pc = NULL;
-
+  bool cause_return = (poll_type == POLL_AT_RETURN);
+  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
   // If cause_return is true we are at a poll_return and there is
   // the return address on the stack to the caller on the nmethod
   // that is safepoint. We can leave this return on the stack and
   // effectively complete the return and safepoint in the caller.
   // Otherwise we push space for a return address that the safepoint
   // handler will install later to make the stack walking sensible.
-  if( !cause_return )
-    __ push(rbx);                // Make room for return address (or push it again)
-
-  map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, false);
+  if (!cause_return)
+    __ push(rbx);  // Make room for return address (or push it again)
+
+  map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, false, save_vectors);
 
   // The following is basically a call_VM. However, we need the precise
   // address of the call in order to generate an oopmap. Hence, we do all the
@@ -3318,7 +3369,7 @@
   __ set_last_Java_frame(java_thread, noreg, noreg, NULL);
 
   // if this was not a poll_return then we need to correct the return address now.
-  if( !cause_return ) {
+  if (!cause_return) {
     __ movptr(rax, Address(java_thread, JavaThread::saved_exception_pc_offset()));
     __ movptr(Address(rbp, wordSize), rax);
   }
@@ -3346,15 +3397,14 @@
   __ jcc(Assembler::equal, noException);
 
   // Exception pending
-
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);
 
   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 
   __ bind(noException);
 
   // Normal exit, register restoring and exit
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);
 
   __ ret(0);
 
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/x86/vm/sharedRuntime_x86_64.cpp
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -116,8 +116,8 @@
   };
 
  public:
-  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words);
-  static void restore_live_registers(MacroAssembler* masm);
+  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false);
+  static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 
   // Offsets into the register save area
   // Used by deoptimization when it is managing result register
@@ -134,7 +134,19 @@
   static void restore_result_registers(MacroAssembler* masm);
 };
 
-OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words) {
+OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
+  int vect_words = 0;
+#ifdef COMPILER2
+  if (save_vectors) {
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    // Save upper half of YMM registes
+    vect_words = 16 * 16 / wordSize;
+    additional_frame_words += vect_words;
+  }
+#else
+  assert(!save_vectors, "vectors are generated only by C2");
+#endif
 
   // Always make the frame size 16-byte aligned
   int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
@@ -155,6 +167,27 @@
 
   __ enter();          // rsp becomes 16-byte aligned here
   __ push_CPU_state(); // Push a multiple of 16 bytes
+
+  if (vect_words > 0) {
+    assert(vect_words*wordSize == 256, "");
+    __ subptr(rsp, 256); // Save upper half of YMM registes
+    __ vextractf128h(Address(rsp,  0),xmm0);
+    __ vextractf128h(Address(rsp, 16),xmm1);
+    __ vextractf128h(Address(rsp, 32),xmm2);
+    __ vextractf128h(Address(rsp, 48),xmm3);
+    __ vextractf128h(Address(rsp, 64),xmm4);
+    __ vextractf128h(Address(rsp, 80),xmm5);
+    __ vextractf128h(Address(rsp, 96),xmm6);
+    __ vextractf128h(Address(rsp,112),xmm7);
+    __ vextractf128h(Address(rsp,128),xmm8);
+    __ vextractf128h(Address(rsp,144),xmm9);
+    __ vextractf128h(Address(rsp,160),xmm10);
+    __ vextractf128h(Address(rsp,176),xmm11);
+    __ vextractf128h(Address(rsp,192),xmm12);
+    __ vextractf128h(Address(rsp,208),xmm13);
+    __ vextractf128h(Address(rsp,224),xmm14);
+    __ vextractf128h(Address(rsp,240),xmm15);
+  }
   if (frame::arg_reg_save_area_bytes != 0) {
     // Allocate argument register save area
     __ subptr(rsp, frame::arg_reg_save_area_bytes);
@@ -167,112 +200,111 @@
 
   OopMapSet *oop_maps = new OopMapSet();
   OopMap* map = new OopMap(frame_size_in_slots, 0);
-  map->set_callee_saved(VMRegImpl::stack2reg( rax_off  + additional_frame_slots), rax->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rcx_off  + additional_frame_slots), rcx->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rdx_off  + additional_frame_slots), rdx->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rbx_off  + additional_frame_slots), rbx->as_VMReg());
+
+#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)
+
+  map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
   // rbp location is known implicitly by the frame sender code, needs no oopmap
   // and the location where rbp was saved by is ignored
-  map->set_callee_saved(VMRegImpl::stack2reg( rsi_off  + additional_frame_slots), rsi->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rdi_off  + additional_frame_slots), rdi->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r8_off   + additional_frame_slots), r8->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r9_off   + additional_frame_slots), r9->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r10_off  + additional_frame_slots), r10->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r11_off  + additional_frame_slots), r11->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r12_off  + additional_frame_slots), r12->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r13_off  + additional_frame_slots), r13->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r14_off  + additional_frame_slots), r14->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r15_off  + additional_frame_slots), r15->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm0_off  + additional_frame_slots), xmm0->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm1_off  + additional_frame_slots), xmm1->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm2_off  + additional_frame_slots), xmm2->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm3_off  + additional_frame_slots), xmm3->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm4_off  + additional_frame_slots), xmm4->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm5_off  + additional_frame_slots), xmm5->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm6_off  + additional_frame_slots), xmm6->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm7_off  + additional_frame_slots), xmm7->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm8_off  + additional_frame_slots), xmm8->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm9_off  + additional_frame_slots), xmm9->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm10_off + additional_frame_slots), xmm10->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm11_off + additional_frame_slots), xmm11->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm12_off + additional_frame_slots), xmm12->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm13_off + additional_frame_slots), xmm13->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm14_off + additional_frame_slots), xmm14->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm15_off + additional_frame_slots), xmm15->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm0_off ), xmm0->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm1_off ), xmm1->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm2_off ), xmm2->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm3_off ), xmm3->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm4_off ), xmm4->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm5_off ), xmm5->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm6_off ), xmm6->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm7_off ), xmm7->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm8_off ), xmm8->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm9_off ), xmm9->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm10_off), xmm10->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm11_off), xmm11->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm12_off), xmm12->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());
 
   // %%% These should all be a waste but we'll keep things as they were for now
   if (true) {
-    map->set_callee_saved(VMRegImpl::stack2reg( raxH_off  + additional_frame_slots),
-                          rax->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rcxH_off  + additional_frame_slots),
-                          rcx->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rdxH_off  + additional_frame_slots),
-                          rdx->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rbxH_off  + additional_frame_slots),
-                          rbx->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
     // rbp location is known implicitly by the frame sender code, needs no oopmap
-    map->set_callee_saved(VMRegImpl::stack2reg( rsiH_off  + additional_frame_slots),
-                          rsi->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rdiH_off  + additional_frame_slots),
-                          rdi->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r8H_off   + additional_frame_slots),
-                          r8->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r9H_off   + additional_frame_slots),
-                          r9->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r10H_off  + additional_frame_slots),
-                          r10->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r11H_off  + additional_frame_slots),
-                          r11->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r12H_off  + additional_frame_slots),
-                          r12->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r13H_off  + additional_frame_slots),
-                          r13->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r14H_off  + additional_frame_slots),
-                          r14->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r15H_off  + additional_frame_slots),
-                          r15->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm0H_off  + additional_frame_slots),
-                          xmm0->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm1H_off  + additional_frame_slots),
-                          xmm1->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm2H_off  + additional_frame_slots),
-                          xmm2->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm3H_off  + additional_frame_slots),
-                          xmm3->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm4H_off  + additional_frame_slots),
-                          xmm4->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm5H_off  + additional_frame_slots),
-                          xmm5->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm6H_off  + additional_frame_slots),
-                          xmm6->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm7H_off  + additional_frame_slots),
-                          xmm7->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm8H_off  + additional_frame_slots),
-                          xmm8->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm9H_off  + additional_frame_slots),
-                          xmm9->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm10H_off + additional_frame_slots),
-                          xmm10->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm11H_off + additional_frame_slots),
-                          xmm11->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm12H_off + additional_frame_slots),
-                          xmm12->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm13H_off + additional_frame_slots),
-                          xmm13->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm14H_off + additional_frame_slots),
-                          xmm14->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm15H_off + additional_frame_slots),
-                          xmm15->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm0H_off ), xmm0->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm1H_off ), xmm1->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm2H_off ), xmm2->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm3H_off ), xmm3->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm4H_off ), xmm4->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm5H_off ), xmm5->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm6H_off ), xmm6->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm7H_off ), xmm7->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm8H_off ), xmm8->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm9H_off ), xmm9->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm10H_off), xmm10->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm11H_off), xmm11->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm12H_off), xmm12->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
   }
 
   return map;
 }
 
-void RegisterSaver::restore_live_registers(MacroAssembler* masm) {
+void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
   if (frame::arg_reg_save_area_bytes != 0) {
     // Pop arg register save area
     __ addptr(rsp, frame::arg_reg_save_area_bytes);
   }
+#ifdef COMPILER2
+  if (restore_vectors) {
+    // Restore upper half of YMM registes.
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    __ vinsertf128h(xmm0, Address(rsp,  0));
+    __ vinsertf128h(xmm1, Address(rsp, 16));
+    __ vinsertf128h(xmm2, Address(rsp, 32));
+    __ vinsertf128h(xmm3, Address(rsp, 48));
+    __ vinsertf128h(xmm4, Address(rsp, 64));
+    __ vinsertf128h(xmm5, Address(rsp, 80));
+    __ vinsertf128h(xmm6, Address(rsp, 96));
+    __ vinsertf128h(xmm7, Address(rsp,112));
+    __ vinsertf128h(xmm8, Address(rsp,128));
+    __ vinsertf128h(xmm9, Address(rsp,144));
+    __ vinsertf128h(xmm10, Address(rsp,160));
+    __ vinsertf128h(xmm11, Address(rsp,176));
+    __ vinsertf128h(xmm12, Address(rsp,192));
+    __ vinsertf128h(xmm13, Address(rsp,208));
+    __ vinsertf128h(xmm14, Address(rsp,224));
+    __ vinsertf128h(xmm15, Address(rsp,240));
+    __ addptr(rsp, 256);
+  }
+#else
+  assert(!restore_vectors, "vectors are generated only by C2");
+#endif
   // Recover CPU state
   __ pop_CPU_state();
   // Get the rbp described implicitly by the calling convention (no oopMap)
@@ -297,6 +329,12 @@
   __ addptr(rsp, return_offset_in_bytes());
 }
 
+// Is vector's size (in bytes) bigger than a size saved by default?
+// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
+bool SharedRuntime::is_wide_vector(int size) {
+  return size > 16;
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
@@ -1593,12 +1631,12 @@
 };
 
 static void verify_oop_args(MacroAssembler* masm,
-                            int total_args_passed,
+                            methodHandle method,
                             const BasicType* sig_bt,
                             const VMRegPair* regs) {
   Register temp_reg = rbx;  // not part of any compiled calling seq
   if (VerifyOops) {
-    for (int i = 0; i < total_args_passed; i++) {
+    for (int i = 0; i < method->size_of_parameters(); i++) {
       if (sig_bt[i] == T_OBJECT ||
           sig_bt[i] == T_ARRAY) {
         VMReg r = regs[i].first();
@@ -1615,35 +1653,32 @@
 }
 
 static void gen_special_dispatch(MacroAssembler* masm,
-                                 int total_args_passed,
-                                 int comp_args_on_stack,
-                                 vmIntrinsics::ID special_dispatch,
+                                 methodHandle method,
                                  const BasicType* sig_bt,
                                  const VMRegPair* regs) {
-  verify_oop_args(masm, total_args_passed, sig_bt, regs);
+  verify_oop_args(masm, method, sig_bt, regs);
+  vmIntrinsics::ID iid = method->intrinsic_id();
 
   // Now write the args into the outgoing interpreter space
   bool     has_receiver   = false;
   Register receiver_reg   = noreg;
   int      member_arg_pos = -1;
   Register member_reg     = noreg;
-  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(special_dispatch);
+  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
   if (ref_kind != 0) {
-    member_arg_pos = total_args_passed - 1;  // trailing MemberName argument
+    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
     member_reg = rbx;  // known to be free at this point
     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
-  } else if (special_dispatch == vmIntrinsics::_invokeBasic) {
+  } else if (iid == vmIntrinsics::_invokeBasic) {
     has_receiver = true;
   } else {
-    guarantee(false, err_msg("special_dispatch=%d", special_dispatch));
+    fatal(err_msg_res("unexpected intrinsic id %d", iid));
   }
 
   if (member_reg != noreg) {
     // Load the member_arg into register, if necessary.
-    assert(member_arg_pos >= 0 && member_arg_pos < total_args_passed, "oob");
-    assert(sig_bt[member_arg_pos] == T_OBJECT, "dispatch argument must be an object");
+    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
     VMReg r = regs[member_arg_pos].first();
-    assert(r->is_valid(), "bad member arg");
     if (r->is_stack()) {
       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
     } else {
@@ -1654,7 +1689,7 @@
 
   if (has_receiver) {
     // Make sure the receiver is loaded into a register.
-    assert(total_args_passed > 0, "oob");
+    assert(method->size_of_parameters() > 0, "oob");
     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
     VMReg r = regs[0].first();
     assert(r->is_valid(), "bad receiver arg");
@@ -1662,7 +1697,7 @@
       // Porting note:  This assumes that compiled calling conventions always
       // pass the receiver oop in a register.  If this is not true on some
       // platform, pick a temp and load the receiver from stack.
-      assert(false, "receiver always in a register");
+      fatal("receiver always in a register");
       receiver_reg = j_rarg0;  // known to be free at this point
       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
     } else {
@@ -1672,7 +1707,7 @@
   }
 
   // Figure out which address we are really jumping to:
-  MethodHandles::generate_method_handle_dispatch(masm, special_dispatch,
+  MethodHandles::generate_method_handle_dispatch(masm, iid,
                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
 }
 
@@ -1708,8 +1743,6 @@
 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
                                                 methodHandle method,
                                                 int compile_id,
-                                                int total_in_args,
-                                                int comp_args_on_stack,
                                                 BasicType* in_sig_bt,
                                                 VMRegPair* in_regs,
                                                 BasicType ret_type) {
@@ -1718,9 +1751,7 @@
     intptr_t start = (intptr_t)__ pc();
     int vep_offset = ((intptr_t)__ pc()) - start;
     gen_special_dispatch(masm,
-                         total_in_args,
-                         comp_args_on_stack,
-                         method->intrinsic_id(),
+                         method,
                          in_sig_bt,
                          in_regs);
     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
@@ -1754,6 +1785,7 @@
   // we convert the java signature to a C signature by inserting
   // the hidden arguments as arg[0] and possibly arg[1] (static method)
 
+  const int total_in_args = method->size_of_parameters();
   int total_c_args = total_in_args;
   if (!is_critical_native) {
     total_c_args += 1;
@@ -3241,7 +3273,6 @@
   return 0;
 }
 
-
 //------------------------------generate_deopt_blob----------------------------
 void SharedRuntime::generate_deopt_blob() {
   // Allocate space for the code
@@ -3746,7 +3777,7 @@
 // Generate a special Compile2Runtime blob that saves all registers,
 // and setup oopmap.
 //
-SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) {
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
   assert(StubRoutines::forward_exception_entry() != NULL,
          "must be generated before");
 
@@ -3761,6 +3792,8 @@
   address start   = __ pc();
   address call_pc = NULL;
   int frame_size_in_words;
+  bool cause_return = (poll_type == POLL_AT_RETURN);
+  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
 
   // Make room for return address (or push it again)
   if (!cause_return) {
@@ -3768,7 +3801,7 @@
   }
 
   // Save registers, fpu state, and flags
-  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
 
   // The following is basically a call_VM.  However, we need the precise
   // address of the call in order to generate an oopmap. Hence, we do all the
@@ -3805,7 +3838,7 @@
 
   // Exception pending
 
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);
 
   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 
@@ -3813,7 +3846,7 @@
   __ bind(noException);
 
   // Normal exit, restore registers and exit.
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);
 
   __ ret(0);
 
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/x86/vm/vm_version_x86.cpp
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -363,6 +363,11 @@
   }
 
   _supports_cx8 = supports_cmpxchg8();
+  // xchg and xadd instructions
+  _supports_atomic_getset4 = true;
+  _supports_atomic_getadd4 = true;
+  LP64_ONLY(_supports_atomic_getset8 = true);
+  LP64_ONLY(_supports_atomic_getadd8 = true);
 
 #ifdef _LP64
   // OS should support SSE for x64 and hardware should support at least SSE2.
@@ -562,10 +567,10 @@
         AllocatePrefetchInstr = 3;
       }
       // On family 15h processors use XMM and UnalignedLoadStores for Array Copy
-      if( supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+      if (supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
         UseXMMForArrayCopy = true;
       }
-      if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+      if (supports_sse2() && FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
         UseUnalignedLoadStores = true;
       }
     }
@@ -612,16 +617,16 @@
         MaxLoopPad = 11;
       }
 #endif // COMPILER2
-      if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
         UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
       }
-      if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
-        if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+      if (supports_sse4_2() && supports_ht()) { // Newest Intel cpus
+        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
           UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
         }
       }
-      if( supports_sse4_2() && UseSSE >= 4 ) {
-        if( FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
+      if (supports_sse4_2() && UseSSE >= 4) {
+        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
           UseSSE42Intrinsics = true;
         }
       }
@@ -638,6 +643,13 @@
     FLAG_SET_DEFAULT(UsePopCountInstruction, false);
   }
 
+#ifdef COMPILER2
+  if (FLAG_IS_DEFAULT(AlignVector)) {
+    // Modern processors allow misaligned memory operations for vectors.
+    AlignVector = !UseUnalignedLoadStores;
+  }
+#endif // COMPILER2
+
   assert(0 <= ReadPrefetchInstr && ReadPrefetchInstr <= 3, "invalid value");
   assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 3, "invalid value");
 
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/x86/vm/x86.ad
--- a/src/cpu/x86/vm/x86.ad	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/x86/vm/x86.ad	Mon Sep 24 14:46:06 2012 -0700
@@ -498,10 +498,18 @@
     case Op_PopCountL:
       if (!UsePopCountInstruction)
         return false;
+    break;
     case Op_MulVI:
       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
         return false;
     break;
+    case Op_CompareAndSwapL:
+#ifdef _LP64
+    case Op_CompareAndSwapP:
+#endif
+      if (!VM_Version::supports_cx8())
+        return false;
+    break;
   }
 
   return true;  // Per default match rules are supported.
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/x86/vm/x86_32.ad
--- a/src/cpu/x86/vm/x86_32.ad	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/x86/vm/x86_32.ad	Mon Sep 24 14:46:06 2012 -0700
@@ -7762,6 +7762,7 @@
 // No flag versions for CompareAndSwap{P,I,L} because matcher can't match them
 
 instruct compareAndSwapL( rRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
+  predicate(VM_Version::supports_cx8());
   match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
   format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
@@ -7798,6 +7799,47 @@
   ins_pipe( pipe_cmpxchg );
 %}
 
+instruct xaddI_no_res( memory mem, Universe dummy, immI add, eFlagsReg cr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem add));
+  effect(KILL cr);
+  format %{ "ADDL  [$mem],$add" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ addl($mem$$Address, $add$$constant);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xaddI( memory mem, rRegI newval, eFlagsReg cr) %{
+  match(Set newval (GetAndAddI mem newval));
+  effect(KILL cr);
+  format %{ "XADDL  [$mem],$newval" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ xaddl($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgI( memory mem, rRegI newval) %{
+  match(Set newval (GetAndSetI mem newval));
+  format %{ "XCHGL  $newval,[$mem]" %}
+  ins_encode %{
+    __ xchgl($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgP( memory mem, pRegP newval) %{
+  match(Set newval (GetAndSetP mem newval));
+  format %{ "XCHGL  $newval,[$mem]" %}
+  ins_encode %{
+    __ xchgl($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
 //----------Subtraction Instructions-------------------------------------------
 // Integer Subtraction Instructions
 instruct subI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
diff -r 04ed664b7e30 -r c92f43386117 src/cpu/x86/vm/x86_64.ad
--- a/src/cpu/x86/vm/x86_64.ad	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/cpu/x86/vm/x86_64.ad	Mon Sep 24 14:46:06 2012 -0700
@@ -7242,6 +7242,7 @@
                          rax_RegP oldval, rRegP newval,
                          rFlagsReg cr)
 %{
+  predicate(VM_Version::supports_cx8());
   match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
 
@@ -7265,6 +7266,7 @@
                          rax_RegL oldval, rRegL newval,
                          rFlagsReg cr)
 %{
+  predicate(VM_Version::supports_cx8());
   match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
 
@@ -7329,6 +7331,88 @@
   ins_pipe( pipe_cmpxchg );
 %}
 
+instruct xaddI_no_res( memory mem, Universe dummy, immI add, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem add));
+  effect(KILL cr);
+  format %{ "ADDL  [$mem],$add" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ addl($mem$$Address, $add$$constant);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xaddI( memory mem, rRegI newval, rFlagsReg cr) %{
+  match(Set newval (GetAndAddI mem newval));
+  effect(KILL cr);
+  format %{ "XADDL  [$mem],$newval" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ xaddl($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xaddL_no_res( memory mem, Universe dummy, immL add, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddL mem add));
+  effect(KILL cr);
+  format %{ "ADDQ  [$mem],$add" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ addq($mem$$Address, $add$$constant);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xaddL( memory mem, rRegL newval, rFlagsReg cr) %{
+  match(Set newval (GetAndAddL mem newval));
+  effect(KILL cr);
+  format %{ "XADDQ  [$mem],$newval" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ xaddq($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgI( memory mem, rRegI newval) %{
+  match(Set newval (GetAndSetI mem newval));
+  format %{ "XCHGL  $newval,[$mem]" %}
+  ins_encode %{
+    __ xchgl($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgL( memory mem, rRegL newval) %{
+  match(Set newval (GetAndSetL mem newval));
+  format %{ "XCHGL  $newval,[$mem]" %}
+  ins_encode %{
+    __ xchgq($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgP( memory mem, rRegP newval) %{
+  match(Set newval (GetAndSetP mem newval));
+  format %{ "XCHGQ  $newval,[$mem]" %}
+  ins_encode %{
+    __ xchgq($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgN( memory mem, rRegN newval) %{
+  match(Set newval (GetAndSetN mem newval));
+  format %{ "XCHGL  $newval,$mem]" %}
+  ins_encode %{
+    __ xchgl($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
 //----------Subtraction Instructions-------------------------------------------
 
 // Integer Subtraction Instructions
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/adlc/formssel.cpp
--- a/src/share/vm/adlc/formssel.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/adlc/formssel.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -751,6 +751,7 @@
         !strcmp(_matrule->_rChild->_opType,"DecodeN")    ||
         !strcmp(_matrule->_rChild->_opType,"EncodeP")    ||
         !strcmp(_matrule->_rChild->_opType,"LoadN")      ||
+        !strcmp(_matrule->_rChild->_opType,"GetAndSetN") ||
         !strcmp(_matrule->_rChild->_opType,"LoadNKlass") ||
         !strcmp(_matrule->_rChild->_opType,"CreateEx")   ||  // type of exception
         !strcmp(_matrule->_rChild->_opType,"CheckCastPP")) ) return true;
@@ -3399,7 +3400,9 @@
     "StorePConditional", "StoreIConditional", "StoreLConditional",
     "CompareAndSwapI", "CompareAndSwapL", "CompareAndSwapP", "CompareAndSwapN",
     "StoreCM",
-    "ClearArray"
+    "ClearArray",
+    "GetAndAddI", "GetAndSetI", "GetAndSetP",
+    "GetAndAddL", "GetAndSetL", "GetAndSetN",
   };
   int cnt = sizeof(needs_ideal_memory_list)/sizeof(char*);
   if( strcmp(_opType,"PrefetchRead")==0 ||
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/asm/codeBuffer.cpp
--- a/src/share/vm/asm/codeBuffer.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/asm/codeBuffer.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -1026,25 +1026,30 @@
     }
     return a;
   }
+
+  // Convenience for add_comment.
+  CodeComment* find_last(intptr_t offset) {
+    CodeComment* a = find(offset);
+    if (a != NULL) {
+      while ((a->_next != NULL) && (a->_next->_offset == offset)) {
+        a = a->_next;
+      }
+    }
+    return a;
+  }
 };
 
 
 void CodeComments::add_comment(intptr_t offset, const char * comment) {
-  CodeComment* c = new CodeComment(offset, comment);
-  CodeComment* insert = NULL;
-  if (_comments != NULL) {
-    CodeComment* c = _comments->find(offset);
-    insert = c;
-    while (c && c->offset() == offset) {
-      insert = c;
-      c = c->next();
-    }
-  }
-  if (insert) {
-    // insert after comments with same offset
-    c->set_next(insert->next());
-    insert->set_next(c);
+  CodeComment* c      = new CodeComment(offset, comment);
+  CodeComment* inspos = (_comments == NULL) ? NULL : _comments->find_last(offset);
+
+  if (inspos) {
+    // insert after already existing comments with same offset
+    c->set_next(inspos->next());
+    inspos->set_next(c);
   } else {
+    // no comments with such offset, yet. Insert before anything else.
     c->set_next(_comments);
     _comments = c;
   }
@@ -1052,12 +1057,11 @@
 
 
 void CodeComments::assign(CodeComments& other) {
-  assert(_comments == NULL, "don't overwrite old value");
   _comments = other._comments;
 }
 
 
-void CodeComments::print_block_comment(outputStream* stream, intptr_t offset) {
+void CodeComments::print_block_comment(outputStream* stream, intptr_t offset) const {
   if (_comments != NULL) {
     CodeComment* c = _comments->find(offset);
     while (c && c->offset() == offset) {
@@ -1085,6 +1089,7 @@
 
 
 void CodeBuffer::decode() {
+  ttyLocker ttyl;
   Disassembler::decode(decode_begin(), insts_end());
   _decode_begin = insts_end();
 }
@@ -1096,6 +1101,7 @@
 
 
 void CodeBuffer::decode_all() {
+  ttyLocker ttyl;
   for (int n = 0; n < (int)SECT_LIMIT; n++) {
     // dump contents of each section
     CodeSection* cs = code_section(n);
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/asm/codeBuffer.hpp
--- a/src/share/vm/asm/codeBuffer.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/asm/codeBuffer.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -253,7 +253,7 @@
   }
 
   void add_comment(intptr_t offset, const char * comment) PRODUCT_RETURN;
-  void print_block_comment(outputStream* stream, intptr_t offset)  PRODUCT_RETURN;
+  void print_block_comment(outputStream* stream, intptr_t offset) const PRODUCT_RETURN;
   void assign(CodeComments& other)  PRODUCT_RETURN;
   void free() PRODUCT_RETURN;
 };
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/asm/register.hpp
--- a/src/share/vm/asm/register.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/asm/register.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -103,8 +103,8 @@
 ) {
   assert(
     a != b,
-    err_msg("registers must be different: a=%d, b=%d",
-            a, b)
+    err_msg_res("registers must be different: a=%d, b=%d",
+                a, b)
   );
 }
 
@@ -117,8 +117,8 @@
   assert(
     a != b && a != c
            && b != c,
-    err_msg("registers must be different: a=%d, b=%d, c=%d",
-            a, b, c)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d",
+                a, b, c)
   );
 }
 
@@ -133,8 +133,8 @@
     a != b && a != c && a != d
            && b != c && b != d
                      && c != d,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d",
-            a, b, c, d)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d",
+                a, b, c, d)
   );
 }
 
@@ -151,8 +151,8 @@
            && b != c && b != d && b != e
                      && c != d && c != e
                                && d != e,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d",
-            a, b, c, d, e)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d",
+                a, b, c, d, e)
   );
 }
 
@@ -171,8 +171,8 @@
                      && c != d && c != e && c != f
                                && d != e && d != f
                                          && e != f,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d",
-            a, b, c, d, e, f)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d",
+                a, b, c, d, e, f)
   );
 }
 
@@ -193,8 +193,8 @@
                                && d != e && d != f && d != g
                                          && e != f && e != g
                                                    && f != g,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d",
-            a, b, c, d, e, f, g)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d",
+                a, b, c, d, e, f, g)
   );
 }
 
@@ -217,8 +217,8 @@
                                          && e != f && e != g && e != h
                                                    && f != g && f != h
                                                              && g != h,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d, h=%d",
-            a, b, c, d, e, f, g, h)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d, h=%d",
+                a, b, c, d, e, f, g, h)
   );
 }
 
@@ -243,8 +243,8 @@
                                                    && f != g && f != h && f != i
                                                              && g != h && g != i
                                                                        && h != i,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d, h=%d, i=%d",
-            a, b, c, d, e, f, g, h, i)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d, h=%d, i=%d",
+                a, b, c, d, e, f, g, h, i)
   );
 }
 
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_Canonicalizer.cpp
--- a/src/share/vm/c1/c1_Canonicalizer.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_Canonicalizer.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -931,6 +931,7 @@
 void Canonicalizer::do_UnsafePutRaw(UnsafePutRaw* x) { if (OptimizeUnsafes) do_UnsafeRawOp(x); }
 void Canonicalizer::do_UnsafeGetObject(UnsafeGetObject* x) {}
 void Canonicalizer::do_UnsafePutObject(UnsafePutObject* x) {}
+void Canonicalizer::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {}
 void Canonicalizer::do_UnsafePrefetchRead (UnsafePrefetchRead*  x) {}
 void Canonicalizer::do_UnsafePrefetchWrite(UnsafePrefetchWrite* x) {}
 void Canonicalizer::do_ProfileCall(ProfileCall* x) {}
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_Canonicalizer.hpp
--- a/src/share/vm/c1/c1_Canonicalizer.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_Canonicalizer.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -100,6 +100,7 @@
   virtual void do_UnsafePutRaw   (UnsafePutRaw*    x);
   virtual void do_UnsafeGetObject(UnsafeGetObject* x);
   virtual void do_UnsafePutObject(UnsafePutObject* x);
+  virtual void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x);
   virtual void do_UnsafePrefetchRead (UnsafePrefetchRead*  x);
   virtual void do_UnsafePrefetchWrite(UnsafePrefetchWrite* x);
   virtual void do_ProfileCall    (ProfileCall*     x);
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_Compilation.cpp
--- a/src/share/vm/c1/c1_Compilation.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_Compilation.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -346,7 +346,8 @@
     implicit_exception_table(),
     compiler(),
     _env->comp_level(),
-    has_unsafe_access()
+    has_unsafe_access(),
+    SharedRuntime::is_wide_vector(max_vector_size())
   );
 }
 
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_Compilation.hpp
--- a/src/share/vm/c1/c1_Compilation.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_Compilation.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -127,6 +127,7 @@
   bool has_exception_handlers() const            { return _has_exception_handlers; }
   bool has_fpu_code() const                      { return _has_fpu_code; }
   bool has_unsafe_access() const                 { return _has_unsafe_access; }
+  int max_vector_size() const                    { return 0; }
   ciMethod* method() const                       { return _method; }
   int osr_bci() const                            { return _osr_bci; }
   bool is_osr_compile() const                    { return osr_bci() >= 0; }
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_GraphBuilder.cpp
--- a/src/share/vm/c1/c1_GraphBuilder.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_GraphBuilder.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -3383,6 +3383,41 @@
       append_unsafe_CAS(callee);
       return true;
 
+    case vmIntrinsics::_getAndAddInt:
+      if (!VM_Version::supports_atomic_getadd4()) {
+        return false;
+      }
+      return append_unsafe_get_and_set_obj(callee, true);
+    case vmIntrinsics::_getAndAddLong:
+      if (!VM_Version::supports_atomic_getadd8()) {
+        return false;
+      }
+      return append_unsafe_get_and_set_obj(callee, true);
+    case vmIntrinsics::_getAndSetInt:
+      if (!VM_Version::supports_atomic_getset4()) {
+        return false;
+      }
+      return append_unsafe_get_and_set_obj(callee, false);
+    case vmIntrinsics::_getAndSetLong:
+      if (!VM_Version::supports_atomic_getset8()) {
+        return false;
+      }
+      return append_unsafe_get_and_set_obj(callee, false);
+    case vmIntrinsics::_getAndSetObject:
+#ifdef _LP64
+      if (!UseCompressedOops && !VM_Version::supports_atomic_getset8()) {
+        return false;
+      }
+      if (UseCompressedOops && !VM_Version::supports_atomic_getset4()) {
+        return false;
+      }
+#else
+      if (!VM_Version::supports_atomic_getset4()) {
+        return false;
+      }
+#endif
+      return append_unsafe_get_and_set_obj(callee, false);
+
     case vmIntrinsics::_Reference_get:
       // Use the intrinsic version of Reference.get() so that the value in
       // the referent field can be registered by the G1 pre-barrier code.
@@ -4106,6 +4141,22 @@
   }
 }
 
+bool GraphBuilder::append_unsafe_get_and_set_obj(ciMethod* callee, bool is_add) {
+  if (InlineUnsafeOps) {
+    Values* args = state()->pop_arguments(callee->arg_size());
+    BasicType t = callee->return_type()->basic_type();
+    null_check(args->at(0));
+    Instruction* offset = args->at(2);
+#ifndef _LP64
+    offset = append(new Convert(Bytecodes::_l2i, offset, as_ValueType(T_INT)));
+#endif
+    Instruction* op = append(new UnsafeGetAndSetObject(t, args->at(1), offset, args->at(3), is_add));
+    compilation()->set_has_unsafe_access(true);
+    kill_all();
+    push(op->type(), op);
+  }
+  return InlineUnsafeOps;
+}
 
 #ifndef PRODUCT
 void GraphBuilder::print_stats() {
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_GraphBuilder.hpp
--- a/src/share/vm/c1/c1_GraphBuilder.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_GraphBuilder.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -367,6 +367,7 @@
   bool append_unsafe_put_raw(ciMethod* callee, BasicType t);
   bool append_unsafe_prefetch(ciMethod* callee, bool is_store, bool is_static);
   void append_unsafe_CAS(ciMethod* callee);
+  bool append_unsafe_get_and_set_obj(ciMethod* callee, bool is_add);
 
   void print_inlining(ciMethod* callee, const char* msg, bool success = true);
 
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_Instruction.hpp
--- a/src/share/vm/c1/c1_Instruction.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_Instruction.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -102,6 +102,7 @@
 class     UnsafeObjectOp;
 class       UnsafeGetObject;
 class       UnsafePutObject;
+class         UnsafeGetAndSetObject;
 class       UnsafePrefetch;
 class         UnsafePrefetchRead;
 class         UnsafePrefetchWrite;
@@ -202,6 +203,7 @@
   virtual void do_UnsafePutRaw   (UnsafePutRaw*    x) = 0;
   virtual void do_UnsafeGetObject(UnsafeGetObject* x) = 0;
   virtual void do_UnsafePutObject(UnsafePutObject* x) = 0;
+  virtual void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) = 0;
   virtual void do_UnsafePrefetchRead (UnsafePrefetchRead*  x) = 0;
   virtual void do_UnsafePrefetchWrite(UnsafePrefetchWrite* x) = 0;
   virtual void do_ProfileCall    (ProfileCall*     x) = 0;
@@ -2273,6 +2275,27 @@
                                                    f->visit(&_value); }
 };
 
+LEAF(UnsafeGetAndSetObject, UnsafeObjectOp)
+ private:
+  Value _value;                                  // Value to be stored
+  bool  _is_add;
+ public:
+  UnsafeGetAndSetObject(BasicType basic_type, Value object, Value offset, Value value, bool is_add)
+  : UnsafeObjectOp(basic_type, object, offset, false, false)
+    , _value(value)
+    , _is_add(is_add)
+  {
+    ASSERT_VALUES
+  }
+
+  // accessors
+  bool is_add() const                            { return _is_add; }
+  Value value()                                  { return _value; }
+
+  // generic
+  virtual void input_values_do(ValueVisitor* f)   { UnsafeObjectOp::input_values_do(f);
+                                                   f->visit(&_value); }
+};
 
 BASE(UnsafePrefetch, UnsafeObjectOp)
  public:
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_InstructionPrinter.cpp
--- a/src/share/vm/c1/c1_InstructionPrinter.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_InstructionPrinter.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -831,6 +831,12 @@
   output()->put(')');
 }
 
+void InstructionPrinter::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {
+  print_unsafe_object_op(x, x->is_add()?"UnsafeGetAndSetObject (add)":"UnsafeGetAndSetObject");
+  output()->print(", value ");
+  print_value(x->value());
+  output()->put(')');
+}
 
 void InstructionPrinter::do_UnsafePrefetchRead(UnsafePrefetchRead* x) {
   print_unsafe_object_op(x, "UnsafePrefetchRead");
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_InstructionPrinter.hpp
--- a/src/share/vm/c1/c1_InstructionPrinter.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_InstructionPrinter.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -128,6 +128,7 @@
   virtual void do_UnsafePutRaw   (UnsafePutRaw*    x);
   virtual void do_UnsafeGetObject(UnsafeGetObject* x);
   virtual void do_UnsafePutObject(UnsafePutObject* x);
+  virtual void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x);
   virtual void do_UnsafePrefetchRead (UnsafePrefetchRead*  x);
   virtual void do_UnsafePrefetchWrite(UnsafePrefetchWrite* x);
   virtual void do_ProfileCall    (ProfileCall*     x);
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_LIR.cpp
--- a/src/share/vm/c1/c1_LIR.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_LIR.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -264,6 +264,7 @@
 #ifdef ASSERT
   switch (code()) {
     case lir_cmove:
+    case lir_xchg:
       break;
 
     default:
@@ -630,6 +631,8 @@
     case lir_shl:
     case lir_shr:
     case lir_ushr:
+    case lir_xadd:
+    case lir_xchg:
     {
       assert(op->as_Op2() != NULL, "must be");
       LIR_Op2* op2 = (LIR_Op2*)op;
@@ -641,6 +644,13 @@
       if (op2->_opr2->is_valid())         do_input(op2->_opr2);
       if (op2->_tmp1->is_valid())         do_temp(op2->_tmp1);
       if (op2->_result->is_valid())       do_output(op2->_result);
+      if (op->code() == lir_xchg || op->code() == lir_xadd) {
+        // on ARM and PPC, return value is loaded first so could
+        // destroy inputs. On other platforms that implement those
+        // (x86, sparc), the extra constrainsts are harmless.
+        if (op2->_opr1->is_valid())       do_temp(op2->_opr1);
+        if (op2->_opr2->is_valid())       do_temp(op2->_opr2);
+      }
 
       break;
     }
@@ -1733,6 +1743,8 @@
      case lir_shr:                   s = "shift_right";   break;
      case lir_ushr:                  s = "ushift_right";  break;
      case lir_alloc_array:           s = "alloc_array";   break;
+     case lir_xadd:                  s = "xadd";          break;
+     case lir_xchg:                  s = "xchg";          break;
      // LIR_Op3
      case lir_idiv:                  s = "idiv";          break;
      case lir_irem:                  s = "irem";          break;
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_LIR.hpp
--- a/src/share/vm/c1/c1_LIR.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_LIR.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -963,6 +963,8 @@
       , lir_alloc_array
       , lir_throw
       , lir_compare_to
+      , lir_xadd
+      , lir_xchg
   , end_op2
   , begin_op3
       , lir_idiv
@@ -2191,6 +2193,9 @@
   void profile_call(ciMethod* method, int bci, ciMethod* callee, LIR_Opr mdo, LIR_Opr recv, LIR_Opr t1, ciKlass* cha_klass) {
     append(new LIR_OpProfileCall(lir_profile_call, method, bci, callee, mdo, recv, t1, cha_klass));
   }
+
+  void xadd(LIR_Opr src, LIR_Opr add, LIR_Opr res, LIR_Opr tmp) { append(new LIR_Op2(lir_xadd, src, add, res, tmp)); }
+  void xchg(LIR_Opr src, LIR_Opr set, LIR_Opr res, LIR_Opr tmp) { append(new LIR_Op2(lir_xchg, src, set, res, tmp)); }
 };
 
 void print_LIR(BlockList* blocks);
@@ -2287,16 +2292,21 @@
       LIR_Address* address = opr->as_address_ptr();
       if (address != NULL) {
         // special handling for addresses: add base and index register of the address
-        // both are always input operands!
+        // both are always input operands or temp if we want to extend
+        // their liveness!
+        if (mode == outputMode) {
+          mode = inputMode;
+        }
+        assert (mode == inputMode || mode == tempMode, "input or temp only for addresses");
         if (address->_base->is_valid()) {
           assert(address->_base->is_register(), "must be");
-          assert(_oprs_len[inputMode] < maxNumberOfOperands, "array overflow");
-          _oprs_new[inputMode][_oprs_len[inputMode]++] = &address->_base;
+          assert(_oprs_len[mode] < maxNumberOfOperands, "array overflow");
+          _oprs_new[mode][_oprs_len[mode]++] = &address->_base;
         }
         if (address->_index->is_valid()) {
           assert(address->_index->is_register(), "must be");
-          assert(_oprs_len[inputMode] < maxNumberOfOperands, "array overflow");
-          _oprs_new[inputMode][_oprs_len[inputMode]++] = &address->_index;
+          assert(_oprs_len[mode] < maxNumberOfOperands, "array overflow");
+          _oprs_new[mode][_oprs_len[mode]++] = &address->_index;
         }
 
       } else {
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_LIRAssembler.cpp
--- a/src/share/vm/c1/c1_LIRAssembler.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_LIRAssembler.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -773,6 +773,11 @@
       throw_op(op->in_opr1(), op->in_opr2(), op->info());
       break;
 
+    case lir_xadd:
+    case lir_xchg:
+      atomic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op->tmp1_opr());
+      break;
+
     default:
       Unimplemented();
       break;
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_LIRAssembler.hpp
--- a/src/share/vm/c1/c1_LIRAssembler.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_LIRAssembler.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -252,6 +252,8 @@
 
   void verify_oop_map(CodeEmitInfo* info);
 
+  void atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp);
+
 #ifdef TARGET_ARCH_x86
 # include "c1_LIRAssembler_x86.hpp"
 #endif
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_LIRGenerator.hpp
--- a/src/share/vm/c1/c1_LIRGenerator.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_LIRGenerator.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -527,6 +527,7 @@
   virtual void do_UnsafePutRaw   (UnsafePutRaw*    x);
   virtual void do_UnsafeGetObject(UnsafeGetObject* x);
   virtual void do_UnsafePutObject(UnsafePutObject* x);
+  virtual void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x);
   virtual void do_UnsafePrefetchRead (UnsafePrefetchRead*  x);
   virtual void do_UnsafePrefetchWrite(UnsafePrefetchWrite* x);
   virtual void do_ProfileCall    (ProfileCall*     x);
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_Optimizer.cpp
--- a/src/share/vm/c1/c1_Optimizer.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_Optimizer.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -505,6 +505,7 @@
   void do_UnsafePutRaw   (UnsafePutRaw*    x);
   void do_UnsafeGetObject(UnsafeGetObject* x);
   void do_UnsafePutObject(UnsafePutObject* x);
+  void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x);
   void do_UnsafePrefetchRead (UnsafePrefetchRead*  x);
   void do_UnsafePrefetchWrite(UnsafePrefetchWrite* x);
   void do_ProfileCall    (ProfileCall*     x);
@@ -676,6 +677,7 @@
 void NullCheckVisitor::do_UnsafePutRaw   (UnsafePutRaw*    x) {}
 void NullCheckVisitor::do_UnsafeGetObject(UnsafeGetObject* x) {}
 void NullCheckVisitor::do_UnsafePutObject(UnsafePutObject* x) {}
+void NullCheckVisitor::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {}
 void NullCheckVisitor::do_UnsafePrefetchRead (UnsafePrefetchRead*  x) {}
 void NullCheckVisitor::do_UnsafePrefetchWrite(UnsafePrefetchWrite* x) {}
 void NullCheckVisitor::do_ProfileCall    (ProfileCall*     x) { nce()->clear_last_explicit_null_check(); }
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/c1/c1_ValueMap.hpp
--- a/src/share/vm/c1/c1_ValueMap.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/c1/c1_ValueMap.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -157,6 +157,7 @@
   void do_Invoke         (Invoke*          x) { kill_memory(); }
   void do_UnsafePutRaw   (UnsafePutRaw*    x) { kill_memory(); }
   void do_UnsafePutObject(UnsafePutObject* x) { kill_memory(); }
+  void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) { kill_memory(); }
   void do_Intrinsic      (Intrinsic*       x) { if (!x->preserves_state()) kill_memory(); }
 
   void do_Phi            (Phi*             x) { /* nothing to do */ }
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/ci/ciEnv.cpp
--- a/src/share/vm/ci/ciEnv.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/ci/ciEnv.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -921,7 +921,8 @@
                             ImplicitExceptionTable* inc_table,
                             AbstractCompiler* compiler,
                             int comp_level,
-                            bool has_unsafe_access) {
+                            bool has_unsafe_access,
+                            bool has_wide_vectors) {
   VM_ENTRY_MARK;
   nmethod* nm = NULL;
   {
@@ -1016,6 +1017,7 @@
       }
     } else {
       nm->set_has_unsafe_access(has_unsafe_access);
+      nm->set_has_wide_vectors(has_wide_vectors);
 
       // Record successful registration.
       // (Put nm into the task handle *before* publishing to the Java heap.)
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/ci/ciEnv.hpp
--- a/src/share/vm/ci/ciEnv.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/ci/ciEnv.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -362,7 +362,8 @@
                        ImplicitExceptionTable*   inc_table,
                        AbstractCompiler*         compiler,
                        int                       comp_level,
-                       bool                      has_unsafe_access);
+                       bool                      has_unsafe_access,
+                       bool                      has_wide_vectors);
 
 
   // Access to certain well known ciObjects.
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/classfile/vmSymbols.hpp
--- a/src/share/vm/classfile/vmSymbols.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/classfile/vmSymbols.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -873,6 +873,20 @@
    do_name(     putOrderedInt_name,                              "putOrderedInt")                                       \
    do_alias(    putOrderedInt_signature,                        /*(Ljava/lang/Object;JI)V*/ putInt_signature)           \
                                                                                                                         \
+  do_intrinsic(_getAndAddInt,             sun_misc_Unsafe,        getAndAddInt_name, getAndAddInt_signature, F_R)       \
+   do_name(     getAndAddInt_name,                                "getAndAddInt")                                       \
+   do_signature(getAndAddInt_signature,                           "(Ljava/lang/Object;JI)I" )                           \
+  do_intrinsic(_getAndAddLong,            sun_misc_Unsafe,        getAndAddLong_name, getAndAddLong_signature, F_R)     \
+   do_name(     getAndAddLong_name,                               "getAndAddLong")                                      \
+   do_signature(getAndAddLong_signature,                          "(Ljava/lang/Object;JJ)J" )                           \
+  do_intrinsic(_getAndSetInt,             sun_misc_Unsafe,        getAndSet_name, getAndSetInt_signature, F_R)          \
+   do_name(     getAndSet_name,                                   "getAndSet")                                          \
+   do_alias(    getAndSetInt_signature,                         /*"(Ljava/lang/Object;JI)I"*/ getAndAddInt_signature)   \
+  do_intrinsic(_getAndSetLong,            sun_misc_Unsafe,        getAndSet_name, getAndSetLong_signature, F_R)         \
+   do_alias(    getAndSetLong_signature,                        /*"(Ljava/lang/Object;JJ)J"*/ getAndAddLong_signature)  \
+  do_intrinsic(_getAndSetObject,          sun_misc_Unsafe,        getAndSet_name, getAndSetObject_signature,  F_R)      \
+   do_signature(getAndSetObject_signature,                        "(Ljava/lang/Object;JLjava/lang/Object;)Ljava/lang/Object;" ) \
+                                                                                                                        \
   /* prefetch_signature is shared by all prefetch variants */                                                           \
   do_signature( prefetch_signature,        "(Ljava/lang/Object;J)V")                                                    \
                                                                                                                         \
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/code/codeBlob.cpp
--- a/src/share/vm/code/codeBlob.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/code/codeBlob.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -162,8 +162,10 @@
     assert(strlen(name1) + strlen(name2) < sizeof(stub_id), "");
     jio_snprintf(stub_id, sizeof(stub_id), "%s%s", name1, name2);
     if (PrintStubCode) {
+      ttyLocker ttyl;
       tty->print_cr("Decoding %s " INTPTR_FORMAT, stub_id, (intptr_t) stub);
       Disassembler::decode(stub->code_begin(), stub->code_end());
+      tty->cr();
     }
     Forte::register_stub(stub_id, stub->code_begin(), stub->code_end());
 
@@ -548,6 +550,7 @@
 }
 
 void RuntimeStub::print_on(outputStream* st) const {
+  ttyLocker ttyl;
   CodeBlob::print_on(st);
   st->print("Runtime Stub (" INTPTR_FORMAT "): ", this);
   st->print_cr(name());
@@ -563,6 +566,7 @@
 }
 
 void SingletonBlob::print_on(outputStream* st) const {
+  ttyLocker ttyl;
   CodeBlob::print_on(st);
   st->print_cr(name());
   Disassembler::decode((CodeBlob*)this, st);
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/code/codeBlob.hpp
--- a/src/share/vm/code/codeBlob.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/code/codeBlob.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -184,7 +184,7 @@
   static void trace_new_stub(CodeBlob* blob, const char* name1, const char* name2 = "");
 
   // Print the comment associated with offset on stream, if there is one
-  virtual void print_block_comment(outputStream* stream, address block_begin) {
+  virtual void print_block_comment(outputStream* stream, address block_begin) const {
     intptr_t offset = (intptr_t)(block_begin - code_begin());
     _comments.print_block_comment(stream, offset);
   }
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/code/icBuffer.hpp
--- a/src/share/vm/code/icBuffer.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/code/icBuffer.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -25,6 +25,7 @@
 #ifndef SHARE_VM_CODE_ICBUFFER_HPP
 #define SHARE_VM_CODE_ICBUFFER_HPP
 
+#include "asm/codeBuffer.hpp"
 #include "code/stubs.hpp"
 #include "interpreter/bytecodes.hpp"
 #include "memory/allocation.hpp"
@@ -48,7 +49,8 @@
  protected:
   friend class ICStubInterface;
   // This will be called only by ICStubInterface
-  void    initialize(int size) { _size = size; _ic_site = NULL; }
+  void    initialize(int size,
+                     CodeComments comments)      { _size = size; _ic_site = NULL; }
   void    finalize(); // called when a method is removed
 
   // General info
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/code/nmethod.cpp
--- a/src/share/vm/code/nmethod.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/code/nmethod.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -463,6 +463,7 @@
   _has_unsafe_access          = 0;
   _has_method_handle_invokes  = 0;
   _lazy_critical_native       = 0;
+  _has_wide_vectors           = 0;
   _marked_for_deoptimization  = 0;
   _lock_count                 = 0;
   _stack_traversal_mark       = 0;
@@ -700,7 +701,9 @@
     // then print the requested information
     if (PrintNativeNMethods) {
       print_code();
-      oop_maps->print();
+      if (oop_maps != NULL) {
+        oop_maps->print();
+      }
     }
     if (PrintRelocations) {
       print_relocations();
@@ -2669,7 +2672,7 @@
   return NULL;
 }
 
-void nmethod::print_nmethod_labels(outputStream* stream, address block_begin) {
+void nmethod::print_nmethod_labels(outputStream* stream, address block_begin) const {
   if (block_begin == entry_point())             stream->print_cr("[Entry Point]");
   if (block_begin == verified_entry_point())    stream->print_cr("[Verified Entry Point]");
   if (block_begin == exception_begin())         stream->print_cr("[Exception Handler]");
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/code/nmethod.hpp
--- a/src/share/vm/code/nmethod.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/code/nmethod.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -177,6 +177,7 @@
   unsigned int _has_unsafe_access:1;         // May fault due to unsafe access.
   unsigned int _has_method_handle_invokes:1; // Has this method MethodHandle invokes?
   unsigned int _lazy_critical_native:1;      // Lazy JNI critical native
+  unsigned int _has_wide_vectors:1;          // Preserve wide vectors at safepoints
 
   // Protected by Patching_lock
   unsigned char _state;                      // {alive, not_entrant, zombie, unloaded}
@@ -442,6 +443,9 @@
   bool  is_lazy_critical_native() const           { return _lazy_critical_native; }
   void  set_lazy_critical_native(bool z)          { _lazy_critical_native = z; }
 
+  bool  has_wide_vectors() const                  { return _has_wide_vectors; }
+  void  set_has_wide_vectors(bool z)              { _has_wide_vectors = z; }
+
   int   comp_level() const                        { return _comp_level; }
 
   // Support for oops in scopes and relocs:
@@ -649,11 +653,11 @@
   void log_state_change() const;
 
   // Prints block-level comments, including nmethod specific block labels:
-  virtual void print_block_comment(outputStream* stream, address block_begin) {
+  virtual void print_block_comment(outputStream* stream, address block_begin) const {
     print_nmethod_labels(stream, block_begin);
     CodeBlob::print_block_comment(stream, block_begin);
   }
-  void print_nmethod_labels(outputStream* stream, address block_begin);
+  void print_nmethod_labels(outputStream* stream, address block_begin) const;
 
   // Prints a comment for one native instruction (reloc info, pc desc)
   void print_code_comment_on(outputStream* st, int column, address begin, address end);
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/code/stubs.cpp
--- a/src/share/vm/code/stubs.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/code/stubs.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -101,7 +101,8 @@
 
 Stub* StubQueue::request_committed(int code_size) {
   Stub* s = request(code_size);
-  if (s != NULL) commit(code_size);
+  CodeComments comments;
+  if (s != NULL) commit(code_size, comments);
   return s;
 }
 
@@ -118,7 +119,8 @@
       assert(_buffer_limit == _buffer_size, "buffer must be fully usable");
       if (_queue_end + requested_size <= _buffer_size) {
         // code fits in at the end => nothing to do
-        stub_initialize(s, requested_size);
+        CodeComments comments;
+        stub_initialize(s, requested_size, comments);
         return s;
       } else {
         // stub doesn't fit in at the queue end
@@ -135,7 +137,8 @@
     // Queue: |XXX|.......|XXXXXXX|.......|
     //        ^0  ^end    ^begin  ^limit  ^size
     s = current_stub();
-    stub_initialize(s, requested_size);
+    CodeComments comments;
+    stub_initialize(s, requested_size, comments);
     return s;
   }
   // Not enough space left
@@ -144,12 +147,12 @@
 }
 
 
-void StubQueue::commit(int committed_code_size) {
+void StubQueue::commit(int committed_code_size, CodeComments& comments) {
   assert(committed_code_size > 0, "committed_code_size must be > 0");
   int committed_size = round_to(stub_code_size_to_size(committed_code_size), CodeEntryAlignment);
   Stub* s = current_stub();
   assert(committed_size <= stub_size(s), "committed size must not exceed requested size");
-  stub_initialize(s, committed_size);
+  stub_initialize(s, committed_size, comments);
   _queue_end += committed_size;
   _number_of_stubs++;
   if (_mutex != NULL) _mutex->unlock();
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/code/stubs.hpp
--- a/src/share/vm/code/stubs.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/code/stubs.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -25,6 +25,7 @@
 #ifndef SHARE_VM_CODE_STUBS_HPP
 #define SHARE_VM_CODE_STUBS_HPP
 
+#include "asm/codeBuffer.hpp"
 #include "memory/allocation.hpp"
 #ifdef TARGET_OS_FAMILY_linux
 # include "os_linux.inline.hpp"
@@ -71,7 +72,8 @@
 class Stub VALUE_OBJ_CLASS_SPEC {
  public:
   // Initialization/finalization
-  void    initialize(int size)                   { ShouldNotCallThis(); }                // called to initialize/specify the stub's size
+  void    initialize(int size,
+                     CodeComments& comments)     { ShouldNotCallThis(); }                // called to initialize/specify the stub's size
   void    finalize()                             { ShouldNotCallThis(); }                // called before the stub is deallocated
 
   // General info/converters
@@ -104,7 +106,8 @@
 class StubInterface: public CHeapObj<mtCode> {
  public:
   // Initialization/finalization
-  virtual void    initialize(Stub* self, int size)         = 0; // called after creation (called twice if allocated via (request, commit))
+  virtual void    initialize(Stub* self, int size,
+                             CodeComments& comments)       = 0; // called after creation (called twice if allocated via (request, commit))
   virtual void    finalize(Stub* self)                     = 0; // called before deallocation
 
   // General info/converters
@@ -132,7 +135,8 @@
                                                            \
    public:                                                 \
     /* Initialization/finalization */                      \
-    virtual void    initialize(Stub* self, int size)       { cast(self)->initialize(size); }       \
+    virtual void    initialize(Stub* self, int size,       \
+                               CodeComments& comments)     { cast(self)->initialize(size, comments); } \
     virtual void    finalize(Stub* self)                   { cast(self)->finalize(); }             \
                                                            \
     /* General info */                                     \
@@ -171,7 +175,8 @@
   Stub* current_stub() const                     { return stub_at(_queue_end); }
 
   // Stub functionality accessed via interface
-  void  stub_initialize(Stub* s, int size)       { assert(size % CodeEntryAlignment == 0, "size not aligned"); _stub_interface->initialize(s, size); }
+  void  stub_initialize(Stub* s, int size,
+                        CodeComments& comments)  { assert(size % CodeEntryAlignment == 0, "size not aligned"); _stub_interface->initialize(s, size, comments); }
   void  stub_finalize(Stub* s)                   { _stub_interface->finalize(s); }
   int   stub_size(Stub* s) const                 { return _stub_interface->size(s); }
   bool  stub_contains(Stub* s, address pc) const { return _stub_interface->code_begin(s) <= pc && pc < _stub_interface->code_end(s); }
@@ -200,7 +205,8 @@
   // Stub allocation (atomic transactions)
   Stub* request_committed(int code_size);        // request a stub that provides exactly code_size space for code
   Stub* request(int requested_code_size);        // request a stub with a (maximum) code space - locks the queue
-  void  commit (int committed_code_size);        // commit the previously requested stub - unlocks the queue
+  void  commit (int committed_code_size,
+                CodeComments& comments);         // commit the previously requested stub - unlocks the queue
 
   // Stub deallocation
   void  remove_first();                          // remove the first stub in the queue
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/compiler/disassembler.cpp
--- a/src/share/vm/compiler/disassembler.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/compiler/disassembler.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -148,6 +148,7 @@
  private:
   nmethod*      _nm;
   CodeBlob*     _code;
+  CodeComments  _comments;
   outputStream* _output;
   address       _start, _end;
 
@@ -187,7 +188,7 @@
   void print_address(address value);
 
  public:
-  decode_env(CodeBlob* code, outputStream* output);
+  decode_env(CodeBlob* code, outputStream* output, CodeComments c = CodeComments());
 
   address decode_instructions(address start, address end);
 
@@ -229,12 +230,13 @@
   const char* options() { return _option_buf; }
 };
 
-decode_env::decode_env(CodeBlob* code, outputStream* output) {
+decode_env::decode_env(CodeBlob* code, outputStream* output, CodeComments c) {
   memset(this, 0, sizeof(*this));
   _output = output ? output : tty;
   _code = code;
   if (code != NULL && code->is_nmethod())
     _nm = (nmethod*) code;
+  _comments.assign(c);
 
   // by default, output pc but not bytes:
   _print_pc       = true;
@@ -356,6 +358,7 @@
   if (cb != NULL) {
     cb->print_block_comment(st, p);
   }
+  _comments.print_block_comment(st, (intptr_t)(p - _start));
   if (_print_pc) {
     st->print("  " PTR_FORMAT ": ", p);
   }
@@ -467,10 +470,9 @@
   env.decode_instructions(cb->code_begin(), cb->code_end());
 }
 
-
-void Disassembler::decode(address start, address end, outputStream* st) {
+void Disassembler::decode(address start, address end, outputStream* st, CodeComments c) {
   if (!load_library())  return;
-  decode_env env(CodeCache::find_blob_unsafe(start), st);
+  decode_env env(CodeCache::find_blob_unsafe(start), st, c);
   env.decode_instructions(start, end);
 }
 
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/compiler/disassembler.hpp
--- a/src/share/vm/compiler/disassembler.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/compiler/disassembler.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -25,6 +25,7 @@
 #ifndef SHARE_VM_COMPILER_DISASSEMBLER_HPP
 #define SHARE_VM_COMPILER_DISASSEMBLER_HPP
 
+#include "asm/codeBuffer.hpp"
 #include "runtime/globals.hpp"
 #ifdef TARGET_OS_FAMILY_linux
 # include "os_linux.inline.hpp"
@@ -87,7 +88,7 @@
   }
   static void decode(CodeBlob *cb,               outputStream* st = NULL);
   static void decode(nmethod* nm,                outputStream* st = NULL);
-  static void decode(address begin, address end, outputStream* st = NULL);
+  static void decode(address begin, address end, outputStream* st = NULL, CodeComments c = CodeComments());
 };
 
 #endif // SHARE_VM_COMPILER_DISASSEMBLER_HPP
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/interpreter/interpreter.cpp
--- a/src/share/vm/interpreter/interpreter.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/interpreter/interpreter.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -60,6 +60,8 @@
 
 
 void InterpreterCodelet::print_on(outputStream* st) const {
+  ttyLocker ttyl;
+
   if (PrintInterpreter) {
     st->cr();
     st->print_cr("----------------------------------------------------------------------");
@@ -72,7 +74,7 @@
 
   if (PrintInterpreter) {
     st->cr();
-    Disassembler::decode(code_begin(), code_end(), st);
+    Disassembler::decode(code_begin(), code_end(), st, DEBUG_ONLY(_comments) NOT_DEBUG(CodeComments()));
   }
 }
 
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/interpreter/interpreter.hpp
--- a/src/share/vm/interpreter/interpreter.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/interpreter/interpreter.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -48,10 +48,12 @@
   int         _size;                             // the size in bytes
   const char* _description;                      // a description of the codelet, for debugging & printing
   Bytecodes::Code _bytecode;                     // associated bytecode if any
+  DEBUG_ONLY(CodeComments _comments;)            // Comments for annotating assembler output.
 
  public:
   // Initialization/finalization
-  void    initialize(int size)                   { _size = size; }
+  void    initialize(int size,
+                     CodeComments& comments)     { _size = size; DEBUG_ONLY(_comments.assign(comments);) }
   void    finalize()                             { ShouldNotCallThis(); }
 
   // General info/converters
@@ -129,7 +131,7 @@
 
 
     // commit Codelet
-    AbstractInterpreter::code()->commit((*_masm)->code()->pure_insts_size());
+    AbstractInterpreter::code()->commit((*_masm)->code()->pure_insts_size(), (*_masm)->code()->comments());
     // make sure nobody can use _masm outside a CodeletMark lifespan
     *_masm = NULL;
   }
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/oops/method.cpp
--- a/src/share/vm/oops/method.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/oops/method.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -251,8 +251,12 @@
 
 
 int Method::bci_from(address bcp) const {
+#ifdef ASSERT
+  { ResourceMark rm;
   assert(is_native() && bcp == code_base() || contains(bcp) || is_error_reported(),
          err_msg("bcp doesn't belong to this method: bcp: " INTPTR_FORMAT ", method: %s", bcp, name_and_sig_as_C_string()));
+  }
+#endif
   return bcp - code_base();
 }
 
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/c2_globals.hpp
--- a/src/share/vm/opto/c2_globals.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/c2_globals.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -85,7 +85,7 @@
           "Max vector size in bytes, "                                      \
           "actual size could be less depending on elements type")           \
                                                                             \
-  product(bool, AlignVector, false,                                         \
+  product(bool, AlignVector, true,                                          \
           "Perform vector store/load alignment in loop")                    \
                                                                             \
   product(intx, NumberOfLoopInstrToAlign, 4,                                \
@@ -535,7 +535,7 @@
   notproduct(bool, TraceSpilling, false,                                    \
           "Trace spilling")                                                 \
                                                                             \
-  notproduct(bool, TraceTypeProfile, false,                                 \
+  diagnostic(bool, TraceTypeProfile, false,                                 \
           "Trace type profile")                                             \
                                                                             \
   develop(bool, PoisonOSREntry, true,                                       \
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/classes.hpp
--- a/src/share/vm/opto/classes.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/classes.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -83,6 +83,12 @@
 macro(CompareAndSwapL)
 macro(CompareAndSwapP)
 macro(CompareAndSwapN)
+macro(GetAndAddI)
+macro(GetAndAddL)
+macro(GetAndSetI)
+macro(GetAndSetL)
+macro(GetAndSetP)
+macro(GetAndSetN)
 macro(Con)
 macro(ConN)
 macro(ConD)
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/compile.cpp
--- a/src/share/vm/opto/compile.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/compile.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -825,7 +825,8 @@
                            &_handler_table, &_inc_table,
                            compiler,
                            env()->comp_level(),
-                           has_unsafe_access()
+                           has_unsafe_access(),
+                           SharedRuntime::is_wide_vector(max_vector_size())
                            );
   }
 }
@@ -963,6 +964,7 @@
   _trap_can_recompile = false;  // no traps emitted yet
   _major_progress = true; // start out assuming good things will happen
   set_has_unsafe_access(false);
+  set_max_vector_size(0);
   Copy::zero_to_bytes(_trap_hist, sizeof(_trap_hist));
   set_decompile_count(0);
 
@@ -2274,6 +2276,12 @@
   case Op_CompareAndSwapL:
   case Op_CompareAndSwapP:
   case Op_CompareAndSwapN:
+  case Op_GetAndAddI:
+  case Op_GetAndAddL:
+  case Op_GetAndSetI:
+  case Op_GetAndSetL:
+  case Op_GetAndSetP:
+  case Op_GetAndSetN:
   case Op_StoreP:
   case Op_StoreN:
   case Op_LoadB:
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/compile.hpp
--- a/src/share/vm/opto/compile.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/compile.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -279,6 +279,7 @@
   bool                  _has_split_ifs;         // True if the method _may_ have some split-if
   bool                  _has_unsafe_access;     // True if the method _may_ produce faults in unsafe loads or stores.
   bool                  _has_stringbuilder;     // True StringBuffers or StringBuilders are allocated
+  int                   _max_vector_size;       // Maximum size of generated vectors
   uint                  _trap_hist[trapHistLength];  // Cumulative traps
   bool                  _trap_can_recompile;    // Have we emitted a recompiling trap?
   uint                  _decompile_count;       // Cumulative decompilation counts.
@@ -443,6 +444,8 @@
   void          set_has_unsafe_access(bool z)   { _has_unsafe_access = z; }
   bool              has_stringbuilder() const   { return _has_stringbuilder; }
   void          set_has_stringbuilder(bool z)   { _has_stringbuilder = z; }
+  int               max_vector_size() const     { return _max_vector_size; }
+  void          set_max_vector_size(int s)      { _max_vector_size = s; }
   void          set_trap_count(uint r, uint c)  { assert(r < trapHistLength, "oob");        _trap_hist[r] = c; }
   uint              trap_count(uint r) const    { assert(r < trapHistLength, "oob"); return _trap_hist[r]; }
   bool              trap_can_recompile() const  { return _trap_can_recompile; }
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/connode.cpp
--- a/src/share/vm/opto/connode.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/connode.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -480,7 +480,9 @@
         opc == Op_CheckCastPP ||
         opc == Op_StorePConditional ||
         opc == Op_CompareAndSwapP ||
-        opc == Op_CompareAndSwapN;
+        opc == Op_CompareAndSwapN ||
+        opc == Op_GetAndSetP ||
+        opc == Op_GetAndSetN;
   }
   return possible_alias;
 }
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/doCall.cpp
--- a/src/share/vm/opto/doCall.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/doCall.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -40,11 +40,10 @@
 #include "prims/nativeLookup.hpp"
 #include "runtime/sharedRuntime.hpp"
 
-#ifndef PRODUCT
 void trace_type_profile(ciMethod *method, int depth, int bci, ciMethod *prof_method, ciKlass *prof_klass, int site_count, int receiver_count) {
-  if (TraceTypeProfile || PrintInlining || PrintOptoInlining) {
+  if (TraceTypeProfile || PrintInlining NOT_PRODUCT(|| PrintOptoInlining)) {
     if (!PrintInlining) {
-      if (!PrintOpto && !PrintCompilation) {
+      if (NOT_PRODUCT(!PrintOpto &&) !PrintCompilation) {
         method->print_short_name();
         tty->cr();
       }
@@ -56,7 +55,6 @@
     tty->cr();
   }
 }
-#endif
 
 CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool call_is_virtual,
                                        JVMState* jvms, bool allow_inline,
@@ -225,13 +223,13 @@
           }
           if (miss_cg != NULL) {
             if (next_hit_cg != NULL) {
-              NOT_PRODUCT(trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), next_receiver_method, profile.receiver(1), site_count, profile.receiver_count(1)));
+              trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), next_receiver_method, profile.receiver(1), site_count, profile.receiver_count(1));
               // We don't need to record dependency on a receiver here and below.
               // Whenever we inline, the dependency is added by Parse::Parse().
               miss_cg = CallGenerator::for_predicted_call(profile.receiver(1), miss_cg, next_hit_cg, PROB_MAX);
             }
             if (miss_cg != NULL) {
-              NOT_PRODUCT(trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), receiver_method, profile.receiver(0), site_count, receiver_count));
+              trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), receiver_method, profile.receiver(0), site_count, receiver_count);
               CallGenerator* cg = CallGenerator::for_predicted_call(profile.receiver(0), miss_cg, hit_cg, profile.receiver_prob(0));
               if (cg != NULL)  return cg;
             }
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/escape.cpp
--- a/src/share/vm/opto/escape.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/escape.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -282,6 +282,26 @@
   return has_non_escaping_obj;
 }
 
+// Utility function for nodes that load an object
+void ConnectionGraph::add_objload_to_connection_graph(Node *n, Unique_Node_List *delayed_worklist) {
+  // Using isa_ptr() instead of isa_oopptr() for LoadP and Phi because
+  // ThreadLocal has RawPtr type.
+  const Type* t = _igvn->type(n);
+  if (t->make_ptr() != NULL) {
+    Node* adr = n->in(MemNode::Address);
+#ifdef ASSERT
+    if (!adr->is_AddP()) {
+      assert(_igvn->type(adr)->isa_rawptr(), "sanity");
+    } else {
+      assert((ptnode_adr(adr->_idx) == NULL ||
+              ptnode_adr(adr->_idx)->as_Field()->is_oop()), "sanity");
+    }
+#endif
+    add_local_var_and_edge(n, PointsToNode::NoEscape,
+                           adr, delayed_worklist);
+  }
+}
+
 // Populate Connection Graph with PointsTo nodes and create simple
 // connection graph edges.
 void ConnectionGraph::add_node_to_connection_graph(Node *n, Unique_Node_List *delayed_worklist) {
@@ -387,22 +407,7 @@
     case Op_LoadP:
     case Op_LoadN:
     case Op_LoadPLocked: {
-      // Using isa_ptr() instead of isa_oopptr() for LoadP and Phi because
-      // ThreadLocal has RawPrt type.
-      const Type* t = igvn->type(n);
-      if (t->make_ptr() != NULL) {
-        Node* adr = n->in(MemNode::Address);
-#ifdef ASSERT
-        if (!adr->is_AddP()) {
-          assert(igvn->type(adr)->isa_rawptr(), "sanity");
-        } else {
-          assert((ptnode_adr(adr->_idx) == NULL ||
-                  ptnode_adr(adr->_idx)->as_Field()->is_oop()), "sanity");
-        }
-#endif
-        add_local_var_and_edge(n, PointsToNode::NoEscape,
-                               adr, delayed_worklist);
-      }
+      add_objload_to_connection_graph(n, delayed_worklist);
       break;
     }
     case Op_Parm: {
@@ -417,7 +422,7 @@
     }
     case Op_Phi: {
       // Using isa_ptr() instead of isa_oopptr() for LoadP and Phi because
-      // ThreadLocal has RawPrt type.
+      // ThreadLocal has RawPtr type.
       const Type* t = n->as_Phi()->type();
       if (t->make_ptr() != NULL) {
         add_local_var(n, PointsToNode::NoEscape);
@@ -446,6 +451,11 @@
       }
       break;
     }
+    case Op_GetAndSetP:
+    case Op_GetAndSetN: {
+      add_objload_to_connection_graph(n, delayed_worklist);
+      // fallthrough
+    }
     case Op_StoreP:
     case Op_StoreN:
     case Op_StorePConditional:
@@ -585,7 +595,7 @@
     case Op_LoadN:
     case Op_LoadPLocked: {
       // Using isa_ptr() instead of isa_oopptr() for LoadP and Phi because
-      // ThreadLocal has RawPrt type.
+      // ThreadLocal has RawPtr type.
       const Type* t = _igvn->type(n);
       if (t->make_ptr() != NULL) {
         Node* adr = n->in(MemNode::Address);
@@ -596,7 +606,7 @@
     }
     case Op_Phi: {
       // Using isa_ptr() instead of isa_oopptr() for LoadP and Phi because
-      // ThreadLocal has RawPrt type.
+      // ThreadLocal has RawPtr type.
       const Type* t = n->as_Phi()->type();
       if (t->make_ptr() != NULL) {
         for (uint i = 1; i < n->req(); i++) {
@@ -638,8 +648,16 @@
     case Op_StoreN:
     case Op_StorePConditional:
     case Op_CompareAndSwapP:
-    case Op_CompareAndSwapN: {
+    case Op_CompareAndSwapN:
+    case Op_GetAndSetP:
+    case Op_GetAndSetN: {
       Node* adr = n->in(MemNode::Address);
+      if (opcode == Op_GetAndSetP || opcode == Op_GetAndSetN) {
+        const Type* t = _igvn->type(n);
+        if (t->make_ptr() != NULL) {
+          add_local_var_and_edge(n, PointsToNode::NoEscape, adr, NULL);
+        }
+      }
       const Type *adr_type = _igvn->type(adr);
       adr_type = adr_type->make_ptr();
       if (adr_type->isa_oopptr() ||
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/escape.hpp
--- a/src/share/vm/opto/escape.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/escape.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -371,6 +371,8 @@
     _nodes.at_put(n->_idx, ptn);
   }
 
+  // Utility function for nodes that load an object
+  void add_objload_to_connection_graph(Node *n, Unique_Node_List *delayed_worklist);
   // Create PointsToNode node and add it to Connection Graph.
   void add_node_to_connection_graph(Node *n, Unique_Node_List *delayed_worklist);
 
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/library_call.cpp
--- a/src/share/vm/opto/library_call.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/library_call.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -65,6 +65,8 @@
  private:
   LibraryIntrinsic* _intrinsic;   // the library intrinsic being called
 
+  const TypeOopPtr* sharpen_unsafe_type(Compile::AliasType* alias_type, const TypePtr *adr_type, bool is_native_ptr = false);
+
  public:
   LibraryCallKit(JVMState* caller, LibraryIntrinsic* intrinsic)
     : GraphKit(caller),
@@ -241,7 +243,8 @@
                                     Node* src,  Node* src_offset,
                                     Node* dest, Node* dest_offset,
                                     Node* copy_length, bool dest_uninitialized);
-  bool inline_unsafe_CAS(BasicType type);
+  typedef enum { LS_xadd, LS_xchg, LS_cmpxchg } LoadStoreKind;
+  bool inline_unsafe_load_store(BasicType type,  LoadStoreKind kind);
   bool inline_unsafe_ordered_store(BasicType type);
   bool inline_fp_conversions(vmIntrinsics::ID id);
   bool inline_numberOfLeadingZeros(vmIntrinsics::ID id);
@@ -290,6 +293,11 @@
     case vmIntrinsics::_compareTo:
     case vmIntrinsics::_equals:
     case vmIntrinsics::_equalsC:
+    case vmIntrinsics::_getAndAddInt:
+    case vmIntrinsics::_getAndAddLong:
+    case vmIntrinsics::_getAndSetInt:
+    case vmIntrinsics::_getAndSetLong:
+    case vmIntrinsics::_getAndSetObject:
       break;  // InlineNatives does not control String.compareTo
     case vmIntrinsics::_Reference_get:
       break;  // InlineNatives does not control Reference.get
@@ -369,6 +377,42 @@
     // across safepoint since GC can change it value.
     break;
 
+  case vmIntrinsics::_compareAndSwapObject:
+#ifdef _LP64
+    if (!UseCompressedOops && !Matcher::match_rule_supported(Op_CompareAndSwapP)) return NULL;
+#endif
+    break;
+
+  case vmIntrinsics::_compareAndSwapLong:
+    if (!Matcher::match_rule_supported(Op_CompareAndSwapL)) return NULL;
+    break;
+
+  case vmIntrinsics::_getAndAddInt:
+    if (!Matcher::match_rule_supported(Op_GetAndAddI)) return NULL;
+    break;
+
+  case vmIntrinsics::_getAndAddLong:
+    if (!Matcher::match_rule_supported(Op_GetAndAddL)) return NULL;
+    break;
+
+  case vmIntrinsics::_getAndSetInt:
+    if (!Matcher::match_rule_supported(Op_GetAndSetI)) return NULL;
+    break;
+
+  case vmIntrinsics::_getAndSetLong:
+    if (!Matcher::match_rule_supported(Op_GetAndSetL)) return NULL;
+    break;
+
+  case vmIntrinsics::_getAndSetObject:
+#ifdef _LP64
+    if (!UseCompressedOops && !Matcher::match_rule_supported(Op_GetAndSetP)) return NULL;
+    if (UseCompressedOops && !Matcher::match_rule_supported(Op_GetAndSetN)) return NULL;
+    break;
+#else
+    if (!Matcher::match_rule_supported(Op_GetAndSetP)) return NULL;
+    break;
+#endif
+
  default:
     assert(id <= vmIntrinsics::LAST_COMPILER_INLINE, "caller responsibility");
     assert(id != vmIntrinsics::_Object_init && id != vmIntrinsics::_invoke, "enum out of order?");
@@ -620,11 +664,11 @@
     return inline_unsafe_prefetch(!is_native_ptr, is_store, is_static);
 
   case vmIntrinsics::_compareAndSwapObject:
-    return inline_unsafe_CAS(T_OBJECT);
+    return inline_unsafe_load_store(T_OBJECT, LS_cmpxchg);
   case vmIntrinsics::_compareAndSwapInt:
-    return inline_unsafe_CAS(T_INT);
+    return inline_unsafe_load_store(T_INT, LS_cmpxchg);
   case vmIntrinsics::_compareAndSwapLong:
-    return inline_unsafe_CAS(T_LONG);
+    return inline_unsafe_load_store(T_LONG, LS_cmpxchg);
 
   case vmIntrinsics::_putOrderedObject:
     return inline_unsafe_ordered_store(T_OBJECT);
@@ -633,6 +677,17 @@
   case vmIntrinsics::_putOrderedLong:
     return inline_unsafe_ordered_store(T_LONG);
 
+  case vmIntrinsics::_getAndAddInt:
+    return inline_unsafe_load_store(T_INT, LS_xadd);
+  case vmIntrinsics::_getAndAddLong:
+    return inline_unsafe_load_store(T_LONG, LS_xadd);
+  case vmIntrinsics::_getAndSetInt:
+    return inline_unsafe_load_store(T_INT, LS_xchg);
+  case vmIntrinsics::_getAndSetLong:
+    return inline_unsafe_load_store(T_LONG, LS_xchg);
+  case vmIntrinsics::_getAndSetObject:
+    return inline_unsafe_load_store(T_OBJECT, LS_xchg);
+
   case vmIntrinsics::_currentThread:
     return inline_native_currentThread();
   case vmIntrinsics::_isInterrupted:
@@ -2301,6 +2356,43 @@
 // Interpret Unsafe.fieldOffset cookies correctly:
 extern jlong Unsafe_field_offset_to_byte_offset(jlong field_offset);
 
+const TypeOopPtr* LibraryCallKit::sharpen_unsafe_type(Compile::AliasType* alias_type, const TypePtr *adr_type, bool is_native_ptr) {
+  // Attempt to infer a sharper value type from the offset and base type.
+  ciKlass* sharpened_klass = NULL;
+
+  // See if it is an instance field, with an object type.
+  if (alias_type->field() != NULL) {
+    assert(!is_native_ptr, "native pointer op cannot use a java address");
+    if (alias_type->field()->type()->is_klass()) {
+      sharpened_klass = alias_type->field()->type()->as_klass();
+    }
+  }
+
+  // See if it is a narrow oop array.
+  if (adr_type->isa_aryptr()) {
+    if (adr_type->offset() >= objArrayOopDesc::base_offset_in_bytes()) {
+      const TypeOopPtr *elem_type = adr_type->is_aryptr()->elem()->isa_oopptr();
+      if (elem_type != NULL) {
+        sharpened_klass = elem_type->klass();
+      }
+    }
+  }
+
+  if (sharpened_klass != NULL) {
+    const TypeOopPtr* tjp = TypeOopPtr::make_from_klass(sharpened_klass);
+
+#ifndef PRODUCT
+    if (PrintIntrinsics || PrintInlining || PrintOptoInlining) {
+      tty->print("  from base type:  ");   adr_type->dump();
+      tty->print("  sharpened value: ");   tjp->dump();
+    }
+#endif
+    // Sharpen the value type.
+    return tjp;
+  }
+  return NULL;
+}
+
 bool LibraryCallKit::inline_unsafe_access(bool is_native_ptr, bool is_store, BasicType type, bool is_volatile) {
   if (callee()->is_static())  return false;  // caller must have the capability!
 
@@ -2430,39 +2522,9 @@
                            offset != top() && heap_base_oop != top();
 
   if (!is_store && type == T_OBJECT) {
-    // Attempt to infer a sharper value type from the offset and base type.
-    ciKlass* sharpened_klass = NULL;
-
-    // See if it is an instance field, with an object type.
-    if (alias_type->field() != NULL) {
-      assert(!is_native_ptr, "native pointer op cannot use a java address");
-      if (alias_type->field()->type()->is_klass()) {
-        sharpened_klass = alias_type->field()->type()->as_klass();
-      }
-    }
-
-    // See if it is a narrow oop array.
-    if (adr_type->isa_aryptr()) {
-      if (adr_type->offset() >= objArrayOopDesc::base_offset_in_bytes()) {
-        const TypeOopPtr *elem_type = adr_type->is_aryptr()->elem()->isa_oopptr();
-        if (elem_type != NULL) {
-          sharpened_klass = elem_type->klass();
-        }
-      }
-    }
-
-    if (sharpened_klass != NULL) {
-      const TypeOopPtr* tjp = TypeOopPtr::make_from_klass(sharpened_klass);
-
-      // Sharpen the value type.
+    const TypeOopPtr* tjp = sharpen_unsafe_type(alias_type, adr_type, is_native_ptr);
+    if (tjp != NULL) {
       value_type = tjp;
-
-#ifndef PRODUCT
-      if (PrintIntrinsics || PrintInlining || PrintOptoInlining) {
-        tty->print("  from base type:  ");   adr_type->dump();
-        tty->print("  sharpened value: "); value_type->dump();
-      }
-#endif
     }
   }
 
@@ -2673,9 +2735,9 @@
   return true;
 }
 
-//----------------------------inline_unsafe_CAS----------------------------
-
-bool LibraryCallKit::inline_unsafe_CAS(BasicType type) {
+//----------------------------inline_unsafe_load_store----------------------------
+
+bool LibraryCallKit::inline_unsafe_load_store(BasicType type, LoadStoreKind kind) {
   // This basic scheme here is the same as inline_unsafe_access, but
   // differs in enough details that combining them would make the code
   // overly confusing.  (This is a true fact! I originally combined
@@ -2686,37 +2748,47 @@
   if (callee()->is_static())  return false;  // caller must have the capability!
 
 #ifndef PRODUCT
+  BasicType rtype;
   {
     ResourceMark rm;
-    // Check the signatures.
     ciSignature* sig = signature();
+    rtype = sig->return_type()->basic_type();
+    if (kind == LS_xadd || kind == LS_xchg) {
+      // Check the signatures.
 #ifdef ASSERT
-    BasicType rtype = sig->return_type()->basic_type();
-    assert(rtype == T_BOOLEAN, "CAS must return boolean");
-    assert(sig->count() == 4, "CAS has 4 arguments");
-    assert(sig->type_at(0)->basic_type() == T_OBJECT, "CAS base is object");
-    assert(sig->type_at(1)->basic_type() == T_LONG, "CAS offset is long");
+      assert(rtype == type, "get and set must return the expected type");
+      assert(sig->count() == 3, "get and set has 3 arguments");
+      assert(sig->type_at(0)->basic_type() == T_OBJECT, "get and set base is object");
+      assert(sig->type_at(1)->basic_type() == T_LONG, "get and set offset is long");
+      assert(sig->type_at(2)->basic_type() == type, "get and set must take expected type as new value/delta");
 #endif // ASSERT
+    } else if (kind == LS_cmpxchg) {
+      // Check the signatures.
+#ifdef ASSERT
+      assert(rtype == T_BOOLEAN, "CAS must return boolean");
+      assert(sig->count() == 4, "CAS has 4 arguments");
+      assert(sig->type_at(0)->basic_type() == T_OBJECT, "CAS base is object");
+      assert(sig->type_at(1)->basic_type() == T_LONG, "CAS offset is long");
+#endif // ASSERT
+    } else {
+      ShouldNotReachHere();
+    }
   }
 #endif //PRODUCT
 
   // number of stack slots per value argument (1 or 2)
   int type_words = type2size[type];
 
-  // Cannot inline wide CAS on machines that don't support it natively
-  if (type2aelembytes(type) > BytesPerInt && !VM_Version::supports_cx8())
-    return false;
-
   C->set_has_unsafe_access(true);  // Mark eventual nmethod as "unsafe".
 
-  // Argument words:  "this" plus oop plus offset plus oldvalue plus newvalue;
-  int nargs = 1 + 1 + 2  + type_words + type_words;
-
-  // pop arguments: newval, oldval, offset, base, and receiver
+  // Argument words:  "this" plus oop plus offset (plus oldvalue) plus newvalue/delta;
+  int nargs = 1 + 1 + 2  + ((kind == LS_cmpxchg) ? type_words : 0) + type_words;
+
+  // pop arguments: newval, offset, base, and receiver
   debug_only(int saved_sp = _sp);
   _sp += nargs;
   Node* newval   = (type_words == 1) ? pop() : pop_pair();
-  Node* oldval   = (type_words == 1) ? pop() : pop_pair();
+  Node* oldval   = (kind == LS_cmpxchg) ? ((type_words == 1) ? pop() : pop_pair()) : NULL;
   Node *offset   = pop_pair();
   Node *base     = pop();
   Node *receiver = pop();
@@ -2740,16 +2812,24 @@
   Node* adr = make_unsafe_address(base, offset);
   const TypePtr *adr_type = _gvn.type(adr)->isa_ptr();
 
-  // (Unlike inline_unsafe_access, there seems no point in trying
-  // to refine types. Just use the coarse types here.
+  // For CAS, unlike inline_unsafe_access, there seems no point in
+  // trying to refine types. Just use the coarse types here.
   const Type *value_type = Type::get_const_basic_type(type);
   Compile::AliasType* alias_type = C->alias_type(adr_type);
   assert(alias_type->index() != Compile::AliasIdxBot, "no bare pointers here");
+
+  if (kind == LS_xchg && type == T_OBJECT) {
+    const TypeOopPtr* tjp = sharpen_unsafe_type(alias_type, adr_type);
+    if (tjp != NULL) {
+      value_type = tjp;
+    }
+  }
+
   int alias_idx = C->get_alias_index(adr_type);
 
-  // Memory-model-wise, a CAS acts like a little synchronized block,
-  // so needs barriers on each side.  These don't translate into
-  // actual barriers on most machines, but we still need rest of
+  // Memory-model-wise, a LoadStore acts like a little synchronized
+  // block, so needs barriers on each side.  These don't translate
+  // into actual barriers on most machines, but we still need rest of
   // compiler to respect ordering.
 
   insert_mem_bar(Op_MemBarRelease);
@@ -2762,13 +2842,29 @@
 
   // For now, we handle only those cases that actually exist: ints,
   // longs, and Object. Adding others should be straightforward.
-  Node* cas;
+  Node* load_store;
   switch(type) {
   case T_INT:
-    cas = _gvn.transform(new (C, 5) CompareAndSwapINode(control(), mem, adr, newval, oldval));
+    if (kind == LS_xadd) {
+      load_store = _gvn.transform(new (C, 4) GetAndAddINode(control(), mem, adr, newval, adr_type));
+    } else if (kind == LS_xchg) {
+      load_store = _gvn.transform(new (C, 4) GetAndSetINode(control(), mem, adr, newval, adr_type));
+    } else if (kind == LS_cmpxchg) {
+      load_store = _gvn.transform(new (C, 5) CompareAndSwapINode(control(), mem, adr, newval, oldval));
+    } else {
+      ShouldNotReachHere();
+    }
     break;
   case T_LONG:
-    cas = _gvn.transform(new (C, 5) CompareAndSwapLNode(control(), mem, adr, newval, oldval));
+    if (kind == LS_xadd) {
+      load_store = _gvn.transform(new (C, 4) GetAndAddLNode(control(), mem, adr, newval, adr_type));
+    } else if (kind == LS_xchg) {
+      load_store = _gvn.transform(new (C, 4) GetAndSetLNode(control(), mem, adr, newval, adr_type));
+    } else if (kind == LS_cmpxchg) {
+      load_store = _gvn.transform(new (C, 5) CompareAndSwapLNode(control(), mem, adr, newval, oldval));
+    } else {
+      ShouldNotReachHere();
+    }
     break;
   case T_OBJECT:
     // Transformation of a value which could be NULL pointer (CastPP #NULL)
@@ -2778,7 +2874,6 @@
       newval = _gvn.makecon(TypePtr::NULL_PTR);
 
     // Reference stores need a store barrier.
-    // (They don't if CAS fails, but it isn't worth checking.)
     pre_barrier(true /* do_load*/,
                 control(), base, adr, alias_idx, newval, value_type->make_oopptr(),
                 NULL /* pre_val*/,
@@ -2786,32 +2881,50 @@
 #ifdef _LP64
     if (adr->bottom_type()->is_ptr_to_narrowoop()) {
       Node *newval_enc = _gvn.transform(new (C, 2) EncodePNode(newval, newval->bottom_type()->make_narrowoop()));
-      Node *oldval_enc = _gvn.transform(new (C, 2) EncodePNode(oldval, oldval->bottom_type()->make_narrowoop()));
-      cas = _gvn.transform(new (C, 5) CompareAndSwapNNode(control(), mem, adr,
-                                                          newval_enc, oldval_enc));
+      if (kind == LS_xchg) {
+        load_store = _gvn.transform(new (C, 4) GetAndSetNNode(control(), mem, adr,
+                                                              newval_enc, adr_type, value_type->make_narrowoop()));
+      } else {
+        assert(kind == LS_cmpxchg, "wrong LoadStore operation");
+        Node *oldval_enc = _gvn.transform(new (C, 2) EncodePNode(oldval, oldval->bottom_type()->make_narrowoop()));
+        load_store = _gvn.transform(new (C, 5) CompareAndSwapNNode(control(), mem, adr,
+                                                                   newval_enc, oldval_enc));
+      }
     } else
 #endif
     {
-      cas = _gvn.transform(new (C, 5) CompareAndSwapPNode(control(), mem, adr, newval, oldval));
+      if (kind == LS_xchg) {
+        load_store = _gvn.transform(new (C, 4) GetAndSetPNode(control(), mem, adr, newval, adr_type, value_type->is_oopptr()));
+      } else {
+        assert(kind == LS_cmpxchg, "wrong LoadStore operation");
+        load_store = _gvn.transform(new (C, 5) CompareAndSwapPNode(control(), mem, adr, newval, oldval));
+      }
     }
-    post_barrier(control(), cas, base, adr, alias_idx, newval, T_OBJECT, true);
+    post_barrier(control(), load_store, base, adr, alias_idx, newval, T_OBJECT, true);
     break;
   default:
     ShouldNotReachHere();
     break;
   }
 
-  // SCMemProjNodes represent the memory state of CAS. Their main
-  // role is to prevent CAS nodes from being optimized away when their
-  // results aren't used.
-  Node* proj = _gvn.transform( new (C, 1) SCMemProjNode(cas));
+  // SCMemProjNodes represent the memory state of a LoadStore. Their
+  // main role is to prevent LoadStore nodes from being optimized away
+  // when their results aren't used.
+  Node* proj = _gvn.transform( new (C, 1) SCMemProjNode(load_store));
   set_memory(proj, alias_idx);
 
   // Add the trailing membar surrounding the access
   insert_mem_bar(Op_MemBarCPUOrder);
   insert_mem_bar(Op_MemBarAcquire);
 
-  push(cas);
+#ifdef _LP64
+  if (type == T_OBJECT && adr->bottom_type()->is_ptr_to_narrowoop() && kind == LS_xchg) {
+    load_store = _gvn.transform(new (C, 2) DecodeNNode(load_store, load_store->bottom_type()->make_ptr()));
+  }
+#endif
+
+  assert(type2size[load_store->bottom_type()->basic_type()] == type2size[rtype], "result type should match");
+  push_node(load_store->bottom_type()->basic_type(), load_store);
   return true;
 }
 
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/matcher.cpp
--- a/src/share/vm/opto/matcher.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/matcher.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -2134,10 +2134,10 @@
       case Op_CompareAndSwapP:
       case Op_CompareAndSwapN: {   // Convert trinary to binary-tree
         Node *newval = n->in(MemNode::ValueIn );
-        Node *oldval  = n->in(LoadStoreNode::ExpectedIn);
+        Node *oldval  = n->in(LoadStoreConditionalNode::ExpectedIn);
         Node *pair = new (C, 3) BinaryNode( oldval, newval );
         n->set_req(MemNode::ValueIn,pair);
-        n->del_req(LoadStoreNode::ExpectedIn);
+        n->del_req(LoadStoreConditionalNode::ExpectedIn);
         break;
       }
       case Op_CMoveD:              // Convert trinary to binary-tree
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/memnode.cpp
--- a/src/share/vm/opto/memnode.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/memnode.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -2552,14 +2552,38 @@
 }
 
 //=============================================================================
-LoadStoreNode::LoadStoreNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex ) : Node(5) {
+//----------------------------------LoadStoreNode------------------------------
+LoadStoreNode::LoadStoreNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at, const Type* rt, uint required )
+  : Node(required),
+    _type(rt),
+    _adr_type(at)
+{
   init_req(MemNode::Control, c  );
   init_req(MemNode::Memory , mem);
   init_req(MemNode::Address, adr);
   init_req(MemNode::ValueIn, val);
-  init_req(         ExpectedIn, ex );
   init_class_id(Class_LoadStore);
-
+}
+
+uint LoadStoreNode::ideal_reg() const {
+  return _type->ideal_reg();
+}
+
+bool LoadStoreNode::result_not_used() const {
+  for( DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++ ) {
+    Node *x = fast_out(i);
+    if (x->Opcode() == Op_SCMemProj) continue;
+    return false;
+  }
+  return true;
+}
+
+uint LoadStoreNode::size_of() const { return sizeof(*this); }
+
+//=============================================================================
+//----------------------------------LoadStoreConditionalNode--------------------
+LoadStoreConditionalNode::LoadStoreConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex ) : LoadStoreNode(c, mem, adr, val, NULL, TypeInt::BOOL, 5) {
+  init_req(ExpectedIn, ex );
 }
 
 //=============================================================================
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/memnode.hpp
--- a/src/share/vm/opto/memnode.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/memnode.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -657,23 +657,36 @@
 //------------------------------LoadStoreNode---------------------------
 // Note: is_Mem() method returns 'true' for this class.
 class LoadStoreNode : public Node {
+private:
+  const Type* const _type;      // What kind of value is loaded?
+  const TypePtr* _adr_type;     // What kind of memory is being addressed?
+  virtual uint size_of() const; // Size is bigger
+public:
+  LoadStoreNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at, const Type* rt, uint required );
+  virtual bool depends_only_on_test() const { return false; }
+  virtual uint match_edge(uint idx) const { return idx == MemNode::Address || idx == MemNode::ValueIn; }
+
+  virtual const Type *bottom_type() const { return _type; }
+  virtual uint ideal_reg() const;
+  virtual const class TypePtr *adr_type() const { return _adr_type; }  // returns bottom_type of address
+
+  bool result_not_used() const;
+};
+
+class LoadStoreConditionalNode : public LoadStoreNode {
 public:
   enum {
     ExpectedIn = MemNode::ValueIn+1 // One more input than MemNode
   };
-  LoadStoreNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex);
-  virtual bool depends_only_on_test() const { return false; }
-  virtual const Type *bottom_type() const { return TypeInt::BOOL; }
-  virtual uint ideal_reg() const { return Op_RegI; }
-  virtual uint match_edge(uint idx) const { return idx == MemNode::Address || idx == MemNode::ValueIn; }
+  LoadStoreConditionalNode(Node *c, Node *mem, Node *adr, Node *val, Node *ex);
 };
 
 //------------------------------StorePConditionalNode---------------------------
 // Conditionally store pointer to memory, if no change since prior
 // load-locked.  Sets flags for success or failure of the store.
-class StorePConditionalNode : public LoadStoreNode {
+class StorePConditionalNode : public LoadStoreConditionalNode {
 public:
-  StorePConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ll ) : LoadStoreNode(c, mem, adr, val, ll) { }
+  StorePConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ll ) : LoadStoreConditionalNode(c, mem, adr, val, ll) { }
   virtual int Opcode() const;
   // Produces flags
   virtual uint ideal_reg() const { return Op_RegFlags; }
@@ -682,9 +695,9 @@
 //------------------------------StoreIConditionalNode---------------------------
 // Conditionally store int to memory, if no change since prior
 // load-locked.  Sets flags for success or failure of the store.
-class StoreIConditionalNode : public LoadStoreNode {
+class StoreIConditionalNode : public LoadStoreConditionalNode {
 public:
-  StoreIConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ii ) : LoadStoreNode(c, mem, adr, val, ii) { }
+  StoreIConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ii ) : LoadStoreConditionalNode(c, mem, adr, val, ii) { }
   virtual int Opcode() const;
   // Produces flags
   virtual uint ideal_reg() const { return Op_RegFlags; }
@@ -693,9 +706,9 @@
 //------------------------------StoreLConditionalNode---------------------------
 // Conditionally store long to memory, if no change since prior
 // load-locked.  Sets flags for success or failure of the store.
-class StoreLConditionalNode : public LoadStoreNode {
+class StoreLConditionalNode : public LoadStoreConditionalNode {
 public:
-  StoreLConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ll ) : LoadStoreNode(c, mem, adr, val, ll) { }
+  StoreLConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ll ) : LoadStoreConditionalNode(c, mem, adr, val, ll) { }
   virtual int Opcode() const;
   // Produces flags
   virtual uint ideal_reg() const { return Op_RegFlags; }
@@ -703,32 +716,75 @@
 
 
 //------------------------------CompareAndSwapLNode---------------------------
-class CompareAndSwapLNode : public LoadStoreNode {
+class CompareAndSwapLNode : public LoadStoreConditionalNode {
 public:
-  CompareAndSwapLNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreNode(c, mem, adr, val, ex) { }
+  CompareAndSwapLNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreConditionalNode(c, mem, adr, val, ex) { }
   virtual int Opcode() const;
 };
 
 
 //------------------------------CompareAndSwapINode---------------------------
-class CompareAndSwapINode : public LoadStoreNode {
+class CompareAndSwapINode : public LoadStoreConditionalNode {
 public:
-  CompareAndSwapINode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreNode(c, mem, adr, val, ex) { }
+  CompareAndSwapINode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreConditionalNode(c, mem, adr, val, ex) { }
   virtual int Opcode() const;
 };
 
 
 //------------------------------CompareAndSwapPNode---------------------------
-class CompareAndSwapPNode : public LoadStoreNode {
+class CompareAndSwapPNode : public LoadStoreConditionalNode {
 public:
-  CompareAndSwapPNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreNode(c, mem, adr, val, ex) { }
+  CompareAndSwapPNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreConditionalNode(c, mem, adr, val, ex) { }
   virtual int Opcode() const;
 };
 
 //------------------------------CompareAndSwapNNode---------------------------
-class CompareAndSwapNNode : public LoadStoreNode {
+class CompareAndSwapNNode : public LoadStoreConditionalNode {
+public:
+  CompareAndSwapNNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreConditionalNode(c, mem, adr, val, ex) { }
+  virtual int Opcode() const;
+};
+
+//------------------------------GetAndAddINode---------------------------
+class GetAndAddINode : public LoadStoreNode {
+public:
+  GetAndAddINode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at ) : LoadStoreNode(c, mem, adr, val, at, TypeInt::INT, 4) { }
+  virtual int Opcode() const;
+};
+
+//------------------------------GetAndAddLNode---------------------------
+class GetAndAddLNode : public LoadStoreNode {
 public:
-  CompareAndSwapNNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreNode(c, mem, adr, val, ex) { }
+  GetAndAddLNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at ) : LoadStoreNode(c, mem, adr, val, at, TypeLong::LONG, 4) { }
+  virtual int Opcode() const;
+};
+
+
+//------------------------------GetAndSetINode---------------------------
+class GetAndSetINode : public LoadStoreNode {
+public:
+  GetAndSetINode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at ) : LoadStoreNode(c, mem, adr, val, at, TypeInt::INT, 4) { }
+  virtual int Opcode() const;
+};
+
+//------------------------------GetAndSetINode---------------------------
+class GetAndSetLNode : public LoadStoreNode {
+public:
+  GetAndSetLNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at ) : LoadStoreNode(c, mem, adr, val, at, TypeLong::LONG, 4) { }
+  virtual int Opcode() const;
+};
+
+//------------------------------GetAndSetPNode---------------------------
+class GetAndSetPNode : public LoadStoreNode {
+public:
+  GetAndSetPNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at, const Type* t ) : LoadStoreNode(c, mem, adr, val, at, t, 4) { }
+  virtual int Opcode() const;
+};
+
+//------------------------------GetAndSetNNode---------------------------
+class GetAndSetNNode : public LoadStoreNode {
+public:
+  GetAndSetNNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at, const Type* t ) : LoadStoreNode(c, mem, adr, val, at, t, 4) { }
   virtual int Opcode() const;
 };
 
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/output.cpp
--- a/src/share/vm/opto/output.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/output.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -1869,7 +1869,9 @@
   if (!do_scheduling())
     return;
 
-  assert(MaxVectorSize <= 8, "scheduling code works only with pairs");
+  // Scheduling code works only with pairs (8 bytes) maximum.
+  if (max_vector_size() > 8)
+    return;
 
   NOT_PRODUCT( TracePhase t2("isched", &_t_instrSched, TimeCompiler); )
 
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/superword.cpp
--- a/src/share/vm/opto/superword.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/superword.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -179,6 +179,7 @@
   for (int i = 0; i < _block.length(); i++) {
     Node* n = _block.at(i);
     if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
+        n->Opcode() != Op_LoadUI2L &&
         is_java_primitive(n->as_Mem()->memory_type())) {
       int align = memory_alignment(n->as_Mem(), 0);
       if (align != bottom_align) {
@@ -481,12 +482,19 @@
   int vw       = vector_width_in_bytes(mem_ref);
   assert(vw > 1, "sanity");
   int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
-  int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw;
+  // At least one iteration is executed in pre-loop by default. As result
+  // several iterations are needed to align memory operations in main-loop even
+  // if offset is 0.
+  int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
+  int elt_size = align_to_ref_p.memory_size();
+  assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
+         err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
+  int iv_adjustment = iv_adjustment_in_bytes/elt_size;
 
 #ifndef PRODUCT
   if (TraceSuperWord)
     tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
-                  offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw);
+                  offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
 #endif
   return iv_adjustment;
 }
@@ -1350,11 +1358,14 @@
     insert_extracts(_packset.at(i));
   }
 
+  Compile* C = _phase->C;
+  uint max_vlen_in_bytes = 0;
   for (int i = 0; i < _block.length(); i++) {
     Node* n = _block.at(i);
     Node_List* p = my_pack(n);
     if (p && n == executed_last(p)) {
       uint vlen = p->size();
+      uint vlen_in_bytes = 0;
       Node* vn = NULL;
       Node* low_adr = p->at(0);
       Node* first   = executed_first(p);
@@ -1364,7 +1375,8 @@
         Node* mem = first->in(MemNode::Memory);
         Node* adr = low_adr->in(MemNode::Address);
         const TypePtr* atyp = n->adr_type();
-        vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
+        vn = LoadVectorNode::make(C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
+        vlen_in_bytes = vn->as_LoadVector()->memory_size();
       } else if (n->is_Store()) {
         // Promote value to be stored to vector
         Node* val = vector_opd(p, MemNode::ValueIn);
@@ -1372,7 +1384,8 @@
         Node* mem = first->in(MemNode::Memory);
         Node* adr = low_adr->in(MemNode::Address);
         const TypePtr* atyp = n->adr_type();
-        vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
+        vn = StoreVectorNode::make(C, opc, ctl, mem, adr, atyp, val, vlen);
+        vlen_in_bytes = vn->as_StoreVector()->memory_size();
       } else if (n->req() == 3) {
         // Promote operands to vector
         Node* in1 = vector_opd(p, 1);
@@ -1383,7 +1396,8 @@
           in1 = in2;
           in2 = tmp;
         }
-        vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
+        vn = VectorNode::make(C, opc, in1, in2, vlen, velt_basic_type(n));
+        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
       } else {
         ShouldNotReachHere();
       }
@@ -1395,6 +1409,10 @@
         _igvn.replace_node(pm, vn);
       }
       _igvn._worklist.push(vn);
+
+      if (vlen_in_bytes > max_vlen_in_bytes) {
+        max_vlen_in_bytes = vlen_in_bytes;
+      }
 #ifdef ASSERT
       if (TraceNewVectors) {
         tty->print("new Vector node: ");
@@ -1403,6 +1421,7 @@
 #endif
     }
   }
+  C->set_max_vector_size(max_vlen_in_bytes);
 }
 
 //------------------------------vector_opd---------------------------
@@ -1439,7 +1458,7 @@
         }
         assert(opd->bottom_type()->isa_int(), "int type only");
         // Move non constant shift count into XMM register.
-        cnt = new (_phase->C, 2) MoveI2FNode(cnt);
+        cnt = new (C, 2) MoveI2FNode(cnt);
       }
       if (cnt != opd) {
         _phase->_igvn.register_new_node_with_optimizer(cnt);
@@ -1480,10 +1499,10 @@
   _phase->_igvn.register_new_node_with_optimizer(pk);
   _phase->set_ctrl(pk, _phase->get_ctrl(opd));
 #ifdef ASSERT
-    if (TraceNewVectors) {
-      tty->print("new Vector node: ");
-      pk->dump();
-    }
+  if (TraceNewVectors) {
+    tty->print("new Vector node: ");
+    pk->dump();
+  }
 #endif
   return pk;
 }
@@ -1805,7 +1824,7 @@
 
 //------------------------------memory_alignment---------------------------
 // Alignment within a vector memory reference
-int SuperWord::memory_alignment(MemNode* s, int iv_adjust_in_bytes) {
+int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
   SWPointer p(s, this);
   if (!p.valid()) {
     return bottom_align;
@@ -1815,7 +1834,7 @@
     return bottom_align; // No vectors for this type
   }
   int offset  = p.offset_in_bytes();
-  offset     += iv_adjust_in_bytes;
+  offset     += iv_adjust*p.memory_size();
   int off_rem = offset % vw;
   int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
   return off_mod;
@@ -1838,7 +1857,7 @@
 
 bool SuperWord::same_velt_type(Node* n1, Node* n2) {
   const Type* vt1 = velt_type(n1);
-  const Type* vt2 = velt_type(n1);
+  const Type* vt2 = velt_type(n2);
   if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
     // Compare vectors element sizes for integer types.
     return data_size(n1) == data_size(n2);
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/opto/superword.hpp
--- a/src/share/vm/opto/superword.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/opto/superword.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -400,7 +400,7 @@
   // Return the node executed last in pack p.
   Node* executed_last(Node_List* p);
   // Alignment within a vector memory reference
-  int memory_alignment(MemNode* s, int iv_adjust_in_bytes);
+  int memory_alignment(MemNode* s, int iv_adjust);
   // (Start, end] half-open range defining which operands are vector
   void vector_opd_range(Node* n, uint* start, uint* end);
   // Smallest type containing range of values
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/runtime/sharedRuntime.cpp
--- a/src/share/vm/runtime/sharedRuntime.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/runtime/sharedRuntime.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -88,6 +88,7 @@
 RuntimeStub*        SharedRuntime::_resolve_static_call_blob;
 
 DeoptimizationBlob* SharedRuntime::_deopt_blob;
+SafepointBlob*      SharedRuntime::_polling_page_vectors_safepoint_handler_blob;
 SafepointBlob*      SharedRuntime::_polling_page_safepoint_handler_blob;
 SafepointBlob*      SharedRuntime::_polling_page_return_handler_blob;
 
@@ -104,8 +105,14 @@
   _resolve_virtual_call_blob           = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_virtual_call_C),      "resolve_virtual_call");
   _resolve_static_call_blob            = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_static_call_C),       "resolve_static_call");
 
-  _polling_page_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), false);
-  _polling_page_return_handler_blob    = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), true);
+#ifdef COMPILER2
+  // Vectors are generated only by C2.
+  if (is_wide_vector(MaxVectorSize)) {
+    _polling_page_vectors_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_VECTOR_LOOP);
+  }
+#endif // COMPILER2
+  _polling_page_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_LOOP);
+  _polling_page_return_handler_blob    = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_RETURN);
 
   generate_deopt_blob();
 
@@ -535,10 +542,15 @@
     "Only polling locations are used for safepoint");
 
   bool at_poll_return = ((nmethod*)cb)->is_at_poll_return(pc);
+  bool has_wide_vectors = ((nmethod*)cb)->has_wide_vectors();
   if (at_poll_return) {
     assert(SharedRuntime::polling_page_return_handler_blob() != NULL,
            "polling page return stub not created yet");
     stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
+  } else if (has_wide_vectors) {
+    assert(SharedRuntime::polling_page_vectors_safepoint_handler_blob() != NULL,
+           "polling page vectors safepoint stub not created yet");
+    stub = SharedRuntime::polling_page_vectors_safepoint_handler_blob()->entry_point();
   } else {
     assert(SharedRuntime::polling_page_safepoint_handler_blob() != NULL,
            "polling page safepoint stub not created yet");
@@ -1618,6 +1630,31 @@
   return callee_method;
 }
 
+#ifdef ASSERT
+void SharedRuntime::check_member_name_argument_is_last_argument(methodHandle method,
+                                                                const BasicType* sig_bt,
+                                                                const VMRegPair* regs) {
+  ResourceMark rm;
+  const int total_args_passed = method->size_of_parameters();
+  const VMRegPair*    regs_with_member_name = regs;
+        VMRegPair* regs_without_member_name = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed - 1);
+
+  const int member_arg_pos = total_args_passed - 1;
+  assert(member_arg_pos >= 0 && member_arg_pos < total_args_passed, "oob");
+  assert(sig_bt[member_arg_pos] == T_OBJECT, "dispatch argument must be an object");
+
+  const bool is_outgoing = method->is_method_handle_intrinsic();
+  int comp_args_on_stack = java_calling_convention(sig_bt, regs_without_member_name, total_args_passed - 1, is_outgoing);
+
+  for (int i = 0; i < member_arg_pos; i++) {
+    VMReg a =    regs_with_member_name[i].first();
+    VMReg b = regs_without_member_name[i].first();
+    assert(a->value() == b->value(), err_msg_res("register allocation mismatch: a=%d, b=%d", a->value(), b->value()));
+  }
+  assert(regs_with_member_name[member_arg_pos].first()->is_valid(), "bad member arg");
+}
+#endif
+
 // ---------------------------------------------------------------------------
 // We are calling the interpreter via a c2i. Normally this would mean that
 // we were called by a compiled method. However we could have lost a race
@@ -2423,6 +2460,7 @@
 #ifndef PRODUCT
     // debugging suppport
     if (PrintAdapterHandlers || PrintStubCode) {
+      ttyLocker ttyl;
       entry->print_adapter_on(tty);
       tty->print_cr("i2c argument handler #%d for: %s %s (%d bytes generated)",
                     _adapters->number_of_entries(), (method->is_static() ? "static" : "receiver"),
@@ -2430,8 +2468,10 @@
       tty->print_cr("c2i argument handler starts at %p",entry->get_c2i_entry());
       if (Verbose || PrintStubCode) {
         address first_pc = entry->base_address();
-        if (first_pc != NULL)
+        if (first_pc != NULL) {
           Disassembler::decode(first_pc, first_pc + insts_size);
+          tty->cr();
+        }
       }
     }
 #endif
@@ -2546,10 +2586,10 @@
       MacroAssembler _masm(&buffer);
 
       // Fill in the signature array, for the calling-convention call.
-      int total_args_passed = method->size_of_parameters();
-
-      BasicType* sig_bt = NEW_RESOURCE_ARRAY(BasicType,total_args_passed);
-      VMRegPair*   regs = NEW_RESOURCE_ARRAY(VMRegPair,total_args_passed);
+      const int total_args_passed = method->size_of_parameters();
+
+      BasicType* sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_args_passed);
+      VMRegPair*   regs = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed);
       int i=0;
       if( !method->is_static() )  // Pass in receiver first
         sig_bt[i++] = T_OBJECT;
@@ -2559,7 +2599,7 @@
         if( ss.type() == T_LONG || ss.type() == T_DOUBLE )
           sig_bt[i++] = T_VOID;   // Longs & doubles take 2 Java slots
       }
-      assert( i==total_args_passed, "" );
+      assert(i == total_args_passed, "");
       BasicType ret_type = ss.type();
 
       // Now get the compiled-Java layout as input (or output) arguments.
@@ -2572,9 +2612,8 @@
       nm = SharedRuntime::generate_native_wrapper(&_masm,
                                                   method,
                                                   compile_id,
-                                                  total_args_passed,
-                                                  comp_args_on_stack,
-                                                  sig_bt,regs,
+                                                  sig_bt,
+                                                  regs,
                                                   ret_type);
     }
   }
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/runtime/sharedRuntime.hpp
--- a/src/share/vm/runtime/sharedRuntime.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/runtime/sharedRuntime.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -62,6 +62,7 @@
 
   static DeoptimizationBlob* _deopt_blob;
 
+  static SafepointBlob*      _polling_page_vectors_safepoint_handler_blob;
   static SafepointBlob*      _polling_page_safepoint_handler_blob;
   static SafepointBlob*      _polling_page_return_handler_blob;
 
@@ -75,7 +76,8 @@
 #endif // !PRODUCT
 
  private:
-  static SafepointBlob* generate_handler_blob(address call_ptr, bool cause_return);
+  enum { POLL_AT_RETURN,  POLL_AT_LOOP, POLL_AT_VECTOR_LOOP };
+  static SafepointBlob* generate_handler_blob(address call_ptr, int poll_type);
   static RuntimeStub*   generate_resolve_blob(address destination, const char* name);
 
  public:
@@ -223,6 +225,7 @@
 
   static SafepointBlob* polling_page_return_handler_blob()     { return _polling_page_return_handler_blob; }
   static SafepointBlob* polling_page_safepoint_handler_blob()  { return _polling_page_safepoint_handler_blob; }
+  static SafepointBlob* polling_page_vectors_safepoint_handler_blob()  { return _polling_page_vectors_safepoint_handler_blob; }
 
   // Counters
 #ifndef PRODUCT
@@ -345,7 +348,11 @@
   // the bottom of the frame the first 16 words will be skipped and SharedInfo::stack0
   // will be just above it. (
   // return value is the maximum number of VMReg stack slots the convention will use.
-  static int java_calling_convention(const BasicType *sig_bt, VMRegPair *regs, int total_args_passed, int is_outgoing);
+  static int java_calling_convention(const BasicType* sig_bt, VMRegPair* regs, int total_args_passed, int is_outgoing);
+
+  static void check_member_name_argument_is_last_argument(methodHandle method,
+                                                          const BasicType* sig_bt,
+                                                          const VMRegPair* regs) NOT_DEBUG_RETURN;
 
   // Ditto except for calling C
   static int c_calling_convention(const BasicType *sig_bt, VMRegPair *regs, int total_args_passed);
@@ -412,6 +419,10 @@
   // when an interrupt occurs.
   static uint out_preserve_stack_slots();
 
+  // Is vector's size (in bytes) bigger than a size saved by default?
+  // For example, on x86 16 bytes XMM registers are saved by default.
+  static bool is_wide_vector(int size);
+
   // Save and restore a native result
   static void    save_native_result(MacroAssembler *_masm, BasicType ret_type, int frame_slots );
   static void restore_native_result(MacroAssembler *_masm, BasicType ret_type, int frame_slots );
@@ -425,13 +436,11 @@
   // The wrapper may contain special-case code if the given method
   // is a JNI critical method, or a compiled method handle adapter,
   // such as _invokeBasic, _linkToVirtual, etc.
-  static nmethod *generate_native_wrapper(MacroAssembler* masm,
+  static nmethod* generate_native_wrapper(MacroAssembler* masm,
                                           methodHandle method,
                                           int compile_id,
-                                          int total_args_passed,
-                                          int max_arg,
-                                          BasicType *sig_bt,
-                                          VMRegPair *regs,
+                                          BasicType* sig_bt,
+                                          VMRegPair* regs,
                                           BasicType ret_type );
 
   // Block before entering a JNI critical method
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/runtime/vm_version.cpp
--- a/src/share/vm/runtime/vm_version.cpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/runtime/vm_version.cpp	Mon Sep 24 14:46:06 2012 -0700
@@ -45,6 +45,10 @@
 const char* Abstract_VM_Version::_s_vm_release = Abstract_VM_Version::vm_release();
 const char* Abstract_VM_Version::_s_internal_vm_info_string = Abstract_VM_Version::internal_vm_info_string();
 bool Abstract_VM_Version::_supports_cx8 = false;
+bool Abstract_VM_Version::_supports_atomic_getset4 = false;
+bool Abstract_VM_Version::_supports_atomic_getset8 = false;
+bool Abstract_VM_Version::_supports_atomic_getadd4 = false;
+bool Abstract_VM_Version::_supports_atomic_getadd8 = false;
 unsigned int Abstract_VM_Version::_logical_processors_per_package = 1U;
 int Abstract_VM_Version::_reserve_for_allocation_prefetch = 0;
 
diff -r 04ed664b7e30 -r c92f43386117 src/share/vm/runtime/vm_version.hpp
--- a/src/share/vm/runtime/vm_version.hpp	Fri Sep 21 14:39:56 2012 -0700
+++ b/src/share/vm/runtime/vm_version.hpp	Mon Sep 24 14:46:06 2012 -0700
@@ -37,6 +37,10 @@
   static const char*  _s_internal_vm_info_string;
   // These are set by machine-dependent initializations
   static bool         _supports_cx8;
+  static bool         _supports_atomic_getset4;
+  static bool         _supports_atomic_getset8;
+  static bool         _supports_atomic_getadd4;
+  static bool         _supports_atomic_getadd8;
   static unsigned int _logical_processors_per_package;
   static int          _vm_major_version;
   static int          _vm_minor_version;
@@ -75,6 +79,13 @@
 
   // does HW support an 8-byte compare-exchange operation?
   static bool supports_cx8()  {return _supports_cx8;}
+  // does HW support atomic get-and-set or atomic get-and-add?  Used
+  // to guide intrinsification decisions for Unsafe atomic ops
+  static bool supports_atomic_getset4()  {return _supports_atomic_getset4;}
+  static bool supports_atomic_getset8()  {return _supports_atomic_getset8;}
+  static bool supports_atomic_getadd4()  {return _supports_atomic_getadd4;}
+  static bool supports_atomic_getadd8()  {return _supports_atomic_getadd8;}
+
   static unsigned int logical_processors_per_package() {
     return _logical_processors_per_package;
   }
diff -r 04ed664b7e30 -r c92f43386117 test/compiler/7196199/Test7196199.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7196199/Test7196199.java	Mon Sep 24 14:46:06 2012 -0700
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7196199
+ * @summary java/text/Bidi/Bug6665028.java failed: Bidi run count incorrect
+ *
+ * @run main/othervm/timeout=400 -Xmx32m -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:CompileCommand=exclude,Test7196199.test -XX:+SafepointALot -XX:GuaranteedSafepointInterval=100 Test7196199
+ */
+
+
+public class Test7196199 {
+  private static final int ARRLEN = 97;
+  private static final int ITERS  = 5000;
+  private static final int INI_ITERS  = 1000;
+  private static final int SFP_ITERS  = 10000;
+  private static final float SFP_ITERS_F  = 10000.f;
+  private static final float VALUE = 15.f;
+  public static void main(String args[]) {
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    float[] a0 = new float[ARRLEN];
+    float[] a1 = new float[ARRLEN];
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a0[i] = 0.f;
+      a1[i] = (float)i;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<INI_ITERS; i++) {
+      test_incrc(a0);
+      test_incrv(a0, VALUE);
+      test_addc(a0, a1);
+      test_addv(a0, a1, VALUE);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    for (int i=0; i<ARRLEN; i++)
+      a0[i] = 0.f;
+
+    System.out.println("  test_incrc");
+    for (int j=0; j<ITERS; j++) {
+      test_incrc(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_incrc: ", i, a0[i], VALUE*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    System.out.println("  test_incrv");
+    for (int j=0; j<ITERS; j++) {
+      test_incrv(a0, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_incrv: ", i, a0[i], VALUE*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    System.out.println("  test_addc");
+    for (int j=0; j<ITERS; j++) {
+      test_addc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], ((float)i + VALUE)*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    System.out.println("  test_addv");
+    for (int j=0; j<ITERS; j++) {
+      test_addv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], ((float)i + VALUE)*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_incrc(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_incrc: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_incrv(a0, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_incrv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_addv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_incrc(float[] a0) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += VALUE;
+      }
+    }
+  }
+  static void test_incrv(float[] a0, float b) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += b;
+      }
+    }
+  }
+  static void test_addc(float[] a0, float[] a1) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += a1[i]+VALUE;
+      }
+    }
+  }
+  static void test_addv(float[] a0, float[] a1, float b) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += a1[i]+b;
+      }
+    }
+  }
+
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
+