Mercurial > hg > graal-compiler

--- a/.hgtags	Wed Aug 22 10:01:51 2012 +0200
+++ b/.hgtags	Fri Aug 24 19:45:42 2012 -0700
@@ -269,3 +269,6 @@
 3b3ad16429701b2eb6712851c2f7c5a726eb2cbe hs24-b19
 663fc23da8d51c4c0552cbcb17ffc85f5869d4fd jdk8-b51
 4c8f2a12e757e7a808aa85827573e09f75d7459f hs24-b20
+6d0436885201db3f581523344a734793bb989549 jdk8-b52
+54240c1b8e87758f28da2c6a569a926fd9e0910a jdk8-b53
+9e3ae661284dc04185b029d85440fe7811f1ed07 hs24-b21
--- a/make/hotspot_version	Wed Aug 22 10:01:51 2012 +0200
+++ b/make/hotspot_version	Fri Aug 24 19:45:42 2012 -0700
@@ -35,7 +35,7 @@

 HS_MAJOR_VER=24
 HS_MINOR_VER=0
-HS_BUILD_NUMBER=21
+HS_BUILD_NUMBER=22

 JDK_MAJOR_VER=1
 JDK_MINOR_VER=8
--- a/make/jprt.properties	Wed Aug 22 10:01:51 2012 +0200
+++ b/make/jprt.properties	Fri Aug 24 19:45:42 2012 -0700
@@ -54,77 +54,77 @@
 # Define the Solaris platforms we want for the various releases
 jprt.my.solaris.sparc.jdk8=solaris_sparc_5.10
 jprt.my.solaris.sparc.jdk7=solaris_sparc_5.10
-jprt.my.solaris.sparc.jdk7u6=${jprt.my.solaris.sparc.jdk7}
+jprt.my.solaris.sparc.jdk7u8=${jprt.my.solaris.sparc.jdk7}
 jprt.my.solaris.sparc=${jprt.my.solaris.sparc.${jprt.tools.default.release}}

 jprt.my.solaris.sparcv9.jdk8=solaris_sparcv9_5.10
 jprt.my.solaris.sparcv9.jdk7=solaris_sparcv9_5.10
-jprt.my.solaris.sparcv9.jdk7u6=${jprt.my.solaris.sparcv9.jdk7}
+jprt.my.solaris.sparcv9.jdk7u8=${jprt.my.solaris.sparcv9.jdk7}
 jprt.my.solaris.sparcv9=${jprt.my.solaris.sparcv9.${jprt.tools.default.release}}

 jprt.my.solaris.i586.jdk8=solaris_i586_5.10
 jprt.my.solaris.i586.jdk7=solaris_i586_5.10
-jprt.my.solaris.i586.jdk7u6=${jprt.my.solaris.i586.jdk7}
+jprt.my.solaris.i586.jdk7u8=${jprt.my.solaris.i586.jdk7}
 jprt.my.solaris.i586=${jprt.my.solaris.i586.${jprt.tools.default.release}}

 jprt.my.solaris.x64.jdk8=solaris_x64_5.10
 jprt.my.solaris.x64.jdk7=solaris_x64_5.10
-jprt.my.solaris.x64.jdk7u6=${jprt.my.solaris.x64.jdk7}
+jprt.my.solaris.x64.jdk7u8=${jprt.my.solaris.x64.jdk7}
 jprt.my.solaris.x64=${jprt.my.solaris.x64.${jprt.tools.default.release}}

 jprt.my.linux.i586.jdk8=linux_i586_2.6
 jprt.my.linux.i586.jdk7=linux_i586_2.6
-jprt.my.linux.i586.jdk7u6=${jprt.my.linux.i586.jdk7}
+jprt.my.linux.i586.jdk7u8=${jprt.my.linux.i586.jdk7}
 jprt.my.linux.i586=${jprt.my.linux.i586.${jprt.tools.default.release}}

 jprt.my.linux.x64.jdk8=linux_x64_2.6
 jprt.my.linux.x64.jdk7=linux_x64_2.6
-jprt.my.linux.x64.jdk7u6=${jprt.my.linux.x64.jdk7}
+jprt.my.linux.x64.jdk7u8=${jprt.my.linux.x64.jdk7}
 jprt.my.linux.x64=${jprt.my.linux.x64.${jprt.tools.default.release}}

 jprt.my.linux.ppc.jdk8=linux_ppc_2.6
 jprt.my.linux.ppc.jdk7=linux_ppc_2.6
-jprt.my.linux.ppc.jdk7u6=${jprt.my.linux.ppc.jdk7}
+jprt.my.linux.ppc.jdk7u8=${jprt.my.linux.ppc.jdk7}
 jprt.my.linux.ppc=${jprt.my.linux.ppc.${jprt.tools.default.release}}

 jprt.my.linux.ppcv2.jdk8=linux_ppcv2_2.6
 jprt.my.linux.ppcv2.jdk7=linux_ppcv2_2.6
-jprt.my.linux.ppcv2.jdk7u6=${jprt.my.linux.ppcv2.jdk7}
+jprt.my.linux.ppcv2.jdk7u8=${jprt.my.linux.ppcv2.jdk7}
 jprt.my.linux.ppcv2=${jprt.my.linux.ppcv2.${jprt.tools.default.release}}

 jprt.my.linux.ppcsflt.jdk8=linux_ppcsflt_2.6
 jprt.my.linux.ppcsflt.jdk7=linux_ppcsflt_2.6
-jprt.my.linux.ppcsflt.jdk7u6=${jprt.my.linux.ppcsflt.jdk7}
+jprt.my.linux.ppcsflt.jdk7u8=${jprt.my.linux.ppcsflt.jdk7}
 jprt.my.linux.ppcsflt=${jprt.my.linux.ppcsflt.${jprt.tools.default.release}}

 jprt.my.linux.armvfp.jdk8=linux_armvfp_2.6
 jprt.my.linux.armvfp.jdk7=linux_armvfp_2.6
-jprt.my.linux.armvfp.jdk7u6=${jprt.my.linux.armvfp.jdk7}
+jprt.my.linux.armvfp.jdk7u8=${jprt.my.linux.armvfp.jdk7}
 jprt.my.linux.armvfp=${jprt.my.linux.armvfp.${jprt.tools.default.release}}

 jprt.my.linux.armv6.jdk8=linux_armv6_2.6
 jprt.my.linux.armv6.jdk7=linux_armv6_2.6
-jprt.my.linux.armv6.jdk7u6=${jprt.my.linux.armv6.jdk7}
+jprt.my.linux.armv6.jdk7u8=${jprt.my.linux.armv6.jdk7}
 jprt.my.linux.armv6=${jprt.my.linux.armv6.${jprt.tools.default.release}}

 jprt.my.linux.armsflt.jdk8=linux_armsflt_2.6
 jprt.my.linux.armsflt.jdk7=linux_armsflt_2.6
-jprt.my.linux.armsflt.jdk7u6=${jprt.my.linux.armsflt.jdk7}
+jprt.my.linux.armsflt.jdk7u8=${jprt.my.linux.armsflt.jdk7}
 jprt.my.linux.armsflt=${jprt.my.linux.armsflt.${jprt.tools.default.release}}

 jprt.my.macosx.x64.jdk8=macosx_x64_10.7
 jprt.my.macosx.x64.jdk7=macosx_x64_10.7
-jprt.my.macosx.x64.jdk7u6=${jprt.my.macosx.x64.jdk7}
+jprt.my.macosx.x64.jdk7u8=${jprt.my.macosx.x64.jdk7}
 jprt.my.macosx.x64=${jprt.my.macosx.x64.${jprt.tools.default.release}}

 jprt.my.windows.i586.jdk8=windows_i586_5.1
 jprt.my.windows.i586.jdk7=windows_i586_5.1
-jprt.my.windows.i586.jdk7u6=${jprt.my.windows.i586.jdk7}
+jprt.my.windows.i586.jdk7u8=${jprt.my.windows.i586.jdk7}
 jprt.my.windows.i586=${jprt.my.windows.i586.${jprt.tools.default.release}}

 jprt.my.windows.x64.jdk8=windows_x64_5.2
 jprt.my.windows.x64.jdk7=windows_x64_5.2
-jprt.my.windows.x64.jdk7u6=${jprt.my.windows.x64.jdk7}
+jprt.my.windows.x64.jdk7u8=${jprt.my.windows.x64.jdk7}
 jprt.my.windows.x64=${jprt.my.windows.x64.${jprt.tools.default.release}}

 # Standard list of jprt build targets for this source tree
@@ -159,7 +159,7 @@

 jprt.build.targets.jdk8=${jprt.build.targets.all}
 jprt.build.targets.jdk7=${jprt.build.targets.all}
-jprt.build.targets.jdk7u6=${jprt.build.targets.all}
+jprt.build.targets.jdk7u8=${jprt.build.targets.all}
 jprt.build.targets=${jprt.build.targets.${jprt.tools.default.release}}

 # Subset lists of test targets for this source tree
@@ -452,7 +452,7 @@

 jprt.test.targets.jdk8=${jprt.test.targets.standard}
 jprt.test.targets.jdk7=${jprt.test.targets.standard}
-jprt.test.targets.jdk7u6=${jprt.test.targets.jdk7}
+jprt.test.targets.jdk7u8=${jprt.test.targets.jdk7}
 jprt.test.targets=${jprt.test.targets.${jprt.tools.default.release}}

 # The default test/Makefile targets that should be run
@@ -512,7 +512,7 @@

 jprt.make.rule.test.targets.jdk8=${jprt.make.rule.test.targets.standard}
 jprt.make.rule.test.targets.jdk7=${jprt.make.rule.test.targets.standard}
-jprt.make.rule.test.targets.jdk7u6=${jprt.make.rule.test.targets.jdk7}
+jprt.make.rule.test.targets.jdk7u8=${jprt.make.rule.test.targets.jdk7}
 jprt.make.rule.test.targets=${jprt.make.rule.test.targets.${jprt.tools.default.release}}

 # 7155453: Work-around to prevent popups on OSX from blocking test completion
--- a/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -435,85 +435,6 @@

 }

-void G1UnsafeGetObjSATBBarrierStub::emit_code(LIR_Assembler* ce) {
-  // At this point we know that offset == referent_offset.
-  //
-  // So we might have to emit:
-  //   if (src == null) goto continuation.
-  //
-  // and we definitely have to emit:
-  //   if (klass(src).reference_type == REF_NONE) goto continuation
-  //   if (!marking_active) goto continuation
-  //   if (pre_val == null) goto continuation
-  //   call pre_barrier(pre_val)
-  //   goto continuation
-  //
-  __ bind(_entry);
-
-  assert(src()->is_register(), "sanity");
-  Register src_reg = src()->as_register();
-
-  if (gen_src_check()) {
-    // The original src operand was not a constant.
-    // Generate src == null?
-    if (__ is_in_wdisp16_range(_continuation)) {
-      __ br_null(src_reg, /*annul*/false, Assembler::pt, _continuation);
-    } else {
-      __ cmp(src_reg, G0);
-      __ brx(Assembler::equal, false, Assembler::pt, _continuation);
-    }
-    __ delayed()->nop();
-  }
-
-  // Generate src->_klass->_reference_type() == REF_NONE)?
-  assert(tmp()->is_register(), "sanity");
-  Register tmp_reg = tmp()->as_register();
-
-  __ load_klass(src_reg, tmp_reg);
-
-  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset());
-  __ ldub(ref_type_adr, tmp_reg);
-
-  // _reference_type field is of type ReferenceType (enum)
-  assert(REF_NONE == 0, "check this code");
-  __ cmp_zero_and_br(Assembler::equal, tmp_reg, _continuation, /*annul*/false, Assembler::pt);
-  __ delayed()->nop();
-
-  // Is marking active?
-  assert(thread()->is_register(), "precondition");
-  Register thread_reg = thread()->as_pointer_register();
-
-  Address in_progress(thread_reg, in_bytes(JavaThread::satb_mark_queue_offset() +
-                                       PtrQueue::byte_offset_of_active()));
-
-  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
-    __ ld(in_progress, tmp_reg);
-  } else {
-    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
-    __ ldsb(in_progress, tmp_reg);
-  }
-
-  __ cmp_zero_and_br(Assembler::equal, tmp_reg, _continuation, /*annul*/false, Assembler::pt);
-  __ delayed()->nop();
-
-  // val == null?
-  assert(val()->is_register(), "Precondition.");
-  Register val_reg = val()->as_register();
-
-  if (__ is_in_wdisp16_range(_continuation)) {
-    __ br_null(val_reg, /*annul*/false, Assembler::pt, _continuation);
-  } else {
-    __ cmp(val_reg, G0);
-    __ brx(Assembler::equal, false, Assembler::pt, _continuation);
-  }
-  __ delayed()->nop();
-
-  __ call(Runtime1::entry_for(Runtime1::Runtime1::g1_pre_barrier_slow_id));
-  __ delayed()->mov(val_reg, G4);
-  __ br(Assembler::always, false, Assembler::pt, _continuation);
-  __ delayed()->nop();
-}
-
 jbyte* G1PostBarrierStub::_byte_map_base = NULL;

 jbyte* G1PostBarrierStub::byte_map_base_slow() {
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -106,10 +106,10 @@
     if (FLAG_IS_DEFAULT(OptoLoopAlignment)) {
       FLAG_SET_DEFAULT(OptoLoopAlignment, 4);
     }
-    // When using CMS, we cannot use memset() in BOT updates because
-    // the sun4v/CMT version in libc_psr uses BIS which exposes
-    // "phantom zeros" to concurrent readers. See 6948537.
-    if (FLAG_IS_DEFAULT(UseMemSetInBOT) && UseConcMarkSweepGC) {
+    // When using CMS or G1, we cannot use memset() in BOT updates
+    // because the sun4v/CMT version in libc_psr uses BIS which
+    // exposes "phantom zeros" to concurrent readers. See 6948537.
+    if (FLAG_IS_DEFAULT(UseMemSetInBOT) && (UseConcMarkSweepGC || UseG1GC)) {
       FLAG_SET_DEFAULT(UseMemSetInBOT, false);
     }
 #ifdef _LP64
--- a/src/cpu/x86/vm/assembler_x86.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -999,32 +999,22 @@

 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x58);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
 }

 void Assembler::addsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x58);
-  emit_operand(dst, src);
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
 }

 void Assembler::addss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x58);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
 }

 void Assembler::addss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x58);
-  emit_operand(dst, src);
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
 }

 void Assembler::andl(Address dst, int32_t imm32) {
@@ -1052,36 +1042,6 @@
   emit_arith(0x23, 0xC0, dst, src);
 }

-void Assembler::andpd(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x54);
-  emit_operand(dst, src);
-}
-
-void Assembler::andpd(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x54);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::andps(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_NONE);
-  emit_byte(0x54);
-  emit_operand(dst, src);
-}
-
-void Assembler::andps(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
-  emit_byte(0x54);
-  emit_byte(0xC0 | encode);
-}
-
 void Assembler::bsfl(Register dst, Register src) {
   int encode = prefix_and_encode(dst->encoding(), src->encoding());
   emit_byte(0x0F);
@@ -1246,61 +1206,42 @@
   // NOTE: dbx seems to decode this as comiss even though the
   // 0x66 is there. Strangly ucomisd comes out correct
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66);
-  emit_byte(0x2F);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
 }

 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
-  emit_byte(0x2F);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
 }

 void Assembler::comiss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x2F);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
 }

 void Assembler::comiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x2F);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
 }

 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
-  emit_byte(0xE6);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3);
 }

 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x5B);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE);
 }

 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5A);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
 }

 void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5A);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
 }

 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
@@ -1312,10 +1253,7 @@

 void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x2A);
-  emit_operand(dst, src);
+  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2);
 }

 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
@@ -1327,25 +1265,17 @@

 void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x2A);
-  emit_operand(dst, src);
+  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3);
 }

 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5A);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
 }

 void Assembler::cvtss2sd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5A);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
 }


@@ -1373,32 +1303,22 @@

 void Assembler::divsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5E);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
 }

 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5E);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
 }

 void Assembler::divss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5E);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
 }

 void Assembler::divss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5E);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
 }

 void Assembler::emms() {
@@ -1634,16 +1554,12 @@

 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
-  emit_byte(0x28);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66);
 }

 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x28);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE);
 }

 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
@@ -1712,24 +1628,17 @@

 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
-  emit_byte(0x6F);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
 }

 void Assembler::movdqu(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3);
-  emit_byte(0x6F);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
 }

 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
-  emit_byte(0x6F);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
 }

 void Assembler::movdqu(Address dst, XMMRegister src) {
@@ -1810,10 +1719,7 @@
 // The selection is done in MacroAssembler::movdbl() and movflt().
 void Assembler::movlpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x12);
-  emit_operand(dst, src);
+  emit_simd_arith(0x12, dst, src, VEX_SIMD_66);
 }

 void Assembler::movq( MMXRegister dst, Address src ) {
@@ -1870,17 +1776,12 @@

 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x10);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x10, dst, src, VEX_SIMD_F2);
 }

 void Assembler::movsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F2);
-  emit_byte(0x10);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2);
 }

 void Assembler::movsd(Address dst, XMMRegister src) {
@@ -1893,17 +1794,12 @@

 void Assembler::movss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x10);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x10, dst, src, VEX_SIMD_F3);
 }

 void Assembler::movss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3);
-  emit_byte(0x10);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3);
 }

 void Assembler::movss(Address dst, XMMRegister src) {
@@ -2001,32 +1897,22 @@

 void Assembler::mulsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x59);
-  emit_operand(dst, src);
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
 }

 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x59);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
 }

 void Assembler::mulss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x59);
-  emit_operand(dst, src);
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
 }

 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x59);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
 }

 void Assembler::negl(Register dst) {
@@ -2315,17 +2201,12 @@
 void Assembler::packuswb(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x67);
-  emit_operand(dst, src);
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
 }

 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x67);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
 }

 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
@@ -2339,7 +2220,7 @@

 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
   emit_byte(0x61);
   emit_byte(0xC0 | encode);
   emit_byte(imm8);
@@ -2355,7 +2236,7 @@

 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
   emit_byte(0x30);
   emit_byte(0xC0 | encode);
 }
@@ -2456,28 +2337,10 @@
   a_byte(p);
 }

-void Assembler::por(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0xEB);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::por(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0xEB);
-  emit_operand(dst, src);
-}
-
 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
-  emit_byte(0x70);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_66);
   emit_byte(mode & 0xFF);

 }
@@ -2496,9 +2359,7 @@
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
-  emit_byte(0x70);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2);
   emit_byte(mode & 0xFF);
 }

@@ -2513,18 +2374,6 @@
   emit_byte(mode & 0xFF);
 }

-void Assembler::psrlq(XMMRegister dst, int shift) {
-  // Shift 64 bit value logically right by specified number of bits.
-  // HMM Table D-1 says sse2 or mmx.
-  // Do not confuse it with psrldq SSE2 instruction which
-  // shifts 128 bit value in xmm register by number of bytes.
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
-  emit_byte(0x73);
-  emit_byte(0xC0 | encode);
-  emit_byte(shift);
-}
-
 void Assembler::psrldq(XMMRegister dst, int shift) {
   // Shift 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@@ -2545,7 +2394,7 @@

 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
   emit_byte(0x17);
   emit_byte(0xC0 | encode);
 }
@@ -2553,40 +2402,28 @@
 void Assembler::punpcklbw(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x60);
-  emit_operand(dst, src);
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
 }

 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x60);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
 }

 void Assembler::punpckldq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x62);
-  emit_operand(dst, src);
+  emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
 }

 void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x62);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
 }

 void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x6C);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x6C, dst, src, VEX_SIMD_66);
 }

 void Assembler::push(int32_t imm32) {
@@ -2616,22 +2453,6 @@
 }
 #endif

-void Assembler::pxor(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0xEF);
-  emit_operand(dst, src);
-}
-
-void Assembler::pxor(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0xEF);
-  emit_byte(0xC0 | encode);
-}
-
 void Assembler::rcll(Register dst, int imm8) {
   assert(isShiftCount(imm8), "illegal shift count");
   int encode = prefix_and_encode(dst->encoding());
@@ -2790,32 +2611,22 @@

 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x51);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
 }

 void Assembler::sqrtsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x51);
-  emit_operand(dst, src);
+  emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
 }

 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x51);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
 }

 void Assembler::sqrtss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x51);
-  emit_operand(dst, src);
+  emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
 }

 void Assembler::stmxcsr( Address dst) {
@@ -2865,32 +2676,22 @@

 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5C);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
 }

 void Assembler::subsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F2);
-  emit_byte(0x5C);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
 }

 void Assembler::subss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5C);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
 }

 void Assembler::subss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_F3);
-  emit_byte(0x5C);
-  emit_operand(dst, src);
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
 }

 void Assembler::testb(Register dst, int imm8) {
@@ -2928,32 +2729,22 @@

 void Assembler::ucomisd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66);
-  emit_byte(0x2E);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
 }

 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
-  emit_byte(0x2E);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
 }

 void Assembler::ucomiss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x2E);
-  emit_operand(dst, src);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
 }

 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
-  emit_byte(0x2E);
-  emit_byte(0xC0 | encode);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
 }


@@ -2995,211 +2786,714 @@
   emit_arith(0x33, 0xC0, dst, src);
 }

-void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x57);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::xorpd(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66);
-  emit_byte(0x57);
-  emit_operand(dst, src);
-}
-
-
-void Assembler::xorps(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
-  emit_byte(0x57);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::xorps(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_NONE);
-  emit_byte(0x57);
-  emit_operand(dst, src);
-}
-
-// AVX 3-operands non destructive source instructions (encoded with VEX prefix)
+
+// AVX 3-operands scalar float-point arithmetic instructions

 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x58);
-  emit_operand(dst, src);
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
 }

 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x58);
-  emit_byte(0xC0 | encode);
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
 }

 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x58);
-  emit_operand(dst, src);
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
 }

 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x58);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
-  emit_byte(0x54);
-  emit_operand(dst, src);
-}
-
-void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
-  emit_byte(0x54);
-  emit_operand(dst, src);
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
 }

 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x5E);
-  emit_operand(dst, src);
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
 }

 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x5E);
-  emit_byte(0xC0 | encode);
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
 }

 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x5E);
-  emit_operand(dst, src);
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
 }

 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x5E);
-  emit_byte(0xC0 | encode);
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
 }

 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x59);
-  emit_operand(dst, src);
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
 }

 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x59);
-  emit_byte(0xC0 | encode);
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
 }

 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x59);
-  emit_operand(dst, src);
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
 }

 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x59);
-  emit_byte(0xC0 | encode);
-}
-
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+}

 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x5C);
-  emit_operand(dst, src);
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
 }

 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
-  emit_byte(0x5C);
-  emit_byte(0xC0 | encode);
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
 }

 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x5C);
-  emit_operand(dst, src);
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
 }

 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
-  emit_byte(0x5C);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src) {
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+}
+
+//====================VECTOR ARITHMETIC=====================================
+
+// Float-point vector arithmetic
+
+void Assembler::addpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::addps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::subpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::subps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::mulpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::mulps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
   assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
-  emit_byte(0x57);
-  emit_operand(dst, src);
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::divpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::divps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::andpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::andps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::andps(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::andpd(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::xorps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
+}
+
+void Assembler::xorpd(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::xorps(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
 }

 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256);
-  emit_byte(0x57);
-  emit_byte(0xC0 | encode);
-}
-
-void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
-  emit_byte(0x57);
-  emit_operand(dst, src);
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
 }

 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
   assert(VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, vector256);
-  emit_byte(0x57);
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
+}
+
+
+// Integer vector arithmetic
+void Assembler::paddb(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xFC, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::paddw(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xFD, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::paddd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xFE, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::paddq(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xD4, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::psubb(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xF8, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::psubw(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xF9, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::psubd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xFA, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::psubq(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xFB, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xD5, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::pmulld(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sse4_1(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0x40);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+  emit_byte(0x40);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  InstructionMark im(this);
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256);
+  emit_byte(0x40);
+  emit_operand(dst, src);
+}
+
+// Shift packed integers left by specified number of bits.
+void Assembler::psllw(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM6 is for /6 encoding: 66 0F 71 /6 ib
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
+  emit_byte(0x71);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::pslld(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM6 is for /6 encoding: 66 0F 72 /6 ib
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
+  emit_byte(0x72);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psllq(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM6 is for /6 encoding: 66 0F 73 /6 ib
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
+  emit_byte(0x73);
   emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::pslld(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xF2, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xF3, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM6 is for /6 encoding: 66 0F 71 /6 ib
+  emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM6 is for /6 encoding: 66 0F 72 /6 ib
+  emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM6 is for /6 encoding: 66 0F 73 /6 ib
+  emit_vex_arith(0x73, xmm6, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xF3, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+// Shift packed integers logically right by specified number of bits.
+void Assembler::psrlw(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM2 is for /2 encoding: 66 0F 71 /2 ib
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
+  emit_byte(0x71);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psrld(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM2 is for /2 encoding: 66 0F 72 /2 ib
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
+  emit_byte(0x72);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psrlq(XMMRegister dst, int shift) {
+  // Do not confuse it with psrldq SSE2 instruction which
+  // shifts 128 bit value in xmm register by number of bytes.
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
+  emit_byte(0x73);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::psrld(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xD2, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xD3, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+  emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+  emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+  emit_vex_arith(0x73, xmm2, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xD3, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+// Shift packed integers arithmetically right by specified number of bits.
+void Assembler::psraw(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
+  emit_byte(0x71);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psrad(XMMRegister dst, int shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  // XMM4 is for /4 encoding: 66 0F 72 /4 ib
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
+  emit_byte(0x72);
+  emit_byte(0xC0 | encode);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::psrad(XMMRegister dst, XMMRegister shift) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66);
+}
+
+void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
+  emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
+  emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector256);
+  emit_byte(shift & 0xFF);
+}
+
+void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector256);
+}
+
+
+// AND packed integers
+void Assembler::pand(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xDB, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::por(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xEB, dst, src, VEX_SIMD_66);
+}
+
+void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::pxor(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_simd_arith(0xEF, dst, src, VEX_SIMD_66);
 }

 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx2() || (!vector256) && VM_Version::supports_avx(), "");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256);
-  emit_byte(0xEF);
-  emit_byte(0xC0 | encode);
-}
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
+}
+
+void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
+}
+

 void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
@@ -3805,6 +4099,49 @@
   }
 }

+void Assembler::emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, pre);
+  emit_byte(opcode);
+  emit_operand(dst, src);
+}
+
+void Assembler::emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
+  int encode = simd_prefix_and_encode(dst, dst, src, pre);
+  emit_byte(opcode);
+  emit_byte(0xC0 | encode);
+}
+
+// Versions with no second source register (non-destructive source).
+void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
+  InstructionMark im(this);
+  simd_prefix(dst, xnoreg, src, pre);
+  emit_byte(opcode);
+  emit_operand(dst, src);
+}
+
+void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre);
+  emit_byte(opcode);
+  emit_byte(0xC0 | encode);
+}
+
+// 3-operands AVX instructions
+void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
+                               Address src, VexSimdPrefix pre, bool vector256) {
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, pre, vector256);
+  emit_byte(opcode);
+  emit_operand(dst, src);
+}
+
+void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
+                               XMMRegister src, VexSimdPrefix pre, bool vector256) {
+  int encode = vex_prefix_and_encode(dst, nds, src, pre, vector256);
+  emit_byte(opcode);
+  emit_byte(0xC0 | encode);
+}
+
 #ifndef _LP64

 void Assembler::incl(Register dst) {
@@ -7968,21 +8305,21 @@
   }
 }

-void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
   if (reachable(src)) {
-    vandpd(dst, nds, as_Address(src));
+    vandpd(dst, nds, as_Address(src), vector256);
   } else {
     lea(rscratch1, src);
-    vandpd(dst, nds, Address(rscratch1, 0));
-  }
-}
-
-void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+    vandpd(dst, nds, Address(rscratch1, 0), vector256);
+  }
+}
+
+void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
   if (reachable(src)) {
-    vandps(dst, nds, as_Address(src));
+    vandps(dst, nds, as_Address(src), vector256);
   } else {
     lea(rscratch1, src);
-    vandps(dst, nds, Address(rscratch1, 0));
+    vandps(dst, nds, Address(rscratch1, 0), vector256);
   }
 }

@@ -8040,21 +8377,21 @@
   }
 }

-void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
   if (reachable(src)) {
-    vxorpd(dst, nds, as_Address(src));
+    vxorpd(dst, nds, as_Address(src), vector256);
   } else {
     lea(rscratch1, src);
-    vxorpd(dst, nds, Address(rscratch1, 0));
-  }
-}
-
-void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+    vxorpd(dst, nds, Address(rscratch1, 0), vector256);
+  }
+}
+
+void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
   if (reachable(src)) {
-    vxorps(dst, nds, as_Address(src));
+    vxorps(dst, nds, as_Address(src), vector256);
   } else {
     lea(rscratch1, src);
-    vxorps(dst, nds, Address(rscratch1, 0));
+    vxorps(dst, nds, Address(rscratch1, 0), vector256);
   }
 }
--- a/src/cpu/x86/vm/assembler_x86.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -617,6 +617,7 @@
                    VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
     simd_prefix(dst, xnoreg, src, pre, opc);
   }
+
   void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
     simd_prefix(src, dst, pre);
   }
@@ -626,16 +627,10 @@
     simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
   }

-
   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
                              VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
                              bool rex_w = false, bool vector256 = false);

-  int simd_prefix_and_encode(XMMRegister dst, XMMRegister src,
-                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
-    return simd_prefix_and_encode(dst, xnoreg, src, pre, opc);
-  }
-
   // Move/convert 32-bit integer value.
   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
                              VexSimdPrefix pre) {
@@ -677,6 +672,15 @@
   void emit_arith(int op1, int op2, Register dst, jobject obj);
   void emit_arith(int op1, int op2, Register dst, Register src);

+  void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
+  void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
+  void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
+  void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
+  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
+                      Address src, VexSimdPrefix pre, bool vector256);
+  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
+                      XMMRegister src, VexSimdPrefix pre, bool vector256);
+
   void emit_operand(Register reg,
                     Register base, Register index, Address::ScaleFactor scale,
                     int disp,
@@ -891,12 +895,6 @@
   void andq(Register dst, Address src);
   void andq(Register dst, Register src);

-  // Bitwise Logical AND of Packed Double-Precision Floating-Point Values
-  void andpd(XMMRegister dst, XMMRegister src);
-
-  // Bitwise Logical AND of Packed Single-Precision Floating-Point Values
-  void andps(XMMRegister dst, XMMRegister src);
-
   void bsfl(Register dst, Register src);
   void bsrl(Register dst, Register src);

@@ -1436,10 +1434,6 @@
   void prefetcht2(Address src);
   void prefetchw(Address src);

-  // POR - Bitwise logical OR
-  void por(XMMRegister dst, XMMRegister src);
-  void por(XMMRegister dst, Address src);
-
   // Shuffle Packed Doublewords
   void pshufd(XMMRegister dst, XMMRegister src, int mode);
   void pshufd(XMMRegister dst, Address src,     int mode);
@@ -1448,9 +1442,6 @@
   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
   void pshuflw(XMMRegister dst, Address src,     int mode);

-  // Shift Right by bits Logical Quadword Immediate
-  void psrlq(XMMRegister dst, int shift);
-
   // Shift Right by bytes Logical DoubleQuadword Immediate
   void psrldq(XMMRegister dst, int shift);

@@ -1475,10 +1466,6 @@

   void pushq(Address src);

-  // Xor Packed Byte Integer Values
-  void pxor(XMMRegister dst, Address src);
-  void pxor(XMMRegister dst, XMMRegister src);
-
   void rcll(Register dst, int imm8);

   void rclq(Register dst, int imm8);
@@ -1601,15 +1588,10 @@
   void xorq(Register dst, Address src);
   void xorq(Register dst, Register src);

-  // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
-  void xorpd(XMMRegister dst, XMMRegister src);
-
-  // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
-  void xorps(XMMRegister dst, XMMRegister src);
-
   void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0

   // AVX 3-operands scalar instructions (encoded with VEX prefix)
+
   void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vaddss(XMMRegister dst, XMMRegister nds, Address src);
@@ -1627,14 +1609,147 @@
   void vsubss(XMMRegister dst, XMMRegister nds, Address src);
   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);

-  // AVX Vector instrucitons.
-  void vandpd(XMMRegister dst, XMMRegister nds, Address src);
-  void vandps(XMMRegister dst, XMMRegister nds, Address src);
-  void vxorpd(XMMRegister dst, XMMRegister nds, Address src);
-  void vxorps(XMMRegister dst, XMMRegister nds, Address src);
+
+  //====================VECTOR ARITHMETIC=====================================
+
+  // Add Packed Floating-Point Values
+  void addpd(XMMRegister dst, XMMRegister src);
+  void addps(XMMRegister dst, XMMRegister src);
+  void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Subtract Packed Floating-Point Values
+  void subpd(XMMRegister dst, XMMRegister src);
+  void subps(XMMRegister dst, XMMRegister src);
+  void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Multiply Packed Floating-Point Values
+  void mulpd(XMMRegister dst, XMMRegister src);
+  void mulps(XMMRegister dst, XMMRegister src);
+  void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Divide Packed Floating-Point Values
+  void divpd(XMMRegister dst, XMMRegister src);
+  void divps(XMMRegister dst, XMMRegister src);
+  void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Bitwise Logical AND of Packed Floating-Point Values
+  void andpd(XMMRegister dst, XMMRegister src);
+  void andps(XMMRegister dst, XMMRegister src);
+  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Bitwise Logical XOR of Packed Floating-Point Values
+  void xorpd(XMMRegister dst, XMMRegister src);
+  void xorps(XMMRegister dst, XMMRegister src);
   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Add packed integers
+  void paddb(XMMRegister dst, XMMRegister src);
+  void paddw(XMMRegister dst, XMMRegister src);
+  void paddd(XMMRegister dst, XMMRegister src);
+  void paddq(XMMRegister dst, XMMRegister src);
+  void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Sub packed integers
+  void psubb(XMMRegister dst, XMMRegister src);
+  void psubw(XMMRegister dst, XMMRegister src);
+  void psubd(XMMRegister dst, XMMRegister src);
+  void psubq(XMMRegister dst, XMMRegister src);
+  void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Multiply packed integers (only shorts and ints)
+  void pmullw(XMMRegister dst, XMMRegister src);
+  void pmulld(XMMRegister dst, XMMRegister src);
+  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Shift left packed integers
+  void psllw(XMMRegister dst, int shift);
+  void pslld(XMMRegister dst, int shift);
+  void psllq(XMMRegister dst, int shift);
+  void psllw(XMMRegister dst, XMMRegister shift);
+  void pslld(XMMRegister dst, XMMRegister shift);
+  void psllq(XMMRegister dst, XMMRegister shift);
+  void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+
+  // Logical shift right packed integers
+  void psrlw(XMMRegister dst, int shift);
+  void psrld(XMMRegister dst, int shift);
+  void psrlq(XMMRegister dst, int shift);
+  void psrlw(XMMRegister dst, XMMRegister shift);
+  void psrld(XMMRegister dst, XMMRegister shift);
+  void psrlq(XMMRegister dst, XMMRegister shift);
+  void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+
+  // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
+  void psraw(XMMRegister dst, int shift);
+  void psrad(XMMRegister dst, int shift);
+  void psraw(XMMRegister dst, XMMRegister shift);
+  void psrad(XMMRegister dst, XMMRegister shift);
+  void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256);
+  void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+
+  // And packed integers
+  void pand(XMMRegister dst, XMMRegister src);
+  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Or packed integers
+  void por(XMMRegister dst, XMMRegister src);
+  void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Xor packed integers
+  void pxor(XMMRegister dst, XMMRegister src);
   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+
+  // Copy low 128bit into high 128bit of YMM registers.
   void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);

@@ -2532,11 +2647,13 @@
   void vaddss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddss(dst, nds, src); }
   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);

-  void vandpd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vandpd(dst, nds, src); }
-  void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
-
-  void vandps(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vandps(dst, nds, src); }
-  void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); }
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256)     { Assembler::vandpd(dst, nds, src, vector256); }
+  void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
+
+  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); }
+  void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256)     { Assembler::vandps(dst, nds, src, vector256); }
+  void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);

   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
   void vdivsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivsd(dst, nds, src); }
@@ -2565,12 +2682,12 @@
   // AVX Vector instructions

   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
-  void vxorpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorpd(dst, nds, src); }
-  void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
+  void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);

   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
-  void vxorps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorps(dst, nds, src); }
-  void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
+  void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);

   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
     if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
@@ -2578,6 +2695,12 @@
     else
       Assembler::vxorpd(dst, nds, src, vector256);
   }
+  void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+    if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
+      Assembler::vpxor(dst, nds, src, vector256);
+    else
+      Assembler::vxorpd(dst, nds, src, vector256);
+  }

   // Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector.
   void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
--- a/src/cpu/x86/vm/c1_CodeStubs_x86.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/cpu/x86/vm/c1_CodeStubs_x86.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -488,68 +488,6 @@

 }

-void G1UnsafeGetObjSATBBarrierStub::emit_code(LIR_Assembler* ce) {
-  // At this point we know that offset == referent_offset.
-  //
-  // So we might have to emit:
-  //   if (src == null) goto continuation.
-  //
-  // and we definitely have to emit:
-  //   if (klass(src).reference_type == REF_NONE) goto continuation
-  //   if (!marking_active) goto continuation
-  //   if (pre_val == null) goto continuation
-  //   call pre_barrier(pre_val)
-  //   goto continuation
-  //
-  __ bind(_entry);
-
-  assert(src()->is_register(), "sanity");
-  Register src_reg = src()->as_register();
-
-  if (gen_src_check()) {
-    // The original src operand was not a constant.
-    // Generate src == null?
-    __ cmpptr(src_reg, (int32_t) NULL_WORD);
-    __ jcc(Assembler::equal, _continuation);
-  }
-
-  // Generate src->_klass->_reference_type == REF_NONE)?
-  assert(tmp()->is_register(), "sanity");
-  Register tmp_reg = tmp()->as_register();
-
-  __ load_klass(tmp_reg, src_reg);
-
-  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset());
-  __ cmpb(ref_type_adr, REF_NONE);
-  __ jcc(Assembler::equal, _continuation);
-
-  // Is marking active?
-  assert(thread()->is_register(), "precondition");
-  Register thread_reg = thread()->as_pointer_register();
-
-  Address in_progress(thread_reg, in_bytes(JavaThread::satb_mark_queue_offset() +
-                                       PtrQueue::byte_offset_of_active()));
-
-  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
-    __ cmpl(in_progress, 0);
-  } else {
-    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
-    __ cmpb(in_progress, 0);
-  }
-  __ jcc(Assembler::equal, _continuation);
-
-  // val == null?
-  assert(val()->is_register(), "Precondition.");
-  Register val_reg = val()->as_register();
-
-  __ cmpptr(val_reg, (int32_t) NULL_WORD);
-  __ jcc(Assembler::equal, _continuation);
-
-  ce->store_parameter(val()->as_register(), 0);
-  __ call(RuntimeAddress(Runtime1::entry_for(Runtime1::g1_pre_barrier_slow_id)));
-  __ jmp(_continuation);
-}
-
 jbyte* G1PostBarrierStub::_byte_map_base = NULL;

 jbyte* G1PostBarrierStub::byte_map_base_slow() {
--- a/src/cpu/x86/vm/x86.ad	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/cpu/x86/vm/x86.ad	Fri Aug 24 19:45:42 2012 -0700
@@ -500,6 +500,24 @@
   0  /*bottom*/
 };

+const bool Matcher::match_rule_supported(int opcode) {
+  if (!has_match_rule(opcode))
+    return false;
+
+  switch (opcode) {
+    case Op_PopCountI:
+    case Op_PopCountL:
+      if (!UsePopCountInstruction)
+        return false;
+    case Op_MulVI:
+      if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
+        return false;
+    break;
+  }
+
+  return true;  // Per default match rules are supported.
+}
+
 // Max vector size in bytes. 0 if not supported.
 const int Matcher::vector_width_in_bytes(BasicType bt) {
   assert(is_java_primitive(bt), "only primitive type vectors");
@@ -1439,8 +1457,9 @@
   ins_cost(150);
   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
   ins_encode %{
+    bool vector256 = false;
     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signmask()));
+              ExternalAddress(float_signmask()), vector256);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -1464,8 +1483,9 @@
   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
             "# abs double by sign masking" %}
   ins_encode %{
+    bool vector256 = false;
     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signmask()));
+              ExternalAddress(double_signmask()), vector256);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -1487,8 +1507,9 @@
   ins_cost(150);
   format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
   ins_encode %{
+    bool vector256 = false;
     __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signflip()));
+              ExternalAddress(float_signflip()), vector256);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -1512,8 +1533,9 @@
   format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
             "# neg double by sign flipping" %}
   ins_encode %{
+    bool vector256 = false;
     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signflip()));
+              ExternalAddress(double_signflip()), vector256);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -2382,3 +2404,2416 @@
   ins_pipe( fpu_reg_reg );
 %}

+// ====================VECTOR ARITHMETIC=======================================
+
+// --------------------------------- ADD --------------------------------------
+
+// Bytes vector add
+instruct vadd4B(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVB dst src));
+  format %{ "paddb   $dst,$src\t! add packed4B" %}
+  ins_encode %{
+    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVB src1 src2));
+  format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8B(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVB dst src));
+  format %{ "paddb   $dst,$src\t! add packed8B" %}
+  ins_encode %{
+    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVB src1 src2));
+  format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16B(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (AddVB dst src));
+  format %{ "paddb   $dst,$src\t! add packed16B" %}
+  ins_encode %{
+    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVB src1 src2));
+  format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVB src (LoadVector mem)));
+  format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
+  match(Set dst (AddVB src1 src2));
+  format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
+  match(Set dst (AddVB src (LoadVector mem)));
+  format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Shorts/Chars vector add
+instruct vadd2S(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVS dst src));
+  format %{ "paddw   $dst,$src\t! add packed2S" %}
+  ins_encode %{
+    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVS src1 src2));
+  format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4S(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVS dst src));
+  format %{ "paddw   $dst,$src\t! add packed4S" %}
+  ins_encode %{
+    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVS src1 src2));
+  format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8S(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVS dst src));
+  format %{ "paddw   $dst,$src\t! add packed8S" %}
+  ins_encode %{
+    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVS src1 src2));
+  format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVS src (LoadVector mem)));
+  format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVS src1 src2));
+  format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVS src (LoadVector mem)));
+  format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integers vector add
+instruct vadd2I(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVI dst src));
+  format %{ "paddd   $dst,$src\t! add packed2I" %}
+  ins_encode %{
+    __ paddd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVI src1 src2));
+  format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4I(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVI dst src));
+  format %{ "paddd   $dst,$src\t! add packed4I" %}
+  ins_encode %{
+    __ paddd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVI src1 src2));
+  format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVI src (LoadVector mem)));
+  format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVI src1 src2));
+  format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVI src (LoadVector mem)));
+  format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Longs vector add
+instruct vadd2L(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVL dst src));
+  format %{ "paddq   $dst,$src\t! add packed2L" %}
+  ins_encode %{
+    __ paddq($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVL src1 src2));
+  format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVL src (LoadVector mem)));
+  format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVL src1 src2));
+  format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVL src (LoadVector mem)));
+  format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Floats vector add
+instruct vadd2F(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVF dst src));
+  format %{ "addps   $dst,$src\t! add packed2F" %}
+  ins_encode %{
+    __ addps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVF src1 src2));
+  format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4F(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVF dst src));
+  format %{ "addps   $dst,$src\t! add packed4F" %}
+  ins_encode %{
+    __ addps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVF src1 src2));
+  format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVF src (LoadVector mem)));
+  format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVF src1 src2));
+  format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVF src (LoadVector mem)));
+  format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Doubles vector add
+instruct vadd2D(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVD dst src));
+  format %{ "addpd   $dst,$src\t! add packed2D" %}
+  ins_encode %{
+    __ addpd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVD src1 src2));
+  format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (AddVD src (LoadVector mem)));
+  format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVD src1 src2));
+  format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (AddVD src (LoadVector mem)));
+  format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// --------------------------------- SUB --------------------------------------
+
+// Bytes vector sub
+instruct vsub4B(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVB dst src));
+  format %{ "psubb   $dst,$src\t! sub packed4B" %}
+  ins_encode %{
+    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVB src1 src2));
+  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8B(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (SubVB dst src));
+  format %{ "psubb   $dst,$src\t! sub packed8B" %}
+  ins_encode %{
+    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVB src1 src2));
+  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16B(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (SubVB dst src));
+  format %{ "psubb   $dst,$src\t! sub packed16B" %}
+  ins_encode %{
+    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVB src1 src2));
+  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVB src (LoadVector mem)));
+  format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
+  match(Set dst (SubVB src1 src2));
+  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
+  match(Set dst (SubVB src (LoadVector mem)));
+  format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Shorts/Chars vector sub
+instruct vsub2S(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVS dst src));
+  format %{ "psubw   $dst,$src\t! sub packed2S" %}
+  ins_encode %{
+    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVS src1 src2));
+  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4S(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVS dst src));
+  format %{ "psubw   $dst,$src\t! sub packed4S" %}
+  ins_encode %{
+    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVS src1 src2));
+  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8S(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (SubVS dst src));
+  format %{ "psubw   $dst,$src\t! sub packed8S" %}
+  ins_encode %{
+    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVS src1 src2));
+  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVS src (LoadVector mem)));
+  format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVS src1 src2));
+  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVS src (LoadVector mem)));
+  format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integers vector sub
+instruct vsub2I(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVI dst src));
+  format %{ "psubd   $dst,$src\t! sub packed2I" %}
+  ins_encode %{
+    __ psubd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVI src1 src2));
+  format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4I(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVI dst src));
+  format %{ "psubd   $dst,$src\t! sub packed4I" %}
+  ins_encode %{
+    __ psubd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVI src1 src2));
+  format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVI src (LoadVector mem)));
+  format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVI src1 src2));
+  format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVI src (LoadVector mem)));
+  format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Longs vector sub
+instruct vsub2L(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVL dst src));
+  format %{ "psubq   $dst,$src\t! sub packed2L" %}
+  ins_encode %{
+    __ psubq($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVL src1 src2));
+  format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVL src (LoadVector mem)));
+  format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVL src1 src2));
+  format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVL src (LoadVector mem)));
+  format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Floats vector sub
+instruct vsub2F(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVF dst src));
+  format %{ "subps   $dst,$src\t! sub packed2F" %}
+  ins_encode %{
+    __ subps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVF src1 src2));
+  format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4F(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVF dst src));
+  format %{ "subps   $dst,$src\t! sub packed4F" %}
+  ins_encode %{
+    __ subps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVF src1 src2));
+  format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVF src (LoadVector mem)));
+  format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVF src1 src2));
+  format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVF src (LoadVector mem)));
+  format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Doubles vector sub
+instruct vsub2D(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVD dst src));
+  format %{ "subpd   $dst,$src\t! sub packed2D" %}
+  ins_encode %{
+    __ subpd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVD src1 src2));
+  format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SubVD src (LoadVector mem)));
+  format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVD src1 src2));
+  format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SubVD src (LoadVector mem)));
+  format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// --------------------------------- MUL --------------------------------------
+
+// Shorts/Chars vector mul
+instruct vmul2S(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVS dst src));
+  format %{ "pmullw $dst,$src\t! mul packed2S" %}
+  ins_encode %{
+    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVS src1 src2));
+  format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4S(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (MulVS dst src));
+  format %{ "pmullw  $dst,$src\t! mul packed4S" %}
+  ins_encode %{
+    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVS src1 src2));
+  format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8S(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (MulVS dst src));
+  format %{ "pmullw  $dst,$src\t! mul packed8S" %}
+  ins_encode %{
+    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVS src1 src2));
+  format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVS src (LoadVector mem)));
+  format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (MulVS src1 src2));
+  format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (MulVS src (LoadVector mem)));
+  format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integers vector mul (sse4_1)
+instruct vmul2I(vecD dst, vecD src) %{
+  predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVI dst src));
+  format %{ "pmulld  $dst,$src\t! mul packed2I" %}
+  ins_encode %{
+    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVI src1 src2));
+  format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I(vecX dst, vecX src) %{
+  predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI dst src));
+  format %{ "pmulld  $dst,$src\t! mul packed4I" %}
+  ins_encode %{
+    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI src1 src2));
+  format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVI src (LoadVector mem)));
+  format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVI src1 src2));
+  format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVI src (LoadVector mem)));
+  format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Floats vector mul
+instruct vmul2F(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVF dst src));
+  format %{ "mulps   $dst,$src\t! mul packed2F" %}
+  ins_encode %{
+    __ mulps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVF src1 src2));
+  format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4F(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (MulVF dst src));
+  format %{ "mulps   $dst,$src\t! mul packed4F" %}
+  ins_encode %{
+    __ mulps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVF src1 src2));
+  format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVF src (LoadVector mem)));
+  format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVF src1 src2));
+  format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVF src (LoadVector mem)));
+  format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Doubles vector mul
+instruct vmul2D(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVD dst src));
+  format %{ "mulpd   $dst,$src\t! mul packed2D" %}
+  ins_encode %{
+    __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVD src1 src2));
+  format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (MulVD src (LoadVector mem)));
+  format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVD src1 src2));
+  format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (MulVD src (LoadVector mem)));
+  format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// --------------------------------- DIV --------------------------------------
+
+// Floats vector div
+instruct vdiv2F(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (DivVF dst src));
+  format %{ "divps   $dst,$src\t! div packed2F" %}
+  ins_encode %{
+    __ divps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (DivVF src1 src2));
+  format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv4F(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (DivVF dst src));
+  format %{ "divps   $dst,$src\t! div packed4F" %}
+  ins_encode %{
+    __ divps($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (DivVF src1 src2));
+  format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (DivVF src (LoadVector mem)));
+  format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (DivVF src1 src2));
+  format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (DivVF src (LoadVector mem)));
+  format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Doubles vector div
+instruct vdiv2D(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (DivVD dst src));
+  format %{ "divpd   $dst,$src\t! div packed2D" %}
+  ins_encode %{
+    __ divpd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (DivVD src1 src2));
+  format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (DivVD src (LoadVector mem)));
+  format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (DivVD src1 src2));
+  format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (DivVD src (LoadVector mem)));
+  format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ------------------------------ LeftShift -----------------------------------
+
+// Shorts/Chars vector left shift
+instruct vsll2S(vecS dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2S_imm(vecS dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2S_reg(vecS dst, vecS src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4S(vecD dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4S_imm(vecD dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4S_reg(vecD dst, vecD src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8S(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8S_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS dst shift));
+  format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
+  ins_encode %{
+    __ psllw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8S_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll16S_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integers vector left shift
+instruct vsll2I(vecD dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI dst shift));
+  format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
+  ins_encode %{
+    __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2I_imm(vecD dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI dst shift));
+  format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
+  ins_encode %{
+    __ pslld($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2I_reg(vecD dst, vecD src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4I(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVI dst shift));
+  format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
+  ins_encode %{
+    __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4I_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVI dst shift));
+  format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
+  ins_encode %{
+    __ pslld($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4I_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8I_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Longs vector left shift
+instruct vsll2L(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL dst shift));
+  format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
+  ins_encode %{
+    __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2L_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL dst shift));
+  format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
+  ins_encode %{
+    __ psllq($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2L_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL src shift));
+  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL src shift));
+  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4L_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVL src shift));
+  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVL src shift));
+  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ----------------------- LogicalRightShift -----------------------------------
+
+// Shorts/Chars vector logical right shift produces incorrect Java result
+// for negative data because java code convert short value into int with
+// sign extension before a shift.
+
+// Integers vector logical right shift
+instruct vsrl2I(vecD dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI dst shift));
+  format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
+  ins_encode %{
+    __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2I_imm(vecD dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI dst shift));
+  format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
+  ins_encode %{
+    __ psrld($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2I_reg(vecD dst, vecD src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4I(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVI dst shift));
+  format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
+  ins_encode %{
+    __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4I_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVI dst shift));
+  format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
+  ins_encode %{
+    __ psrld($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4I_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl8I_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Longs vector logical right shift
+instruct vsrl2L(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL dst shift));
+  format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
+  ins_encode %{
+    __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2L_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL dst shift));
+  format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
+  ins_encode %{
+    __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2L_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL src shift));
+  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL src shift));
+  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4L_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVL src shift));
+  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVL src shift));
+  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ------------------- ArithmeticRightShift -----------------------------------
+
+// Shorts/Chars vector arithmetic right shift
+instruct vsra2S(vecS dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2S_imm(vecS dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2S_reg(vecS dst, vecS src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4S(vecD dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4S_imm(vecD dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4S_reg(vecD dst, vecD src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8S(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8S_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS dst shift));
+  format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
+  ins_encode %{
+    __ psraw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8S_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra16S_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integers vector arithmetic right shift
+instruct vsra2I(vecD dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI dst shift));
+  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
+  ins_encode %{
+    __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2I_imm(vecD dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI dst shift));
+  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
+  ins_encode %{
+    __ psrad($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2I_reg(vecD dst, vecD src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4I(vecX dst, regF shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI dst shift));
+  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
+  ins_encode %{
+    __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4I_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI dst shift));
+  format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
+  ins_encode %{
+    __ psrad($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4I_reg(vecX dst, vecX src, regF shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8I_reg(vecY dst, vecY src, regF shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// There are no longs vector arithmetic right shift instructions.
+
+
+// --------------------------------- AND --------------------------------------
+
+instruct vand4B(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (AndV dst src));
+  format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
+  ins_encode %{
+    __ pand($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (AndV src1 src2));
+  format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand8B(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (AndV dst src));
+  format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    __ pand($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (AndV src1 src2));
+  format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand16B(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (AndV dst src));
+  format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    __ pand($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (AndV src1 src2));
+  format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (AndV src (LoadVector mem)));
+  format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (AndV src1 src2));
+  format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (AndV src (LoadVector mem)));
+  format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// --------------------------------- OR ---------------------------------------
+
+instruct vor4B(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (OrV dst src));
+  format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
+  ins_encode %{
+    __ por($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (OrV src1 src2));
+  format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor8B(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (OrV dst src));
+  format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
+  ins_encode %{
+    __ por($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (OrV src1 src2));
+  format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor16B(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (OrV dst src));
+  format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
+  ins_encode %{
+    __ por($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (OrV src1 src2));
+  format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (OrV src (LoadVector mem)));
+  format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (OrV src1 src2));
+  format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (OrV src (LoadVector mem)));
+  format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// --------------------------------- XOR --------------------------------------
+
+instruct vxor4B(vecS dst, vecS src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (XorV dst src));
+  format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+  match(Set dst (XorV src1 src2));
+  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor8B(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (XorV dst src));
+  format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (XorV src1 src2));
+  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor16B(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (XorV dst src));
+  format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (XorV src1 src2));
+  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (XorV src (LoadVector mem)));
+  format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (XorV src1 src2));
+  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
+  match(Set dst (XorV src (LoadVector mem)));
+  format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
--- a/src/cpu/x86/vm/x86_32.ad	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/cpu/x86/vm/x86_32.ad	Fri Aug 24 19:45:42 2012 -0700
@@ -1367,22 +1367,6 @@
   return offset;
 }

-
-const bool Matcher::match_rule_supported(int opcode) {
-  if (!has_match_rule(opcode))
-    return false;
-
-  switch (opcode) {
-    case Op_PopCountI:
-    case Op_PopCountL:
-      if (!UsePopCountInstruction)
-        return false;
-    break;
-  }
-
-  return true;  // Per default match rules are supported.
-}
-
 int Matcher::regnum_to_fpu_offset(int regnum) {
   return regnum - 32; // The FP registers are in the second chunk
 }
--- a/src/cpu/x86/vm/x86_64.ad	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/cpu/x86/vm/x86_64.ad	Fri Aug 24 19:45:42 2012 -0700
@@ -1513,22 +1513,6 @@
   return offset;
 }

-
-const bool Matcher::match_rule_supported(int opcode) {
-  if (!has_match_rule(opcode))
-    return false;
-
-  switch (opcode) {
-    case Op_PopCountI:
-    case Op_PopCountL:
-      if (!UsePopCountInstruction)
-        return false;
-    break;
-  }
-
-  return true;  // Per default match rules are supported.
-}
-
 int Matcher::regnum_to_fpu_offset(int regnum)
 {
   return regnum - 32; // The FP registers are in the second chunk
@@ -6427,6 +6411,31 @@
   ins_pipe(ialu_reg_reg); // XXX
 %}

+// Convert oop into int for vectors alignment masking
+instruct convP2I(rRegI dst, rRegP src)
+%{
+  match(Set dst (ConvL2I (CastP2X src)));
+
+  format %{ "movl    $dst, $src\t# ptr -> int" %}
+  ins_encode %{
+    __ movl($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // XXX
+%}
+
+// Convert compressed oop into int for vectors alignment masking
+// in case of 32bit oops (heap < 4Gb).
+instruct convN2I(rRegI dst, rRegN src)
+%{
+  predicate(Universe::narrow_oop_shift() == 0);
+  match(Set dst (ConvL2I (CastP2X (DecodeN src))));
+
+  format %{ "movl    $dst, $src\t# compressed ptr -> int" %}
+  ins_encode %{
+    __ movl($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // XXX
+%}

 // Convert oop pointer into compressed form
 instruct encodeHeapOop(rRegN dst, rRegP src, rFlagsReg cr) %{
@@ -10049,11 +10058,10 @@
   ins_pipe( pipe_slow );
 %}

-// The next instructions have long latency and use Int unit. Set high cost.
 instruct MoveI2F_reg_reg(regF dst, rRegI src) %{
   match(Set dst (MoveI2F src));
   effect(DEF dst, USE src);
-  ins_cost(300);
+  ins_cost(100);
   format %{ "movd    $dst,$src\t# MoveI2F" %}
   ins_encode %{
     __ movdl($dst$$XMMRegister, $src$$Register);
@@ -10064,7 +10072,7 @@
 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
   match(Set dst (MoveL2D src));
   effect(DEF dst, USE src);
-  ins_cost(300);
+  ins_cost(100);
   format %{ "movd    $dst,$src\t# MoveL2D" %}
   ins_encode %{
      __ movdq($dst$$XMMRegister, $src$$Register);
--- a/src/share/vm/c1/c1_CodeStubs.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/c1/c1_CodeStubs.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -574,71 +574,6 @@
 #endif // PRODUCT
 };

-// This G1 barrier code stub is used in Unsafe.getObject.
-// It generates a sequence of guards around the SATB
-// barrier code that are used to detect when we have
-// the referent field of a Reference object.
-// The first check is assumed to have been generated
-// in the code generated for Unsafe.getObject().
-
-class G1UnsafeGetObjSATBBarrierStub: public CodeStub {
- private:
-  LIR_Opr _val;
-  LIR_Opr _src;
-
-  LIR_Opr _tmp;
-  LIR_Opr _thread;
-
-  bool _gen_src_check;
-
- public:
-  // A G1 barrier that is guarded by generated guards that determine whether
-  // val (which is the result of Unsafe.getObject() should be recorded in an
-  // SATB log buffer. We could be reading the referent field of a Reference object
-  // using Unsafe.getObject() and we need to record the referent.
-  //
-  // * val is the operand returned by the unsafe.getObject routine.
-  // * src is the base object
-  // * tmp is a temp used to load the klass of src, and then reference type
-  // * thread is the thread object.
-
-  G1UnsafeGetObjSATBBarrierStub(LIR_Opr val, LIR_Opr src,
-                                LIR_Opr tmp, LIR_Opr thread,
-                                bool gen_src_check) :
-    _val(val), _src(src),
-    _tmp(tmp), _thread(thread),
-    _gen_src_check(gen_src_check)
-  {
-    assert(_val->is_register(), "should have already been loaded");
-    assert(_src->is_register(), "should have already been loaded");
-
-    assert(_tmp->is_register(), "should be a temporary register");
-  }
-
-  LIR_Opr val() const { return _val; }
-  LIR_Opr src() const { return _src; }
-
-  LIR_Opr tmp() const { return _tmp; }
-  LIR_Opr thread() const { return _thread; }
-
-  bool gen_src_check() const { return _gen_src_check; }
-
-  virtual void emit_code(LIR_Assembler* e);
-
-  virtual void visit(LIR_OpVisitState* visitor) {
-    visitor->do_slow_case();
-    visitor->do_input(_val);
-    visitor->do_input(_src);
-    visitor->do_input(_thread);
-
-    visitor->do_temp(_tmp);
-  }
-
-#ifndef PRODUCT
-  virtual void print_name(outputStream* out) const { out->print("G1UnsafeGetObjSATBBarrierStub"); }
-#endif // PRODUCT
-};
-
 class G1PostBarrierStub: public CodeStub {
  private:
   LIR_Opr _addr;
--- a/src/share/vm/c1/c1_GraphBuilder.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/c1/c1_GraphBuilder.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -1646,10 +1646,6 @@


 void GraphBuilder::invoke(Bytecodes::Code code) {
-  const bool has_receiver =
-    code == Bytecodes::_invokespecial   ||
-    code == Bytecodes::_invokevirtual   ||
-    code == Bytecodes::_invokeinterface;
   const bool is_invokedynamic = (code == Bytecodes::_invokedynamic);

   bool will_link;
@@ -1690,8 +1686,12 @@
   // convert them directly to an invokespecial or invokestatic.
   if (target->is_loaded() && !target->is_abstract() && target->can_be_statically_bound()) {
     switch (bc_raw) {
-    case Bytecodes::_invokevirtual:  code = Bytecodes::_invokespecial;  break;
-    case Bytecodes::_invokehandle:   code = Bytecodes::_invokestatic;   break;
+    case Bytecodes::_invokevirtual:
+      code = Bytecodes::_invokespecial;
+      break;
+    case Bytecodes::_invokehandle:
+      code = target->is_static() ? Bytecodes::_invokestatic : Bytecodes::_invokespecial;
+      break;
     }
   }

@@ -1878,11 +1878,13 @@
   // inlining not successful => standard invoke
   bool is_loaded = target->is_loaded();
   ValueType* result_type = as_ValueType(target->return_type());
-
-  // We require the debug info to be the "state before" because
-  // invokedynamics may deoptimize.
-  ValueStack* state_before = is_invokedynamic ? copy_state_before() : copy_state_exhandling();
-
+  ValueStack* state_before = copy_state_exhandling();
+
+  // The bytecode (code) might change in this method so we are checking this very late.
+  const bool has_receiver =
+    code == Bytecodes::_invokespecial   ||
+    code == Bytecodes::_invokevirtual   ||
+    code == Bytecodes::_invokeinterface;
   Values* args = state()->pop_arguments(target->arg_size_no_receiver());
   Value recv = has_receiver ? apop() : NULL;
   int vtable_index = methodOopDesc::invalid_vtable_index;
@@ -3058,7 +3060,7 @@

   case vmIntrinsics::_Reference_get:
     {
-      if (UseG1GC) {
+      {
         // With java.lang.ref.reference.get() we must go through the
         // intrinsic - when G1 is enabled - even when get() is the root
         // method of the compile so that, if necessary, the value in
@@ -3070,6 +3072,9 @@
         // object removed from the list of discovered references during
         // reference processing.

+        // Also we need intrinsic to prevent commoning reads from this field
+        // across safepoint since GC can change its value.
+
         // Set up a stream so that appending instructions works properly.
         ciBytecodeStream s(scope->method());
         s.reset_to_bci(0);
@@ -3226,7 +3231,6 @@


 bool GraphBuilder::try_inline_intrinsics(ciMethod* callee) {
-  if (!InlineNatives           ) INLINE_BAILOUT("intrinsic method inlining disabled");
   if (callee->is_synchronized()) {
     // We don't currently support any synchronized intrinsics
     return false;
@@ -3234,9 +3238,13 @@

   // callee seems like a good candidate
   // determine id
+  vmIntrinsics::ID id = callee->intrinsic_id();
+  if (!InlineNatives && id != vmIntrinsics::_Reference_get) {
+    // InlineNatives does not control Reference.get
+    INLINE_BAILOUT("intrinsic method inlining disabled");
+  }
   bool preserves_state = false;
   bool cantrap = true;
-  vmIntrinsics::ID id = callee->intrinsic_id();
   switch (id) {
     case vmIntrinsics::_arraycopy:
       if (!InlineArrayCopy) return false;
@@ -3376,11 +3384,10 @@
       return true;

     case vmIntrinsics::_Reference_get:
-      // It is only when G1 is enabled that we absolutely
-      // need to use the intrinsic version of Reference.get()
-      // so that the value in the referent field, if necessary,
-      // can be registered by the pre-barrier code.
-      if (!UseG1GC) return false;
+      // Use the intrinsic version of Reference.get() so that the value in
+      // the referent field can be registered by the G1 pre-barrier code.
+      // Also to prevent commoning reads from this field across safepoint
+      // since GC can change its value.
       preserves_state = true;
       break;
--- a/src/share/vm/c1/c1_Instruction.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/c1/c1_Instruction.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -369,9 +369,6 @@
   _signature = new BasicTypeList(number_of_arguments() + (has_receiver() ? 1 : 0));
   if (has_receiver()) {
     _signature->append(as_BasicType(receiver()->type()));
-  } else if (is_invokedynamic()) {
-    // Add the synthetic MethodHandle argument to the signature.
-    _signature->append(T_OBJECT);
   }
   for (int i = 0; i < number_of_arguments(); i++) {
     ValueType* t = argument_at(i)->type();
--- a/src/share/vm/c1/c1_LIRAssembler.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/c1/c1_LIRAssembler.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -448,10 +448,10 @@

   switch (op->code()) {
   case lir_static_call:
+  case lir_dynamic_call:
     call(op, relocInfo::static_call_type);
     break;
   case lir_optvirtual_call:
-  case lir_dynamic_call:
     call(op, relocInfo::opt_virtual_call_type);
     break;
   case lir_icvirtual_call:
@@ -460,7 +460,9 @@
   case lir_virtual_call:
     vtable_call(op);
     break;
-  default: ShouldNotReachHere();
+  default:
+    fatal(err_msg_res("unexpected op code: %s", op->name()));
+    break;
   }

   // JSR 292
--- a/src/share/vm/c1/c1_LIRGenerator.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/c1/c1_LIRGenerator.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -920,7 +920,8 @@


 LIR_Opr LIRGenerator::force_to_spill(LIR_Opr value, BasicType t) {
-  assert(type2size[t] == type2size[value->type()], "size mismatch");
+  assert(type2size[t] == type2size[value->type()],
+         err_msg_res("size mismatch: t=%s, value->type()=%s", type2name(t), type2name(value->type())));
   if (!value->is_register()) {
     // force into a register
     LIR_Opr r = new_register(value->type());
@@ -2176,9 +2177,9 @@
   off.load_item();
   src.load_item();

-  LIR_Opr reg = rlock_result(x, x->basic_type());
-
-  get_Object_unsafe(reg, src.result(), off.result(), type, x->is_volatile());
+  LIR_Opr value = rlock_result(x, x->basic_type());
+
+  get_Object_unsafe(value, src.result(), off.result(), type, x->is_volatile());

 #ifndef SERIALGC
   // We might be reading the value of the referent field of a
@@ -2191,19 +2192,16 @@
   // if (offset == java_lang_ref_Reference::referent_offset) {
   //   if (src != NULL) {
   //     if (klass(src)->reference_type() != REF_NONE) {
-  //       pre_barrier(..., reg, ...);
+  //       pre_barrier(..., value, ...);
   //     }
   //   }
   // }
-  //
-  // The first non-constant check of either the offset or
-  // the src operand will be done here; the remainder
-  // will take place in the generated code stub.

   if (UseG1GC && type == T_OBJECT) {
-    bool gen_code_stub = true;       // Assume we need to generate the slow code stub.
-    bool gen_offset_check = true;       // Assume the code stub has to generate the offset guard.
-    bool gen_source_check = true;       // Assume the code stub has to check the src object for null.
+    bool gen_pre_barrier = true;     // Assume we need to generate pre_barrier.
+    bool gen_offset_check = true;    // Assume we need to generate the offset guard.
+    bool gen_source_check = true;    // Assume we need to check the src object for null.
+    bool gen_type_check = true;      // Assume we need to check the reference_type.

     if (off.is_constant()) {
       jlong off_con = (off.type()->is_int() ?
@@ -2215,7 +2213,7 @@
         // The constant offset is something other than referent_offset.
         // We can skip generating/checking the remaining guards and
         // skip generation of the code stub.
-        gen_code_stub = false;
+        gen_pre_barrier = false;
       } else {
         // The constant offset is the same as referent_offset -
         // we do not need to generate a runtime offset check.
@@ -2224,11 +2222,11 @@
     }

     // We don't need to generate stub if the source object is an array
-    if (gen_code_stub && src.type()->is_array()) {
-      gen_code_stub = false;
+    if (gen_pre_barrier && src.type()->is_array()) {
+      gen_pre_barrier = false;
     }

-    if (gen_code_stub) {
+    if (gen_pre_barrier) {
       // We still need to continue with the checks.
       if (src.is_constant()) {
         ciObject* src_con = src.get_jobject_constant();
@@ -2236,7 +2234,7 @@
         if (src_con->is_null_object()) {
           // The constant src object is null - We can skip
           // generating the code stub.
-          gen_code_stub = false;
+          gen_pre_barrier = false;
         } else {
           // Non-null constant source object. We still have to generate
           // the slow stub - but we don't need to generate the runtime
@@ -2245,20 +2243,28 @@
         }
       }
     }
-
-    if (gen_code_stub) {
-      // Temoraries.
-      LIR_Opr src_klass = new_register(T_OBJECT);
-
-      // Get the thread pointer for the pre-barrier
-      LIR_Opr thread = getThreadPointer();
-
-      CodeStub* stub;
+    if (gen_pre_barrier && !PatchALot) {
+      // Can the klass of object be statically determined to be
+      // a sub-class of Reference?
+      ciType* type = src.value()->declared_type();
+      if ((type != NULL) && type->is_loaded()) {
+        if (type->is_subtype_of(compilation()->env()->Reference_klass())) {
+          gen_type_check = false;
+        } else if (type->is_klass() &&
+                   !compilation()->env()->Object_klass()->is_subtype_of(type->as_klass())) {
+          // Not Reference and not Object klass.
+          gen_pre_barrier = false;
+        }
+      }
+    }
+
+    if (gen_pre_barrier) {
+      LabelObj* Lcont = new LabelObj();

       // We can have generate one runtime check here. Let's start with
       // the offset check.
       if (gen_offset_check) {
-        // if (offset == referent_offset) -> slow code stub
+        // if (offset != referent_offset) -> continue
         // If offset is an int then we can do the comparison with the
         // referent_offset constant; otherwise we need to move
         // referent_offset into a temporary register and generate
@@ -2273,43 +2279,36 @@
           referent_off = new_register(T_LONG);
           __ move(LIR_OprFact::longConst(java_lang_ref_Reference::referent_offset), referent_off);
         }
-
-        __ cmp(lir_cond_equal, off.result(), referent_off);
-
-        // Optionally generate "src == null" check.
-        stub = new G1UnsafeGetObjSATBBarrierStub(reg, src.result(),
-                                                    src_klass, thread,
-                                                    gen_source_check);
-
-        __ branch(lir_cond_equal, as_BasicType(off.type()), stub);
-      } else {
-        if (gen_source_check) {
-          // offset is a const and equals referent offset
-          // if (source != null) -> slow code stub
-          __ cmp(lir_cond_notEqual, src.result(), LIR_OprFact::oopConst(NULL));
-
-          // Since we are generating the "if src == null" guard here,
-          // there is no need to generate the "src == null" check again.
-          stub = new G1UnsafeGetObjSATBBarrierStub(reg, src.result(),
-                                                    src_klass, thread,
-                                                    false);
-
-          __ branch(lir_cond_notEqual, T_OBJECT, stub);
-        } else {
-          // We have statically determined that offset == referent_offset
-          // && src != null so we unconditionally branch to code stub
-          // to perform the guards and record reg in the SATB log buffer.
-
-          stub = new G1UnsafeGetObjSATBBarrierStub(reg, src.result(),
-                                                    src_klass, thread,
-                                                    false);
-
-          __ branch(lir_cond_always, T_ILLEGAL, stub);
-        }
+        __ cmp(lir_cond_notEqual, off.result(), referent_off);
+        __ branch(lir_cond_notEqual, as_BasicType(off.type()), Lcont->label());
+      }
+      if (gen_source_check) {
+        // offset is a const and equals referent offset
+        // if (source == null) -> continue
+        __ cmp(lir_cond_equal, src.result(), LIR_OprFact::oopConst(NULL));
+        __ branch(lir_cond_equal, T_OBJECT, Lcont->label());
       }
-
-      // Continuation point
-      __ branch_destination(stub->continuation());
+      LIR_Opr src_klass = new_register(T_OBJECT);
+      if (gen_type_check) {
+        // We have determined that offset == referent_offset && src != null.
+        // if (src->_klass->_reference_type == REF_NONE) -> continue
+        __ move(new LIR_Address(src.result(), oopDesc::klass_offset_in_bytes(), T_OBJECT), src_klass);
+        LIR_Address* reference_type_addr = new LIR_Address(src_klass, in_bytes(instanceKlass::reference_type_offset()), T_BYTE);
+        LIR_Opr reference_type = new_register(T_INT);
+        __ move(reference_type_addr, reference_type);
+        __ cmp(lir_cond_equal, reference_type, LIR_OprFact::intConst(REF_NONE));
+        __ branch(lir_cond_equal, T_INT, Lcont->label());
+      }
+      {
+        // We have determined that src->_klass->_reference_type != REF_NONE
+        // so register the value in the referent field with the pre-barrier.
+        pre_barrier(LIR_OprFact::illegalOpr /* addr_opr */,
+                    value  /* pre_val */,
+                    false  /* do_load */,
+                    false  /* patch */,
+                    NULL   /* info */);
+      }
+      __ branch_destination(Lcont->label());
     }
   }
 #endif // SERIALGC
@@ -2664,8 +2663,9 @@


 void LIRGenerator::invoke_load_arguments(Invoke* x, LIRItemList* args, const LIR_OprList* arg_list) {
-  int i = (x->has_receiver() || x->is_invokedynamic()) ? 1 : 0;
-  for (; i < args->length(); i++) {
+  assert(args->length() == arg_list->length(),
+         err_msg_res("args=%d, arg_list=%d", args->length(), arg_list->length()));
+  for (int i = x->has_receiver() ? 1 : 0; i < args->length(); i++) {
     LIRItem* param = args->at(i);
     LIR_Opr loc = arg_list->at(i);
     if (loc->is_register()) {
@@ -2705,15 +2705,9 @@
     LIRItem* receiver = new LIRItem(x->receiver(), this);
     argument_items->append(receiver);
   }
-  if (x->is_invokedynamic()) {
-    // Insert a dummy for the synthetic MethodHandle argument.
-    argument_items->append(NULL);
-  }
-  int idx = x->has_receiver() ? 1 : 0;
   for (int i = 0; i < x->number_of_arguments(); i++) {
     LIRItem* param = new LIRItem(x->argument_at(i), this);
     argument_items->append(param);
-    idx += (param->type()->is_double_word() ? 2 : 1);
   }
   return argument_items;
 }
@@ -2758,9 +2752,6 @@

   CodeEmitInfo* info = state_for(x, x->state());

-  // invokedynamics can deoptimize.
-  CodeEmitInfo* deopt_info = x->is_invokedynamic() ? state_for(x, x->state_before()) : NULL;
-
   invoke_load_arguments(x, args, arg_list);

   if (x->has_receiver()) {
@@ -2809,41 +2800,8 @@
       }
       break;
     case Bytecodes::_invokedynamic: {
-      ciBytecodeStream bcs(x->scope()->method());
-      bcs.force_bci(x->state()->bci());
-      assert(bcs.cur_bc() == Bytecodes::_invokedynamic, "wrong stream");
-      ciCPCache* cpcache = bcs.get_cpcache();
-
-      // Get CallSite offset from constant pool cache pointer.
-      int index = bcs.get_method_index();
-      size_t call_site_offset = cpcache->get_f1_offset(index);
-
-      // Load CallSite object from constant pool cache.
-      LIR_Opr call_site = new_register(objectType);
-      __ oop2reg(cpcache->constant_encoding(), call_site);
-      __ move_wide(new LIR_Address(call_site, call_site_offset, T_OBJECT), call_site);
-
-      // If this invokedynamic call site hasn't been executed yet in
-      // the interpreter, the CallSite object in the constant pool
-      // cache is still null and we need to deoptimize.
-      if (cpcache->is_f1_null_at(index)) {
-        // Only deoptimize if the CallSite object is still null; we don't
-        // recompile methods in C1 after deoptimization so this call site
-        // might be resolved the next time we execute it after OSR.
-        DeoptimizeStub* deopt_stub = new DeoptimizeStub(deopt_info);
-        __ cmp(lir_cond_equal, call_site, LIR_OprFact::oopConst(NULL));
-        __ branch(lir_cond_equal, T_OBJECT, deopt_stub);
-      }
-
-      // Use the receiver register for the synthetic MethodHandle
-      // argument.
-      receiver = LIR_Assembler::receiverOpr();
-
-      // Load target MethodHandle from CallSite object.
-      __ load(new LIR_Address(call_site, java_lang_invoke_CallSite::target_offset_in_bytes(), T_OBJECT), receiver);
-
       __ call_dynamic(target, receiver, result_register,
-                      SharedRuntime::get_resolve_opt_virtual_call_stub(),
+                      SharedRuntime::get_resolve_static_call_stub(),
                       arg_list, info);
       break;
     }
--- a/src/share/vm/c1/c1_ValueMap.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/c1/c1_ValueMap.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -190,7 +190,7 @@
   LoadField* lf = value->as_LoadField();                                                 \
   bool must_kill = lf != NULL                                                            \
                    && lf->field()->holder() == field->holder()                           \
-                   && lf->field()->offset() == field->offset();
+                   && (all_offsets || lf->field()->offset() == field->offset());

 #define MUST_KILL_EXCEPTION(must_kill, entry, value)                                     \
   assert(entry->nesting() < nesting(), "must not find bigger nesting than current");     \
@@ -205,7 +205,7 @@
   GENERIC_KILL_VALUE(MUST_KILL_ARRAY);
 }

-void ValueMap::kill_field(ciField* field) {
+void ValueMap::kill_field(ciField* field, bool all_offsets) {
   GENERIC_KILL_VALUE(MUST_KILL_FIELD);
 }

@@ -280,9 +280,9 @@
   ValueMap* value_map_of(BlockBegin* block)      { return _gvn->value_map_of(block); }

   // implementation for abstract methods of ValueNumberingVisitor
-  void      kill_memory()                        { _too_complicated_loop = true; }
-  void      kill_field(ciField* field)           { current_map()->kill_field(field); };
-  void      kill_array(ValueType* type)          { current_map()->kill_array(type); };
+  void      kill_memory()                                 { _too_complicated_loop = true; }
+  void      kill_field(ciField* field, bool all_offsets)  { current_map()->kill_field(field, all_offsets); };
+  void      kill_array(ValueType* type)                   { current_map()->kill_array(type); };

  public:
   ShortLoopOptimizer(GlobalValueNumbering* gvn)
--- a/src/share/vm/c1/c1_ValueMap.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/c1/c1_ValueMap.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -114,7 +114,7 @@
   Value find_insert(Value x);

   void kill_memory();
-  void kill_field(ciField* field);
+  void kill_field(ciField* field, bool all_offsets);
   void kill_array(ValueType* type);
   void kill_exception();
   void kill_map(ValueMap* map);
@@ -136,7 +136,7 @@
  protected:
   // called by visitor functions for instructions that kill values
   virtual void kill_memory() = 0;
-  virtual void kill_field(ciField* field) = 0;
+  virtual void kill_field(ciField* field, bool all_offsets) = 0;
   virtual void kill_array(ValueType* type) = 0;

   // visitor functions
@@ -148,7 +148,7 @@
         x->field()->is_volatile()) {
       kill_memory();
     } else {
-      kill_field(x->field());
+      kill_field(x->field(), x->needs_patching());
     }
   }
   void do_StoreIndexed   (StoreIndexed*    x) { kill_array(x->type()); }
@@ -214,9 +214,9 @@

  public:
   // implementation for abstract methods of ValueNumberingVisitor
-  void          kill_memory()                    { _map->kill_memory(); }
-  void          kill_field(ciField* field)       { _map->kill_field(field); }
-  void          kill_array(ValueType* type)      { _map->kill_array(type); }
+  void          kill_memory()                                 { _map->kill_memory(); }
+  void          kill_field(ciField* field, bool all_offsets)  { _map->kill_field(field, all_offsets); }
+  void          kill_array(ValueType* type)                   { _map->kill_array(type); }

   ValueNumberingEffects(ValueMap* map): _map(map) {}
 };
@@ -234,9 +234,9 @@
   void          set_value_map_of(BlockBegin* block, ValueMap* map)   { assert(value_map_of(block) == NULL, ""); _value_maps.at_put(block->linear_scan_number(), map); }

   // implementation for abstract methods of ValueNumberingVisitor
-  void          kill_memory()                    { current_map()->kill_memory(); }
-  void          kill_field(ciField* field)       { current_map()->kill_field(field); }
-  void          kill_array(ValueType* type)      { current_map()->kill_array(type); }
+  void          kill_memory()                                 { current_map()->kill_memory(); }
+  void          kill_field(ciField* field, bool all_offsets)  { current_map()->kill_field(field, all_offsets); }
+  void          kill_array(ValueType* type)                   { current_map()->kill_array(type); }

   // main entry point that performs global value numbering
   GlobalValueNumbering(IR* ir);
--- a/src/share/vm/gc_implementation/g1/g1BlockOffsetTable.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/gc_implementation/g1/g1BlockOffsetTable.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -159,14 +159,30 @@
            "right address out of range");
     assert(left  < right, "Heap addresses out of order");
     size_t num_cards = pointer_delta(right, left) >> LogN_words;
-    memset(&_offset_array[index_for(left)], offset, num_cards);
+    if (UseMemSetInBOT) {
+      memset(&_offset_array[index_for(left)], offset, num_cards);
+    } else {
+      size_t i = index_for(left);
+      const size_t end = i + num_cards;
+      for (; i < end; i++) {
+        _offset_array[i] = offset;
+      }
+    }
   }

   void set_offset_array(size_t left, size_t right, u_char offset) {
     assert(right < _vs.committed_size(), "right address out of range");
-    assert(left  <= right, "indexes out of order");
+    assert(left <= right, "indexes out of order");
     size_t num_cards = right - left + 1;
-    memset(&_offset_array[left], offset, num_cards);
+    if (UseMemSetInBOT) {
+      memset(&_offset_array[left], offset, num_cards);
+    } else {
+      size_t i = left;
+      const size_t end = i + num_cards;
+      for (; i < end; i++) {
+        _offset_array[i] = offset;
+      }
+    }
   }

   void check_offset_array(size_t index, HeapWord* high, HeapWord* low) const {
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -1891,6 +1891,8 @@
   _young_list(new YoungList(this)),
   _gc_time_stamp(0),
   _retained_old_gc_alloc_region(NULL),
+  _survivor_plab_stats(YoungPLABSize, PLABWeight),
+  _old_plab_stats(OldPLABSize, PLABWeight),
   _expand_heap_after_alloc_failure(true),
   _surviving_young_words(NULL),
   _old_marking_cycles_started(0),
@@ -1932,6 +1934,14 @@
   clear_cset_start_regions();

   guarantee(_task_queues != NULL, "task_queues allocation failure.");
+#ifdef SPARC
+  // Issue a stern warning, but allow use for experimentation and debugging.
+  if (VM_Version::is_sun4v() && UseMemSetInBOT) {
+    assert(!FLAG_IS_DEFAULT(UseMemSetInBOT), "Error");
+    warning("Experimental flag -XX:+UseMemSetInBOT is known to cause instability"
+            " on sun4v; please understand that you are using at your own risk!");
+  }
+#endif
 }

 jint G1CollectedHeap::initialize() {
@@ -3580,15 +3590,11 @@
   DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
   size_t buffer_size = dcqs.buffer_size();
   size_t buffer_num = dcqs.completed_buffers_num();
-  return buffer_size * buffer_num + extra_cards;
-}
-
-size_t G1CollectedHeap::max_pending_card_num() {
-  DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
-  size_t buffer_size = dcqs.buffer_size();
-  size_t buffer_num  = dcqs.completed_buffers_num();
-  int thread_num  = Threads::number_of_threads();
-  return (buffer_num + thread_num) * buffer_size;
+
+  // PtrQueueSet::buffer_size() and PtrQueue:size() return sizes
+  // in bytes - not the number of 'entries'. We need to convert
+  // into a number of cards.
+  return (buffer_size * buffer_num + extra_cards) / oopSize;
 }

 size_t G1CollectedHeap::cards_scanned() {
@@ -4099,17 +4105,22 @@
   size_t gclab_word_size;
   switch (purpose) {
     case GCAllocForSurvived:
-      gclab_word_size = YoungPLABSize;
+      gclab_word_size = _survivor_plab_stats.desired_plab_sz();
       break;
     case GCAllocForTenured:
-      gclab_word_size = OldPLABSize;
+      gclab_word_size = _old_plab_stats.desired_plab_sz();
       break;
     default:
       assert(false, "unknown GCAllocPurpose");
-      gclab_word_size = OldPLABSize;
+      gclab_word_size = _old_plab_stats.desired_plab_sz();
       break;
   }
-  return gclab_word_size;
+
+  // Prevent humongous PLAB sizes for two reasons:
+  // * PLABs are allocated using a similar paths as oops, but should
+  //   never be in a humongous region
+  // * Allowing humongous PLABs needlessly churns the region free lists
+  return MIN2(_humongous_object_threshold_in_words, gclab_word_size);
 }

 void G1CollectedHeap::init_mutator_alloc_region() {
@@ -4165,6 +4176,11 @@
   // want either way so no reason to check explicitly for either
   // condition.
   _retained_old_gc_alloc_region = _old_gc_alloc_region.release();
+
+  if (ResizePLAB) {
+    _survivor_plab_stats.adjust_desired_plab_sz();
+    _old_plab_stats.adjust_desired_plab_sz();
+  }
 }

 void G1CollectedHeap::abandon_gc_alloc_regions() {
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -33,7 +33,7 @@
 #include "gc_implementation/g1/heapRegionSeq.hpp"
 #include "gc_implementation/g1/heapRegionSets.hpp"
 #include "gc_implementation/shared/hSpaceCounters.hpp"
-#include "gc_implementation/parNew/parGCAllocBuffer.hpp"
+#include "gc_implementation/shared/parGCAllocBuffer.hpp"
 #include "memory/barrierSet.hpp"
 #include "memory/memRegion.hpp"
 #include "memory/sharedHeap.hpp"
@@ -278,10 +278,33 @@
   // survivor objects.
   SurvivorGCAllocRegion _survivor_gc_alloc_region;

+  // PLAB sizing policy for survivors.
+  PLABStats _survivor_plab_stats;
+
   // Alloc region used to satisfy allocation requests by the GC for
   // old objects.
   OldGCAllocRegion _old_gc_alloc_region;

+  // PLAB sizing policy for tenured objects.
+  PLABStats _old_plab_stats;
+
+  PLABStats* stats_for_purpose(GCAllocPurpose purpose) {
+    PLABStats* stats = NULL;
+
+    switch (purpose) {
+    case GCAllocForSurvived:
+      stats = &_survivor_plab_stats;
+      break;
+    case GCAllocForTenured:
+      stats = &_old_plab_stats;
+      break;
+    default:
+      assert(false, "unrecognized GCAllocPurpose");
+    }
+
+    return stats;
+  }
+
   // The last old region we allocated to during the last GC.
   // Typically, it is not full so we should re-use it during the next GC.
   HeapRegion* _retained_old_gc_alloc_region;
@@ -314,7 +337,7 @@
   G1MonitoringSupport* _g1mm;

   // Determines PLAB size for a particular allocation purpose.
-  static size_t desired_plab_sz(GCAllocPurpose purpose);
+  size_t desired_plab_sz(GCAllocPurpose purpose);

   // Outside of GC pauses, the number of bytes used in all regions other
   // than the current allocation region.
@@ -1683,7 +1706,6 @@
   void stop_conc_gc_threads();

   size_t pending_card_num();
-  size_t max_pending_card_num();
   size_t cards_scanned();

 protected:
@@ -1811,19 +1833,19 @@
   }

   HeapWord* allocate_slow(GCAllocPurpose purpose, size_t word_sz) {
-
     HeapWord* obj = NULL;
     size_t gclab_word_size = _g1h->desired_plab_sz(purpose);
     if (word_sz * 100 < gclab_word_size * ParallelGCBufferWastePct) {
       G1ParGCAllocBuffer* alloc_buf = alloc_buffer(purpose);
-      assert(gclab_word_size == alloc_buf->word_sz(),
-             "dynamic resizing is not supported");
       add_to_alloc_buffer_waste(alloc_buf->words_remaining());
-      alloc_buf->retire(false, false);
+      alloc_buf->flush_stats_and_retire(_g1h->stats_for_purpose(purpose),
+                                        false /* end_of_gc */,
+                                        false /* retain */);

       HeapWord* buf = _g1h->par_allocate_during_gc(purpose, gclab_word_size);
       if (buf == NULL) return NULL; // Let caller handle allocation failure.
       // Otherwise.
+      alloc_buf->set_word_size(gclab_word_size);
       alloc_buf->set_buf(buf);

       obj = alloc_buf->allocate(word_sz);
@@ -1908,7 +1930,9 @@
     for (int ap = 0; ap < GCAllocPurposeCount; ++ap) {
       size_t waste = _alloc_buffers[ap]->words_remaining();
       add_to_alloc_buffer_waste(waste);
-      _alloc_buffers[ap]->retire(true, false);
+      _alloc_buffers[ap]->flush_stats_and_retire(_g1h->stats_for_purpose((GCAllocPurpose)ap),
+                                                 true /* end_of_gc */,
+                                                 false /* retain */);
     }
   }
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -90,7 +90,6 @@

   _alloc_rate_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
   _prev_collection_pause_end_ms(0.0),
-  _pending_card_diff_seq(new TruncatedSeq(TruncatedSeqLength)),
   _rs_length_diff_seq(new TruncatedSeq(TruncatedSeqLength)),
   _cost_per_card_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
   _young_cards_per_entry_ratio_seq(new TruncatedSeq(TruncatedSeqLength)),
@@ -197,7 +196,6 @@

   int index = MIN2(_parallel_gc_threads - 1, 7);

-  _pending_card_diff_seq->add(0.0);
   _rs_length_diff_seq->add(rs_length_diff_defaults[index]);
   _cost_per_card_ms_seq->add(cost_per_card_ms_defaults[index]);
   _young_cards_per_entry_ratio_seq->add(
@@ -657,7 +655,7 @@
   for (HeapRegion * r = _recorded_survivor_head;
        r != NULL && r != _recorded_survivor_tail->get_next_young_region();
        r = r->get_next_young_region()) {
-    survivor_regions_evac_time += predict_region_elapsed_time_ms(r, true);
+    survivor_regions_evac_time += predict_region_elapsed_time_ms(r, gcs_are_young());
   }
   return survivor_regions_evac_time;
 }
@@ -801,9 +799,8 @@
   _cur_collection_pause_used_at_start_bytes = start_used;
   _cur_collection_pause_used_regions_at_start = _g1->used_regions();
   _pending_cards = _g1->pending_card_num();
-  _max_pending_cards = _g1->max_pending_card_num();

-  _bytes_in_collection_set_before_gc = 0;
+  _collection_set_bytes_used_before = 0;
   _bytes_copied_during_gc = 0;

   YoungList* young_list = _g1->young_list();
@@ -1036,12 +1033,6 @@
   // do that for any other surv rate groupsx

   if (update_stats) {
-    size_t diff = 0;
-    if (_max_pending_cards >= _pending_cards) {
-      diff = _max_pending_cards - _pending_cards;
-    }
-    _pending_card_diff_seq->add((double) diff);
-
     double cost_per_card_ms = 0.0;
     if (_pending_cards > 0) {
       cost_per_card_ms = phase_times()->_update_rs_time / (double) _pending_cards;
@@ -1126,9 +1117,9 @@
     _constant_other_time_ms_seq->add(constant_other_time_ms);

     double survival_ratio = 0.0;
-    if (_bytes_in_collection_set_before_gc > 0) {
+    if (_collection_set_bytes_used_before > 0) {
       survival_ratio = (double) _bytes_copied_during_gc /
-                                   (double) _bytes_in_collection_set_before_gc;
+                                   (double) _collection_set_bytes_used_before;
     }

     _pending_cards_seq->add((double) _pending_cards);
@@ -1229,18 +1220,6 @@
 }

 double
-G1CollectorPolicy::predict_base_elapsed_time_ms(size_t pending_cards) {
-  size_t rs_length = predict_rs_length_diff();
-  size_t card_num;
-  if (gcs_are_young()) {
-    card_num = predict_young_card_num(rs_length);
-  } else {
-    card_num = predict_non_young_card_num(rs_length);
-  }
-  return predict_base_elapsed_time_ms(pending_cards, card_num);
-}
-
-double
 G1CollectorPolicy::predict_base_elapsed_time_ms(size_t pending_cards,
                                                 size_t scanned_cards) {
   return
@@ -1250,27 +1229,15 @@
 }

 double
-G1CollectorPolicy::predict_region_elapsed_time_ms(HeapRegion* hr,
-                                                  bool young) {
-  size_t rs_length = hr->rem_set()->occupied();
+G1CollectorPolicy::predict_base_elapsed_time_ms(size_t pending_cards) {
+  size_t rs_length = predict_rs_length_diff();
   size_t card_num;
   if (gcs_are_young()) {
     card_num = predict_young_card_num(rs_length);
   } else {
     card_num = predict_non_young_card_num(rs_length);
   }
-  size_t bytes_to_copy = predict_bytes_to_copy(hr);
-
-  double region_elapsed_time_ms =
-    predict_rs_scan_time_ms(card_num) +
-    predict_object_copy_time_ms(bytes_to_copy);
-
-  if (young)
-    region_elapsed_time_ms += predict_young_other_time_ms(1);
-  else
-    region_elapsed_time_ms += predict_non_young_other_time_ms(1);
-
-  return region_elapsed_time_ms;
+  return predict_base_elapsed_time_ms(pending_cards, card_num);
 }

 size_t G1CollectorPolicy::predict_bytes_to_copy(HeapRegion* hr) {
@@ -1286,6 +1253,35 @@
   return bytes_to_copy;
 }

+double
+G1CollectorPolicy::predict_region_elapsed_time_ms(HeapRegion* hr,
+                                                  bool for_young_gc) {
+  size_t rs_length = hr->rem_set()->occupied();
+  size_t card_num;
+
+  // Predicting the number of cards is based on which type of GC
+  // we're predicting for.
+  if (for_young_gc) {
+    card_num = predict_young_card_num(rs_length);
+  } else {
+    card_num = predict_non_young_card_num(rs_length);
+  }
+  size_t bytes_to_copy = predict_bytes_to_copy(hr);
+
+  double region_elapsed_time_ms =
+    predict_rs_scan_time_ms(card_num) +
+    predict_object_copy_time_ms(bytes_to_copy);
+
+  // The prediction of the "other" time for this region is based
+  // upon the region type and NOT the GC type.
+  if (hr->is_young()) {
+    region_elapsed_time_ms += predict_young_other_time_ms(1);
+  } else {
+    region_elapsed_time_ms += predict_non_young_other_time_ms(1);
+  }
+  return region_elapsed_time_ms;
+}
+
 void
 G1CollectorPolicy::init_cset_region_lengths(uint eden_cset_region_length,
                                             uint survivor_cset_region_length) {
@@ -1342,22 +1338,6 @@
   }
 }

-class CountCSClosure: public HeapRegionClosure {
-  G1CollectorPolicy* _g1_policy;
-public:
-  CountCSClosure(G1CollectorPolicy* g1_policy) :
-    _g1_policy(g1_policy) {}
-  bool doHeapRegion(HeapRegion* r) {
-    _g1_policy->_bytes_in_collection_set_before_gc += r->used();
-    return false;
-  }
-};
-
-void G1CollectorPolicy::count_CS_bytes_used() {
-  CountCSClosure cs_closure(this);
-  _g1->collection_set_iterate(&cs_closure);
-}
-
 void G1CollectorPolicy::print_tracing_info() const {
   _trace_gen0_time_data.print();
   _trace_gen1_time_data.print();
@@ -1696,7 +1676,7 @@
   // retiring the current allocation region) or a concurrent
   // refine thread (RSet sampling).

-  double region_elapsed_time_ms = predict_region_elapsed_time_ms(hr, true);
+  double region_elapsed_time_ms = predict_region_elapsed_time_ms(hr, gcs_are_young());
   size_t used_bytes = hr->used();
   _inc_cset_recorded_rs_lengths += rs_length;
   _inc_cset_predicted_elapsed_time_ms += region_elapsed_time_ms;
@@ -1731,7 +1711,7 @@
   _inc_cset_recorded_rs_lengths_diffs += rs_lengths_diff;

   double old_elapsed_time_ms = hr->predicted_elapsed_time_ms();
-  double new_region_elapsed_time_ms = predict_region_elapsed_time_ms(hr, true);
+  double new_region_elapsed_time_ms = predict_region_elapsed_time_ms(hr, gcs_are_young());
   double elapsed_ms_diff = new_region_elapsed_time_ms - old_elapsed_time_ms;
   _inc_cset_predicted_elapsed_time_ms_diffs += elapsed_ms_diff;

@@ -1854,8 +1834,7 @@
 }

 void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) {
-  // Set this here - in case we're not doing young collections.
-  double non_young_start_time_sec = os::elapsedTime();
+  double young_start_time_sec = os::elapsedTime();

   YoungList* young_list = _g1->young_list();
   finalize_incremental_cset_building();
@@ -1869,17 +1848,14 @@
   double predicted_pause_time_ms = base_time_ms;
   double time_remaining_ms = target_pause_time_ms - base_time_ms;

-  ergo_verbose3(ErgoCSetConstruction | ErgoHigh,
+  ergo_verbose4(ErgoCSetConstruction | ErgoHigh,
                 "start choosing CSet",
+                ergo_format_size("_pending_cards")
                 ergo_format_ms("predicted base time")
                 ergo_format_ms("remaining time")
                 ergo_format_ms("target pause time"),
-                base_time_ms, time_remaining_ms, target_pause_time_ms);
+                _pending_cards, base_time_ms, time_remaining_ms, target_pause_time_ms);

-  HeapRegion* hr;
-  double young_start_time_sec = os::elapsedTime();
-
-  _collection_set_bytes_used_before = 0;
   _last_gc_was_young = gcs_are_young() ? true : false;

   if (_last_gc_was_young) {
@@ -1895,7 +1871,8 @@
   uint survivor_region_length = young_list->survivor_length();
   uint eden_region_length = young_list->length() - survivor_region_length;
   init_cset_region_lengths(eden_region_length, survivor_region_length);
-  hr = young_list->first_survivor_region();
+
+  HeapRegion* hr = young_list->first_survivor_region();
   while (hr != NULL) {
     assert(hr->is_survivor(), "badly formed young list");
     hr->set_young();
@@ -1926,8 +1903,8 @@
   phase_times()->_recorded_young_cset_choice_time_ms =
     (young_end_time_sec - young_start_time_sec) * 1000.0;

-  // We are doing young collections so reset this.
-  non_young_start_time_sec = young_end_time_sec;
+  // Set the start of the non-young choice time.
+  double non_young_start_time_sec = young_end_time_sec;

   if (!gcs_are_young()) {
     CollectionSetChooser* cset_chooser = _collectionSetChooser;
@@ -1937,6 +1914,7 @@

     uint expensive_region_num = 0;
     bool check_time_remaining = adaptive_young_list_length();
+
     HeapRegion* hr = cset_chooser->peek();
     while (hr != NULL) {
       if (old_cset_region_length() >= max_old_cset_length) {
@@ -1950,7 +1928,7 @@
         break;
       }

-      double predicted_time_ms = predict_region_elapsed_time_ms(hr, false);
+      double predicted_time_ms = predict_region_elapsed_time_ms(hr, gcs_are_young());
       if (check_time_remaining) {
         if (predicted_time_ms > time_remaining_ms) {
           // Too expensive for the current CSet.
@@ -2025,8 +2003,6 @@

   stop_incremental_cset_building();

-  count_CS_bytes_used();
-
   ergo_verbose5(ErgoCSetConstruction,
                 "finish choosing CSet",
                 ergo_format_region("eden")
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -228,7 +228,6 @@
   TruncatedSeq* _alloc_rate_ms_seq;
   double        _prev_collection_pause_end_ms;

-  TruncatedSeq* _pending_card_diff_seq;
   TruncatedSeq* _rs_length_diff_seq;
   TruncatedSeq* _cost_per_card_ms_seq;
   TruncatedSeq* _young_cards_per_entry_ratio_seq;
@@ -295,7 +294,6 @@
   double _pause_time_target_ms;

   size_t _pending_cards;
-  size_t _max_pending_cards;

 public:
   // Accessors
@@ -325,28 +323,6 @@
     _max_rs_lengths = rs_lengths;
   }

-  size_t predict_pending_card_diff() {
-    double prediction = get_new_neg_prediction(_pending_card_diff_seq);
-    if (prediction < 0.00001) {
-      return 0;
-    } else {
-      return (size_t) prediction;
-    }
-  }
-
-  size_t predict_pending_cards() {
-    size_t max_pending_card_num = _g1->max_pending_card_num();
-    size_t diff = predict_pending_card_diff();
-    size_t prediction;
-    if (diff > max_pending_card_num) {
-      prediction = max_pending_card_num;
-    } else {
-      prediction = max_pending_card_num - diff;
-    }
-
-    return prediction;
-  }
-
   size_t predict_rs_length_diff() {
     return (size_t) get_new_prediction(_rs_length_diff_seq);
   }
@@ -439,7 +415,7 @@
   double predict_base_elapsed_time_ms(size_t pending_cards,
                                       size_t scanned_cards);
   size_t predict_bytes_to_copy(HeapRegion* hr);
-  double predict_region_elapsed_time_ms(HeapRegion* hr, bool young);
+  double predict_region_elapsed_time_ms(HeapRegion* hr, bool for_young_gc);

   void set_recorded_rs_lengths(size_t rs_lengths);

@@ -495,12 +471,6 @@
   }

 private:
-  size_t _bytes_in_collection_set_before_gc;
-  size_t _bytes_copied_during_gc;
-
-  // Used to count used bytes in CS.
-  friend class CountCSClosure;
-
   // Statistics kept per GC stoppage, pause or full.
   TruncatedSeq* _recent_prev_end_times_for_all_gcs_sec;

@@ -514,9 +484,13 @@

   // The number of bytes in the collection set before the pause. Set from
   // the incrementally built collection set at the start of an evacuation
-  // pause.
+  // pause, and incremented in finalize_cset() when adding old regions
+  // (if any) to the collection set.
   size_t _collection_set_bytes_used_before;

+  // The number of bytes copied during the GC.
+  size_t _bytes_copied_during_gc;
+
   // The associated information that is maintained while the incremental
   // collection set is being built with young regions. Used to populate
   // the recorded info for the evacuation pause.
@@ -646,9 +620,6 @@
   bool predict_will_fit(uint young_length, double base_time_ms,
                         uint base_free_regions, double target_pause_time_ms);

-  // Count the number of bytes used in the CS.
-  void count_CS_bytes_used();
-
 public:

   G1CollectorPolicy();
@@ -666,10 +637,6 @@
   // higher, recalculate the young list target length prediction.
   void revise_young_list_target_length_if_necessary();

-  size_t bytes_in_collection_set() {
-    return _bytes_in_collection_set_before_gc;
-  }
-
   // This should be called after the heap is resized.
   void record_new_heap_size(uint new_number_of_regions);
--- a/src/share/vm/gc_implementation/g1/g1ErgoVerbose.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/gc_implementation/g1/g1ErgoVerbose.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -125,6 +125,7 @@
 #define ergo_format_double(_name_)   ", " _name_ ": %1.2f"
 #define ergo_format_perc(_name_)     ", " _name_ ": %1.2f %%"
 #define ergo_format_ms(_name_)       ", " _name_ ": %1.2f ms"
+#define ergo_format_size(_name_)     ", " _name_ ": "SIZE_FORMAT

 // Double parameter format strings
 #define ergo_format_byte_perc(_name_)                                   \
--- a/src/share/vm/gc_implementation/g1/g1_globals.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -287,17 +287,17 @@
           "The number of times we'll force an overflow during "             \
           "concurrent marking")                                             \
                                                                             \
-  develop(uintx, G1DefaultMinNewGenPercent, 20,                             \
+  experimental(uintx, G1DefaultMinNewGenPercent, 20,                        \
           "Percentage (0-100) of the heap size to use as minimum "          \
           "young gen size.")                                                \
                                                                             \
-  develop(uintx, G1DefaultMaxNewGenPercent, 80,                             \
+  experimental(uintx, G1DefaultMaxNewGenPercent, 80,                        \
           "Percentage (0-100) of the heap size to use as maximum "          \
           "young gen size.")                                                \
                                                                             \
-  develop(uintx, G1OldCSetRegionLiveThresholdPercent, 90,                   \
+  experimental(uintx, G1OldCSetRegionLiveThresholdPercent, 90,              \
           "Threshold for regions to be added to the collection set. "       \
-          "Regions with more live bytes that this will not be collected.")  \
+          "Regions with more live bytes than this will not be collected.")  \
                                                                             \
   product(uintx, G1HeapWastePercent, 5,                                     \
           "Amount of space, expressed as a percentage of the heap size, "   \
@@ -306,7 +306,7 @@
   product(uintx, G1MixedGCCountTarget, 4,                                   \
           "The target number of mixed GCs after a marking cycle.")          \
                                                                             \
-  develop(uintx, G1OldCSetRegionThresholdPercent, 10,                       \
+  experimental(uintx, G1OldCSetRegionThresholdPercent, 10,                  \
           "An upper bound for the number of old CSet regions expressed "    \
           "as a percentage of the heap size.")                              \
                                                                             \
--- a/src/share/vm/gc_implementation/g1/heapRegion.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/gc_implementation/g1/heapRegion.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -384,10 +384,17 @@
 }

 void HeapRegion::calc_gc_efficiency() {
+  // GC efficiency is the ratio of how much space would be
+  // reclaimed over how long we predict it would take to reclaim it.
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
   G1CollectorPolicy* g1p = g1h->g1_policy();
-  _gc_efficiency = (double) reclaimable_bytes() /
-                            g1p->predict_region_elapsed_time_ms(this, false);
+
+  // Retrieve a prediction of the elapsed time for this region for
+  // a mixed gc because the region will only be evacuated during a
+  // mixed gc.
+  double region_elapsed_time_ms =
+    g1p->predict_region_elapsed_time_ms(this, false /* for_young_gc */);
+  _gc_efficiency = (double) reclaimable_bytes() / region_elapsed_time_ms;
 }

 void HeapRegion::set_startsHumongous(HeapWord* new_top, HeapWord* new_end) {
--- a/src/share/vm/gc_implementation/parNew/parGCAllocBuffer.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,344 +0,0 @@
-/*
- * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#include "precompiled.hpp"
-#include "gc_implementation/parNew/parGCAllocBuffer.hpp"
-#include "memory/sharedHeap.hpp"
-#include "oops/arrayOop.hpp"
-#include "oops/oop.inline.hpp"
-
-ParGCAllocBuffer::ParGCAllocBuffer(size_t desired_plab_sz_) :
-  _word_sz(desired_plab_sz_), _bottom(NULL), _top(NULL),
-  _end(NULL), _hard_end(NULL),
-  _retained(false), _retained_filler(),
-  _allocated(0), _wasted(0)
-{
-  assert (min_size() > AlignmentReserve, "Inconsistency!");
-  // arrayOopDesc::header_size depends on command line initialization.
-  FillerHeaderSize = align_object_size(arrayOopDesc::header_size(T_INT));
-  AlignmentReserve = oopDesc::header_size() > MinObjAlignment ? FillerHeaderSize : 0;
-}
-
-size_t ParGCAllocBuffer::FillerHeaderSize;
-
-// If the minimum object size is greater than MinObjAlignment, we can
-// end up with a shard at the end of the buffer that's smaller than
-// the smallest object.  We can't allow that because the buffer must
-// look like it's full of objects when we retire it, so we make
-// sure we have enough space for a filler int array object.
-size_t ParGCAllocBuffer::AlignmentReserve;
-
-void ParGCAllocBuffer::retire(bool end_of_gc, bool retain) {
-  assert(!retain || end_of_gc, "Can only retain at GC end.");
-  if (_retained) {
-    // If the buffer had been retained shorten the previous filler object.
-    assert(_retained_filler.end() <= _top, "INVARIANT");
-    CollectedHeap::fill_with_object(_retained_filler);
-    // Wasted space book-keeping, otherwise (normally) done in invalidate()
-    _wasted += _retained_filler.word_size();
-    _retained = false;
-  }
-  assert(!end_of_gc || !_retained, "At this point, end_of_gc ==> !_retained.");
-  if (_top < _hard_end) {
-    CollectedHeap::fill_with_object(_top, _hard_end);
-    if (!retain) {
-      invalidate();
-    } else {
-      // Is there wasted space we'd like to retain for the next GC?
-      if (pointer_delta(_end, _top) > FillerHeaderSize) {
-        _retained = true;
-        _retained_filler = MemRegion(_top, FillerHeaderSize);
-        _top = _top + FillerHeaderSize;
-      } else {
-        invalidate();
-      }
-    }
-  }
-}
-
-void ParGCAllocBuffer::flush_stats(PLABStats* stats) {
-  assert(ResizePLAB, "Wasted work");
-  stats->add_allocated(_allocated);
-  stats->add_wasted(_wasted);
-  stats->add_unused(pointer_delta(_end, _top));
-}
-
-// Compute desired plab size and latch result for later
-// use. This should be called once at the end of parallel
-// scavenge; it clears the sensor accumulators.
-void PLABStats::adjust_desired_plab_sz() {
-  assert(ResizePLAB, "Not set");
-  if (_allocated == 0) {
-    assert(_unused == 0, "Inconsistency in PLAB stats");
-    _allocated = 1;
-  }
-  double wasted_frac    = (double)_unused/(double)_allocated;
-  size_t target_refills = (size_t)((wasted_frac*TargetSurvivorRatio)/
-                                   TargetPLABWastePct);
-  if (target_refills == 0) {
-    target_refills = 1;
-  }
-  _used = _allocated - _wasted - _unused;
-  size_t plab_sz = _used/(target_refills*ParallelGCThreads);
-  if (PrintPLAB) gclog_or_tty->print(" (plab_sz = %d ", plab_sz);
-  // Take historical weighted average
-  _filter.sample(plab_sz);
-  // Clip from above and below, and align to object boundary
-  plab_sz = MAX2(min_size(), (size_t)_filter.average());
-  plab_sz = MIN2(max_size(), plab_sz);
-  plab_sz = align_object_size(plab_sz);
-  // Latch the result
-  if (PrintPLAB) gclog_or_tty->print(" desired_plab_sz = %d) ", plab_sz);
-  if (ResizePLAB) {
-    _desired_plab_sz = plab_sz;
-  }
-  // Now clear the accumulators for next round:
-  // note this needs to be fixed in the case where we
-  // are retaining across scavenges. FIX ME !!! XXX
-  _allocated = 0;
-  _wasted    = 0;
-  _unused    = 0;
-}
-
-#ifndef PRODUCT
-void ParGCAllocBuffer::print() {
-  gclog_or_tty->print("parGCAllocBuffer: _bottom: %p  _top: %p  _end: %p  _hard_end: %p"
-             "_retained: %c _retained_filler: [%p,%p)\n",
-             _bottom, _top, _end, _hard_end,
-             "FT"[_retained], _retained_filler.start(), _retained_filler.end());
-}
-#endif // !PRODUCT
-
-const size_t ParGCAllocBufferWithBOT::ChunkSizeInWords =
-MIN2(CardTableModRefBS::par_chunk_heapword_alignment(),
-     ((size_t)Generation::GenGrain)/HeapWordSize);
-const size_t ParGCAllocBufferWithBOT::ChunkSizeInBytes =
-MIN2(CardTableModRefBS::par_chunk_heapword_alignment() * HeapWordSize,
-     (size_t)Generation::GenGrain);
-
-ParGCAllocBufferWithBOT::ParGCAllocBufferWithBOT(size_t word_sz,
-                                                 BlockOffsetSharedArray* bsa) :
-  ParGCAllocBuffer(word_sz),
-  _bsa(bsa),
-  _bt(bsa, MemRegion(_bottom, _hard_end)),
-  _true_end(_hard_end)
-{}
-
-// The buffer comes with its own BOT, with a shared (obviously) underlying
-// BlockOffsetSharedArray. We manipulate this BOT in the normal way
-// as we would for any contiguous space. However, on accasion we
-// need to do some buffer surgery at the extremities before we
-// start using the body of the buffer for allocations. Such surgery
-// (as explained elsewhere) is to prevent allocation on a card that
-// is in the process of being walked concurrently by another GC thread.
-// When such surgery happens at a point that is far removed (to the
-// right of the current allocation point, top), we use the "contig"
-// parameter below to directly manipulate the shared array without
-// modifying the _next_threshold state in the BOT.
-void ParGCAllocBufferWithBOT::fill_region_with_block(MemRegion mr,
-                                                     bool contig) {
-  CollectedHeap::fill_with_object(mr);
-  if (contig) {
-    _bt.alloc_block(mr.start(), mr.end());
-  } else {
-    _bt.BlockOffsetArray::alloc_block(mr.start(), mr.end());
-  }
-}
-
-HeapWord* ParGCAllocBufferWithBOT::allocate_slow(size_t word_sz) {
-  HeapWord* res = NULL;
-  if (_true_end > _hard_end) {
-    assert((HeapWord*)align_size_down(intptr_t(_hard_end),
-                                      ChunkSizeInBytes) == _hard_end,
-           "or else _true_end should be equal to _hard_end");
-    assert(_retained, "or else _true_end should be equal to _hard_end");
-    assert(_retained_filler.end() <= _top, "INVARIANT");
-    CollectedHeap::fill_with_object(_retained_filler);
-    if (_top < _hard_end) {
-      fill_region_with_block(MemRegion(_top, _hard_end), true);
-    }
-    HeapWord* next_hard_end = MIN2(_true_end, _hard_end + ChunkSizeInWords);
-    _retained_filler = MemRegion(_hard_end, FillerHeaderSize);
-    _bt.alloc_block(_retained_filler.start(), _retained_filler.word_size());
-    _top      = _retained_filler.end();
-    _hard_end = next_hard_end;
-    _end      = _hard_end - AlignmentReserve;
-    res       = ParGCAllocBuffer::allocate(word_sz);
-    if (res != NULL) {
-      _bt.alloc_block(res, word_sz);
-    }
-  }
-  return res;
-}
-
-void
-ParGCAllocBufferWithBOT::undo_allocation(HeapWord* obj, size_t word_sz) {
-  ParGCAllocBuffer::undo_allocation(obj, word_sz);
-  // This may back us up beyond the previous threshold, so reset.
-  _bt.set_region(MemRegion(_top, _hard_end));
-  _bt.initialize_threshold();
-}
-
-void ParGCAllocBufferWithBOT::retire(bool end_of_gc, bool retain) {
-  assert(!retain || end_of_gc, "Can only retain at GC end.");
-  if (_retained) {
-    // We're about to make the retained_filler into a block.
-    _bt.BlockOffsetArray::alloc_block(_retained_filler.start(),
-                                      _retained_filler.end());
-  }
-  // Reset _hard_end to _true_end (and update _end)
-  if (retain && _hard_end != NULL) {
-    assert(_hard_end <= _true_end, "Invariant.");
-    _hard_end = _true_end;
-    _end      = MAX2(_top, _hard_end - AlignmentReserve);
-    assert(_end <= _hard_end, "Invariant.");
-  }
-  _true_end = _hard_end;
-  HeapWord* pre_top = _top;
-
-  ParGCAllocBuffer::retire(end_of_gc, retain);
-  // Now any old _retained_filler is cut back to size, the free part is
-  // filled with a filler object, and top is past the header of that
-  // object.
-
-  if (retain && _top < _end) {
-    assert(end_of_gc && retain, "Or else retain should be false.");
-    // If the lab does not start on a card boundary, we don't want to
-    // allocate onto that card, since that might lead to concurrent
-    // allocation and card scanning, which we don't support.  So we fill
-    // the first card with a garbage object.
-    size_t first_card_index = _bsa->index_for(pre_top);
-    HeapWord* first_card_start = _bsa->address_for_index(first_card_index);
-    if (first_card_start < pre_top) {
-      HeapWord* second_card_start =
-        _bsa->inc_by_region_size(first_card_start);
-
-      // Ensure enough room to fill with the smallest block
-      second_card_start = MAX2(second_card_start, pre_top + AlignmentReserve);
-
-      // If the end is already in the first card, don't go beyond it!
-      // Or if the remainder is too small for a filler object, gobble it up.
-      if (_hard_end < second_card_start ||
-          pointer_delta(_hard_end, second_card_start) < AlignmentReserve) {
-        second_card_start = _hard_end;
-      }
-      if (pre_top < second_card_start) {
-        MemRegion first_card_suffix(pre_top, second_card_start);
-        fill_region_with_block(first_card_suffix, true);
-      }
-      pre_top = second_card_start;
-      _top = pre_top;
-      _end = MAX2(_top, _hard_end - AlignmentReserve);
-    }
-
-    // If the lab does not end on a card boundary, we don't want to
-    // allocate onto that card, since that might lead to concurrent
-    // allocation and card scanning, which we don't support.  So we fill
-    // the last card with a garbage object.
-    size_t last_card_index = _bsa->index_for(_hard_end);
-    HeapWord* last_card_start = _bsa->address_for_index(last_card_index);
-    if (last_card_start < _hard_end) {
-
-      // Ensure enough room to fill with the smallest block
-      last_card_start = MIN2(last_card_start, _hard_end - AlignmentReserve);
-
-      // If the top is already in the last card, don't go back beyond it!
-      // Or if the remainder is too small for a filler object, gobble it up.
-      if (_top > last_card_start ||
-          pointer_delta(last_card_start, _top) < AlignmentReserve) {
-        last_card_start = _top;
-      }
-      if (last_card_start < _hard_end) {
-        MemRegion last_card_prefix(last_card_start, _hard_end);
-        fill_region_with_block(last_card_prefix, false);
-      }
-      _hard_end = last_card_start;
-      _end      = MAX2(_top, _hard_end - AlignmentReserve);
-      _true_end = _hard_end;
-      assert(_end <= _hard_end, "Invariant.");
-    }
-
-    // At this point:
-    //   1) we had a filler object from the original top to hard_end.
-    //   2) We've filled in any partial cards at the front and back.
-    if (pre_top < _hard_end) {
-      // Now we can reset the _bt to do allocation in the given area.
-      MemRegion new_filler(pre_top, _hard_end);
-      fill_region_with_block(new_filler, false);
-      _top = pre_top + ParGCAllocBuffer::FillerHeaderSize;
-      // If there's no space left, don't retain.
-      if (_top >= _end) {
-        _retained = false;
-        invalidate();
-        return;
-      }
-      _retained_filler = MemRegion(pre_top, _top);
-      _bt.set_region(MemRegion(_top, _hard_end));
-      _bt.initialize_threshold();
-      assert(_bt.threshold() > _top, "initialize_threshold failed!");
-
-      // There may be other reasons for queries into the middle of the
-      // filler object.  When such queries are done in parallel with
-      // allocation, bad things can happen, if the query involves object
-      // iteration.  So we ensure that such queries do not involve object
-      // iteration, by putting another filler object on the boundaries of
-      // such queries.  One such is the object spanning a parallel card
-      // chunk boundary.
-
-      // "chunk_boundary" is the address of the first chunk boundary less
-      // than "hard_end".
-      HeapWord* chunk_boundary =
-        (HeapWord*)align_size_down(intptr_t(_hard_end-1), ChunkSizeInBytes);
-      assert(chunk_boundary < _hard_end, "Or else above did not work.");
-      assert(pointer_delta(_true_end, chunk_boundary) >= AlignmentReserve,
-             "Consequence of last card handling above.");
-
-      if (_top <= chunk_boundary) {
-        assert(_true_end == _hard_end, "Invariant.");
-        while (_top <= chunk_boundary) {
-          assert(pointer_delta(_hard_end, chunk_boundary) >= AlignmentReserve,
-                 "Consequence of last card handling above.");
-          _bt.BlockOffsetArray::alloc_block(chunk_boundary, _hard_end);
-          CollectedHeap::fill_with_object(chunk_boundary, _hard_end);
-          _hard_end = chunk_boundary;
-          chunk_boundary -= ChunkSizeInWords;
-        }
-        _end = _hard_end - AlignmentReserve;
-        assert(_top <= _end, "Invariant.");
-        // Now reset the initial filler chunk so it doesn't overlap with
-        // the one(s) inserted above.
-        MemRegion new_filler(pre_top, _hard_end);
-        fill_region_with_block(new_filler, false);
-      }
-    } else {
-      _retained = false;
-      invalidate();
-    }
-  } else {
-    assert(!end_of_gc ||
-           (!_retained && _true_end == _hard_end), "Checking.");
-  }
-  assert(_end <= _hard_end, "Invariant.");
-  assert(_top < _end || _top == _hard_end, "Invariant");
-}
--- a/src/share/vm/gc_implementation/parNew/parGCAllocBuffer.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,249 +0,0 @@
-/*
- * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_VM_GC_IMPLEMENTATION_PARNEW_PARGCALLOCBUFFER_HPP
-#define SHARE_VM_GC_IMPLEMENTATION_PARNEW_PARGCALLOCBUFFER_HPP
-
-#include "memory/allocation.hpp"
-#include "memory/blockOffsetTable.hpp"
-#include "memory/threadLocalAllocBuffer.hpp"
-#include "utilities/globalDefinitions.hpp"
-
-// Forward decl.
-
-class PLABStats;
-
-// A per-thread allocation buffer used during GC.
-class ParGCAllocBuffer: public CHeapObj<mtGC> {
-protected:
-  char head[32];
-  size_t _word_sz;          // in HeapWord units
-  HeapWord* _bottom;
-  HeapWord* _top;
-  HeapWord* _end;       // last allocatable address + 1
-  HeapWord* _hard_end;  // _end + AlignmentReserve
-  bool      _retained;  // whether we hold a _retained_filler
-  MemRegion _retained_filler;
-  // In support of ergonomic sizing of PLAB's
-  size_t    _allocated;     // in HeapWord units
-  size_t    _wasted;        // in HeapWord units
-  char tail[32];
-  static size_t FillerHeaderSize;
-  static size_t AlignmentReserve;
-
-public:
-  // Initializes the buffer to be empty, but with the given "word_sz".
-  // Must get initialized with "set_buf" for an allocation to succeed.
-  ParGCAllocBuffer(size_t word_sz);
-
-  static const size_t min_size() {
-    return ThreadLocalAllocBuffer::min_size();
-  }
-
-  static const size_t max_size() {
-    return ThreadLocalAllocBuffer::max_size();
-  }
-
-  // If an allocation of the given "word_sz" can be satisfied within the
-  // buffer, do the allocation, returning a pointer to the start of the
-  // allocated block.  If the allocation request cannot be satisfied,
-  // return NULL.
-  HeapWord* allocate(size_t word_sz) {
-    HeapWord* res = _top;
-    if (pointer_delta(_end, _top) >= word_sz) {
-      _top = _top + word_sz;
-      return res;
-    } else {
-      return NULL;
-    }
-  }
-
-  // Undo the last allocation in the buffer, which is required to be of the
-  // "obj" of the given "word_sz".
-  void undo_allocation(HeapWord* obj, size_t word_sz) {
-    assert(pointer_delta(_top, _bottom) >= word_sz, "Bad undo");
-    assert(pointer_delta(_top, obj)     == word_sz, "Bad undo");
-    _top = obj;
-  }
-
-  // The total (word) size of the buffer, including both allocated and
-  // unallocted space.
-  size_t word_sz() { return _word_sz; }
-
-  // Should only be done if we are about to reset with a new buffer of the
-  // given size.
-  void set_word_size(size_t new_word_sz) {
-    assert(new_word_sz > AlignmentReserve, "Too small");
-    _word_sz = new_word_sz;
-  }
-
-  // The number of words of unallocated space remaining in the buffer.
-  size_t words_remaining() {
-    assert(_end >= _top, "Negative buffer");
-    return pointer_delta(_end, _top, HeapWordSize);
-  }
-
-  bool contains(void* addr) {
-    return (void*)_bottom <= addr && addr < (void*)_hard_end;
-  }
-
-  // Sets the space of the buffer to be [buf, space+word_sz()).
-  void set_buf(HeapWord* buf) {
-    _bottom   = buf;
-    _top      = _bottom;
-    _hard_end = _bottom + word_sz();
-    _end      = _hard_end - AlignmentReserve;
-    assert(_end >= _top, "Negative buffer");
-    // In support of ergonomic sizing
-    _allocated += word_sz();
-  }
-
-  // Flush the stats supporting ergonomic sizing of PLAB's
-  void flush_stats(PLABStats* stats);
-  void flush_stats_and_retire(PLABStats* stats, bool retain) {
-    // We flush the stats first in order to get a reading of
-    // unused space in the last buffer.
-    if (ResizePLAB) {
-      flush_stats(stats);
-    }
-    // Retire the last allocation buffer.
-    retire(true, retain);
-  }
-
-  // Force future allocations to fail and queries for contains()
-  // to return false
-  void invalidate() {
-    assert(!_retained, "Shouldn't retain an invalidated buffer.");
-    _end    = _hard_end;
-    _wasted += pointer_delta(_end, _top);  // unused  space
-    _top    = _end;      // force future allocations to fail
-    _bottom = _end;      // force future contains() queries to return false
-  }
-
-  // Fills in the unallocated portion of the buffer with a garbage object.
-  // If "end_of_gc" is TRUE, is after the last use in the GC.  IF "retain"
-  // is true, attempt to re-use the unused portion in the next GC.
-  void retire(bool end_of_gc, bool retain);
-
-  void print() PRODUCT_RETURN;
-};
-
-// PLAB stats book-keeping
-class PLABStats VALUE_OBJ_CLASS_SPEC {
-  size_t _allocated;      // total allocated
-  size_t _wasted;         // of which wasted (internal fragmentation)
-  size_t _unused;         // Unused in last buffer
-  size_t _used;           // derived = allocated - wasted - unused
-  size_t _desired_plab_sz;// output of filter (below), suitably trimmed and quantized
-  AdaptiveWeightedAverage
-         _filter;         // integrator with decay
-
- public:
-  PLABStats(size_t desired_plab_sz_, unsigned wt) :
-    _allocated(0),
-    _wasted(0),
-    _unused(0),
-    _used(0),
-    _desired_plab_sz(desired_plab_sz_),
-    _filter(wt)
-  {
-    size_t min_sz = min_size();
-    size_t max_sz = max_size();
-    size_t aligned_min_sz = align_object_size(min_sz);
-    size_t aligned_max_sz = align_object_size(max_sz);
-    assert(min_sz <= aligned_min_sz && max_sz >= aligned_max_sz &&
-           min_sz <= max_sz,
-           "PLAB clipping computation in adjust_desired_plab_sz()"
-           " may be incorrect");
-  }
-
-  static const size_t min_size() {
-    return ParGCAllocBuffer::min_size();
-  }
-
-  static const size_t max_size() {
-    return ParGCAllocBuffer::max_size();
-  }
-
-  size_t desired_plab_sz() {
-    return _desired_plab_sz;
-  }
-
-  void adjust_desired_plab_sz(); // filter computation, latches output to
-                                 // _desired_plab_sz, clears sensor accumulators
-
-  void add_allocated(size_t v) {
-    Atomic::add_ptr(v, &_allocated);
-  }
-
-  void add_unused(size_t v) {
-    Atomic::add_ptr(v, &_unused);
-  }
-
-  void add_wasted(size_t v) {
-    Atomic::add_ptr(v, &_wasted);
-  }
-};
-
-class ParGCAllocBufferWithBOT: public ParGCAllocBuffer {
-  BlockOffsetArrayContigSpace _bt;
-  BlockOffsetSharedArray*     _bsa;
-  HeapWord*                   _true_end;  // end of the whole ParGCAllocBuffer
-
-  static const size_t ChunkSizeInWords;
-  static const size_t ChunkSizeInBytes;
-  HeapWord* allocate_slow(size_t word_sz);
-
-  void fill_region_with_block(MemRegion mr, bool contig);
-
-public:
-  ParGCAllocBufferWithBOT(size_t word_sz, BlockOffsetSharedArray* bsa);
-
-  HeapWord* allocate(size_t word_sz) {
-    HeapWord* res = ParGCAllocBuffer::allocate(word_sz);
-    if (res != NULL) {
-      _bt.alloc_block(res, word_sz);
-    } else {
-      res = allocate_slow(word_sz);
-    }
-    return res;
-  }
-
-  void undo_allocation(HeapWord* obj, size_t word_sz);
-
-  void set_buf(HeapWord* buf_start) {
-    ParGCAllocBuffer::set_buf(buf_start);
-    _true_end = _hard_end;
-    _bt.set_region(MemRegion(buf_start, word_sz()));
-    _bt.initialize_threshold();
-  }
-
-  void retire(bool end_of_gc, bool retain);
-
-  MemRegion range() {
-    return MemRegion(_top, _true_end);
-  }
-};
-
-#endif // SHARE_VM_GC_IMPLEMENTATION_PARNEW_PARGCALLOCBUFFER_HPP
--- a/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -24,11 +24,11 @@

 #include "precompiled.hpp"
 #include "gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp"
-#include "gc_implementation/parNew/parGCAllocBuffer.hpp"
 #include "gc_implementation/parNew/parNewGeneration.hpp"
 #include "gc_implementation/parNew/parOopClosures.inline.hpp"
 #include "gc_implementation/shared/adaptiveSizePolicy.hpp"
 #include "gc_implementation/shared/ageTable.hpp"
+#include "gc_implementation/shared/parGCAllocBuffer.hpp"
 #include "gc_implementation/shared/spaceDecorator.hpp"
 #include "memory/defNewGeneration.inline.hpp"
 #include "memory/genCollectedHeap.hpp"
@@ -453,7 +453,8 @@
     // retire the last buffer.
     par_scan_state.to_space_alloc_buffer()->
       flush_stats_and_retire(_gen.plab_stats(),
-                             false /* !retain */);
+                             true /* end_of_gc */,
+                             false /* retain */);

     // Every thread has its own age table.  We need to merge
     // them all into one.
--- a/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,7 +25,7 @@
 #ifndef SHARE_VM_GC_IMPLEMENTATION_PARNEW_PARNEWGENERATION_HPP
 #define SHARE_VM_GC_IMPLEMENTATION_PARNEW_PARNEWGENERATION_HPP

-#include "gc_implementation/parNew/parGCAllocBuffer.hpp"
+#include "gc_implementation/shared/parGCAllocBuffer.hpp"
 #include "memory/defNewGeneration.hpp"
 #include "utilities/taskqueue.hpp"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/gc_implementation/shared/parGCAllocBuffer.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "gc_implementation/shared/parGCAllocBuffer.hpp"
+#include "memory/sharedHeap.hpp"
+#include "oops/arrayOop.hpp"
+#include "oops/oop.inline.hpp"
+
+ParGCAllocBuffer::ParGCAllocBuffer(size_t desired_plab_sz_) :
+  _word_sz(desired_plab_sz_), _bottom(NULL), _top(NULL),
+  _end(NULL), _hard_end(NULL),
+  _retained(false), _retained_filler(),
+  _allocated(0), _wasted(0)
+{
+  assert (min_size() > AlignmentReserve, "Inconsistency!");
+  // arrayOopDesc::header_size depends on command line initialization.
+  FillerHeaderSize = align_object_size(arrayOopDesc::header_size(T_INT));
+  AlignmentReserve = oopDesc::header_size() > MinObjAlignment ? FillerHeaderSize : 0;
+}
+
+size_t ParGCAllocBuffer::FillerHeaderSize;
+
+// If the minimum object size is greater than MinObjAlignment, we can
+// end up with a shard at the end of the buffer that's smaller than
+// the smallest object.  We can't allow that because the buffer must
+// look like it's full of objects when we retire it, so we make
+// sure we have enough space for a filler int array object.
+size_t ParGCAllocBuffer::AlignmentReserve;
+
+void ParGCAllocBuffer::retire(bool end_of_gc, bool retain) {
+  assert(!retain || end_of_gc, "Can only retain at GC end.");
+  if (_retained) {
+    // If the buffer had been retained shorten the previous filler object.
+    assert(_retained_filler.end() <= _top, "INVARIANT");
+    CollectedHeap::fill_with_object(_retained_filler);
+    // Wasted space book-keeping, otherwise (normally) done in invalidate()
+    _wasted += _retained_filler.word_size();
+    _retained = false;
+  }
+  assert(!end_of_gc || !_retained, "At this point, end_of_gc ==> !_retained.");
+  if (_top < _hard_end) {
+    CollectedHeap::fill_with_object(_top, _hard_end);
+    if (!retain) {
+      invalidate();
+    } else {
+      // Is there wasted space we'd like to retain for the next GC?
+      if (pointer_delta(_end, _top) > FillerHeaderSize) {
+        _retained = true;
+        _retained_filler = MemRegion(_top, FillerHeaderSize);
+        _top = _top + FillerHeaderSize;
+      } else {
+        invalidate();
+      }
+    }
+  }
+}
+
+void ParGCAllocBuffer::flush_stats(PLABStats* stats) {
+  assert(ResizePLAB, "Wasted work");
+  stats->add_allocated(_allocated);
+  stats->add_wasted(_wasted);
+  stats->add_unused(pointer_delta(_end, _top));
+}
+
+// Compute desired plab size and latch result for later
+// use. This should be called once at the end of parallel
+// scavenge; it clears the sensor accumulators.
+void PLABStats::adjust_desired_plab_sz() {
+  assert(ResizePLAB, "Not set");
+  if (_allocated == 0) {
+    assert(_unused == 0, "Inconsistency in PLAB stats");
+    _allocated = 1;
+  }
+  double wasted_frac    = (double)_unused/(double)_allocated;
+  size_t target_refills = (size_t)((wasted_frac*TargetSurvivorRatio)/
+                                   TargetPLABWastePct);
+  if (target_refills == 0) {
+    target_refills = 1;
+  }
+  _used = _allocated - _wasted - _unused;
+  size_t plab_sz = _used/(target_refills*ParallelGCThreads);
+  if (PrintPLAB) gclog_or_tty->print(" (plab_sz = %d ", plab_sz);
+  // Take historical weighted average
+  _filter.sample(plab_sz);
+  // Clip from above and below, and align to object boundary
+  plab_sz = MAX2(min_size(), (size_t)_filter.average());
+  plab_sz = MIN2(max_size(), plab_sz);
+  plab_sz = align_object_size(plab_sz);
+  // Latch the result
+  if (PrintPLAB) gclog_or_tty->print(" desired_plab_sz = %d) ", plab_sz);
+  _desired_plab_sz = plab_sz;
+  // Now clear the accumulators for next round:
+  // note this needs to be fixed in the case where we
+  // are retaining across scavenges. FIX ME !!! XXX
+  _allocated = 0;
+  _wasted    = 0;
+  _unused    = 0;
+}
+
+#ifndef PRODUCT
+void ParGCAllocBuffer::print() {
+  gclog_or_tty->print("parGCAllocBuffer: _bottom: %p  _top: %p  _end: %p  _hard_end: %p"
+             "_retained: %c _retained_filler: [%p,%p)\n",
+             _bottom, _top, _end, _hard_end,
+             "FT"[_retained], _retained_filler.start(), _retained_filler.end());
+}
+#endif // !PRODUCT
+
+const size_t ParGCAllocBufferWithBOT::ChunkSizeInWords =
+MIN2(CardTableModRefBS::par_chunk_heapword_alignment(),
+     ((size_t)Generation::GenGrain)/HeapWordSize);
+const size_t ParGCAllocBufferWithBOT::ChunkSizeInBytes =
+MIN2(CardTableModRefBS::par_chunk_heapword_alignment() * HeapWordSize,
+     (size_t)Generation::GenGrain);
+
+ParGCAllocBufferWithBOT::ParGCAllocBufferWithBOT(size_t word_sz,
+                                                 BlockOffsetSharedArray* bsa) :
+  ParGCAllocBuffer(word_sz),
+  _bsa(bsa),
+  _bt(bsa, MemRegion(_bottom, _hard_end)),
+  _true_end(_hard_end)
+{}
+
+// The buffer comes with its own BOT, with a shared (obviously) underlying
+// BlockOffsetSharedArray. We manipulate this BOT in the normal way
+// as we would for any contiguous space. However, on accasion we
+// need to do some buffer surgery at the extremities before we
+// start using the body of the buffer for allocations. Such surgery
+// (as explained elsewhere) is to prevent allocation on a card that
+// is in the process of being walked concurrently by another GC thread.
+// When such surgery happens at a point that is far removed (to the
+// right of the current allocation point, top), we use the "contig"
+// parameter below to directly manipulate the shared array without
+// modifying the _next_threshold state in the BOT.
+void ParGCAllocBufferWithBOT::fill_region_with_block(MemRegion mr,
+                                                     bool contig) {
+  CollectedHeap::fill_with_object(mr);
+  if (contig) {
+    _bt.alloc_block(mr.start(), mr.end());
+  } else {
+    _bt.BlockOffsetArray::alloc_block(mr.start(), mr.end());
+  }
+}
+
+HeapWord* ParGCAllocBufferWithBOT::allocate_slow(size_t word_sz) {
+  HeapWord* res = NULL;
+  if (_true_end > _hard_end) {
+    assert((HeapWord*)align_size_down(intptr_t(_hard_end),
+                                      ChunkSizeInBytes) == _hard_end,
+           "or else _true_end should be equal to _hard_end");
+    assert(_retained, "or else _true_end should be equal to _hard_end");
+    assert(_retained_filler.end() <= _top, "INVARIANT");
+    CollectedHeap::fill_with_object(_retained_filler);
+    if (_top < _hard_end) {
+      fill_region_with_block(MemRegion(_top, _hard_end), true);
+    }
+    HeapWord* next_hard_end = MIN2(_true_end, _hard_end + ChunkSizeInWords);
+    _retained_filler = MemRegion(_hard_end, FillerHeaderSize);
+    _bt.alloc_block(_retained_filler.start(), _retained_filler.word_size());
+    _top      = _retained_filler.end();
+    _hard_end = next_hard_end;
+    _end      = _hard_end - AlignmentReserve;
+    res       = ParGCAllocBuffer::allocate(word_sz);
+    if (res != NULL) {
+      _bt.alloc_block(res, word_sz);
+    }
+  }
+  return res;
+}
+
+void
+ParGCAllocBufferWithBOT::undo_allocation(HeapWord* obj, size_t word_sz) {
+  ParGCAllocBuffer::undo_allocation(obj, word_sz);
+  // This may back us up beyond the previous threshold, so reset.
+  _bt.set_region(MemRegion(_top, _hard_end));
+  _bt.initialize_threshold();
+}
+
+void ParGCAllocBufferWithBOT::retire(bool end_of_gc, bool retain) {
+  assert(!retain || end_of_gc, "Can only retain at GC end.");
+  if (_retained) {
+    // We're about to make the retained_filler into a block.
+    _bt.BlockOffsetArray::alloc_block(_retained_filler.start(),
+                                      _retained_filler.end());
+  }
+  // Reset _hard_end to _true_end (and update _end)
+  if (retain && _hard_end != NULL) {
+    assert(_hard_end <= _true_end, "Invariant.");
+    _hard_end = _true_end;
+    _end      = MAX2(_top, _hard_end - AlignmentReserve);
+    assert(_end <= _hard_end, "Invariant.");
+  }
+  _true_end = _hard_end;
+  HeapWord* pre_top = _top;
+
+  ParGCAllocBuffer::retire(end_of_gc, retain);
+  // Now any old _retained_filler is cut back to size, the free part is
+  // filled with a filler object, and top is past the header of that
+  // object.
+
+  if (retain && _top < _end) {
+    assert(end_of_gc && retain, "Or else retain should be false.");
+    // If the lab does not start on a card boundary, we don't want to
+    // allocate onto that card, since that might lead to concurrent
+    // allocation and card scanning, which we don't support.  So we fill
+    // the first card with a garbage object.
+    size_t first_card_index = _bsa->index_for(pre_top);
+    HeapWord* first_card_start = _bsa->address_for_index(first_card_index);
+    if (first_card_start < pre_top) {
+      HeapWord* second_card_start =
+        _bsa->inc_by_region_size(first_card_start);
+
+      // Ensure enough room to fill with the smallest block
+      second_card_start = MAX2(second_card_start, pre_top + AlignmentReserve);
+
+      // If the end is already in the first card, don't go beyond it!
+      // Or if the remainder is too small for a filler object, gobble it up.
+      if (_hard_end < second_card_start ||
+          pointer_delta(_hard_end, second_card_start) < AlignmentReserve) {
+        second_card_start = _hard_end;
+      }
+      if (pre_top < second_card_start) {
+        MemRegion first_card_suffix(pre_top, second_card_start);
+        fill_region_with_block(first_card_suffix, true);
+      }
+      pre_top = second_card_start;
+      _top = pre_top;
+      _end = MAX2(_top, _hard_end - AlignmentReserve);
+    }
+
+    // If the lab does not end on a card boundary, we don't want to
+    // allocate onto that card, since that might lead to concurrent
+    // allocation and card scanning, which we don't support.  So we fill
+    // the last card with a garbage object.
+    size_t last_card_index = _bsa->index_for(_hard_end);
+    HeapWord* last_card_start = _bsa->address_for_index(last_card_index);
+    if (last_card_start < _hard_end) {
+
+      // Ensure enough room to fill with the smallest block
+      last_card_start = MIN2(last_card_start, _hard_end - AlignmentReserve);
+
+      // If the top is already in the last card, don't go back beyond it!
+      // Or if the remainder is too small for a filler object, gobble it up.
+      if (_top > last_card_start ||
+          pointer_delta(last_card_start, _top) < AlignmentReserve) {
+        last_card_start = _top;
+      }
+      if (last_card_start < _hard_end) {
+        MemRegion last_card_prefix(last_card_start, _hard_end);
+        fill_region_with_block(last_card_prefix, false);
+      }
+      _hard_end = last_card_start;
+      _end      = MAX2(_top, _hard_end - AlignmentReserve);
+      _true_end = _hard_end;
+      assert(_end <= _hard_end, "Invariant.");
+    }
+
+    // At this point:
+    //   1) we had a filler object from the original top to hard_end.
+    //   2) We've filled in any partial cards at the front and back.
+    if (pre_top < _hard_end) {
+      // Now we can reset the _bt to do allocation in the given area.
+      MemRegion new_filler(pre_top, _hard_end);
+      fill_region_with_block(new_filler, false);
+      _top = pre_top + ParGCAllocBuffer::FillerHeaderSize;
+      // If there's no space left, don't retain.
+      if (_top >= _end) {
+        _retained = false;
+        invalidate();
+        return;
+      }
+      _retained_filler = MemRegion(pre_top, _top);
+      _bt.set_region(MemRegion(_top, _hard_end));
+      _bt.initialize_threshold();
+      assert(_bt.threshold() > _top, "initialize_threshold failed!");
+
+      // There may be other reasons for queries into the middle of the
+      // filler object.  When such queries are done in parallel with
+      // allocation, bad things can happen, if the query involves object
+      // iteration.  So we ensure that such queries do not involve object
+      // iteration, by putting another filler object on the boundaries of
+      // such queries.  One such is the object spanning a parallel card
+      // chunk boundary.
+
+      // "chunk_boundary" is the address of the first chunk boundary less
+      // than "hard_end".
+      HeapWord* chunk_boundary =
+        (HeapWord*)align_size_down(intptr_t(_hard_end-1), ChunkSizeInBytes);
+      assert(chunk_boundary < _hard_end, "Or else above did not work.");
+      assert(pointer_delta(_true_end, chunk_boundary) >= AlignmentReserve,
+             "Consequence of last card handling above.");
+
+      if (_top <= chunk_boundary) {
+        assert(_true_end == _hard_end, "Invariant.");
+        while (_top <= chunk_boundary) {
+          assert(pointer_delta(_hard_end, chunk_boundary) >= AlignmentReserve,
+                 "Consequence of last card handling above.");
+          _bt.BlockOffsetArray::alloc_block(chunk_boundary, _hard_end);
+          CollectedHeap::fill_with_object(chunk_boundary, _hard_end);
+          _hard_end = chunk_boundary;
+          chunk_boundary -= ChunkSizeInWords;
+        }
+        _end = _hard_end - AlignmentReserve;
+        assert(_top <= _end, "Invariant.");
+        // Now reset the initial filler chunk so it doesn't overlap with
+        // the one(s) inserted above.
+        MemRegion new_filler(pre_top, _hard_end);
+        fill_region_with_block(new_filler, false);
+      }
+    } else {
+      _retained = false;
+      invalidate();
+    }
+  } else {
+    assert(!end_of_gc ||
+           (!_retained && _true_end == _hard_end), "Checking.");
+  }
+  assert(_end <= _hard_end, "Invariant.");
+  assert(_top < _end || _top == _hard_end, "Invariant");
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/gc_implementation/shared/parGCAllocBuffer.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_GC_IMPLEMENTATION_PARNEW_PARGCALLOCBUFFER_HPP
+#define SHARE_VM_GC_IMPLEMENTATION_PARNEW_PARGCALLOCBUFFER_HPP
+
+#include "memory/allocation.hpp"
+#include "memory/blockOffsetTable.hpp"
+#include "memory/threadLocalAllocBuffer.hpp"
+#include "utilities/globalDefinitions.hpp"
+
+// Forward decl.
+
+class PLABStats;
+
+// A per-thread allocation buffer used during GC.
+class ParGCAllocBuffer: public CHeapObj<mtGC> {
+protected:
+  char head[32];
+  size_t _word_sz;          // in HeapWord units
+  HeapWord* _bottom;
+  HeapWord* _top;
+  HeapWord* _end;       // last allocatable address + 1
+  HeapWord* _hard_end;  // _end + AlignmentReserve
+  bool      _retained;  // whether we hold a _retained_filler
+  MemRegion _retained_filler;
+  // In support of ergonomic sizing of PLAB's
+  size_t    _allocated;     // in HeapWord units
+  size_t    _wasted;        // in HeapWord units
+  char tail[32];
+  static size_t FillerHeaderSize;
+  static size_t AlignmentReserve;
+
+public:
+  // Initializes the buffer to be empty, but with the given "word_sz".
+  // Must get initialized with "set_buf" for an allocation to succeed.
+  ParGCAllocBuffer(size_t word_sz);
+
+  static const size_t min_size() {
+    return ThreadLocalAllocBuffer::min_size();
+  }
+
+  static const size_t max_size() {
+    return ThreadLocalAllocBuffer::max_size();
+  }
+
+  // If an allocation of the given "word_sz" can be satisfied within the
+  // buffer, do the allocation, returning a pointer to the start of the
+  // allocated block.  If the allocation request cannot be satisfied,
+  // return NULL.
+  HeapWord* allocate(size_t word_sz) {
+    HeapWord* res = _top;
+    if (pointer_delta(_end, _top) >= word_sz) {
+      _top = _top + word_sz;
+      return res;
+    } else {
+      return NULL;
+    }
+  }
+
+  // Undo the last allocation in the buffer, which is required to be of the
+  // "obj" of the given "word_sz".
+  void undo_allocation(HeapWord* obj, size_t word_sz) {
+    assert(pointer_delta(_top, _bottom) >= word_sz, "Bad undo");
+    assert(pointer_delta(_top, obj)     == word_sz, "Bad undo");
+    _top = obj;
+  }
+
+  // The total (word) size of the buffer, including both allocated and
+  // unallocted space.
+  size_t word_sz() { return _word_sz; }
+
+  // Should only be done if we are about to reset with a new buffer of the
+  // given size.
+  void set_word_size(size_t new_word_sz) {
+    assert(new_word_sz > AlignmentReserve, "Too small");
+    _word_sz = new_word_sz;
+  }
+
+  // The number of words of unallocated space remaining in the buffer.
+  size_t words_remaining() {
+    assert(_end >= _top, "Negative buffer");
+    return pointer_delta(_end, _top, HeapWordSize);
+  }
+
+  bool contains(void* addr) {
+    return (void*)_bottom <= addr && addr < (void*)_hard_end;
+  }
+
+  // Sets the space of the buffer to be [buf, space+word_sz()).
+  void set_buf(HeapWord* buf) {
+    _bottom   = buf;
+    _top      = _bottom;
+    _hard_end = _bottom + word_sz();
+    _end      = _hard_end - AlignmentReserve;
+    assert(_end >= _top, "Negative buffer");
+    // In support of ergonomic sizing
+    _allocated += word_sz();
+  }
+
+  // Flush the stats supporting ergonomic sizing of PLAB's
+  void flush_stats(PLABStats* stats);
+  void flush_stats_and_retire(PLABStats* stats, bool end_of_gc, bool retain) {
+    // We flush the stats first in order to get a reading of
+    // unused space in the last buffer.
+    if (ResizePLAB) {
+      flush_stats(stats);
+    }
+    // Retire the last allocation buffer.
+    retire(end_of_gc, retain);
+  }
+
+  // Force future allocations to fail and queries for contains()
+  // to return false
+  void invalidate() {
+    assert(!_retained, "Shouldn't retain an invalidated buffer.");
+    _end    = _hard_end;
+    _wasted += pointer_delta(_end, _top);  // unused  space
+    _top    = _end;      // force future allocations to fail
+    _bottom = _end;      // force future contains() queries to return false
+  }
+
+  // Fills in the unallocated portion of the buffer with a garbage object.
+  // If "end_of_gc" is TRUE, is after the last use in the GC.  IF "retain"
+  // is true, attempt to re-use the unused portion in the next GC.
+  void retire(bool end_of_gc, bool retain);
+
+  void print() PRODUCT_RETURN;
+};
+
+// PLAB stats book-keeping
+class PLABStats VALUE_OBJ_CLASS_SPEC {
+  size_t _allocated;      // total allocated
+  size_t _wasted;         // of which wasted (internal fragmentation)
+  size_t _unused;         // Unused in last buffer
+  size_t _used;           // derived = allocated - wasted - unused
+  size_t _desired_plab_sz;// output of filter (below), suitably trimmed and quantized
+  AdaptiveWeightedAverage
+         _filter;         // integrator with decay
+
+ public:
+  PLABStats(size_t desired_plab_sz_, unsigned wt) :
+    _allocated(0),
+    _wasted(0),
+    _unused(0),
+    _used(0),
+    _desired_plab_sz(desired_plab_sz_),
+    _filter(wt)
+  {
+    size_t min_sz = min_size();
+    size_t max_sz = max_size();
+    size_t aligned_min_sz = align_object_size(min_sz);
+    size_t aligned_max_sz = align_object_size(max_sz);
+    assert(min_sz <= aligned_min_sz && max_sz >= aligned_max_sz &&
+           min_sz <= max_sz,
+           "PLAB clipping computation in adjust_desired_plab_sz()"
+           " may be incorrect");
+  }
+
+  static const size_t min_size() {
+    return ParGCAllocBuffer::min_size();
+  }
+
+  static const size_t max_size() {
+    return ParGCAllocBuffer::max_size();
+  }
+
+  size_t desired_plab_sz() {
+    return _desired_plab_sz;
+  }
+
+  void adjust_desired_plab_sz(); // filter computation, latches output to
+                                 // _desired_plab_sz, clears sensor accumulators
+
+  void add_allocated(size_t v) {
+    Atomic::add_ptr(v, &_allocated);
+  }
+
+  void add_unused(size_t v) {
+    Atomic::add_ptr(v, &_unused);
+  }
+
+  void add_wasted(size_t v) {
+    Atomic::add_ptr(v, &_wasted);
+  }
+};
+
+class ParGCAllocBufferWithBOT: public ParGCAllocBuffer {
+  BlockOffsetArrayContigSpace _bt;
+  BlockOffsetSharedArray*     _bsa;
+  HeapWord*                   _true_end;  // end of the whole ParGCAllocBuffer
+
+  static const size_t ChunkSizeInWords;
+  static const size_t ChunkSizeInBytes;
+  HeapWord* allocate_slow(size_t word_sz);
+
+  void fill_region_with_block(MemRegion mr, bool contig);
+
+public:
+  ParGCAllocBufferWithBOT(size_t word_sz, BlockOffsetSharedArray* bsa);
+
+  HeapWord* allocate(size_t word_sz) {
+    HeapWord* res = ParGCAllocBuffer::allocate(word_sz);
+    if (res != NULL) {
+      _bt.alloc_block(res, word_sz);
+    } else {
+      res = allocate_slow(word_sz);
+    }
+    return res;
+  }
+
+  void undo_allocation(HeapWord* obj, size_t word_sz);
+
+  void set_buf(HeapWord* buf_start) {
+    ParGCAllocBuffer::set_buf(buf_start);
+    _true_end = _hard_end;
+    _bt.set_region(MemRegion(buf_start, word_sz()));
+    _bt.initialize_threshold();
+  }
+
+  void retire(bool end_of_gc, bool retain);
+
+  MemRegion range() {
+    return MemRegion(_top, _true_end);
+  }
+};
+
+#endif // SHARE_VM_GC_IMPLEMENTATION_PARNEW_PARGCALLOCBUFFER_HPP
--- a/src/share/vm/memory/tenuredGeneration.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/memory/tenuredGeneration.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -23,8 +23,8 @@
  */

 #include "precompiled.hpp"
-#include "gc_implementation/parNew/parGCAllocBuffer.hpp"
 #include "gc_implementation/shared/collectorCounters.hpp"
+#include "gc_implementation/shared/parGCAllocBuffer.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/blockOffsetTable.inline.hpp"
 #include "memory/generation.inline.hpp"
--- a/src/share/vm/opto/callGenerator.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/opto/callGenerator.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -158,74 +158,6 @@
   return kit.transfer_exceptions_into_jvms();
 }

-//---------------------------DynamicCallGenerator-----------------------------
-// Internal class which handles all out-of-line invokedynamic calls.
-class DynamicCallGenerator : public CallGenerator {
-public:
-  DynamicCallGenerator(ciMethod* method)
-    : CallGenerator(method)
-  {
-  }
-  virtual JVMState* generate(JVMState* jvms);
-};
-
-JVMState* DynamicCallGenerator::generate(JVMState* jvms) {
-  GraphKit kit(jvms);
-  Compile* C = kit.C;
-  PhaseGVN& gvn = kit.gvn();
-
-  if (C->log() != NULL) {
-    C->log()->elem("dynamic_call bci='%d'", jvms->bci());
-  }
-
-  // Get the constant pool cache from the caller class.
-  ciMethod* caller_method = jvms->method();
-  ciBytecodeStream str(caller_method);
-  str.force_bci(jvms->bci());  // Set the stream to the invokedynamic bci.
-  assert(str.cur_bc() == Bytecodes::_invokedynamic, "wrong place to issue a dynamic call!");
-  ciCPCache* cpcache = str.get_cpcache();
-
-  // Get the offset of the CallSite from the constant pool cache
-  // pointer.
-  int index = str.get_method_index();
-  size_t call_site_offset = cpcache->get_f1_offset(index);
-
-  // Load the CallSite object from the constant pool cache.
-  const TypeOopPtr* cpcache_type   = TypeOopPtr::make_from_constant(cpcache);  // returns TypeAryPtr of type T_OBJECT
-  const TypeOopPtr* call_site_type = TypeOopPtr::make_from_klass(C->env()->CallSite_klass());
-  Node* cpcache_adr   = kit.makecon(cpcache_type);
-  Node* call_site_adr = kit.basic_plus_adr(cpcache_adr, call_site_offset);
-  // The oops in the constant pool cache are not compressed; load then as raw pointers.
-  Node* call_site     = kit.make_load(kit.control(), call_site_adr, call_site_type, T_ADDRESS, Compile::AliasIdxRaw);
-
-  // Load the target MethodHandle from the CallSite object.
-  const TypeOopPtr* target_type = TypeOopPtr::make_from_klass(C->env()->MethodHandle_klass());
-  Node* target_mh_adr = kit.basic_plus_adr(call_site, java_lang_invoke_CallSite::target_offset_in_bytes());
-  Node* target_mh     = kit.make_load(kit.control(), target_mh_adr, target_type, T_OBJECT);
-
-  address resolve_stub = SharedRuntime::get_resolve_opt_virtual_call_stub();
-
-  CallStaticJavaNode* call = new (C, tf()->domain()->cnt()) CallStaticJavaNode(tf(), resolve_stub, method(), kit.bci());
-  // invokedynamic is treated as an optimized invokevirtual.
-  call->set_optimized_virtual(true);
-  // Take extra care (in the presence of argument motion) not to trash the SP:
-  call->set_method_handle_invoke(true);
-
-  // Pass the target MethodHandle as first argument and shift the
-  // other arguments.
-  call->init_req(0 + TypeFunc::Parms, target_mh);
-  uint nargs = call->method()->arg_size();
-  for (uint i = 1; i < nargs; i++) {
-    Node* arg = kit.argument(i - 1);
-    call->init_req(i + TypeFunc::Parms, arg);
-  }
-
-  kit.set_edges_for_java_call(call);
-  Node* ret = kit.set_results_for_java_call(call);
-  kit.push_node(method()->return_type()->basic_type(), ret);
-  return kit.transfer_exceptions_into_jvms();
-}
-
 //--------------------------VirtualCallGenerator------------------------------
 // Internal class which handles all out-of-line calls checking receiver type.
 class VirtualCallGenerator : public CallGenerator {
@@ -328,12 +260,6 @@
   return new VirtualCallGenerator(m, vtable_index);
 }

-CallGenerator* CallGenerator::for_dynamic_call(ciMethod* m) {
-  assert(m->is_compiled_lambda_form(), "for_dynamic_call mismatch");
-  //@@ FIXME: this should be done via a direct call
-  return new DynamicCallGenerator(m);
-}
-
 // Allow inlining decisions to be delayed
 class LateInlineCallGenerator : public DirectCallGenerator {
   CallGenerator* _inline_cg;
@@ -347,7 +273,7 @@
   // Convert the CallStaticJava into an inline
   virtual void do_late_inline();

-  JVMState* generate(JVMState* jvms) {
+  virtual JVMState* generate(JVMState* jvms) {
     // Record that this call site should be revisited once the main
     // parse is finished.
     Compile::current()->add_late_inline(this);
--- a/src/share/vm/opto/chaitin.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/opto/chaitin.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -484,24 +484,33 @@
     if (_names[i]) {           // Live range associated with Node?
       LRG &lrg = lrgs(_names[i]);
       if (!lrg.alive()) {
-        _node_regs[i].set_bad();
+        set_bad(i);
       } else if (lrg.num_regs() == 1) {
-        _node_regs[i].set1(lrg.reg());
-      } else {                  // Must be a register-pair
-        if (!lrg._fat_proj) {   // Must be aligned adjacent register pair
+        set1(i, lrg.reg());
+      } else {                  // Must be a register-set
+        if (!lrg._fat_proj) {   // Must be aligned adjacent register set
           // Live ranges record the highest register in their mask.
           // We want the low register for the AD file writer's convenience.
-          _node_regs[i].set2( OptoReg::add(lrg.reg(),(1-lrg.num_regs())) );
+          OptoReg::Name hi = lrg.reg(); // Get hi register
+          OptoReg::Name lo = OptoReg::add(hi, (1-lrg.num_regs())); // Find lo
+          // We have to use pair [lo,lo+1] even for wide vectors because
+          // the rest of code generation works only with pairs. It is safe
+          // since for registers encoding only 'lo' is used.
+          // Second reg from pair is used in ScheduleAndBundle on SPARC where
+          // vector max size is 8 which corresponds to registers pair.
+          // It is also used in BuildOopMaps but oop operations are not
+          // vectorized.
+          set2(i, lo);
         } else {                // Misaligned; extract 2 bits
           OptoReg::Name hi = lrg.reg(); // Get hi register
           lrg.Remove(hi);       // Yank from mask
           int lo = lrg.mask().find_first_elem(); // Find lo
-          _node_regs[i].set_pair( hi, lo );
+          set_pair(i, hi, lo);
         }
       }
       if( lrg._is_oop ) _node_oops.set(i);
     } else {
-      _node_regs[i].set_bad();
+      set_bad(i);
     }
   }

@@ -1121,6 +1130,33 @@

 }

+//------------------------------is_legal_reg-----------------------------------
+// Is 'reg' register legal for 'lrg'?
+static bool is_legal_reg(LRG &lrg, OptoReg::Name reg, int chunk) {
+  if (reg >= chunk && reg < (chunk + RegMask::CHUNK_SIZE) &&
+      lrg.mask().Member(OptoReg::add(reg,-chunk))) {
+    // RA uses OptoReg which represent the highest element of a registers set.
+    // For example, vectorX (128bit) on x86 uses [XMM,XMMb,XMMc,XMMd] set
+    // in which XMMd is used by RA to represent such vectors. A double value
+    // uses [XMM,XMMb] pairs and XMMb is used by RA for it.
+    // The register mask uses largest bits set of overlapping register sets.
+    // On x86 with AVX it uses 8 bits for each XMM registers set.
+    //
+    // The 'lrg' already has cleared-to-set register mask (done in Select()
+    // before calling choose_color()). Passing mask.Member(reg) check above
+    // indicates that the size (num_regs) of 'reg' set is less or equal to
+    // 'lrg' set size.
+    // For set size 1 any register which is member of 'lrg' mask is legal.
+    if (lrg.num_regs()==1)
+      return true;
+    // For larger sets only an aligned register with the same set size is legal.
+    int mask = lrg.num_regs()-1;
+    if ((reg&mask) == mask)
+      return true;
+  }
+  return false;
+}
+
 //------------------------------bias_color-------------------------------------
 // Choose a color using the biasing heuristic
 OptoReg::Name PhaseChaitin::bias_color( LRG &lrg, int chunk ) {
@@ -1137,10 +1173,7 @@
     while ((datum = elements.next()) != 0) {
       OptoReg::Name reg = lrgs(datum).reg();
       // If this LRG's register is legal for us, choose it
-      if( reg >= chunk && reg < chunk + RegMask::CHUNK_SIZE &&
-          lrg.mask().Member(OptoReg::add(reg,-chunk)) &&
-          (lrg.num_regs()==1 || // either size 1
-           (reg&1) == 1) )      // or aligned (adjacent reg is available since we already cleared-to-pairs)
+      if (is_legal_reg(lrg, reg, chunk))
         return reg;
     }
   }
@@ -1151,10 +1184,7 @@
     if( !(*(_ifg->_yanked))[copy_lrg] ) {
       OptoReg::Name reg = lrgs(copy_lrg).reg();
       //  And it is legal for you,
-      if( reg >= chunk && reg < chunk + RegMask::CHUNK_SIZE &&
-          lrg.mask().Member(OptoReg::add(reg,-chunk)) &&
-          (lrg.num_regs()==1 || // either size 1
-           (reg&1) == 1) )      // or aligned (adjacent reg is available since we already cleared-to-pairs)
+      if (is_legal_reg(lrg, reg, chunk))
         return reg;
     } else if( chunk == 0 ) {
       // Choose a color which is legal for him
--- a/src/share/vm/opto/classes.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/opto/classes.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -256,6 +256,8 @@
 macro(SubVL)
 macro(SubVF)
 macro(SubVD)
+macro(MulVS)
+macro(MulVI)
 macro(MulVF)
 macro(MulVD)
 macro(DivVF)
@@ -263,9 +265,15 @@
 macro(LShiftVB)
 macro(LShiftVS)
 macro(LShiftVI)
+macro(LShiftVL)
 macro(RShiftVB)
 macro(RShiftVS)
 macro(RShiftVI)
+macro(RShiftVL)
+macro(URShiftVB)
+macro(URShiftVS)
+macro(URShiftVI)
+macro(URShiftVL)
 macro(AndV)
 macro(OrV)
 macro(XorV)
--- a/src/share/vm/opto/compile.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/opto/compile.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -2604,7 +2604,7 @@
     if (n->req()-1 > 2) {
       // Replace many operand PackNodes with a binary tree for matching
       PackNode* p = (PackNode*) n;
-      Node* btp = p->binaryTreePack(Compile::current(), 1, n->req());
+      Node* btp = p->binary_tree_pack(Compile::current(), 1, n->req());
       n->subsume_by(btp);
     }
     break;
--- a/src/share/vm/opto/idealKit.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/opto/idealKit.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -295,7 +295,11 @@
   if (_delay_all_transforms) {
     return delay_transform(n);
   } else {
-    return gvn().transform(n);
+    n = gvn().transform(n);
+    if (!gvn().is_IterGVN()) {
+      C->record_for_igvn(n);
+    }
+    return n;
   }
 }
--- a/src/share/vm/opto/library_call.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/opto/library_call.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -171,7 +171,7 @@
   // Helper for inline_unsafe_access.
   // Generates the guards that check whether the result of
   // Unsafe.getObject should be recorded in an SATB log buffer.
-  void insert_g1_pre_barrier(Node* base_oop, Node* offset, Node* pre_val);
+  void insert_pre_barrier(Node* base_oop, Node* offset, Node* pre_val, int nargs, bool need_mem_bar);
   bool inline_unsafe_access(bool is_native_ptr, bool is_store, BasicType type, bool is_volatile);
   bool inline_unsafe_prefetch(bool is_native_ptr, bool is_store, bool is_static);
   bool inline_unsafe_allocate();
@@ -291,6 +291,8 @@
     case vmIntrinsics::_equals:
     case vmIntrinsics::_equalsC:
       break;  // InlineNatives does not control String.compareTo
+    case vmIntrinsics::_Reference_get:
+      break;  // InlineNatives does not control Reference.get
     default:
       return NULL;
     }
@@ -361,11 +363,10 @@
     break;

   case vmIntrinsics::_Reference_get:
-    // It is only when G1 is enabled that we absolutely
-    // need to use the intrinsic version of Reference.get()
-    // so that the value in the referent field, if necessary,
-    // can be registered by the pre-barrier code.
-    if (!UseG1GC) return NULL;
+    // Use the intrinsic version of Reference.get() so that the value in
+    // the referent field can be registered by the G1 pre-barrier code.
+    // Also add memory barrier to prevent commoning reads from this field
+    // across safepoint since GC can change it value.
     break;

  default:
@@ -2195,14 +2196,17 @@

 const static BasicType T_ADDRESS_HOLDER = T_LONG;

-// Helper that guards and inserts a G1 pre-barrier.
-void LibraryCallKit::insert_g1_pre_barrier(Node* base_oop, Node* offset, Node* pre_val) {
-  assert(UseG1GC, "should not call this otherwise");
-
+// Helper that guards and inserts a pre-barrier.
+void LibraryCallKit::insert_pre_barrier(Node* base_oop, Node* offset,
+                                        Node* pre_val, int nargs, bool need_mem_bar) {
   // We could be accessing the referent field of a reference object. If so, when G1
   // is enabled, we need to log the value in the referent field in an SATB buffer.
   // This routine performs some compile time filters and generates suitable
   // runtime filters that guard the pre-barrier code.
+  // Also add memory barrier for non volatile load from the referent field
+  // to prevent commoning of loads across safepoint.
+  if (!UseG1GC && !need_mem_bar)
+    return;

   // Some compile time checks.

@@ -2224,11 +2228,12 @@

     const TypeInstPtr* itype = btype->isa_instptr();
     if (itype != NULL) {
-      // Can the klass of base_oop be statically determined
-      // to be _not_ a sub-class of Reference?
+      // Can the klass of base_oop be statically determined to be
+      // _not_ a sub-class of Reference and _not_ Object?
       ciKlass* klass = itype->klass();
-      if (klass->is_subtype_of(env()->Reference_klass()) &&
-          !env()->Reference_klass()->is_subtype_of(klass)) {
+      if ( klass->is_loaded() &&
+          !klass->is_subtype_of(env()->Reference_klass()) &&
+          !env()->Object_klass()->is_subtype_of(klass)) {
         return;
       }
     }
@@ -2238,10 +2243,8 @@
   // we need to generate the following runtime filters
   //
   // if (offset == java_lang_ref_Reference::_reference_offset) {
-  //   if (base != null) {
-  //     if (instance_of(base, java.lang.ref.Reference)) {
-  //       pre_barrier(_, pre_val, ...);
-  //     }
+  //   if (instance_of(base, java.lang.ref.Reference)) {
+  //     pre_barrier(_, pre_val, ...);
   //   }
   // }

@@ -2254,19 +2257,19 @@
   Node* referent_off = __ ConX(java_lang_ref_Reference::referent_offset);

   __ if_then(offset, BoolTest::eq, referent_off, unlikely); {
-    __ if_then(base_oop, BoolTest::ne, null(), likely); {
-
       // Update graphKit memory and control from IdealKit.
       sync_kit(ideal);

       Node* ref_klass_con = makecon(TypeKlassPtr::make(env()->Reference_klass()));
+      _sp += nargs;  // gen_instanceof might do an uncommon trap
       Node* is_instof = gen_instanceof(base_oop, ref_klass_con);
+      _sp -= nargs;

       // Update IdealKit memory and control from graphKit.
       __ sync_kit(this);

       Node* one = __ ConI(1);
-
+      // is_instof == 0 if base_oop == NULL
       __ if_then(is_instof, BoolTest::eq, one, unlikely); {

         // Update graphKit from IdeakKit.
@@ -2278,12 +2281,15 @@
                     NULL /* obj */, NULL /* adr */, max_juint /* alias_idx */, NULL /* val */, NULL /* val_type */,
                     pre_val /* pre_val */,
                     T_OBJECT);
-
+        if (need_mem_bar) {
+          // Add memory barrier to prevent commoning reads from this field
+          // across safepoint since GC can change its value.
+          insert_mem_bar(Op_MemBarCPUOrder);
+        }
         // Update IdealKit from graphKit.
         __ sync_kit(this);

       } __ end_if(); // _ref_type != ref_none
-    } __ end_if(); // base  != NULL
   } __ end_if(); // offset == referent_offset

   // Final sync IdealKit and GraphKit.
@@ -2418,7 +2424,9 @@
   // object (either by using Unsafe directly or through reflection)
   // then, if G1 is enabled, we need to record the referent in an
   // SATB log buffer using the pre-barrier mechanism.
-  bool need_read_barrier = UseG1GC && !is_native_ptr && !is_store &&
+  // Also we need to add memory barrier to prevent commoning reads
+  // from this field across safepoint since GC can change its value.
+  bool need_read_barrier = !is_native_ptr && !is_store &&
                            offset != top() && heap_base_oop != top();

   if (!is_store && type == T_OBJECT) {
@@ -2508,7 +2516,7 @@
       break;
     case T_OBJECT:
       if (need_read_barrier) {
-        insert_g1_pre_barrier(heap_base_oop, offset, p);
+        insert_pre_barrier(heap_base_oop, offset, p, nargs, !(is_volatile || need_mem_bar));
       }
       push(p);
       break;
@@ -5484,6 +5492,10 @@
               result /* pre_val */,
               T_OBJECT);

+  // Add memory barrier to prevent commoning reads from this field
+  // across safepoint since GC can change its value.
+  insert_mem_bar(Op_MemBarCPUOrder);
+
   push(result);
   return true;
 }
--- a/src/share/vm/opto/loopnode.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/opto/loopnode.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -1773,6 +1773,8 @@
     if (stride_con > 0) tty->print("+");
     tty->print("%d", stride_con);

+    tty->print(" (%d iters) ", (int)cl->profile_trip_cnt());
+
     if (cl->is_pre_loop ()) tty->print(" pre" );
     if (cl->is_main_loop()) tty->print(" main");
     if (cl->is_post_loop()) tty->print(" post");
--- a/src/share/vm/opto/output.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/opto/output.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -1871,6 +1871,8 @@
   if (!do_scheduling())
     return;

+  assert(MaxVectorSize <= 8, "scheduling code works only with pairs");
+
   NOT_PRODUCT( TracePhase t2("isched", &_t_instrSched, TimeCompiler); )

   // Create a data structure for all the scheduling information
--- a/src/share/vm/opto/superword.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/opto/superword.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -1058,12 +1058,27 @@
   return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0));
 }

+//------------------------------same_inputs--------------------------
+// For pack p, are all idx operands the same?
+static bool same_inputs(Node_List* p, int idx) {
+  Node* p0 = p->at(0);
+  uint vlen = p->size();
+  Node* p0_def = p0->in(idx);
+  for (uint i = 1; i < vlen; i++) {
+    Node* pi = p->at(i);
+    Node* pi_def = pi->in(idx);
+    if (p0_def != pi_def)
+      return false;
+  }
+  return true;
+}
+
 //------------------------------profitable---------------------------
 // For pack p, are all operands and all uses (with in the block) vector?
 bool SuperWord::profitable(Node_List* p) {
   Node* p0 = p->at(0);
   uint start, end;
-  vector_opd_range(p0, &start, &end);
+  VectorNode::vector_operands(p0, &start, &end);

   // Return false if some input is not vector and inside block
   for (uint i = start; i < end; i++) {
@@ -1071,15 +1086,20 @@
       // For now, return false if not scalar promotion case (inputs are the same.)
       // Later, implement PackNode and allow differing, non-vector inputs
       // (maybe just the ones from outside the block.)
-      Node* p0_def = p0->in(i);
-      for (uint j = 1; j < p->size(); j++) {
-        Node* use = p->at(j);
-        Node* def = use->in(i);
-        if (p0_def != def)
-          return false;
+      if (!same_inputs(p, i)) {
+        return false;
       }
     }
   }
+  if (VectorNode::is_shift(p0)) {
+    // For now, return false if shift count is vector because
+    // hw does not support it.
+    if (is_vector_use(p0, 2))
+      return false;
+    // For the same reason return false if different shift counts.
+    if (!same_inputs(p, 2))
+      return false;
+  }
   if (!p0->is_Store()) {
     // For now, return false if not all uses are vector.
     // Later, implement ExtractNode and allow non-vector uses (maybe
@@ -1357,6 +1377,12 @@
         // Promote operands to vector
         Node* in1 = vector_opd(p, 1);
         Node* in2 = vector_opd(p, 2);
+        if (VectorNode::is_invariant_vector(in1) && (n->is_Add() || n->is_Mul())) {
+          // Move invariant vector input into second position to avoid register spilling.
+          Node* tmp = in1;
+          in1 = in2;
+          in2 = tmp;
+        }
         vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
       } else {
         ShouldNotReachHere();
@@ -1386,19 +1412,40 @@
   uint vlen = p->size();
   Node* opd = p0->in(opd_idx);

-  bool same_opd = true;
-  for (uint i = 1; i < vlen; i++) {
-    Node* pi = p->at(i);
-    Node* in = pi->in(opd_idx);
-    if (opd != in) {
-      same_opd = false;
-      break;
+  if (same_inputs(p, opd_idx)) {
+    if (opd->is_Vector() || opd->is_LoadVector()) {
+      assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
+      return opd; // input is matching vector
     }
-  }
-
-  if (same_opd) {
-    if (opd->is_Vector() || opd->is_LoadVector()) {
-      return opd; // input is matching vector
+    if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
+      // No vector is needed for shift count.
+      // Vector instructions do not mask shift count, do it here.
+      Compile* C = _phase->C;
+      Node* cnt = opd;
+      juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
+      const TypeInt* t = opd->find_int_type();
+      if (t != NULL && t->is_con()) {
+        juint shift = t->get_con();
+        if (shift > mask) { // Unsigned cmp
+          cnt = ConNode::make(C, TypeInt::make(shift & mask));
+        }
+      } else {
+        if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) {
+          cnt = ConNode::make(C, TypeInt::make(mask));
+          _phase->_igvn.register_new_node_with_optimizer(cnt);
+          cnt = new (C, 3) AndINode(opd, cnt);
+          _phase->_igvn.register_new_node_with_optimizer(cnt);
+          _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
+        }
+        assert(opd->bottom_type()->isa_int(), "int type only");
+        // Move non constant shift count into XMM register.
+        cnt = new (_phase->C, 2) MoveI2FNode(cnt);
+      }
+      if (cnt != opd) {
+        _phase->_igvn.register_new_node_with_optimizer(cnt);
+        _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
+      }
+      return cnt;
     }
     assert(!opd->is_StoreVector(), "such vector is not expected here");
     // Convert scalar input to vector with the same number of elements as
@@ -1428,7 +1475,7 @@
     Node* in = pi->in(opd_idx);
     assert(my_pack(in) == NULL, "Should already have been unpacked");
     assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
-    pk->add_opd(i, in);
+    pk->add_opd(in);
   }
   _phase->_igvn.register_new_node_with_optimizer(pk);
   _phase->set_ctrl(pk, _phase->get_ctrl(opd));
@@ -1718,37 +1765,27 @@
   for (int i = _block.length() - 1; i >= 0; i--) {
     Node* n = _block.at(i);
     // Only integer types need be examined
-    if (n->bottom_type()->isa_int()) {
+    const Type* vt = velt_type(n);
+    if (vt->basic_type() == T_INT) {
       uint start, end;
-      vector_opd_range(n, &start, &end);
+      VectorNode::vector_operands(n, &start, &end);
       const Type* vt = velt_type(n);

       for (uint j = start; j < end; j++) {
         Node* in  = n->in(j);
-        // Don't propagate through a type conversion
-        if (n->bottom_type() != in->bottom_type())
-          continue;
-        switch(in->Opcode()) {
-        case Op_AddI:    case Op_AddL:
-        case Op_SubI:    case Op_SubL:
-        case Op_MulI:    case Op_MulL:
-        case Op_AndI:    case Op_AndL:
-        case Op_OrI:     case Op_OrL:
-        case Op_XorI:    case Op_XorL:
-        case Op_LShiftI: case Op_LShiftL:
-        case Op_CMoveI:  case Op_CMoveL:
-          if (in_bb(in)) {
-            bool same_type = true;
-            for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
-              Node *use = in->fast_out(k);
-              if (!in_bb(use) || !same_velt_type(use, n)) {
-                same_type = false;
-                break;
-              }
+        // Don't propagate through a memory
+        if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT &&
+            data_size(n) < data_size(in)) {
+          bool same_type = true;
+          for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
+            Node *use = in->fast_out(k);
+            if (!in_bb(use) || !same_velt_type(use, n)) {
+              same_type = false;
+              break;
             }
-            if (same_type) {
-              set_velt_type(in, vt);
-            }
+          }
+          if (same_type) {
+            set_velt_type(in, vt);
           }
         }
       }
@@ -1792,10 +1829,8 @@
   }
   const Type* t = _igvn.type(n);
   if (t->basic_type() == T_INT) {
-    if (t->higher_equal(TypeInt::BOOL))  return TypeInt::BOOL;
-    if (t->higher_equal(TypeInt::BYTE))  return TypeInt::BYTE;
-    if (t->higher_equal(TypeInt::CHAR))  return TypeInt::CHAR;
-    if (t->higher_equal(TypeInt::SHORT)) return TypeInt::SHORT;
+    // A narrow type of arithmetic operations will be determined by
+    // propagating the type of memory operations.
     return TypeInt::INT;
   }
   return t;
@@ -1811,38 +1846,6 @@
   return vt1 == vt2;
 }

-//-------------------------vector_opd_range-----------------------
-// (Start, end] half-open range defining which operands are vector
-void SuperWord::vector_opd_range(Node* n, uint* start, uint* end) {
-  switch (n->Opcode()) {
-  case Op_LoadB:   case Op_LoadUB:
-  case Op_LoadS:   case Op_LoadUS:
-  case Op_LoadI:   case Op_LoadL:
-  case Op_LoadF:   case Op_LoadD:
-  case Op_LoadP:
-    *start = 0;
-    *end   = 0;
-    return;
-  case Op_StoreB:  case Op_StoreC:
-  case Op_StoreI:  case Op_StoreL:
-  case Op_StoreF:  case Op_StoreD:
-  case Op_StoreP:
-    *start = MemNode::ValueIn;
-    *end   = *start + 1;
-    return;
-  case Op_LShiftI: case Op_LShiftL:
-    *start = 1;
-    *end   = 2;
-    return;
-  case Op_CMoveI:  case Op_CMoveL:  case Op_CMoveF:  case Op_CMoveD:
-    *start = 2;
-    *end   = n->req();
-    return;
-  }
-  *start = 1;
-  *end   = n->req(); // default is all operands
-}
-
 //------------------------------in_packset---------------------------
 // Are s1 and s2 in a pack pair and ordered as s1,s2?
 bool SuperWord::in_packset(Node* s1, Node* s2) {
@@ -1940,7 +1943,7 @@
   //     lim0 == original pre loop limit
   //     V == v_align (power of 2)
   //     invar == extra invariant piece of the address expression
-  //     e == k [ +/- invar ]
+  //     e == offset [ +/- invar ]
   //
   // When reassociating expressions involving '%' the basic rules are:
   //     (a - b) % k == 0   =>  a % k == b % k
@@ -1993,13 +1996,12 @@
   int elt_size = align_to_ref_p.memory_size();
   int v_align  = vw / elt_size;
   assert(v_align > 1, "sanity");
-  int k        = align_to_ref_p.offset_in_bytes() / elt_size;
-
-  Node *kn   = _igvn.intcon(k);
+  int offset   = align_to_ref_p.offset_in_bytes() / elt_size;
+  Node *offsn  = _igvn.intcon(offset);

-  Node *e = kn;
+  Node *e = offsn;
   if (align_to_ref_p.invar() != NULL) {
-    // incorporate any extra invariant piece producing k +/- invar >>> log2(elt)
+    // incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt)
     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
     Node* aref     = new (_phase->C, 3) URShiftINode(align_to_ref_p.invar(), log2_elt);
     _phase->_igvn.register_new_node_with_optimizer(aref);
@@ -2014,15 +2016,15 @@
   }
   if (vw > ObjectAlignmentInBytes) {
     // incorporate base e +/- base && Mask >>> log2(elt)
-    Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw)));
     Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base());
     _phase->_igvn.register_new_node_with_optimizer(xbase);
-    Node* masked_xbase  = new (_phase->C, 3) AndXNode(xbase, mask);
+#ifdef _LP64
+    xbase  = new (_phase->C, 2) ConvL2INode(xbase);
+    _phase->_igvn.register_new_node_with_optimizer(xbase);
+#endif
+    Node* mask = _igvn.intcon(vw-1);
+    Node* masked_xbase  = new (_phase->C, 3) AndINode(xbase, mask);
     _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
-#ifdef _LP64
-    masked_xbase  = new (_phase->C, 2) ConvL2INode(masked_xbase);
-    _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
-#endif
     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
     Node* bref     = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt);
     _phase->_igvn.register_new_node_with_optimizer(bref);
--- a/src/share/vm/opto/vectornode.cpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/opto/vectornode.cpp	Fri Aug 24 19:45:42 2012 -0700
@@ -31,7 +31,7 @@
 // Return the vector operator for the specified scalar operation
 // and vector length.  Also used to check if the code generator
 // supports the vector operation.
-int VectorNode::opcode(int sopc, uint vlen, BasicType bt) {
+int VectorNode::opcode(int sopc, BasicType bt) {
   switch (sopc) {
   case Op_AddI:
     switch (bt) {
@@ -69,6 +69,15 @@
   case Op_SubD:
     assert(bt == T_DOUBLE, "must be");
     return Op_SubVD;
+  case Op_MulI:
+    switch (bt) {
+    case T_BOOLEAN:
+    case T_BYTE:   return 0;   // Unimplemented
+    case T_CHAR:
+    case T_SHORT:  return Op_MulVS;
+    case T_INT:    return Matcher::match_rule_supported(Op_MulVI) ? Op_MulVI : 0; // SSE4_1
+    }
+    ShouldNotReachHere();
   case Op_MulF:
     assert(bt == T_FLOAT, "must be");
     return Op_MulVF;
@@ -90,6 +99,9 @@
     case T_INT:    return Op_LShiftVI;
     }
     ShouldNotReachHere();
+  case Op_LShiftL:
+    assert(bt == T_LONG, "must be");
+    return Op_LShiftVL;
   case Op_RShiftI:
     switch (bt) {
     case T_BOOLEAN:
@@ -99,6 +111,21 @@
     case T_INT:    return Op_RShiftVI;
     }
     ShouldNotReachHere();
+  case Op_RShiftL:
+    assert(bt == T_LONG, "must be");
+    return Op_RShiftVL;
+  case Op_URShiftI:
+    switch (bt) {
+    case T_BOOLEAN:
+    case T_BYTE:   return Op_URShiftVB;
+    case T_CHAR:
+    case T_SHORT:  return Op_URShiftVS;
+    case T_INT:    return Op_URShiftVI;
+    }
+    ShouldNotReachHere();
+  case Op_URShiftL:
+    assert(bt == T_LONG, "must be");
+    return Op_URShiftVL;
   case Op_AndI:
   case Op_AndL:
     return Op_AndV;
@@ -134,16 +161,88 @@
   if (is_java_primitive(bt) &&
       (vlen > 1) && is_power_of_2(vlen) &&
       Matcher::vector_size_supported(bt, vlen)) {
-    int vopc = VectorNode::opcode(opc, vlen, bt);
+    int vopc = VectorNode::opcode(opc, bt);
     return vopc > 0 && Matcher::has_match_rule(vopc);
   }
   return false;
 }

+bool VectorNode::is_shift(Node* n) {
+  switch (n->Opcode()) {
+  case Op_LShiftI:
+  case Op_LShiftL:
+  case Op_RShiftI:
+  case Op_RShiftL:
+  case Op_URShiftI:
+  case Op_URShiftL:
+    return true;
+  }
+  return false;
+}
+
+// Check if input is loop invariant vector.
+bool VectorNode::is_invariant_vector(Node* n) {
+  // Only Replicate vector nodes are loop invariant for now.
+  switch (n->Opcode()) {
+  case Op_ReplicateB:
+  case Op_ReplicateS:
+  case Op_ReplicateI:
+  case Op_ReplicateL:
+  case Op_ReplicateF:
+  case Op_ReplicateD:
+    return true;
+  }
+  return false;
+}
+
+// [Start, end) half-open range defining which operands are vectors
+void VectorNode::vector_operands(Node* n, uint* start, uint* end) {
+  switch (n->Opcode()) {
+  case Op_LoadB:   case Op_LoadUB:
+  case Op_LoadS:   case Op_LoadUS:
+  case Op_LoadI:   case Op_LoadL:
+  case Op_LoadF:   case Op_LoadD:
+  case Op_LoadP:   case Op_LoadN:
+    *start = 0;
+    *end   = 0; // no vector operands
+    break;
+  case Op_StoreB:  case Op_StoreC:
+  case Op_StoreI:  case Op_StoreL:
+  case Op_StoreF:  case Op_StoreD:
+  case Op_StoreP:  case Op_StoreN:
+    *start = MemNode::ValueIn;
+    *end   = MemNode::ValueIn + 1; // 1 vector operand
+    break;
+  case Op_LShiftI:  case Op_LShiftL:
+  case Op_RShiftI:  case Op_RShiftL:
+  case Op_URShiftI: case Op_URShiftL:
+    *start = 1;
+    *end   = 2; // 1 vector operand
+    break;
+  case Op_AddI: case Op_AddL: case Op_AddF: case Op_AddD:
+  case Op_SubI: case Op_SubL: case Op_SubF: case Op_SubD:
+  case Op_MulI: case Op_MulL: case Op_MulF: case Op_MulD:
+  case Op_DivF: case Op_DivD:
+  case Op_AndI: case Op_AndL:
+  case Op_OrI:  case Op_OrL:
+  case Op_XorI: case Op_XorL:
+    *start = 1;
+    *end   = 3; // 2 vector operands
+    break;
+  case Op_CMoveI:  case Op_CMoveL:  case Op_CMoveF:  case Op_CMoveD:
+    *start = 2;
+    *end   = n->req();
+    break;
+  default:
+    *start = 1;
+    *end   = n->req(); // default is all operands
+  }
+}
+
 // Return the vector version of a scalar operation node.
 VectorNode* VectorNode::make(Compile* C, int opc, Node* n1, Node* n2, uint vlen, BasicType bt) {
   const TypeVect* vt = TypeVect::make(bt, vlen);
-  int vopc = VectorNode::opcode(opc, vlen, bt);
+  int vopc = VectorNode::opcode(opc, bt);

   switch (vopc) {
   case Op_AddVB: return new (C, 3) AddVBNode(n1, n2, vt);
@@ -160,6 +259,8 @@
   case Op_SubVF: return new (C, 3) SubVFNode(n1, n2, vt);
   case Op_SubVD: return new (C, 3) SubVDNode(n1, n2, vt);

+  case Op_MulVS: return new (C, 3) MulVSNode(n1, n2, vt);
+  case Op_MulVI: return new (C, 3) MulVINode(n1, n2, vt);
   case Op_MulVF: return new (C, 3) MulVFNode(n1, n2, vt);
   case Op_MulVD: return new (C, 3) MulVDNode(n1, n2, vt);

@@ -169,10 +270,17 @@
   case Op_LShiftVB: return new (C, 3) LShiftVBNode(n1, n2, vt);
   case Op_LShiftVS: return new (C, 3) LShiftVSNode(n1, n2, vt);
   case Op_LShiftVI: return new (C, 3) LShiftVINode(n1, n2, vt);
+  case Op_LShiftVL: return new (C, 3) LShiftVLNode(n1, n2, vt);

   case Op_RShiftVB: return new (C, 3) RShiftVBNode(n1, n2, vt);
   case Op_RShiftVS: return new (C, 3) RShiftVSNode(n1, n2, vt);
   case Op_RShiftVI: return new (C, 3) RShiftVINode(n1, n2, vt);
+  case Op_RShiftVL: return new (C, 3) RShiftVLNode(n1, n2, vt);
+
+  case Op_URShiftVB: return new (C, 3) URShiftVBNode(n1, n2, vt);
+  case Op_URShiftVS: return new (C, 3) URShiftVSNode(n1, n2, vt);
+  case Op_URShiftVI: return new (C, 3) URShiftVINode(n1, n2, vt);
+  case Op_URShiftVL: return new (C, 3) URShiftVLNode(n1, n2, vt);

   case Op_AndV: return new (C, 3) AndVNode(n1, n2, vt);
   case Op_OrV:  return new (C, 3) OrVNode (n1, n2, vt);
@@ -214,38 +322,39 @@
   switch (bt) {
   case T_BOOLEAN:
   case T_BYTE:
-    return new (C, vlen+1) PackBNode(s, vt);
+    return new (C, 2) PackBNode(s, vt);
   case T_CHAR:
   case T_SHORT:
-    return new (C, vlen+1) PackSNode(s, vt);
+    return new (C, 2) PackSNode(s, vt);
   case T_INT:
-    return new (C, vlen+1) PackINode(s, vt);
+    return new (C, 2) PackINode(s, vt);
   case T_LONG:
-    return new (C, vlen+1) PackLNode(s, vt);
+    return new (C, 2) PackLNode(s, vt);
   case T_FLOAT:
-    return new (C, vlen+1) PackFNode(s, vt);
+    return new (C, 2) PackFNode(s, vt);
   case T_DOUBLE:
-    return new (C, vlen+1) PackDNode(s, vt);
+    return new (C, 2) PackDNode(s, vt);
   }
   ShouldNotReachHere();
   return NULL;
 }

 // Create a binary tree form for Packs. [lo, hi) (half-open) range
-Node* PackNode::binaryTreePack(Compile* C, int lo, int hi) {
+PackNode* PackNode::binary_tree_pack(Compile* C, int lo, int hi) {
   int ct = hi - lo;
   assert(is_power_of_2(ct), "power of 2");
   if (ct == 2) {
     PackNode* pk = PackNode::make(C, in(lo), 2, vect_type()->element_basic_type());
-    pk->add_opd(1, in(lo+1));
+    pk->add_opd(in(lo+1));
     return pk;

   } else {
     int mid = lo + ct/2;
-    Node* n1 = binaryTreePack(C, lo,  mid);
-    Node* n2 = binaryTreePack(C, mid, hi );
+    PackNode* n1 = binary_tree_pack(C, lo,  mid);
+    PackNode* n2 = binary_tree_pack(C, mid, hi );

-    BasicType bt = vect_type()->element_basic_type();
+    BasicType bt = n1->vect_type()->element_basic_type();
+    assert(bt == n2->vect_type()->element_basic_type(), "should be the same");
     switch (bt) {
     case T_BOOLEAN:
     case T_BYTE:
--- a/src/share/vm/opto/vectornode.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/opto/vectornode.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -46,6 +46,7 @@

   const TypeVect* vect_type() const { return type()->is_vect(); }
   uint length() const { return vect_type()->length(); } // Vector length
+  uint length_in_bytes() const { return vect_type()->length_in_bytes(); }

   virtual int Opcode() const;

@@ -55,9 +56,12 @@

   static VectorNode* make(Compile* C, int opc, Node* n1, Node* n2, uint vlen, BasicType bt);

-  static int  opcode(int opc, uint vlen, BasicType bt);
+  static int  opcode(int opc, BasicType bt);
   static bool implemented(int opc, uint vlen, BasicType bt);
-
+  static bool is_shift(Node* n);
+  static bool is_invariant_vector(Node* n);
+  // [Start, end) half-open range defining which operands are vectors
+  static void vector_operands(Node* n, uint* start, uint* end);
 };

 //===========================Vector=ALU=Operations====================================
@@ -158,6 +162,22 @@
   virtual int Opcode() const;
 };

+//------------------------------MulVSNode---------------------------------------
+// Vector multiply short
+class MulVSNode : public VectorNode {
+ public:
+  MulVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------MulVINode---------------------------------------
+// Vector multiply int
+class MulVINode : public VectorNode {
+ public:
+  MulVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------MulVFNode---------------------------------------
 // Vector multiply float
 class MulVFNode : public VectorNode {
@@ -191,7 +211,7 @@
 };

 //------------------------------LShiftVBNode---------------------------------------
-// Vector lshift byte
+// Vector left shift bytes
 class LShiftVBNode : public VectorNode {
  public:
   LShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
@@ -199,7 +219,7 @@
 };

 //------------------------------LShiftVSNode---------------------------------------
-// Vector lshift shorts
+// Vector left shift shorts
 class LShiftVSNode : public VectorNode {
  public:
   LShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
@@ -207,39 +227,88 @@
 };

 //------------------------------LShiftVINode---------------------------------------
-// Vector lshift ints
+// Vector left shift ints
 class LShiftVINode : public VectorNode {
  public:
   LShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };

-//------------------------------URShiftVBNode---------------------------------------
-// Vector urshift bytes
+//------------------------------LShiftVLNode---------------------------------------
+// Vector left shift longs
+class LShiftVLNode : public VectorNode {
+ public:
+  LShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------RShiftVBNode---------------------------------------
+// Vector right arithmetic (signed) shift bytes
 class RShiftVBNode : public VectorNode {
  public:
   RShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };

-//------------------------------URShiftVSNode---------------------------------------
-// Vector urshift shorts
+//------------------------------RShiftVSNode---------------------------------------
+// Vector right arithmetic (signed) shift shorts
 class RShiftVSNode : public VectorNode {
  public:
   RShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };

-//------------------------------URShiftVINode---------------------------------------
-// Vector urshift ints
+//------------------------------RShiftVINode---------------------------------------
+// Vector right arithmetic (signed) shift ints
 class RShiftVINode : public VectorNode {
  public:
   RShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };

+//------------------------------RShiftVLNode---------------------------------------
+// Vector right arithmetic (signed) shift longs
+class RShiftVLNode : public VectorNode {
+ public:
+  RShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------URShiftVBNode---------------------------------------
+// Vector right logical (unsigned) shift bytes
+class URShiftVBNode : public VectorNode {
+ public:
+  URShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------URShiftVSNode---------------------------------------
+// Vector right logical (unsigned) shift shorts
+class URShiftVSNode : public VectorNode {
+ public:
+  URShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------URShiftVINode---------------------------------------
+// Vector right logical (unsigned) shift ints
+class URShiftVINode : public VectorNode {
+ public:
+  URShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------URShiftVLNode---------------------------------------
+// Vector right logical (unsigned) shift longs
+class URShiftVLNode : public VectorNode {
+ public:
+  URShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+  virtual int Opcode() const;
+};
+
+
 //------------------------------AndVNode---------------------------------------
-// Vector and
+// Vector and integer
 class AndVNode : public VectorNode {
  public:
   AndVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
@@ -247,7 +316,7 @@
 };

 //------------------------------OrVNode---------------------------------------
-// Vector or
+// Vector or integer
 class OrVNode : public VectorNode {
  public:
   OrVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
@@ -255,7 +324,7 @@
 };

 //------------------------------XorVNode---------------------------------------
-// Vector xor
+// Vector xor integer
 class XorVNode : public VectorNode {
  public:
   XorVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
@@ -373,12 +442,12 @@
   PackNode(Node* in1, Node* n2, const TypeVect* vt) : VectorNode(in1, n2, vt) {}
   virtual int Opcode() const;

-  void add_opd(uint i, Node* n) {
-    init_req(i+1, n);
+  void add_opd(Node* n) {
+    add_req(n);
   }

   // Create a binary tree form for Packs. [lo, hi) (half-open) range
-  Node* binaryTreePack(Compile* C, int lo, int hi);
+  PackNode* binary_tree_pack(Compile* C, int lo, int hi);

   static PackNode* make(Compile* C, Node* s, uint vlen, BasicType bt);
 };
--- a/src/share/vm/precompiled/precompiled.hpp	Wed Aug 22 10:01:51 2012 +0200
+++ b/src/share/vm/precompiled/precompiled.hpp	Fri Aug 24 19:45:42 2012 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -306,7 +306,6 @@
 # include "gc_implementation/g1/g1_specialized_oop_closures.hpp"
 # include "gc_implementation/g1/ptrQueue.hpp"
 # include "gc_implementation/g1/satbQueue.hpp"
-# include "gc_implementation/parNew/parGCAllocBuffer.hpp"
 # include "gc_implementation/parNew/parOopClosures.hpp"
 # include "gc_implementation/parallelScavenge/objectStartArray.hpp"
 # include "gc_implementation/parallelScavenge/parMarkBitMap.hpp"
@@ -322,6 +321,7 @@
 # include "gc_implementation/parallelScavenge/psYoungGen.hpp"
 # include "gc_implementation/shared/gcAdaptivePolicyCounters.hpp"
 # include "gc_implementation/shared/gcPolicyCounters.hpp"
+# include "gc_implementation/shared/parGCAllocBuffer.hpp"
 #endif // SERIALGC

 #endif // !DONT_USE_PRECOMPILED_HEADER
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/6340864/TestByteVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,1274 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestByteVect
+ */
+
+public class TestByteVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int ADD_INIT = 0;
+  private static final int BIT_MASK = 0xB7;
+  private static final int VALUE = 3;
+  private static final int SHIFT = 8;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Byte vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    byte[] a0 = new byte[ARRLEN];
+    byte[] a1 = new byte[ARRLEN];
+    byte[] a2 = new byte[ARRLEN];
+    byte[] a3 = new byte[ARRLEN];
+    byte[] a4 = new byte[ARRLEN];
+    short[] p2 = new short[ARRLEN/2];
+      int[] p4 = new   int[ARRLEN/4];
+     long[] p8 = new  long[ARRLEN/8];
+    // Initialize
+    int gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      byte val = (byte)(ADD_INIT+i);
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = (byte)VALUE;
+      a3[i] = (byte)-VALUE;
+      a4[i] = (byte)BIT_MASK;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, (byte)VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, (byte)VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, (byte)VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, (byte)VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, (byte)-VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, (byte)-VALUE);
+      test_diva(a0, a1, a3);
+      test_andc(a0, a1);
+      test_andv(a0, a1, (byte)BIT_MASK);
+      test_anda(a0, a1, a4);
+      test_orc(a0, a1);
+      test_orv(a0, a1, (byte)BIT_MASK);
+      test_ora(a0, a1, a4);
+      test_xorc(a0, a1);
+      test_xorv(a0, a1, (byte)BIT_MASK);
+      test_xora(a0, a1, a4);
+      test_sllc(a0, a1);
+      test_sllv(a0, a1, VALUE);
+      test_srlc(a0, a1);
+      test_srlv(a0, a1, VALUE);
+      test_srac(a0, a1);
+      test_srav(a0, a1, VALUE);
+      test_sllc_n(a0, a1);
+      test_sllv(a0, a1, -VALUE);
+      test_srlc_n(a0, a1);
+      test_srlv(a0, a1, -VALUE);
+      test_srac_n(a0, a1);
+      test_srav(a0, a1, -VALUE);
+      test_sllc_o(a0, a1);
+      test_sllv(a0, a1, SHIFT);
+      test_srlc_o(a0, a1);
+      test_srlv(a0, a1, SHIFT);
+      test_srac_o(a0, a1);
+      test_srav(a0, a1, SHIFT);
+      test_sllc_on(a0, a1);
+      test_sllv(a0, a1, -SHIFT);
+      test_srlc_on(a0, a1);
+      test_srlv(a0, a1, -SHIFT);
+      test_srac_on(a0, a1);
+      test_srav(a0, a1, -SHIFT);
+      test_pack2(p2, a1);
+      test_unpack2(a0, p2);
+      test_pack2_swap(p2, a1);
+      test_unpack2_swap(a0, p2);
+      test_pack4(p4, a1);
+      test_unpack4(a0, p4);
+      test_pack4_swap(p4, a1);
+      test_unpack4_swap(a0, p4);
+      test_pack8(p8, a1);
+      test_unpack8(a0, p8);
+      test_pack8_swap(p8, a1);
+      test_unpack8_swap(a0, p8);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      int sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+
+      test_addc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, (byte)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], (byte)((byte)(ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, (byte)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], (byte)((byte)(ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, (byte)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, (byte)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, (byte)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, (byte)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)/(-VALUE)));
+      }
+
+      test_andc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_andv(a0, a1, (byte)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_anda(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_anda: ", i, a0[i], (byte)((byte)(ADD_INIT+i)&BIT_MASK));
+      }
+
+      test_orc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_orv(a0, a1, (byte)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_ora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ora: ", i, a0[i], (byte)((byte)(ADD_INIT+i)|BIT_MASK));
+      }
+
+      test_xorc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xorv(a0, a1, (byte)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xora: ", i, a0[i], (byte)((byte)(ADD_INIT+i)^BIT_MASK));
+      }
+
+      test_sllc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<VALUE));
+      }
+      test_sllv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<VALUE));
+      }
+
+      test_srlc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>VALUE));
+      }
+      test_srlv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>VALUE));
+      }
+
+      test_srac(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>VALUE));
+      }
+      test_srav(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>VALUE));
+      }
+
+      test_sllc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<(-VALUE)));
+      }
+      test_sllv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<(-VALUE)));
+      }
+
+      test_srlc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>(-VALUE)));
+      }
+      test_srlv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>(-VALUE)));
+      }
+
+      test_srac_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>(-VALUE)));
+      }
+      test_srav(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_n: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>(-VALUE)));
+      }
+
+      test_sllc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<SHIFT));
+      }
+      test_sllv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<SHIFT));
+      }
+
+      test_srlc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>SHIFT));
+      }
+      test_srlv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>SHIFT));
+      }
+
+      test_srac_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>SHIFT));
+      }
+      test_srav(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_o: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>SHIFT));
+      }
+
+      test_sllc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<(-SHIFT)));
+      }
+      test_sllv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)<<(-SHIFT)));
+      }
+
+      test_srlc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+      test_srlv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+
+      test_srac_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>(-SHIFT)));
+      }
+      test_srav(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>(-SHIFT)));
+      }
+
+      test_pack2(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2: ", i, p2[i], (short)(((short)(ADD_INIT+2*i) & 0xFF) | ((short)(ADD_INIT+2*i+1) << 8)));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+      test_pack2_swap(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2_swap: ", i, p2[i], (short)(((short)(ADD_INIT+2*i+1) & 0xFF) | ((short)(ADD_INIT+2*i) << 8)));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2_swap(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2_swap: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+      test_pack4(p4, a1);
+      for (int i=0; i<ARRLEN/4; i++) {
+        errn += verify("test_pack4: ", i, p4[i],  ((int)(ADD_INIT+4*i+0) & 0xFF) |
+                                                 (((int)(ADD_INIT+4*i+1) & 0xFF) <<  8)  |
+                                                 (((int)(ADD_INIT+4*i+2) & 0xFF) << 16)  |
+                                                 (((int)(ADD_INIT+4*i+3) & 0xFF) << 24));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack4(a0, p4);
+      for (int i=0; i<(ARRLEN&(-4)); i++) {
+        errn += verify("test_unpack4: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+      test_pack4_swap(p4, a1);
+      for (int i=0; i<ARRLEN/4; i++) {
+        errn += verify("test_pack4_swap: ", i, p4[i],  ((int)(ADD_INIT+4*i+3) & 0xFF) |
+                                                      (((int)(ADD_INIT+4*i+2) & 0xFF) <<  8)  |
+                                                      (((int)(ADD_INIT+4*i+1) & 0xFF) << 16)  |
+                                                      (((int)(ADD_INIT+4*i+0) & 0xFF) << 24));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack4_swap(a0, p4);
+      for (int i=0; i<(ARRLEN&(-4)); i++) {
+        errn += verify("test_unpack4_swap: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+      test_pack8(p8, a1);
+      for (int i=0; i<ARRLEN/8; i++) {
+        errn += verify("test_pack8: ", i, p8[i],  ((long)(ADD_INIT+8*i+0) & 0xFFl) |
+                                                 (((long)(ADD_INIT+8*i+1) & 0xFFl) <<  8)  |
+                                                 (((long)(ADD_INIT+8*i+2) & 0xFFl) << 16)  |
+                                                 (((long)(ADD_INIT+8*i+3) & 0xFFl) << 24)  |
+                                                 (((long)(ADD_INIT+8*i+4) & 0xFFl) << 32)  |
+                                                 (((long)(ADD_INIT+8*i+5) & 0xFFl) << 40)  |
+                                                 (((long)(ADD_INIT+8*i+6) & 0xFFl) << 48)  |
+                                                 (((long)(ADD_INIT+8*i+7) & 0xFFl) << 56));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack8(a0, p8);
+      for (int i=0; i<(ARRLEN&(-8)); i++) {
+        errn += verify("test_unpack8: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+      test_pack8_swap(p8, a1);
+      for (int i=0; i<ARRLEN/8; i++) {
+        errn += verify("test_pack8_swap: ", i, p8[i],  ((long)(ADD_INIT+8*i+7) & 0xFFl) |
+                                                      (((long)(ADD_INIT+8*i+6) & 0xFFl) <<  8)  |
+                                                      (((long)(ADD_INIT+8*i+5) & 0xFFl) << 16)  |
+                                                      (((long)(ADD_INIT+8*i+4) & 0xFFl) << 24)  |
+                                                      (((long)(ADD_INIT+8*i+3) & 0xFFl) << 32)  |
+                                                      (((long)(ADD_INIT+8*i+2) & 0xFFl) << 40)  |
+                                                      (((long)(ADD_INIT+8*i+1) & 0xFFl) << 48)  |
+                                                      (((long)(ADD_INIT+8*i+0) & 0xFFl) << 56));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack8_swap(a0, p8);
+      for (int i=0; i<(ARRLEN&(-8)); i++) {
+        errn += verify("test_unpack8_swap: ", i, a0[i], (byte)(ADD_INIT+i));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, (byte)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, (byte)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (byte)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (byte)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (byte)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (byte)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andv(a0, a1, (byte)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_anda(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_anda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orv(a0, a1, (byte)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorv(a0, a1, (byte)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack2(p2, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack2: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack2(a0, p2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack2: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack2_swap(p2, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack2_swap: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack2_swap(a0, p2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack2_swap: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack4(p4, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack4: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack4(a0, p4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack4: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack4_swap(p4, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack4_swap: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack4_swap(a0, p4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack4_swap: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack8(p8, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack8: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack8(a0, p8);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack8: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack8_swap(p8, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack8_swap: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack8_swap(a0, p8);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack8_swap: " + (end - start));
+
+    return errn;
+  }
+
+  static int test_sum(byte[] a1) {
+    int sum = 0;
+    for (int i = 0; i < a1.length; i+=1) {
+      sum += a1[i];
+    }
+    return sum;
+  }
+
+  static void test_addc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]+VALUE);
+    }
+  }
+  static void test_addv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]+b);
+    }
+  }
+  static void test_adda(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]+a2[i]);
+    }
+  }
+
+  static void test_subc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]-VALUE);
+    }
+  }
+  static void test_subv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]-b);
+    }
+  }
+  static void test_suba(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]-a2[i]);
+    }
+  }
+
+  static void test_mulc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]*VALUE);
+    }
+  }
+  static void test_mulc_n(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]*(-VALUE));
+    }
+  }
+  static void test_mulv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]*b);
+    }
+  }
+  static void test_mula(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]*a2[i]);
+    }
+  }
+
+  static void test_divc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]/VALUE);
+    }
+  }
+  static void test_divc_n(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]/(-VALUE));
+    }
+  }
+  static void test_divv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]/b);
+    }
+  }
+  static void test_diva(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]/a2[i]);
+    }
+  }
+
+  static void test_andc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]&BIT_MASK);
+    }
+  }
+  static void test_andv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]&b);
+    }
+  }
+  static void test_anda(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]&a2[i]);
+    }
+  }
+
+  static void test_orc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]|BIT_MASK);
+    }
+  }
+  static void test_orv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]|b);
+    }
+  }
+  static void test_ora(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]|a2[i]);
+    }
+  }
+
+  static void test_xorc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]^BIT_MASK);
+    }
+  }
+  static void test_xorv(byte[] a0, byte[] a1, byte b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]^b);
+    }
+  }
+  static void test_xora(byte[] a0, byte[] a1, byte[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]^a2[i]);
+    }
+  }
+
+  static void test_sllc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]<<VALUE);
+    }
+  }
+  static void test_sllc_n(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]<<(-VALUE));
+    }
+  }
+  static void test_sllc_o(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]<<SHIFT);
+    }
+  }
+  static void test_sllc_on(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]<<(-SHIFT));
+    }
+  }
+  static void test_sllv(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]<<b);
+    }
+  }
+
+  static void test_srlc(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>>VALUE);
+    }
+  }
+  static void test_srlc_n(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>>(-VALUE));
+    }
+  }
+  static void test_srlc_o(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>>SHIFT);
+    }
+  }
+  static void test_srlc_on(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>>(-SHIFT));
+    }
+  }
+  static void test_srlv(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>>b);
+    }
+  }
+
+  static void test_srac(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>VALUE);
+    }
+  }
+  static void test_srac_n(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>(-VALUE));
+    }
+  }
+  static void test_srac_o(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>SHIFT);
+    }
+  }
+  static void test_srac_on(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>(-SHIFT));
+    }
+  }
+  static void test_srav(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]>>b);
+    }
+  }
+
+  static void test_pack2(short[] p2, byte[] a1) {
+    if (p2.length*2 > a1.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      short l0 = (short)a1[i*2+0];
+      short l1 = (short)a1[i*2+1];
+      p2[i] = (short)((l1 << 8) | (l0 & 0xFF));
+    }
+  }
+  static void test_unpack2(byte[] a0, short[] p2) {
+    if (p2.length*2 > a0.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      short l = p2[i];
+      a0[i*2+0] = (byte)(l & 0xFF);
+      a0[i*2+1] = (byte)(l >> 8);
+    }
+  }
+  static void test_pack2_swap(short[] p2, byte[] a1) {
+    if (p2.length*2 > a1.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      short l0 = (short)a1[i*2+0];
+      short l1 = (short)a1[i*2+1];
+      p2[i] = (short)((l0 << 8) | (l1 & 0xFF));
+    }
+  }
+  static void test_unpack2_swap(byte[] a0, short[] p2) {
+    if (p2.length*2 > a0.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      short l = p2[i];
+      a0[i*2+0] = (byte)(l >> 8);
+      a0[i*2+1] = (byte)(l & 0xFF);
+    }
+  }
+
+  static void test_pack4(int[] p4, byte[] a1) {
+    if (p4.length*4 > a1.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      int l0 = (int)a1[i*4+0];
+      int l1 = (int)a1[i*4+1];
+      int l2 = (int)a1[i*4+2];
+      int l3 = (int)a1[i*4+3];
+      p4[i] = (l0 & 0xFF) |
+             ((l1 & 0xFF) <<  8) |
+             ((l2 & 0xFF) << 16) |
+             ((l3 & 0xFF) << 24);
+    }
+  }
+  static void test_unpack4(byte[] a0, int[] p4) {
+    if (p4.length*4 > a0.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      int l = p4[i];
+      a0[i*4+0] = (byte)(l & 0xFF);
+      a0[i*4+1] = (byte)(l >>  8);
+      a0[i*4+2] = (byte)(l >> 16);
+      a0[i*4+3] = (byte)(l >> 24);
+    }
+  }
+  static void test_pack4_swap(int[] p4, byte[] a1) {
+    if (p4.length*4 > a1.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      int l0 = (int)a1[i*4+0];
+      int l1 = (int)a1[i*4+1];
+      int l2 = (int)a1[i*4+2];
+      int l3 = (int)a1[i*4+3];
+      p4[i] = (l3 & 0xFF) |
+             ((l2 & 0xFF) <<  8) |
+             ((l1 & 0xFF) << 16) |
+             ((l0 & 0xFF) << 24);
+    }
+  }
+  static void test_unpack4_swap(byte[] a0, int[] p4) {
+    if (p4.length*4 > a0.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      int l = p4[i];
+      a0[i*4+0] = (byte)(l >> 24);
+      a0[i*4+1] = (byte)(l >> 16);
+      a0[i*4+2] = (byte)(l >>  8);
+      a0[i*4+3] = (byte)(l & 0xFF);
+    }
+  }
+
+  static void test_pack8(long[] p8, byte[] a1) {
+    if (p8.length*8 > a1.length) return;
+    for (int i = 0; i < p8.length; i+=1) {
+      long l0 = (long)a1[i*8+0];
+      long l1 = (long)a1[i*8+1];
+      long l2 = (long)a1[i*8+2];
+      long l3 = (long)a1[i*8+3];
+      long l4 = (long)a1[i*8+4];
+      long l5 = (long)a1[i*8+5];
+      long l6 = (long)a1[i*8+6];
+      long l7 = (long)a1[i*8+7];
+      p8[i] = (l0 & 0xFFl) |
+             ((l1 & 0xFFl) <<  8) |
+             ((l2 & 0xFFl) << 16) |
+             ((l3 & 0xFFl) << 24) |
+             ((l4 & 0xFFl) << 32) |
+             ((l5 & 0xFFl) << 40) |
+             ((l6 & 0xFFl) << 48) |
+             ((l7 & 0xFFl) << 56);
+    }
+  }
+  static void test_unpack8(byte[] a0, long[] p8) {
+    if (p8.length*8 > a0.length) return;
+    for (int i = 0; i < p8.length; i+=1) {
+      long l = p8[i];
+      a0[i*8+0] = (byte)(l & 0xFFl);
+      a0[i*8+1] = (byte)(l >>  8);
+      a0[i*8+2] = (byte)(l >> 16);
+      a0[i*8+3] = (byte)(l >> 24);
+      a0[i*8+4] = (byte)(l >> 32);
+      a0[i*8+5] = (byte)(l >> 40);
+      a0[i*8+6] = (byte)(l >> 48);
+      a0[i*8+7] = (byte)(l >> 56);
+    }
+  }
+  static void test_pack8_swap(long[] p8, byte[] a1) {
+    if (p8.length*8 > a1.length) return;
+    for (int i = 0; i < p8.length; i+=1) {
+      long l0 = (long)a1[i*8+0];
+      long l1 = (long)a1[i*8+1];
+      long l2 = (long)a1[i*8+2];
+      long l3 = (long)a1[i*8+3];
+      long l4 = (long)a1[i*8+4];
+      long l5 = (long)a1[i*8+5];
+      long l6 = (long)a1[i*8+6];
+      long l7 = (long)a1[i*8+7];
+      p8[i] = (l7 & 0xFFl) |
+             ((l6 & 0xFFl) <<  8) |
+             ((l5 & 0xFFl) << 16) |
+             ((l4 & 0xFFl) << 24) |
+             ((l3 & 0xFFl) << 32) |
+             ((l2 & 0xFFl) << 40) |
+             ((l1 & 0xFFl) << 48) |
+             ((l0 & 0xFFl) << 56);
+    }
+  }
+  static void test_unpack8_swap(byte[] a0, long[] p8) {
+    if (p8.length*8 > a0.length) return;
+    for (int i = 0; i < p8.length; i+=1) {
+      long l = p8[i];
+      a0[i*8+0] = (byte)(l >> 56);
+      a0[i*8+1] = (byte)(l >> 48);
+      a0[i*8+2] = (byte)(l >> 40);
+      a0[i*8+3] = (byte)(l >> 32);
+      a0[i*8+4] = (byte)(l >> 24);
+      a0[i*8+5] = (byte)(l >> 16);
+      a0[i*8+6] = (byte)(l >>  8);
+      a0[i*8+7] = (byte)(l & 0xFFl);
+    }
+  }
+
+  static int verify(String text, int i, byte elem, byte val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+
+  static int verify(String text, int i, short elem, short val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + Integer.toHexString(elem) + " != " + Integer.toHexString(val));
+      return 1;
+    }
+    return 0;
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + Long.toHexString(elem) + " != " + Long.toHexString(val));
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/6340864/TestDoubleVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestDoubleVect
+ */
+
+public class TestDoubleVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final double ADD_INIT = -7500.;
+  private static final double VALUE = 15.;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Double vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    double[] a0 = new double[ARRLEN];
+    double[] a1 = new double[ARRLEN];
+    double[] a2 = new double[ARRLEN];
+    double[] a3 = new double[ARRLEN];
+    // Initialize
+    double gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      double val = ADD_INIT+(double)i;
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = VALUE;
+      a3[i] = -VALUE;
+    }
+
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, -VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, -VALUE);
+      test_diva(a0, a1, a3);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      double sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+      // Overwrite with NaN values
+      a1[0] = Double.NaN;
+      a1[1] = Double.POSITIVE_INFINITY;
+      a1[2] = Double.NEGATIVE_INFINITY;
+      a1[3] = Double.MAX_VALUE;
+      a1[4] = Double.MIN_VALUE;
+      a1[5] = Double.MIN_NORMAL;
+
+      a2[6] = a1[0];
+      a2[7] = a1[1];
+      a2[8] = a1[2];
+      a2[9] = a1[3];
+      a2[10] = a1[4];
+      a2[11] = a1[5];
+
+      a3[6] = -a2[6];
+      a3[7] = -a2[7];
+      a3[8] = -a2[8];
+      a3[9] = -a2[9];
+      a3[10] = -a2[10];
+      a3[11] = -a2[11];
+
+      test_addc(a0, a1);
+      errn += verify("test_addc: ", 0, a0[0], (Double.NaN+VALUE));
+      errn += verify("test_addc: ", 1, a0[1], (Double.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_addc: ", 2, a0[2], (Double.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_addc: ", 3, a0[3], (Double.MAX_VALUE+VALUE));
+      errn += verify("test_addc: ", 4, a0[4], (Double.MIN_VALUE+VALUE));
+      errn += verify("test_addc: ", 5, a0[5], (Double.MIN_NORMAL+VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, VALUE);
+      errn += verify("test_addv: ", 0, a0[0], (Double.NaN+VALUE));
+      errn += verify("test_addv: ", 1, a0[1], (Double.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_addv: ", 2, a0[2], (Double.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_addv: ", 3, a0[3], (Double.MAX_VALUE+VALUE));
+      errn += verify("test_addv: ", 4, a0[4], (Double.MIN_VALUE+VALUE));
+      errn += verify("test_addv: ", 5, a0[5], (Double.MIN_NORMAL+VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      errn += verify("test_adda: ", 0, a0[0], (Double.NaN+VALUE));
+      errn += verify("test_adda: ", 1, a0[1], (Double.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_adda: ", 2, a0[2], (Double.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_adda: ", 3, a0[3], (Double.MAX_VALUE+VALUE));
+      errn += verify("test_adda: ", 4, a0[4], (Double.MIN_VALUE+VALUE));
+      errn += verify("test_adda: ", 5, a0[5], (Double.MIN_NORMAL+VALUE));
+      errn += verify("test_adda: ", 6, a0[6], ((ADD_INIT+6)+Double.NaN));
+      errn += verify("test_adda: ", 7, a0[7], ((ADD_INIT+7)+Double.POSITIVE_INFINITY));
+      errn += verify("test_adda: ", 8, a0[8], ((ADD_INIT+8)+Double.NEGATIVE_INFINITY));
+      errn += verify("test_adda: ", 9, a0[9], ((ADD_INIT+9)+Double.MAX_VALUE));
+      errn += verify("test_adda: ", 10, a0[10], ((ADD_INIT+10)+Double.MIN_VALUE));
+      errn += verify("test_adda: ", 11, a0[11], ((ADD_INIT+11)+Double.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      errn += verify("test_subc: ", 0, a0[0], (Double.NaN-VALUE));
+      errn += verify("test_subc: ", 1, a0[1], (Double.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_subc: ", 2, a0[2], (Double.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_subc: ", 3, a0[3], (Double.MAX_VALUE-VALUE));
+      errn += verify("test_subc: ", 4, a0[4], (Double.MIN_VALUE-VALUE));
+      errn += verify("test_subc: ", 5, a0[5], (Double.MIN_NORMAL-VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, VALUE);
+      errn += verify("test_subv: ", 0, a0[0], (Double.NaN-VALUE));
+      errn += verify("test_subv: ", 1, a0[1], (Double.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_subv: ", 2, a0[2], (Double.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_subv: ", 3, a0[3], (Double.MAX_VALUE-VALUE));
+      errn += verify("test_subv: ", 4, a0[4], (Double.MIN_VALUE-VALUE));
+      errn += verify("test_subv: ", 5, a0[5], (Double.MIN_NORMAL-VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      errn += verify("test_suba: ", 0, a0[0], (Double.NaN-VALUE));
+      errn += verify("test_suba: ", 1, a0[1], (Double.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_suba: ", 2, a0[2], (Double.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_suba: ", 3, a0[3], (Double.MAX_VALUE-VALUE));
+      errn += verify("test_suba: ", 4, a0[4], (Double.MIN_VALUE-VALUE));
+      errn += verify("test_suba: ", 5, a0[5], (Double.MIN_NORMAL-VALUE));
+      errn += verify("test_suba: ", 6, a0[6], ((ADD_INIT+6)-Double.NaN));
+      errn += verify("test_suba: ", 7, a0[7], ((ADD_INIT+7)-Double.POSITIVE_INFINITY));
+      errn += verify("test_suba: ", 8, a0[8], ((ADD_INIT+8)-Double.NEGATIVE_INFINITY));
+      errn += verify("test_suba: ", 9, a0[9], ((ADD_INIT+9)-Double.MAX_VALUE));
+      errn += verify("test_suba: ", 10, a0[10], ((ADD_INIT+10)-Double.MIN_VALUE));
+      errn += verify("test_suba: ", 11, a0[11], ((ADD_INIT+11)-Double.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      errn += verify("test_mulc: ", 0, a0[0], (Double.NaN*VALUE));
+      errn += verify("test_mulc: ", 1, a0[1], (Double.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mulc: ", 2, a0[2], (Double.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mulc: ", 3, a0[3], (Double.MAX_VALUE*VALUE));
+      errn += verify("test_mulc: ", 4, a0[4], (Double.MIN_VALUE*VALUE));
+      errn += verify("test_mulc: ", 5, a0[5], (Double.MIN_NORMAL*VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, VALUE);
+      errn += verify("test_mulv: ", 0, a0[0], (Double.NaN*VALUE));
+      errn += verify("test_mulv: ", 1, a0[1], (Double.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mulv: ", 2, a0[2], (Double.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mulv: ", 3, a0[3], (Double.MAX_VALUE*VALUE));
+      errn += verify("test_mulv: ", 4, a0[4], (Double.MIN_VALUE*VALUE));
+      errn += verify("test_mulv: ", 5, a0[5], (Double.MIN_NORMAL*VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      errn += verify("test_mula: ", 0, a0[0], (Double.NaN*VALUE));
+      errn += verify("test_mula: ", 1, a0[1], (Double.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mula: ", 2, a0[2], (Double.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mula: ", 3, a0[3], (Double.MAX_VALUE*VALUE));
+      errn += verify("test_mula: ", 4, a0[4], (Double.MIN_VALUE*VALUE));
+      errn += verify("test_mula: ", 5, a0[5], (Double.MIN_NORMAL*VALUE));
+      errn += verify("test_mula: ", 6, a0[6], ((ADD_INIT+6)*Double.NaN));
+      errn += verify("test_mula: ", 7, a0[7], ((ADD_INIT+7)*Double.POSITIVE_INFINITY));
+      errn += verify("test_mula: ", 8, a0[8], ((ADD_INIT+8)*Double.NEGATIVE_INFINITY));
+      errn += verify("test_mula: ", 9, a0[9], ((ADD_INIT+9)*Double.MAX_VALUE));
+      errn += verify("test_mula: ", 10, a0[10], ((ADD_INIT+10)*Double.MIN_VALUE));
+      errn += verify("test_mula: ", 11, a0[11], ((ADD_INIT+11)*Double.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      errn += verify("test_divc: ", 0, a0[0], (Double.NaN/VALUE));
+      errn += verify("test_divc: ", 1, a0[1], (Double.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_divc: ", 2, a0[2], (Double.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_divc: ", 3, a0[3], (Double.MAX_VALUE/VALUE));
+      errn += verify("test_divc: ", 4, a0[4], (Double.MIN_VALUE/VALUE));
+      errn += verify("test_divc: ", 5, a0[5], (Double.MIN_NORMAL/VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, VALUE);
+      errn += verify("test_divv: ", 0, a0[0], (Double.NaN/VALUE));
+      errn += verify("test_divv: ", 1, a0[1], (Double.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_divv: ", 2, a0[2], (Double.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_divv: ", 3, a0[3], (Double.MAX_VALUE/VALUE));
+      errn += verify("test_divv: ", 4, a0[4], (Double.MIN_VALUE/VALUE));
+      errn += verify("test_divv: ", 5, a0[5], (Double.MIN_NORMAL/VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      errn += verify("test_diva: ", 0, a0[0], (Double.NaN/VALUE));
+      errn += verify("test_diva: ", 1, a0[1], (Double.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_diva: ", 2, a0[2], (Double.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_diva: ", 3, a0[3], (Double.MAX_VALUE/VALUE));
+      errn += verify("test_diva: ", 4, a0[4], (Double.MIN_VALUE/VALUE));
+      errn += verify("test_diva: ", 5, a0[5], (Double.MIN_NORMAL/VALUE));
+      errn += verify("test_diva: ", 6, a0[6], ((ADD_INIT+6)/Double.NaN));
+      errn += verify("test_diva: ", 7, a0[7], ((ADD_INIT+7)/Double.POSITIVE_INFINITY));
+      errn += verify("test_diva: ", 8, a0[8], ((ADD_INIT+8)/Double.NEGATIVE_INFINITY));
+      errn += verify("test_diva: ", 9, a0[9], ((ADD_INIT+9)/Double.MAX_VALUE));
+      errn += verify("test_diva: ", 10, a0[10], ((ADD_INIT+10)/Double.MIN_VALUE));
+      errn += verify("test_diva: ", 11, a0[11], ((ADD_INIT+11)/Double.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      errn += verify("test_mulc_n: ", 0, a0[0], (Double.NaN*(-VALUE)));
+      errn += verify("test_mulc_n: ", 1, a0[1], (Double.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulc_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulc_n: ", 3, a0[3], (Double.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mulc_n: ", 4, a0[4], (Double.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mulc_n: ", 5, a0[5], (Double.MIN_NORMAL*(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, -VALUE);
+      errn += verify("test_mulv_n: ", 0, a0[0], (Double.NaN*(-VALUE)));
+      errn += verify("test_mulv_n: ", 1, a0[1], (Double.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulv_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulv_n: ", 3, a0[3], (Double.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mulv_n: ", 4, a0[4], (Double.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mulv_n: ", 5, a0[5], (Double.MIN_NORMAL*(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      errn += verify("test_mula_n: ", 0, a0[0], (Double.NaN*(-VALUE)));
+      errn += verify("test_mula_n: ", 1, a0[1], (Double.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mula_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mula_n: ", 3, a0[3], (Double.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mula_n: ", 4, a0[4], (Double.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mula_n: ", 5, a0[5], (Double.MIN_NORMAL*(-VALUE)));
+      errn += verify("test_mula_n: ", 6, a0[6], ((ADD_INIT+6)*(-Double.NaN)));
+      errn += verify("test_mula_n: ", 7, a0[7], ((ADD_INIT+7)*(-Double.POSITIVE_INFINITY)));
+      errn += verify("test_mula_n: ", 8, a0[8], ((ADD_INIT+8)*(-Double.NEGATIVE_INFINITY)));
+      errn += verify("test_mula_n: ", 9, a0[9], ((ADD_INIT+9)*(-Double.MAX_VALUE)));
+      errn += verify("test_mula_n: ", 10, a0[10], ((ADD_INIT+10)*(-Double.MIN_VALUE)));
+      errn += verify("test_mula_n: ", 11, a0[11], ((ADD_INIT+11)*(-Double.MIN_NORMAL)));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      errn += verify("test_divc_n: ", 0, a0[0], (Double.NaN/(-VALUE)));
+      errn += verify("test_divc_n: ", 1, a0[1], (Double.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divc_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divc_n: ", 3, a0[3], (Double.MAX_VALUE/(-VALUE)));
+      errn += verify("test_divc_n: ", 4, a0[4], (Double.MIN_VALUE/(-VALUE)));
+      errn += verify("test_divc_n: ", 5, a0[5], (Double.MIN_NORMAL/(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, -VALUE);
+      errn += verify("test_divv_n: ", 0, a0[0], (Double.NaN/(-VALUE)));
+      errn += verify("test_divv_n: ", 1, a0[1], (Double.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divv_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divv_n: ", 3, a0[3], (Double.MAX_VALUE/(-VALUE)));
+      errn += verify("test_divv_n: ", 4, a0[4], (Double.MIN_VALUE/(-VALUE)));
+      errn += verify("test_divv_n: ", 5, a0[5], (Double.MIN_NORMAL/(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      errn += verify("test_diva_n: ", 0, a0[0], (Double.NaN/(-VALUE)));
+      errn += verify("test_diva_n: ", 1, a0[1], (Double.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_diva_n: ", 2, a0[2], (Double.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_diva_n: ", 3, a0[3], (Double.MAX_VALUE/(-VALUE)));
+      errn += verify("test_diva_n: ", 4, a0[4], (Double.MIN_VALUE/(-VALUE)));
+      errn += verify("test_diva_n: ", 5, a0[5], (Double.MIN_NORMAL/(-VALUE)));
+      errn += verify("test_diva_n: ", 6, a0[6], ((ADD_INIT+6)/(-Double.NaN)));
+      errn += verify("test_diva_n: ", 7, a0[7], ((ADD_INIT+7)/(-Double.POSITIVE_INFINITY)));
+      errn += verify("test_diva_n: ", 8, a0[8], ((ADD_INIT+8)/(-Double.NEGATIVE_INFINITY)));
+      errn += verify("test_diva_n: ", 9, a0[9], ((ADD_INIT+9)/(-Double.MAX_VALUE)));
+      errn += verify("test_diva_n: ", 10, a0[10], ((ADD_INIT+10)/(-Double.MIN_VALUE)));
+      errn += verify("test_diva_n: ", 11, a0[11], ((ADD_INIT+11)/(-Double.MIN_NORMAL)));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    return errn;
+  }
+
+  static double test_sum(double[] a1) {
+    double sum = 0;
+    for (int i = 0; i < a1.length; i+=1) {
+      sum += a1[i];
+    }
+    return sum;
+  }
+
+  static void test_addc(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+VALUE);
+    }
+  }
+  static void test_addv(double[] a0, double[] a1, double b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+b);
+    }
+  }
+  static void test_adda(double[] a0, double[] a1, double[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+a2[i]);
+    }
+  }
+
+  static void test_subc(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-VALUE);
+    }
+  }
+  static void test_subv(double[] a0, double[] a1, double b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-b);
+    }
+  }
+  static void test_suba(double[] a0, double[] a1, double[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-a2[i]);
+    }
+  }
+
+  static void test_mulc(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*VALUE);
+    }
+  }
+  static void test_mulc_n(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*(-VALUE));
+    }
+  }
+  static void test_mulv(double[] a0, double[] a1, double b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*b);
+    }
+  }
+  static void test_mula(double[] a0, double[] a1, double[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*a2[i]);
+    }
+  }
+
+  static void test_divc(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/VALUE);
+    }
+  }
+  static void test_divc_n(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/(-VALUE));
+    }
+  }
+  static void test_divv(double[] a0, double[] a1, double b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/b);
+    }
+  }
+  static void test_diva(double[] a0, double[] a1, double[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/a2[i]);
+    }
+  }
+
+  static int verify(String text, int i, double elem, double val) {
+    if (elem != val && !(Double.isNaN(elem) && Double.isNaN(val))) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/6340864/TestFloatVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestFloatVect
+ */
+
+public class TestFloatVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final float ADD_INIT = -7500.f;
+  private static final float VALUE = 15.f;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Float vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    float[] a0 = new float[ARRLEN];
+    float[] a1 = new float[ARRLEN];
+    float[] a2 = new float[ARRLEN];
+    float[] a3 = new float[ARRLEN];
+    // Initialize
+    float gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      float val = ADD_INIT+(float)i;
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = VALUE;
+      a3[i] = -VALUE;
+    }
+
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, -VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, -VALUE);
+      test_diva(a0, a1, a3);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      float sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+      // Overwrite with NaN values
+      a1[0] = Float.NaN;
+      a1[1] = Float.POSITIVE_INFINITY;
+      a1[2] = Float.NEGATIVE_INFINITY;
+      a1[3] = Float.MAX_VALUE;
+      a1[4] = Float.MIN_VALUE;
+      a1[5] = Float.MIN_NORMAL;
+
+      a2[6] = a1[0];
+      a2[7] = a1[1];
+      a2[8] = a1[2];
+      a2[9] = a1[3];
+      a2[10] = a1[4];
+      a2[11] = a1[5];
+
+      a3[6] = -a2[6];
+      a3[7] = -a2[7];
+      a3[8] = -a2[8];
+      a3[9] = -a2[9];
+      a3[10] = -a2[10];
+      a3[11] = -a2[11];
+
+      test_addc(a0, a1);
+      errn += verify("test_addc: ", 0, a0[0], (Float.NaN+VALUE));
+      errn += verify("test_addc: ", 1, a0[1], (Float.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_addc: ", 2, a0[2], (Float.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_addc: ", 3, a0[3], (Float.MAX_VALUE+VALUE));
+      errn += verify("test_addc: ", 4, a0[4], (Float.MIN_VALUE+VALUE));
+      errn += verify("test_addc: ", 5, a0[5], (Float.MIN_NORMAL+VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, VALUE);
+      errn += verify("test_addv: ", 0, a0[0], (Float.NaN+VALUE));
+      errn += verify("test_addv: ", 1, a0[1], (Float.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_addv: ", 2, a0[2], (Float.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_addv: ", 3, a0[3], (Float.MAX_VALUE+VALUE));
+      errn += verify("test_addv: ", 4, a0[4], (Float.MIN_VALUE+VALUE));
+      errn += verify("test_addv: ", 5, a0[5], (Float.MIN_NORMAL+VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      errn += verify("test_adda: ", 0, a0[0], (Float.NaN+VALUE));
+      errn += verify("test_adda: ", 1, a0[1], (Float.POSITIVE_INFINITY+VALUE));
+      errn += verify("test_adda: ", 2, a0[2], (Float.NEGATIVE_INFINITY+VALUE));
+      errn += verify("test_adda: ", 3, a0[3], (Float.MAX_VALUE+VALUE));
+      errn += verify("test_adda: ", 4, a0[4], (Float.MIN_VALUE+VALUE));
+      errn += verify("test_adda: ", 5, a0[5], (Float.MIN_NORMAL+VALUE));
+      errn += verify("test_adda: ", 6, a0[6], ((ADD_INIT+6)+Float.NaN));
+      errn += verify("test_adda: ", 7, a0[7], ((ADD_INIT+7)+Float.POSITIVE_INFINITY));
+      errn += verify("test_adda: ", 8, a0[8], ((ADD_INIT+8)+Float.NEGATIVE_INFINITY));
+      errn += verify("test_adda: ", 9, a0[9], ((ADD_INIT+9)+Float.MAX_VALUE));
+      errn += verify("test_adda: ", 10, a0[10], ((ADD_INIT+10)+Float.MIN_VALUE));
+      errn += verify("test_adda: ", 11, a0[11], ((ADD_INIT+11)+Float.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], ((ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      errn += verify("test_subc: ", 0, a0[0], (Float.NaN-VALUE));
+      errn += verify("test_subc: ", 1, a0[1], (Float.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_subc: ", 2, a0[2], (Float.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_subc: ", 3, a0[3], (Float.MAX_VALUE-VALUE));
+      errn += verify("test_subc: ", 4, a0[4], (Float.MIN_VALUE-VALUE));
+      errn += verify("test_subc: ", 5, a0[5], (Float.MIN_NORMAL-VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, VALUE);
+      errn += verify("test_subv: ", 0, a0[0], (Float.NaN-VALUE));
+      errn += verify("test_subv: ", 1, a0[1], (Float.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_subv: ", 2, a0[2], (Float.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_subv: ", 3, a0[3], (Float.MAX_VALUE-VALUE));
+      errn += verify("test_subv: ", 4, a0[4], (Float.MIN_VALUE-VALUE));
+      errn += verify("test_subv: ", 5, a0[5], (Float.MIN_NORMAL-VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      errn += verify("test_suba: ", 0, a0[0], (Float.NaN-VALUE));
+      errn += verify("test_suba: ", 1, a0[1], (Float.POSITIVE_INFINITY-VALUE));
+      errn += verify("test_suba: ", 2, a0[2], (Float.NEGATIVE_INFINITY-VALUE));
+      errn += verify("test_suba: ", 3, a0[3], (Float.MAX_VALUE-VALUE));
+      errn += verify("test_suba: ", 4, a0[4], (Float.MIN_VALUE-VALUE));
+      errn += verify("test_suba: ", 5, a0[5], (Float.MIN_NORMAL-VALUE));
+      errn += verify("test_suba: ", 6, a0[6], ((ADD_INIT+6)-Float.NaN));
+      errn += verify("test_suba: ", 7, a0[7], ((ADD_INIT+7)-Float.POSITIVE_INFINITY));
+      errn += verify("test_suba: ", 8, a0[8], ((ADD_INIT+8)-Float.NEGATIVE_INFINITY));
+      errn += verify("test_suba: ", 9, a0[9], ((ADD_INIT+9)-Float.MAX_VALUE));
+      errn += verify("test_suba: ", 10, a0[10], ((ADD_INIT+10)-Float.MIN_VALUE));
+      errn += verify("test_suba: ", 11, a0[11], ((ADD_INIT+11)-Float.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], ((ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      errn += verify("test_mulc: ", 0, a0[0], (Float.NaN*VALUE));
+      errn += verify("test_mulc: ", 1, a0[1], (Float.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mulc: ", 2, a0[2], (Float.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mulc: ", 3, a0[3], (Float.MAX_VALUE*VALUE));
+      errn += verify("test_mulc: ", 4, a0[4], (Float.MIN_VALUE*VALUE));
+      errn += verify("test_mulc: ", 5, a0[5], (Float.MIN_NORMAL*VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, VALUE);
+      errn += verify("test_mulv: ", 0, a0[0], (Float.NaN*VALUE));
+      errn += verify("test_mulv: ", 1, a0[1], (Float.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mulv: ", 2, a0[2], (Float.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mulv: ", 3, a0[3], (Float.MAX_VALUE*VALUE));
+      errn += verify("test_mulv: ", 4, a0[4], (Float.MIN_VALUE*VALUE));
+      errn += verify("test_mulv: ", 5, a0[5], (Float.MIN_NORMAL*VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      errn += verify("test_mula: ", 0, a0[0], (Float.NaN*VALUE));
+      errn += verify("test_mula: ", 1, a0[1], (Float.POSITIVE_INFINITY*VALUE));
+      errn += verify("test_mula: ", 2, a0[2], (Float.NEGATIVE_INFINITY*VALUE));
+      errn += verify("test_mula: ", 3, a0[3], (Float.MAX_VALUE*VALUE));
+      errn += verify("test_mula: ", 4, a0[4], (Float.MIN_VALUE*VALUE));
+      errn += verify("test_mula: ", 5, a0[5], (Float.MIN_NORMAL*VALUE));
+      errn += verify("test_mula: ", 6, a0[6], ((ADD_INIT+6)*Float.NaN));
+      errn += verify("test_mula: ", 7, a0[7], ((ADD_INIT+7)*Float.POSITIVE_INFINITY));
+      errn += verify("test_mula: ", 8, a0[8], ((ADD_INIT+8)*Float.NEGATIVE_INFINITY));
+      errn += verify("test_mula: ", 9, a0[9], ((ADD_INIT+9)*Float.MAX_VALUE));
+      errn += verify("test_mula: ", 10, a0[10], ((ADD_INIT+10)*Float.MIN_VALUE));
+      errn += verify("test_mula: ", 11, a0[11], ((ADD_INIT+11)*Float.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], ((ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      errn += verify("test_divc: ", 0, a0[0], (Float.NaN/VALUE));
+      errn += verify("test_divc: ", 1, a0[1], (Float.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_divc: ", 2, a0[2], (Float.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_divc: ", 3, a0[3], (Float.MAX_VALUE/VALUE));
+      errn += verify("test_divc: ", 4, a0[4], (Float.MIN_VALUE/VALUE));
+      errn += verify("test_divc: ", 5, a0[5], (Float.MIN_NORMAL/VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, VALUE);
+      errn += verify("test_divv: ", 0, a0[0], (Float.NaN/VALUE));
+      errn += verify("test_divv: ", 1, a0[1], (Float.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_divv: ", 2, a0[2], (Float.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_divv: ", 3, a0[3], (Float.MAX_VALUE/VALUE));
+      errn += verify("test_divv: ", 4, a0[4], (Float.MIN_VALUE/VALUE));
+      errn += verify("test_divv: ", 5, a0[5], (Float.MIN_NORMAL/VALUE));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      errn += verify("test_diva: ", 0, a0[0], (Float.NaN/VALUE));
+      errn += verify("test_diva: ", 1, a0[1], (Float.POSITIVE_INFINITY/VALUE));
+      errn += verify("test_diva: ", 2, a0[2], (Float.NEGATIVE_INFINITY/VALUE));
+      errn += verify("test_diva: ", 3, a0[3], (Float.MAX_VALUE/VALUE));
+      errn += verify("test_diva: ", 4, a0[4], (Float.MIN_VALUE/VALUE));
+      errn += verify("test_diva: ", 5, a0[5], (Float.MIN_NORMAL/VALUE));
+      errn += verify("test_diva: ", 6, a0[6], ((ADD_INIT+6)/Float.NaN));
+      errn += verify("test_diva: ", 7, a0[7], ((ADD_INIT+7)/Float.POSITIVE_INFINITY));
+      errn += verify("test_diva: ", 8, a0[8], ((ADD_INIT+8)/Float.NEGATIVE_INFINITY));
+      errn += verify("test_diva: ", 9, a0[9], ((ADD_INIT+9)/Float.MAX_VALUE));
+      errn += verify("test_diva: ", 10, a0[10], ((ADD_INIT+10)/Float.MIN_VALUE));
+      errn += verify("test_diva: ", 11, a0[11], ((ADD_INIT+11)/Float.MIN_NORMAL));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], ((ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      errn += verify("test_mulc_n: ", 0, a0[0], (Float.NaN*(-VALUE)));
+      errn += verify("test_mulc_n: ", 1, a0[1], (Float.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulc_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulc_n: ", 3, a0[3], (Float.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mulc_n: ", 4, a0[4], (Float.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mulc_n: ", 5, a0[5], (Float.MIN_NORMAL*(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, -VALUE);
+      errn += verify("test_mulv_n: ", 0, a0[0], (Float.NaN*(-VALUE)));
+      errn += verify("test_mulv_n: ", 1, a0[1], (Float.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulv_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mulv_n: ", 3, a0[3], (Float.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mulv_n: ", 4, a0[4], (Float.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mulv_n: ", 5, a0[5], (Float.MIN_NORMAL*(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      errn += verify("test_mula_n: ", 0, a0[0], (Float.NaN*(-VALUE)));
+      errn += verify("test_mula_n: ", 1, a0[1], (Float.POSITIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mula_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY*(-VALUE)));
+      errn += verify("test_mula_n: ", 3, a0[3], (Float.MAX_VALUE*(-VALUE)));
+      errn += verify("test_mula_n: ", 4, a0[4], (Float.MIN_VALUE*(-VALUE)));
+      errn += verify("test_mula_n: ", 5, a0[5], (Float.MIN_NORMAL*(-VALUE)));
+      errn += verify("test_mula_n: ", 6, a0[6], ((ADD_INIT+6)*(-Float.NaN)));
+      errn += verify("test_mula_n: ", 7, a0[7], ((ADD_INIT+7)*(-Float.POSITIVE_INFINITY)));
+      errn += verify("test_mula_n: ", 8, a0[8], ((ADD_INIT+8)*(-Float.NEGATIVE_INFINITY)));
+      errn += verify("test_mula_n: ", 9, a0[9], ((ADD_INIT+9)*(-Float.MAX_VALUE)));
+      errn += verify("test_mula_n: ", 10, a0[10], ((ADD_INIT+10)*(-Float.MIN_VALUE)));
+      errn += verify("test_mula_n: ", 11, a0[11], ((ADD_INIT+11)*(-Float.MIN_NORMAL)));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], ((ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      errn += verify("test_divc_n: ", 0, a0[0], (Float.NaN/(-VALUE)));
+      errn += verify("test_divc_n: ", 1, a0[1], (Float.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divc_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divc_n: ", 3, a0[3], (Float.MAX_VALUE/(-VALUE)));
+      errn += verify("test_divc_n: ", 4, a0[4], (Float.MIN_VALUE/(-VALUE)));
+      errn += verify("test_divc_n: ", 5, a0[5], (Float.MIN_NORMAL/(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, -VALUE);
+      errn += verify("test_divv_n: ", 0, a0[0], (Float.NaN/(-VALUE)));
+      errn += verify("test_divv_n: ", 1, a0[1], (Float.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divv_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_divv_n: ", 3, a0[3], (Float.MAX_VALUE/(-VALUE)));
+      errn += verify("test_divv_n: ", 4, a0[4], (Float.MIN_VALUE/(-VALUE)));
+      errn += verify("test_divv_n: ", 5, a0[5], (Float.MIN_NORMAL/(-VALUE)));
+      for (int i=6; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      errn += verify("test_diva_n: ", 0, a0[0], (Float.NaN/(-VALUE)));
+      errn += verify("test_diva_n: ", 1, a0[1], (Float.POSITIVE_INFINITY/(-VALUE)));
+      errn += verify("test_diva_n: ", 2, a0[2], (Float.NEGATIVE_INFINITY/(-VALUE)));
+      errn += verify("test_diva_n: ", 3, a0[3], (Float.MAX_VALUE/(-VALUE)));
+      errn += verify("test_diva_n: ", 4, a0[4], (Float.MIN_VALUE/(-VALUE)));
+      errn += verify("test_diva_n: ", 5, a0[5], (Float.MIN_NORMAL/(-VALUE)));
+      errn += verify("test_diva_n: ", 6, a0[6], ((ADD_INIT+6)/(-Float.NaN)));
+      errn += verify("test_diva_n: ", 7, a0[7], ((ADD_INIT+7)/(-Float.POSITIVE_INFINITY)));
+      errn += verify("test_diva_n: ", 8, a0[8], ((ADD_INIT+8)/(-Float.NEGATIVE_INFINITY)));
+      errn += verify("test_diva_n: ", 9, a0[9], ((ADD_INIT+9)/(-Float.MAX_VALUE)));
+      errn += verify("test_diva_n: ", 10, a0[10], ((ADD_INIT+10)/(-Float.MIN_VALUE)));
+      errn += verify("test_diva_n: ", 11, a0[11], ((ADD_INIT+11)/(-Float.MIN_NORMAL)));
+      for (int i=12; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    return errn;
+  }
+
+  static float test_sum(float[] a1) {
+    float sum = 0;
+    for (int i = 0; i < a1.length; i+=1) {
+      sum += a1[i];
+    }
+    return sum;
+  }
+
+  static void test_addc(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+VALUE);
+    }
+  }
+  static void test_addv(float[] a0, float[] a1, float b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+b);
+    }
+  }
+  static void test_adda(float[] a0, float[] a1, float[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]+a2[i]);
+    }
+  }
+
+  static void test_subc(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-VALUE);
+    }
+  }
+  static void test_subv(float[] a0, float[] a1, float b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-b);
+    }
+  }
+  static void test_suba(float[] a0, float[] a1, float[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]-a2[i]);
+    }
+  }
+
+  static void test_mulc(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*VALUE);
+    }
+  }
+  static void test_mulc_n(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*(-VALUE));
+    }
+  }
+  static void test_mulv(float[] a0, float[] a1, float b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*b);
+    }
+  }
+  static void test_mula(float[] a0, float[] a1, float[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]*a2[i]);
+    }
+  }
+
+  static void test_divc(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/VALUE);
+    }
+  }
+  static void test_divc_n(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/(-VALUE));
+    }
+  }
+  static void test_divv(float[] a0, float[] a1, float b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/b);
+    }
+  }
+  static void test_diva(float[] a0, float[] a1, float[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (a1[i]/a2[i]);
+    }
+  }
+
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val && !(Float.isNaN(elem) && Float.isNaN(val))) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/6340864/TestIntVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,1012 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestIntVect
+ */
+
+public class TestIntVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int ADD_INIT = Integer.MAX_VALUE-500;
+  private static final int BIT_MASK = 0xEC80F731;
+  private static final int VALUE = 15;
+  private static final int SHIFT = 32;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Integer vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    int[] a0 = new int[ARRLEN];
+    int[] a1 = new int[ARRLEN];
+    int[] a2 = new int[ARRLEN];
+    int[] a3 = new int[ARRLEN];
+    int[] a4 = new int[ARRLEN];
+    long[] p2 = new long[ARRLEN/2];
+    // Initialize
+    int gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      int val = (int)(ADD_INIT+i);
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = (int)VALUE;
+      a3[i] = (int)-VALUE;
+      a4[i] = (int)BIT_MASK;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, (int)VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, (int)VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, (int)VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, (int)VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, (int)-VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, (int)-VALUE);
+      test_diva(a0, a1, a3);
+      test_andc(a0, a1);
+      test_andv(a0, a1, (int)BIT_MASK);
+      test_anda(a0, a1, a4);
+      test_orc(a0, a1);
+      test_orv(a0, a1, (int)BIT_MASK);
+      test_ora(a0, a1, a4);
+      test_xorc(a0, a1);
+      test_xorv(a0, a1, (int)BIT_MASK);
+      test_xora(a0, a1, a4);
+      test_sllc(a0, a1);
+      test_sllv(a0, a1, VALUE);
+      test_srlc(a0, a1);
+      test_srlv(a0, a1, VALUE);
+      test_srac(a0, a1);
+      test_srav(a0, a1, VALUE);
+      test_sllc_n(a0, a1);
+      test_sllv(a0, a1, -VALUE);
+      test_srlc_n(a0, a1);
+      test_srlv(a0, a1, -VALUE);
+      test_srac_n(a0, a1);
+      test_srav(a0, a1, -VALUE);
+      test_sllc_o(a0, a1);
+      test_sllv(a0, a1, SHIFT);
+      test_srlc_o(a0, a1);
+      test_srlv(a0, a1, SHIFT);
+      test_srac_o(a0, a1);
+      test_srav(a0, a1, SHIFT);
+      test_sllc_on(a0, a1);
+      test_sllv(a0, a1, -SHIFT);
+      test_srlc_on(a0, a1);
+      test_srlv(a0, a1, -SHIFT);
+      test_srac_on(a0, a1);
+      test_srav(a0, a1, -SHIFT);
+      test_pack2(p2, a1);
+      test_unpack2(a0, p2);
+      test_pack2_swap(p2, a1);
+      test_unpack2_swap(a0, p2);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      int sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+
+      test_addc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], (int)((int)(ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, (int)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], (int)((int)(ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], (int)((int)(ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], (int)((int)(ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, (int)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], (int)((int)(ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], (int)((int)(ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], (int)((int)(ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, (int)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], (int)((int)(ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], (int)((int)(ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], (int)((int)(ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, (int)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], (int)((int)(ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], (int)((int)(ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], (int)((int)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, (int)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], (int)((int)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], (int)((int)(ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], (int)((int)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, (int)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], (int)((int)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], (int)((int)(ADD_INIT+i)/(-VALUE)));
+      }
+
+      test_andc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andc: ", i, a0[i], (int)((int)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_andv(a0, a1, (int)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andv: ", i, a0[i], (int)((int)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_anda(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_anda: ", i, a0[i], (int)((int)(ADD_INIT+i)&BIT_MASK));
+      }
+
+      test_orc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orc: ", i, a0[i], (int)((int)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_orv(a0, a1, (int)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orv: ", i, a0[i], (int)((int)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_ora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ora: ", i, a0[i], (int)((int)(ADD_INIT+i)|BIT_MASK));
+      }
+
+      test_xorc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorc: ", i, a0[i], (int)((int)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xorv(a0, a1, (int)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorv: ", i, a0[i], (int)((int)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xora: ", i, a0[i], (int)((int)(ADD_INIT+i)^BIT_MASK));
+      }
+
+      test_sllc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc: ", i, a0[i], (int)((int)(ADD_INIT+i)<<VALUE));
+      }
+      test_sllv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv: ", i, a0[i], (int)((int)(ADD_INIT+i)<<VALUE));
+      }
+
+      test_srlc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>VALUE));
+      }
+      test_srlv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>VALUE));
+      }
+
+      test_srac(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac: ", i, a0[i], (int)((int)(ADD_INIT+i)>>VALUE));
+      }
+      test_srav(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav: ", i, a0[i], (int)((int)(ADD_INIT+i)>>VALUE));
+      }
+
+      test_sllc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_n: ", i, a0[i], (int)((int)(ADD_INIT+i)<<(-VALUE)));
+      }
+      test_sllv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_n: ", i, a0[i], (int)((int)(ADD_INIT+i)<<(-VALUE)));
+      }
+
+      test_srlc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_n: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>(-VALUE)));
+      }
+      test_srlv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_n: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>(-VALUE)));
+      }
+
+      test_srac_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_n: ", i, a0[i], (int)((int)(ADD_INIT+i)>>(-VALUE)));
+      }
+      test_srav(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_n: ", i, a0[i], (int)((int)(ADD_INIT+i)>>(-VALUE)));
+      }
+
+      test_sllc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_o: ", i, a0[i], (int)((int)(ADD_INIT+i)<<SHIFT));
+      }
+      test_sllv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_o: ", i, a0[i], (int)((int)(ADD_INIT+i)<<SHIFT));
+      }
+
+      test_srlc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_o: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>SHIFT));
+      }
+      test_srlv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_o: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>SHIFT));
+      }
+
+      test_srac_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_o: ", i, a0[i], (int)((int)(ADD_INIT+i)>>SHIFT));
+      }
+      test_srav(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_o: ", i, a0[i], (int)((int)(ADD_INIT+i)>>SHIFT));
+      }
+
+      test_sllc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_on: ", i, a0[i], (int)((int)(ADD_INIT+i)<<(-SHIFT)));
+      }
+      test_sllv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_on: ", i, a0[i], (int)((int)(ADD_INIT+i)<<(-SHIFT)));
+      }
+
+      test_srlc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_on: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+      test_srlv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_on: ", i, a0[i], (int)((int)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+
+      test_srac_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_on: ", i, a0[i], (int)((int)(ADD_INIT+i)>>(-SHIFT)));
+      }
+      test_srav(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_on: ", i, a0[i], (int)((int)(ADD_INIT+i)>>(-SHIFT)));
+      }
+
+      test_pack2(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2: ", i, p2[i], ((long)(ADD_INIT+2*i) & 0xFFFFFFFFl) | ((long)(ADD_INIT+2*i+1) << 32));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2: ", i, a0[i], (ADD_INIT+i));
+      }
+
+      test_pack2_swap(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2_swap: ", i, p2[i], ((long)(ADD_INIT+2*i+1) & 0xFFFFFFFFl) | ((long)(ADD_INIT+2*i) << 32));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2_swap(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2_swap: ", i, a0[i], (ADD_INIT+i));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, (int)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, (int)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (int)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (int)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (int)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (int)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andv(a0, a1, (int)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_anda(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_anda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orv(a0, a1, (int)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorv(a0, a1, (int)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack2(p2, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack2: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack2(a0, p2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack2: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack2_swap(p2, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack2_swap: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack2_swap(a0, p2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack2_swap: " + (end - start));
+
+    return errn;
+  }
+
+  static int test_sum(int[] a1) {
+    int sum = 0;
+    for (int i = 0; i < a1.length; i+=1) {
+      sum += a1[i];
+    }
+    return sum;
+  }
+
+  static void test_addc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]+VALUE);
+    }
+  }
+  static void test_addv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]+b);
+    }
+  }
+  static void test_adda(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]+a2[i]);
+    }
+  }
+
+  static void test_subc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]-VALUE);
+    }
+  }
+  static void test_subv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]-b);
+    }
+  }
+  static void test_suba(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]-a2[i]);
+    }
+  }
+
+  static void test_mulc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]*VALUE);
+    }
+  }
+  static void test_mulc_n(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]*(-VALUE));
+    }
+  }
+  static void test_mulv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]*b);
+    }
+  }
+  static void test_mula(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]*a2[i]);
+    }
+  }
+
+  static void test_divc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]/VALUE);
+    }
+  }
+  static void test_divc_n(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]/(-VALUE));
+    }
+  }
+  static void test_divv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]/b);
+    }
+  }
+  static void test_diva(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]/a2[i]);
+    }
+  }
+
+  static void test_andc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]&BIT_MASK);
+    }
+  }
+  static void test_andv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]&b);
+    }
+  }
+  static void test_anda(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]&a2[i]);
+    }
+  }
+
+  static void test_orc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]|BIT_MASK);
+    }
+  }
+  static void test_orv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]|b);
+    }
+  }
+  static void test_ora(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]|a2[i]);
+    }
+  }
+
+  static void test_xorc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]^BIT_MASK);
+    }
+  }
+  static void test_xorv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]^b);
+    }
+  }
+  static void test_xora(int[] a0, int[] a1, int[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]^a2[i]);
+    }
+  }
+
+  static void test_sllc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]<<VALUE);
+    }
+  }
+  static void test_sllc_n(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]<<(-VALUE));
+    }
+  }
+  static void test_sllc_o(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]<<SHIFT);
+    }
+  }
+  static void test_sllc_on(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]<<(-SHIFT));
+    }
+  }
+  static void test_sllv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]<<b);
+    }
+  }
+
+  static void test_srlc(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>>VALUE);
+    }
+  }
+  static void test_srlc_n(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>>(-VALUE));
+    }
+  }
+  static void test_srlc_o(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>>SHIFT);
+    }
+  }
+  static void test_srlc_on(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>>(-SHIFT));
+    }
+  }
+  static void test_srlv(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>>b);
+    }
+  }
+
+  static void test_srac(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>VALUE);
+    }
+  }
+  static void test_srac_n(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>(-VALUE));
+    }
+  }
+  static void test_srac_o(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>SHIFT);
+    }
+  }
+  static void test_srac_on(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>(-SHIFT));
+    }
+  }
+  static void test_srav(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)(a1[i]>>b);
+    }
+  }
+
+  static void test_pack2(long[] p2, int[] a1) {
+    if (p2.length*2 > a1.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      long l0 = (long)a1[i*2+0];
+      long l1 = (long)a1[i*2+1];
+      p2[i] = (l1 << 32) | (l0 & 0xFFFFFFFFl);
+    }
+  }
+  static void test_unpack2(int[] a0, long[] p2) {
+    if (p2.length*2 > a0.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      long l = p2[i];
+      a0[i*2+0] = (int)(l & 0xFFFFFFFFl);
+      a0[i*2+1] = (int)(l >> 32);
+    }
+  }
+  static void test_pack2_swap(long[] p2, int[] a1) {
+    if (p2.length*2 > a1.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      long l0 = (long)a1[i*2+0];
+      long l1 = (long)a1[i*2+1];
+      p2[i] = (l0 << 32) | (l1 & 0xFFFFFFFFl);
+    }
+  }
+  static void test_unpack2_swap(int[] a0, long[] p2) {
+    if (p2.length*2 > a0.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      long l = p2[i];
+      a0[i*2+0] = (int)(l >> 32);
+      a0[i*2+1] = (int)(l & 0xFFFFFFFFl);
+    }
+  }
+
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + Long.toHexString(elem) + " != " + Long.toHexString(val));
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/6340864/TestLongVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestLongVect
+ */
+
+public class TestLongVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final long ADD_INIT = Long.MAX_VALUE-500;
+  private static final long BIT_MASK = 0xEC80F731EC80F731L;
+  private static final int VALUE = 31;
+  private static final int SHIFT = 64;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Long vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    long[] a0 = new long[ARRLEN];
+    long[] a1 = new long[ARRLEN];
+    long[] a2 = new long[ARRLEN];
+    long[] a3 = new long[ARRLEN];
+    long[] a4 = new long[ARRLEN];
+    // Initialize
+    long gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      long val = (long)(ADD_INIT+i);
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = (long)VALUE;
+      a3[i] = (long)-VALUE;
+      a4[i] = (long)BIT_MASK;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, (long)VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, (long)VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, (long)VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, (long)VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, (long)-VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, (long)-VALUE);
+      test_diva(a0, a1, a3);
+      test_andc(a0, a1);
+      test_andv(a0, a1, (long)BIT_MASK);
+      test_anda(a0, a1, a4);
+      test_orc(a0, a1);
+      test_orv(a0, a1, (long)BIT_MASK);
+      test_ora(a0, a1, a4);
+      test_xorc(a0, a1);
+      test_xorv(a0, a1, (long)BIT_MASK);
+      test_xora(a0, a1, a4);
+      test_sllc(a0, a1);
+      test_sllv(a0, a1, VALUE);
+      test_srlc(a0, a1);
+      test_srlv(a0, a1, VALUE);
+      test_srac(a0, a1);
+      test_srav(a0, a1, VALUE);
+      test_sllc_n(a0, a1);
+      test_sllv(a0, a1, -VALUE);
+      test_srlc_n(a0, a1);
+      test_srlv(a0, a1, -VALUE);
+      test_srac_n(a0, a1);
+      test_srav(a0, a1, -VALUE);
+      test_sllc_o(a0, a1);
+      test_sllv(a0, a1, SHIFT);
+      test_srlc_o(a0, a1);
+      test_srlv(a0, a1, SHIFT);
+      test_srac_o(a0, a1);
+      test_srav(a0, a1, SHIFT);
+      test_sllc_on(a0, a1);
+      test_sllv(a0, a1, -SHIFT);
+      test_srlc_on(a0, a1);
+      test_srlv(a0, a1, -SHIFT);
+      test_srac_on(a0, a1);
+      test_srav(a0, a1, -SHIFT);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      long sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+
+      test_addc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], (long)((long)(ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, (long)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], (long)((long)(ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], (long)((long)(ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], (long)((long)(ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, (long)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], (long)((long)(ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], (long)((long)(ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], (long)((long)(ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, (long)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], (long)((long)(ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], (long)((long)(ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], (long)((long)(ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, (long)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], (long)((long)(ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], (long)((long)(ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], (long)((long)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, (long)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], (long)((long)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], (long)((long)(ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], (long)((long)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, (long)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], (long)((long)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], (long)((long)(ADD_INIT+i)/(-VALUE)));
+      }
+
+      test_andc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andc: ", i, a0[i], (long)((long)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_andv(a0, a1, (long)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andv: ", i, a0[i], (long)((long)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_anda(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_anda: ", i, a0[i], (long)((long)(ADD_INIT+i)&BIT_MASK));
+      }
+
+      test_orc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orc: ", i, a0[i], (long)((long)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_orv(a0, a1, (long)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orv: ", i, a0[i], (long)((long)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_ora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ora: ", i, a0[i], (long)((long)(ADD_INIT+i)|BIT_MASK));
+      }
+
+      test_xorc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorc: ", i, a0[i], (long)((long)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xorv(a0, a1, (long)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorv: ", i, a0[i], (long)((long)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xora: ", i, a0[i], (long)((long)(ADD_INIT+i)^BIT_MASK));
+      }
+
+      test_sllc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc: ", i, a0[i], (long)((long)(ADD_INIT+i)<<VALUE));
+      }
+      test_sllv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv: ", i, a0[i], (long)((long)(ADD_INIT+i)<<VALUE));
+      }
+
+      test_srlc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>VALUE));
+      }
+      test_srlv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>VALUE));
+      }
+
+      test_srac(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac: ", i, a0[i], (long)((long)(ADD_INIT+i)>>VALUE));
+      }
+      test_srav(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav: ", i, a0[i], (long)((long)(ADD_INIT+i)>>VALUE));
+      }
+
+      test_sllc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_n: ", i, a0[i], (long)((long)(ADD_INIT+i)<<(-VALUE)));
+      }
+      test_sllv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_n: ", i, a0[i], (long)((long)(ADD_INIT+i)<<(-VALUE)));
+      }
+
+      test_srlc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_n: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>(-VALUE)));
+      }
+      test_srlv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_n: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>(-VALUE)));
+      }
+
+      test_srac_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_n: ", i, a0[i], (long)((long)(ADD_INIT+i)>>(-VALUE)));
+      }
+      test_srav(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_n: ", i, a0[i], (long)((long)(ADD_INIT+i)>>(-VALUE)));
+      }
+
+      test_sllc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_o: ", i, a0[i], (long)((long)(ADD_INIT+i)<<SHIFT));
+      }
+      test_sllv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_o: ", i, a0[i], (long)((long)(ADD_INIT+i)<<SHIFT));
+      }
+
+      test_srlc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_o: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>SHIFT));
+      }
+      test_srlv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_o: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>SHIFT));
+      }
+
+      test_srac_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_o: ", i, a0[i], (long)((long)(ADD_INIT+i)>>SHIFT));
+      }
+      test_srav(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_o: ", i, a0[i], (long)((long)(ADD_INIT+i)>>SHIFT));
+      }
+
+      test_sllc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_on: ", i, a0[i], (long)((long)(ADD_INIT+i)<<(-SHIFT)));
+      }
+      test_sllv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_on: ", i, a0[i], (long)((long)(ADD_INIT+i)<<(-SHIFT)));
+      }
+
+      test_srlc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_on: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+      test_srlv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_on: ", i, a0[i], (long)((long)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+
+      test_srac_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_on: ", i, a0[i], (long)((long)(ADD_INIT+i)>>(-SHIFT)));
+      }
+      test_srav(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_on: ", i, a0[i], (long)((long)(ADD_INIT+i)>>(-SHIFT)));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, (long)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, (long)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (long)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (long)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (long)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (long)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andv(a0, a1, (long)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_anda(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_anda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orv(a0, a1, (long)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorv(a0, a1, (long)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_on: " + (end - start));
+
+    return errn;
+  }
+
+  static long test_sum(long[] a1) {
+    long sum = 0;
+    for (int i = 0; i < a1.length; i+=1) {
+      sum += a1[i];
+    }
+    return sum;
+  }
+
+  static void test_addc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]+VALUE);
+    }
+  }
+  static void test_addv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]+b);
+    }
+  }
+  static void test_adda(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]+a2[i]);
+    }
+  }
+
+  static void test_subc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]-VALUE);
+    }
+  }
+  static void test_subv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]-b);
+    }
+  }
+  static void test_suba(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]-a2[i]);
+    }
+  }
+
+  static void test_mulc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]*VALUE);
+    }
+  }
+  static void test_mulc_n(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]*(-VALUE));
+    }
+  }
+  static void test_mulv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]*b);
+    }
+  }
+  static void test_mula(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]*a2[i]);
+    }
+  }
+
+  static void test_divc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]/VALUE);
+    }
+  }
+  static void test_divc_n(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]/(-VALUE));
+    }
+  }
+  static void test_divv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]/b);
+    }
+  }
+  static void test_diva(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]/a2[i]);
+    }
+  }
+
+  static void test_andc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]&BIT_MASK);
+    }
+  }
+  static void test_andv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]&b);
+    }
+  }
+  static void test_anda(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]&a2[i]);
+    }
+  }
+
+  static void test_orc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]|BIT_MASK);
+    }
+  }
+  static void test_orv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]|b);
+    }
+  }
+  static void test_ora(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]|a2[i]);
+    }
+  }
+
+  static void test_xorc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]^BIT_MASK);
+    }
+  }
+  static void test_xorv(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]^b);
+    }
+  }
+  static void test_xora(long[] a0, long[] a1, long[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]^a2[i]);
+    }
+  }
+
+  static void test_sllc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]<<VALUE);
+    }
+  }
+  static void test_sllc_n(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]<<(-VALUE));
+    }
+  }
+  static void test_sllc_o(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]<<SHIFT);
+    }
+  }
+  static void test_sllc_on(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]<<(-SHIFT));
+    }
+  }
+  static void test_sllv(long[] a0, long[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]<<b);
+    }
+  }
+
+  static void test_srlc(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>>VALUE);
+    }
+  }
+  static void test_srlc_n(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>>(-VALUE));
+    }
+  }
+  static void test_srlc_o(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>>SHIFT);
+    }
+  }
+  static void test_srlc_on(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>>(-SHIFT));
+    }
+  }
+  static void test_srlv(long[] a0, long[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>>b);
+    }
+  }
+
+  static void test_srac(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>VALUE);
+    }
+  }
+  static void test_srac_n(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>(-VALUE));
+    }
+  }
+  static void test_srac_o(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>SHIFT);
+    }
+  }
+  static void test_srac_on(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>(-SHIFT));
+    }
+  }
+  static void test_srav(long[] a0, long[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]>>b);
+    }
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/6340864/TestShortVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,1127 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 6340864
+ * @summary Implement vectorization optimizations in hotspot-server
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestShortVect
+ */
+
+public class TestShortVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int ADD_INIT = Short.MAX_VALUE-500;
+  private static final int BIT_MASK = 0xB731;
+  private static final int VALUE = 7;
+  private static final int SHIFT = 16;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Short vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    short[] a0 = new short[ARRLEN];
+    short[] a1 = new short[ARRLEN];
+    short[] a2 = new short[ARRLEN];
+    short[] a3 = new short[ARRLEN];
+    short[] a4 = new short[ARRLEN];
+     int[] p2 = new  int[ARRLEN/2];
+    long[] p4 = new long[ARRLEN/4];
+    // Initialize
+    int gold_sum = 0;
+    for (int i=0; i<ARRLEN; i++) {
+      short val = (short)(ADD_INIT+i);
+      gold_sum += val;
+      a1[i] = val;
+      a2[i] = (short)VALUE;
+      a3[i] = (short)-VALUE;
+      a4[i] = (short)BIT_MASK;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+      test_addc(a0, a1);
+      test_addv(a0, a1, (short)VALUE);
+      test_adda(a0, a1, a2);
+      test_subc(a0, a1);
+      test_subv(a0, a1, (short)VALUE);
+      test_suba(a0, a1, a2);
+      test_mulc(a0, a1);
+      test_mulv(a0, a1, (short)VALUE);
+      test_mula(a0, a1, a2);
+      test_divc(a0, a1);
+      test_divv(a0, a1, (short)VALUE);
+      test_diva(a0, a1, a2);
+      test_mulc_n(a0, a1);
+      test_mulv(a0, a1, (short)-VALUE);
+      test_mula(a0, a1, a3);
+      test_divc_n(a0, a1);
+      test_divv(a0, a1, (short)-VALUE);
+      test_diva(a0, a1, a3);
+      test_andc(a0, a1);
+      test_andv(a0, a1, (short)BIT_MASK);
+      test_anda(a0, a1, a4);
+      test_orc(a0, a1);
+      test_orv(a0, a1, (short)BIT_MASK);
+      test_ora(a0, a1, a4);
+      test_xorc(a0, a1);
+      test_xorv(a0, a1, (short)BIT_MASK);
+      test_xora(a0, a1, a4);
+      test_sllc(a0, a1);
+      test_sllv(a0, a1, VALUE);
+      test_srlc(a0, a1);
+      test_srlv(a0, a1, VALUE);
+      test_srac(a0, a1);
+      test_srav(a0, a1, VALUE);
+      test_sllc_n(a0, a1);
+      test_sllv(a0, a1, -VALUE);
+      test_srlc_n(a0, a1);
+      test_srlv(a0, a1, -VALUE);
+      test_srac_n(a0, a1);
+      test_srav(a0, a1, -VALUE);
+      test_sllc_o(a0, a1);
+      test_sllv(a0, a1, SHIFT);
+      test_srlc_o(a0, a1);
+      test_srlv(a0, a1, SHIFT);
+      test_srac_o(a0, a1);
+      test_srav(a0, a1, SHIFT);
+      test_sllc_on(a0, a1);
+      test_sllv(a0, a1, -SHIFT);
+      test_srlc_on(a0, a1);
+      test_srlv(a0, a1, -SHIFT);
+      test_srac_on(a0, a1);
+      test_srav(a0, a1, -SHIFT);
+      test_pack2(p2, a1);
+      test_unpack2(a0, p2);
+      test_pack2_swap(p2, a1);
+      test_unpack2_swap(a0, p2);
+      test_pack4(p4, a1);
+      test_unpack4(a0, p4);
+      test_pack4_swap(p4, a1);
+      test_unpack4_swap(a0, p4);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      int sum = test_sum(a1);
+      if (sum != gold_sum) {
+        System.err.println("test_sum:  " + sum + " != " + gold_sum);
+        errn++;
+      }
+
+      test_addc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], (short)((short)(ADD_INIT+i)+VALUE));
+      }
+      test_addv(a0, a1, (short)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], (short)((short)(ADD_INIT+i)+VALUE));
+      }
+      test_adda(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_adda: ", i, a0[i], (short)((short)(ADD_INIT+i)+VALUE));
+      }
+
+      test_subc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subc: ", i, a0[i], (short)((short)(ADD_INIT+i)-VALUE));
+      }
+      test_subv(a0, a1, (short)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_subv: ", i, a0[i], (short)((short)(ADD_INIT+i)-VALUE));
+      }
+      test_suba(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_suba: ", i, a0[i], (short)((short)(ADD_INIT+i)-VALUE));
+      }
+
+      test_mulc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc: ", i, a0[i], (short)((short)(ADD_INIT+i)*VALUE));
+      }
+      test_mulv(a0, a1, (short)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv: ", i, a0[i], (short)((short)(ADD_INIT+i)*VALUE));
+      }
+      test_mula(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula: ", i, a0[i], (short)((short)(ADD_INIT+i)*VALUE));
+      }
+
+      test_divc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc: ", i, a0[i], (short)((short)(ADD_INIT+i)/VALUE));
+      }
+      test_divv(a0, a1, (short)VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv: ", i, a0[i], (short)((short)(ADD_INIT+i)/VALUE));
+      }
+      test_diva(a0, a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva: ", i, a0[i], (short)((short)(ADD_INIT+i)/VALUE));
+      }
+
+      test_mulc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulc_n: ", i, a0[i], (short)((short)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mulv(a0, a1, (short)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mulv_n: ", i, a0[i], (short)((short)(ADD_INIT+i)*(-VALUE)));
+      }
+      test_mula(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_mula_n: ", i, a0[i], (short)((short)(ADD_INIT+i)*(-VALUE)));
+      }
+
+      test_divc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divc_n: ", i, a0[i], (short)((short)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_divv(a0, a1, (short)-VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divv_n: ", i, a0[i], (short)((short)(ADD_INIT+i)/(-VALUE)));
+      }
+      test_diva(a0, a1, a3);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_diva_n: ", i, a0[i], (short)((short)(ADD_INIT+i)/(-VALUE)));
+      }
+
+      test_andc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andc: ", i, a0[i], (short)((short)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_andv(a0, a1, (short)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_andv: ", i, a0[i], (short)((short)(ADD_INIT+i)&BIT_MASK));
+      }
+      test_anda(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_anda: ", i, a0[i], (short)((short)(ADD_INIT+i)&BIT_MASK));
+      }
+
+      test_orc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orc: ", i, a0[i], (short)((short)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_orv(a0, a1, (short)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_orv: ", i, a0[i], (short)((short)(ADD_INIT+i)|BIT_MASK));
+      }
+      test_ora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ora: ", i, a0[i], (short)((short)(ADD_INIT+i)|BIT_MASK));
+      }
+
+      test_xorc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorc: ", i, a0[i], (short)((short)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xorv(a0, a1, (short)BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xorv: ", i, a0[i], (short)((short)(ADD_INIT+i)^BIT_MASK));
+      }
+      test_xora(a0, a1, a4);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_xora: ", i, a0[i], (short)((short)(ADD_INIT+i)^BIT_MASK));
+      }
+
+      test_sllc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc: ", i, a0[i], (short)((short)(ADD_INIT+i)<<VALUE));
+      }
+      test_sllv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv: ", i, a0[i], (short)((short)(ADD_INIT+i)<<VALUE));
+      }
+
+      test_srlc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>VALUE));
+      }
+      test_srlv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>VALUE));
+      }
+
+      test_srac(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac: ", i, a0[i], (short)((short)(ADD_INIT+i)>>VALUE));
+      }
+      test_srav(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav: ", i, a0[i], (short)((short)(ADD_INIT+i)>>VALUE));
+      }
+
+      test_sllc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_n: ", i, a0[i], (short)((short)(ADD_INIT+i)<<(-VALUE)));
+      }
+      test_sllv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_n: ", i, a0[i], (short)((short)(ADD_INIT+i)<<(-VALUE)));
+      }
+
+      test_srlc_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_n: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>(-VALUE)));
+      }
+      test_srlv(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_n: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>(-VALUE)));
+      }
+
+      test_srac_n(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_n: ", i, a0[i], (short)((short)(ADD_INIT+i)>>(-VALUE)));
+      }
+      test_srav(a0, a1, -VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_n: ", i, a0[i], (short)((short)(ADD_INIT+i)>>(-VALUE)));
+      }
+
+      test_sllc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_o: ", i, a0[i], (short)((short)(ADD_INIT+i)<<SHIFT));
+      }
+      test_sllv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_o: ", i, a0[i], (short)((short)(ADD_INIT+i)<<SHIFT));
+      }
+
+      test_srlc_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_o: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>SHIFT));
+      }
+      test_srlv(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_o: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>SHIFT));
+      }
+
+      test_srac_o(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_o: ", i, a0[i], (short)((short)(ADD_INIT+i)>>SHIFT));
+      }
+      test_srav(a0, a1, SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_o: ", i, a0[i], (short)((short)(ADD_INIT+i)>>SHIFT));
+      }
+
+      test_sllc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_on: ", i, a0[i], (short)((short)(ADD_INIT+i)<<(-SHIFT)));
+      }
+      test_sllv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_on: ", i, a0[i], (short)((short)(ADD_INIT+i)<<(-SHIFT)));
+      }
+
+      test_srlc_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_on: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+      test_srlv(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_on: ", i, a0[i], (short)((short)(ADD_INIT+i)>>>(-SHIFT)));
+      }
+
+      test_srac_on(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_on: ", i, a0[i], (short)((short)(ADD_INIT+i)>>(-SHIFT)));
+      }
+      test_srav(a0, a1, -SHIFT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_on: ", i, a0[i], (short)((short)(ADD_INIT+i)>>(-SHIFT)));
+      }
+
+      test_pack2(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2: ", i, p2[i], ((int)(ADD_INIT+2*i) & 0xFFFF) | ((int)(ADD_INIT+2*i+1) << 16));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2: ", i, a0[i], (short)(ADD_INIT+i));
+      }
+
+      test_pack2_swap(p2, a1);
+      for (int i=0; i<ARRLEN/2; i++) {
+        errn += verify("test_pack2_swap: ", i, p2[i], ((int)(ADD_INIT+2*i+1) & 0xFFFF) | ((int)(ADD_INIT+2*i) << 16));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack2_swap(a0, p2);
+      for (int i=0; i<(ARRLEN&(-2)); i++) {
+        errn += verify("test_unpack2_swap: ", i, a0[i], (short)(ADD_INIT+i));
+      }
+
+      test_pack4(p4, a1);
+      for (int i=0; i<ARRLEN/4; i++) {
+        errn += verify("test_pack4: ", i, p4[i],  ((long)(ADD_INIT+4*i+0) & 0xFFFFl) |
+                                                 (((long)(ADD_INIT+4*i+1) & 0xFFFFl) << 16)  |
+                                                 (((long)(ADD_INIT+4*i+2) & 0xFFFFl) << 32)  |
+                                                 (((long)(ADD_INIT+4*i+3) & 0xFFFFl) << 48));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack4(a0, p4);
+      for (int i=0; i<(ARRLEN&(-4)); i++) {
+        errn += verify("test_unpack4: ", i, a0[i], (short)(ADD_INIT+i));
+      }
+
+      test_pack4_swap(p4, a1);
+      for (int i=0; i<ARRLEN/4; i++) {
+        errn += verify("test_pack4_swap: ", i, p4[i],  ((long)(ADD_INIT+4*i+3) & 0xFFFFl) |
+                                                      (((long)(ADD_INIT+4*i+2) & 0xFFFFl) << 16)  |
+                                                      (((long)(ADD_INIT+4*i+1) & 0xFFFFl) << 32)  |
+                                                      (((long)(ADD_INIT+4*i+0) & 0xFFFFl) << 48));
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a0[i] = -1;
+      }
+      test_unpack4_swap(a0, p4);
+      for (int i=0; i<(ARRLEN&(-4)); i++) {
+        errn += verify("test_unpack4_swap: ", i, a0[i], (short)(ADD_INIT+i));
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sum(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sum: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addv(a0, a1, (short)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_adda(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_adda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_subv(a0, a1, (short)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_subv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_suba(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_suba: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (short)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (short)VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mulv(a0, a1, (short)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mulv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_mula(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_mula_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divv(a0, a1, (short)-VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divv_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_diva(a0, a1, a3);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_diva_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_andv(a0, a1, (short)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_andv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_anda(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_anda: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_orv(a0, a1, (short)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_orv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xorv(a0, a1, (short)BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xorv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_xora(a0, a1, a4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_xora: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_n(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_n: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_n: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_o(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_o: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_o: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_on(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_on: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav(a0, a1, -SHIFT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_on: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack2(p2, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack2: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack2(a0, p2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack2: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack2_swap(p2, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack2_swap: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack2_swap(a0, p2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack2_swap: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack4(p4, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack4: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack4(a0, p4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack4: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_pack4_swap(p4, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_pack4_swap: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unpack4_swap(a0, p4);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unpack4_swap: " + (end - start));
+
+    return errn;
+  }
+
+  static int test_sum(short[] a1) {
+    int sum = 0;
+    for (int i = 0; i < a1.length; i+=1) {
+      sum += a1[i];
+    }
+    return sum;
+  }
+
+  static void test_addc(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]+VALUE);
+    }
+  }
+  static void test_addv(short[] a0, short[] a1, short b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]+b);
+    }
+  }
+  static void test_adda(short[] a0, short[] a1, short[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]+a2[i]);
+    }
+  }
+
+  static void test_subc(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]-VALUE);
+    }
+  }
+  static void test_subv(short[] a0, short[] a1, short b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]-b);
+    }
+  }
+  static void test_suba(short[] a0, short[] a1, short[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]-a2[i]);
+    }
+  }
+
+  static void test_mulc(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]*VALUE);
+    }
+  }
+  static void test_mulc_n(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]*(-VALUE));
+    }
+  }
+  static void test_mulv(short[] a0, short[] a1, short b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]*b);
+    }
+  }
+  static void test_mula(short[] a0, short[] a1, short[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]*a2[i]);
+    }
+  }
+
+  static void test_divc(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]/VALUE);
+    }
+  }
+  static void test_divc_n(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]/(-VALUE));
+    }
+  }
+  static void test_divv(short[] a0, short[] a1, short b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]/b);
+    }
+  }
+  static void test_diva(short[] a0, short[] a1, short[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]/a2[i]);
+    }
+  }
+
+  static void test_andc(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]&BIT_MASK);
+    }
+  }
+  static void test_andv(short[] a0, short[] a1, short b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]&b);
+    }
+  }
+  static void test_anda(short[] a0, short[] a1, short[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]&a2[i]);
+    }
+  }
+
+  static void test_orc(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]|BIT_MASK);
+    }
+  }
+  static void test_orv(short[] a0, short[] a1, short b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]|b);
+    }
+  }
+  static void test_ora(short[] a0, short[] a1, short[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]|a2[i]);
+    }
+  }
+
+  static void test_xorc(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]^BIT_MASK);
+    }
+  }
+  static void test_xorv(short[] a0, short[] a1, short b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]^b);
+    }
+  }
+  static void test_xora(short[] a0, short[] a1, short[] a2) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]^a2[i]);
+    }
+  }
+
+  static void test_sllc(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]<<VALUE);
+    }
+  }
+  static void test_sllc_n(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]<<(-VALUE));
+    }
+  }
+  static void test_sllc_o(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]<<SHIFT);
+    }
+  }
+  static void test_sllc_on(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]<<(-SHIFT));
+    }
+  }
+  static void test_sllv(short[] a0, short[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]<<b);
+    }
+  }
+
+  static void test_srlc(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]>>>VALUE);
+    }
+  }
+  static void test_srlc_n(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]>>>(-VALUE));
+    }
+  }
+  static void test_srlc_o(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]>>>SHIFT);
+    }
+  }
+  static void test_srlc_on(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]>>>(-SHIFT));
+    }
+  }
+  static void test_srlv(short[] a0, short[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]>>>b);
+    }
+  }
+
+  static void test_srac(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]>>VALUE);
+    }
+  }
+  static void test_srac_n(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]>>(-VALUE));
+    }
+  }
+  static void test_srac_o(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]>>SHIFT);
+    }
+  }
+  static void test_srac_on(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]>>(-SHIFT));
+    }
+  }
+  static void test_srav(short[] a0, short[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]>>b);
+    }
+  }
+
+  static void test_pack2(int[] p2, short[] a1) {
+    if (p2.length*2 > a1.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      int l0 = (int)a1[i*2+0];
+      int l1 = (int)a1[i*2+1];
+      p2[i] = (l1 << 16) | (l0 & 0xFFFF);
+    }
+  }
+  static void test_unpack2(short[] a0, int[] p2) {
+    if (p2.length*2 > a0.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      int l = p2[i];
+      a0[i*2+0] = (short)(l & 0xFFFF);
+      a0[i*2+1] = (short)(l >> 16);
+    }
+  }
+  static void test_pack2_swap(int[] p2, short[] a1) {
+    if (p2.length*2 > a1.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      int l0 = (int)a1[i*2+0];
+      int l1 = (int)a1[i*2+1];
+      p2[i] = (l0 << 16) | (l1 & 0xFFFF);
+    }
+  }
+  static void test_unpack2_swap(short[] a0, int[] p2) {
+    if (p2.length*2 > a0.length) return;
+    for (int i = 0; i < p2.length; i+=1) {
+      int l = p2[i];
+      a0[i*2+0] = (short)(l >> 16);
+      a0[i*2+1] = (short)(l & 0xFFFF);
+    }
+  }
+
+  static void test_pack4(long[] p4, short[] a1) {
+    if (p4.length*4 > a1.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      long l0 = (long)a1[i*4+0];
+      long l1 = (long)a1[i*4+1];
+      long l2 = (long)a1[i*4+2];
+      long l3 = (long)a1[i*4+3];
+      p4[i] = (l0 & 0xFFFFl) |
+             ((l1 & 0xFFFFl) << 16) |
+             ((l2 & 0xFFFFl) << 32) |
+             ((l3 & 0xFFFFl) << 48);
+    }
+  }
+  static void test_unpack4(short[] a0, long[] p4) {
+    if (p4.length*4 > a0.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      long l = p4[i];
+      a0[i*4+0] = (short)(l & 0xFFFFl);
+      a0[i*4+1] = (short)(l >> 16);
+      a0[i*4+2] = (short)(l >> 32);
+      a0[i*4+3] = (short)(l >> 48);
+    }
+  }
+  static void test_pack4_swap(long[] p4, short[] a1) {
+    if (p4.length*4 > a1.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      long l0 = (long)a1[i*4+0];
+      long l1 = (long)a1[i*4+1];
+      long l2 = (long)a1[i*4+2];
+      long l3 = (long)a1[i*4+3];
+      p4[i] = (l3 & 0xFFFFl) |
+             ((l2 & 0xFFFFl) << 16) |
+             ((l1 & 0xFFFFl) << 32) |
+             ((l0 & 0xFFFFl) << 48);
+    }
+  }
+  static void test_unpack4_swap(short[] a0, long[] p4) {
+    if (p4.length*4 > a0.length) return;
+    for (int i = 0; i < p4.length; i+=1) {
+      long l = p4[i];
+      a0[i*4+0] = (short)(l >> 48);
+      a0[i*4+1] = (short)(l >> 32);
+      a0[i*4+2] = (short)(l >> 16);
+      a0[i*4+3] = (short)(l & 0xFFFFl);
+    }
+  }
+
+  static int verify(String text, int i, short elem, short val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + Integer.toHexString(elem) + " != " + Integer.toHexString(val));
+      return 1;
+    }
+    return 0;
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + Long.toHexString(elem) + " != " + Long.toHexString(val));
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7190310/Test7190310.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/*
+ * Manual test
+ */
+
+import java.lang.ref.*;
+
+public class Test7190310 {
+  private static Object str = new Object() {
+    public String toString() {
+      return "The Object";
+    }
+
+    protected void finalize() throws Throwable {
+      System.out.println("The Object is being finalized");
+      super.finalize();
+    }
+  };
+  private final static ReferenceQueue<Object> rq =
+      new ReferenceQueue<Object>();
+  private final static WeakReference<Object> wr =
+      new WeakReference<Object>(str, rq);
+
+  public static void main(String[] args)
+      throws InterruptedException {
+    Thread reader = new Thread() {
+      public void run() {
+        while (wr.get() != null) {
+        }
+        System.out.println("wr.get() returned null");
+      }
+    };
+
+    Thread queueReader = new Thread() {
+      public void run() {
+        try {
+          Reference<? extends Object> ref = rq.remove();
+          System.out.println(ref);
+          System.out.println("queueReader returned, ref==wr is "
+              + (ref == wr));
+        } catch (InterruptedException e) {
+          System.err.println("Sleep interrupted - exiting");
+        }
+      }
+    };
+
+    reader.start();
+    queueReader.start();
+
+    Thread.sleep(1000);
+    str = null;
+    System.gc();
+  }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7190310/Test7190310_unsafe.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/*
+ * @test
+ * @bug 7190310
+ * @summary Inlining WeakReference.get(), and hoisting $referent may lead to non-terminating loops
+ * @run main/othervm -Xbatch Test7190310_unsafe
+ */
+
+import java.lang.ref.*;
+import java.lang.reflect.*;
+import sun.misc.Unsafe;
+
+public class Test7190310_unsafe {
+
+  static class TestObject {
+    public String toString() {
+      return "TestObject";
+    }
+  };
+
+  private static TestObject str = new TestObject();
+  private static final WeakReference ref = new WeakReference(str);
+
+  private TestObject obj;
+
+  public static void main(String[] args) throws Exception {
+    Class c = Test7190310_unsafe.class.getClassLoader().loadClass("sun.misc.Unsafe");
+    Field f = c.getDeclaredField("theUnsafe");
+    f.setAccessible(true);
+    Unsafe unsafe = (Unsafe)f.get(c);
+
+    f = Reference.class.getDeclaredField("referent");
+    f.setAccessible(true);
+    long referent_offset = unsafe.objectFieldOffset(f);
+
+    Test7190310_unsafe t = new Test7190310_unsafe();
+    TestObject o = new TestObject();
+    t.obj = o;
+
+    // Warmup (compile methods)
+    System.err.println("Warmup");
+    Object obj = null;
+    for (int i = 0; i < 11000; i++) {
+      obj = getRef0(ref);
+    }
+    for (int i = 0; i < 11000; i++) {
+      obj = getRef1(unsafe, ref, referent_offset);
+    }
+    for (int i = 0; i < 11000; i++) {
+      obj = getRef2(unsafe, ref, referent_offset);
+    }
+    for (int i = 0; i < 11000; i++) {
+      obj = getRef3(unsafe, ref, referent_offset);
+    }
+    for (int i = 0; i < 11000; i++) {
+      obj = getRef4(unsafe, t, referent_offset);
+    }
+
+    // Access verification
+    System.err.println("Verification");
+    if (!verifyGet(referent_offset, unsafe)) {
+      System.exit(97);
+    }
+
+    obj = getRef3(unsafe, t, referent_offset);
+    if (obj != o) {
+      System.out.println("FAILED: unsafe.getObject(Object, " + referent_offset + ") " + obj + " != " + o);
+      System.exit(97);
+    }
+    obj = getRef4(unsafe, t, referent_offset);
+    if (obj != o) {
+      System.out.println("FAILED: unsafe.getObject(Test7190310, " + referent_offset + ") " + obj + " != " + o);
+      System.exit(97);
+    }
+  }
+
+  static boolean verifyGet(long referent_offset, Unsafe unsafe) throws Exception {
+    // Access verification
+    System.out.println("referent: " + str);
+    Object obj = getRef0(ref);
+    if (obj != str) {
+      System.out.println("FAILED: weakRef.get() " + obj + " != " + str);
+      return false;
+    }
+    obj = getRef1(unsafe, ref, referent_offset);
+    if (obj != str) {
+      System.out.println("FAILED: unsafe.getObject(weakRef, " + referent_offset + ") " + obj + " != " + str);
+      return false;
+    }
+    obj = getRef2(unsafe, ref, referent_offset);
+    if (obj != str) {
+      System.out.println("FAILED: unsafe.getObject(abstRef, " + referent_offset + ") " + obj + " != " + str);
+      return false;
+    }
+    obj = getRef3(unsafe, ref, referent_offset);
+    if (obj != str) {
+      System.out.println("FAILED: unsafe.getObject(Object, " + referent_offset + ") " + obj + " != " + str);
+      return false;
+    }
+    return true;
+  }
+
+  static Object getRef0(WeakReference ref) throws Exception {
+    return ref.get();
+  }
+  static Object getRef1(Unsafe unsafe, WeakReference ref, long referent_offset) throws Exception {
+    return unsafe.getObject(ref, referent_offset);
+  }
+  static Object getRef2(Unsafe unsafe, Reference ref, long referent_offset) throws Exception {
+    return unsafe.getObject(ref, referent_offset);
+  }
+  static Object getRef3(Unsafe unsafe, Object ref, long referent_offset) throws Exception {
+    return unsafe.getObject(ref, referent_offset);
+  }
+  static Object getRef4(Unsafe unsafe, Test7190310_unsafe ref, long referent_offset) throws Exception {
+    return unsafe.getObject(ref, referent_offset);
+  }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7192963/TestByteVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7192963
+ * @summary assert(_in[req-1] == this) failed: Must pass arg count to 'new'
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestByteVect
+ */
+
+public class TestByteVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  public static void main(String args[]) {
+    System.out.println("Testing Byte vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    byte[] a0 = new byte[ARRLEN];
+    byte[] a1 = new byte[ARRLEN];
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = (byte)i;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+      test_addi(a0, a1);
+      test_lsai(a0, a1);
+      test_unrl_init(a0);
+      test_unrl_addi(a0, a1);
+      test_unrl_lsai(a0, a1);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_init: ", i, a0[i], (byte)(i&3));
+      }
+      test_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addi: ", i, a0[i], (byte)(i+(i&3)));
+      }
+      test_lsai(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_lsai: ", i, a0[i], (byte)(i<<(i&3)));
+      }
+      test_unrl_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_init: ", i, a0[i], (byte)(i&3));
+      }
+      test_unrl_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_addi: ", i, a0[i], (byte)(i+(i&3)));
+      }
+      test_unrl_lsai(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_lsai: ", i, a0[i], (byte)(i<<(i&3)));
+      }
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_lsai(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_lsai: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_lsai(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_lsai: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_init(byte[] a0) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(i&3);
+    }
+  }
+  static void test_addi(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]+(i&3));
+    }
+  }
+  static void test_lsai(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)(a1[i]<<(i&3));
+    }
+  }
+  static void test_unrl_init(byte[] a0) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = 0;
+      a0[i+1] = 1;
+      a0[i+2] = 2;
+      a0[i+3] = 3;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (byte)(i&3);
+    }
+  }
+  static void test_unrl_addi(byte[] a0, byte[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = (byte)(a1[i+0]+0);
+      a0[i+1] = (byte)(a1[i+1]+1);
+      a0[i+2] = (byte)(a1[i+2]+2);
+      a0[i+3] = (byte)(a1[i+3]+3);
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (byte)(a1[i]+(i&3));
+    }
+  }
+  static void test_unrl_lsai(byte[] a0, byte[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = (byte)(a1[i+0]<<0);
+      a0[i+1] = (byte)(a1[i+1]<<1);
+      a0[i+2] = (byte)(a1[i+2]<<2);
+      a0[i+3] = (byte)(a1[i+3]<<3);
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (byte)(a1[i]<<(i&3));
+    }
+  }
+
+  static int verify(String text, int i, byte elem, byte val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7192963/TestDoubleVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7192963
+ * @summary assert(_in[req-1] == this) failed: Must pass arg count to 'new'
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestDoubleVect
+ */
+
+public class TestDoubleVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  public static void main(String args[]) {
+    System.out.println("Testing Double vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    double[] a0 = new double[ARRLEN];
+    double[] a1 = new double[ARRLEN];
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = (double)i;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+      test_addi(a0, a1);
+      test_divi(a0, a1);
+      test_unrl_init(a0);
+      test_unrl_addi(a0, a1);
+      test_unrl_divi(a0, a1);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_init: ", i, a0[i], (double)(i&3));
+      }
+      test_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addi: ", i, a0[i], (double)(i+(i&3)));
+      }
+      test_divi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divi: ", i, a0[i], (double)i/(double)((i&3)+1));
+      }
+      test_unrl_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_init: ", i, a0[i], (double)(i&3));
+      }
+      test_unrl_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_addi: ", i, a0[i], (double)(i+(i&3)));
+      }
+      test_unrl_divi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_divi: ", i, a0[i], (double)i/(double)((i&3)+1));
+      }
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_divi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_divi: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_init(double[] a0) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (double)(i&3);
+    }
+  }
+  static void test_addi(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = a1[i]+(double)(i&3);
+    }
+  }
+  static void test_divi(double[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = a1[i]/(double)((i&3)+1);
+    }
+  }
+  static void test_unrl_init(double[] a0) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = 0.;
+      a0[i+1] = 1.;
+      a0[i+2] = 2.;
+      a0[i+3] = 3.;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (double)(i&3);
+    }
+  }
+  static void test_unrl_addi(double[] a0, double[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = a1[i+0]+0.;
+      a0[i+1] = a1[i+1]+1.;
+      a0[i+2] = a1[i+2]+2.;
+      a0[i+3] = a1[i+3]+3.;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = a1[i]+(double)(i&3);
+    }
+  }
+  static void test_unrl_divi(double[] a0, double[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = a1[i+0]/1.;
+      a0[i+1] = a1[i+1]/2.;
+      a0[i+2] = a1[i+2]/3.;
+      a0[i+3] = a1[i+3]/4.;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = a1[i]/(double)((i&3)+1);
+    }
+  }
+
+  static int verify(String text, int i, double elem, double val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7192963/TestFloatVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7192963
+ * @summary assert(_in[req-1] == this) failed: Must pass arg count to 'new'
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestFloatVect
+ */
+
+public class TestFloatVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  public static void main(String args[]) {
+    System.out.println("Testing Float vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    float[] a0 = new float[ARRLEN];
+    float[] a1 = new float[ARRLEN];
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = (float)i;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+      test_addi(a0, a1);
+      test_divi(a0, a1);
+      test_unrl_init(a0);
+      test_unrl_addi(a0, a1);
+      test_unrl_divi(a0, a1);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_init: ", i, a0[i], (float)(i&3));
+      }
+      test_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addi: ", i, a0[i], (float)(i+(i&3)));
+      }
+      test_divi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_divi: ", i, a0[i], (float)i/(float)((i&3)+1));
+      }
+      test_unrl_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_init: ", i, a0[i], (float)(i&3));
+      }
+      test_unrl_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_addi: ", i, a0[i], (float)(i+(i&3)));
+      }
+      test_unrl_divi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_divi: ", i, a0[i], (float)i/(float)((i&3)+1));
+      }
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_divi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_divi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_divi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_divi: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_init(float[] a0) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (float)(i&3);
+    }
+  }
+  static void test_addi(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = a1[i]+(float)(i&3);
+    }
+  }
+  static void test_divi(float[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = a1[i]/(float)((i&3)+1);
+    }
+  }
+  static void test_unrl_init(float[] a0) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = 0.f;
+      a0[i+1] = 1.f;
+      a0[i+2] = 2.f;
+      a0[i+3] = 3.f;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (float)(i&3);
+    }
+  }
+  static void test_unrl_addi(float[] a0, float[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = a1[i+0]+0.f;
+      a0[i+1] = a1[i+1]+1.f;
+      a0[i+2] = a1[i+2]+2.f;
+      a0[i+3] = a1[i+3]+3.f;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = a1[i]+(float)(i&3);
+    }
+  }
+  static void test_unrl_divi(float[] a0, float[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = a1[i+0]/1.f;
+      a0[i+1] = a1[i+1]/2.f;
+      a0[i+2] = a1[i+2]/3.f;
+      a0[i+3] = a1[i+3]/4.f;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = a1[i]/(float)((i&3)+1);
+    }
+  }
+
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7192963/TestIntVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7192963
+ * @summary assert(_in[req-1] == this) failed: Must pass arg count to 'new'
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestIntVect
+ */
+
+public class TestIntVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  public static void main(String args[]) {
+    System.out.println("Testing Integer vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    int[] a0 = new int[ARRLEN];
+    int[] a1 = new int[ARRLEN];
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = i;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+      test_addi(a0, a1);
+      test_lsai(a0, a1);
+      test_unrl_init(a0);
+      test_unrl_addi(a0, a1);
+      test_unrl_lsai(a0, a1);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_init: ", i, a0[i], (i&3));
+      }
+      test_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addi: ", i, a0[i], (i+(i&3)));
+      }
+      test_lsai(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_lsai: ", i, a0[i], (i<<(i&3)));
+      }
+      test_unrl_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_init: ", i, a0[i], (i&3));
+      }
+      test_unrl_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_addi: ", i, a0[i], (i+(i&3)));
+      }
+      test_unrl_lsai(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_lsai: ", i, a0[i], (i<<(i&3)));
+      }
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_lsai(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_lsai: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_lsai(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_lsai: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_init(int[] a0) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (i&3);
+    }
+  }
+  static void test_addi(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = a1[i]+(i&3);
+    }
+  }
+  static void test_lsai(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = a1[i]<<(i&3);
+    }
+  }
+  static void test_unrl_init(int[] a0) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = 0;
+      a0[i+1] = 1;
+      a0[i+2] = 2;
+      a0[i+3] = 3;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (i&3);
+    }
+  }
+  static void test_unrl_addi(int[] a0, int[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = a1[i+0]+0;
+      a0[i+1] = a1[i+1]+1;
+      a0[i+2] = a1[i+2]+2;
+      a0[i+3] = a1[i+3]+3;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = a1[i]+(i&3);
+    }
+  }
+  static void test_unrl_lsai(int[] a0, int[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = a1[i+0]<<0;
+      a0[i+1] = a1[i+1]<<1;
+      a0[i+2] = a1[i+2]<<2;
+      a0[i+3] = a1[i+3]<<3;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = a1[i]<<(i&3);
+    }
+  }
+
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7192963/TestLongVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7192963
+ * @summary assert(_in[req-1] == this) failed: Must pass arg count to 'new'
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestLongVect
+ */
+
+public class TestLongVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  public static void main(String args[]) {
+    System.out.println("Testing Long vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    long[] a0 = new long[ARRLEN];
+    long[] a1 = new long[ARRLEN];
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = (long)i;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+      test_addi(a0, a1);
+      test_lsai(a0, a1);
+      test_unrl_init(a0);
+      test_unrl_addi(a0, a1);
+      test_unrl_lsai(a0, a1);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_init: ", i, a0[i], (long)(i&3));
+      }
+      test_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addi: ", i, a0[i], (long)(i+(i&3)));
+      }
+      test_lsai(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_lsai: ", i, a0[i], (long)(i<<(i&3)));
+      }
+      test_unrl_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_init: ", i, a0[i], (long)(i&3));
+      }
+      test_unrl_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_addi: ", i, a0[i], (long)(i+(i&3)));
+      }
+      test_unrl_lsai(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_lsai: ", i, a0[i], (long)(i<<(i&3)));
+      }
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_lsai(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_lsai: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_lsai(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_lsai: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_init(long[] a0) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(i&3);
+    }
+  }
+  static void test_addi(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]+(i&3));
+    }
+  }
+  static void test_lsai(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)(a1[i]<<(i&3));
+    }
+  }
+  static void test_unrl_init(long[] a0) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = 0;
+      a0[i+1] = 1;
+      a0[i+2] = 2;
+      a0[i+3] = 3;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (long)(i&3);
+    }
+  }
+  static void test_unrl_addi(long[] a0, long[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = (long)(a1[i+0]+0);
+      a0[i+1] = (long)(a1[i+1]+1);
+      a0[i+2] = (long)(a1[i+2]+2);
+      a0[i+3] = (long)(a1[i+3]+3);
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (long)(a1[i]+(i&3));
+    }
+  }
+  static void test_unrl_lsai(long[] a0, long[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = (long)(a1[i+0]<<0);
+      a0[i+1] = (long)(a1[i+1]<<1);
+      a0[i+2] = (long)(a1[i+2]<<2);
+      a0[i+3] = (long)(a1[i+3]<<3);
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (long)(a1[i]<<(i&3));
+    }
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/7192963/TestShortVect.java	Fri Aug 24 19:45:42 2012 -0700
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7192963
+ * @summary assert(_in[req-1] == this) failed: Must pass arg count to 'new'
+ *
+ * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestShortVect
+ */
+
+public class TestShortVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  public static void main(String args[]) {
+    System.out.println("Testing Short vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    short[] a0 = new short[ARRLEN];
+    short[] a1 = new short[ARRLEN];
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = (short)i;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+      test_addi(a0, a1);
+      test_lsai(a0, a1);
+      test_unrl_init(a0);
+      test_unrl_addi(a0, a1);
+      test_unrl_lsai(a0, a1);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_init: ", i, a0[i], (short)(i&3));
+      }
+      test_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addi: ", i, a0[i], (short)(i+(i&3)));
+      }
+      test_lsai(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_lsai: ", i, a0[i], (short)(i<<(i&3)));
+      }
+      test_unrl_init(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_init: ", i, a0[i], (short)(i&3));
+      }
+      test_unrl_addi(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_addi: ", i, a0[i], (short)(i+(i&3)));
+      }
+      test_unrl_lsai(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_unrl_lsai: ", i, a0[i], (short)(i<<(i&3)));
+      }
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_lsai(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_lsai: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_init(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_init: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_addi(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_addi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_unrl_lsai(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_unrl_lsai: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_init(short[] a0) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(i&3);
+    }
+  }
+  static void test_addi(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]+(i&3));
+    }
+  }
+  static void test_lsai(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)(a1[i]<<(i&3));
+    }
+  }
+  static void test_unrl_init(short[] a0) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = 0;
+      a0[i+1] = 1;
+      a0[i+2] = 2;
+      a0[i+3] = 3;
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (short)(i&3);
+    }
+  }
+  static void test_unrl_addi(short[] a0, short[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = (short)(a1[i+0]+0);
+      a0[i+1] = (short)(a1[i+1]+1);
+      a0[i+2] = (short)(a1[i+2]+2);
+      a0[i+3] = (short)(a1[i+3]+3);
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (short)(a1[i]+(i&3));
+    }
+  }
+  static void test_unrl_lsai(short[] a0, short[] a1) {
+    int i = 0;
+    for (; i < a0.length-4; i+=4) {
+      a0[i+0] = (short)(a1[i+0]<<0);
+      a0[i+1] = (short)(a1[i+1]<<1);
+      a0[i+2] = (short)(a1[i+2]<<2);
+      a0[i+3] = (short)(a1[i+3]<<3);
+    }
+    for (; i < a0.length; i++) {
+      a0[i] = (short)(a1[i]<<(i&3));
+    }
+  }
+
+  static int verify(String text, int i, short elem, short val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
+