# HG changeset patch
# User Doug Simon <doug.simon@oracle.com>
# Date 1349209326 -7200
# Node ID 85c1b84f8fd98cd05389040eaca021c969725928
# Parent  dc409418cc2c9217b21a8972349c1dba22728f8d
moved ADM64-specific assembler code into separate project

diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.compiler/src/com/oracle/graal/compiler/target/amd64/AMD64DeoptimizationStub.java
--- a/graal/com.oracle.graal.compiler/src/com/oracle/graal/compiler/target/amd64/AMD64DeoptimizationStub.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.compiler/src/com/oracle/graal/compiler/target/amd64/AMD64DeoptimizationStub.java	Tue Oct 02 22:22:06 2012 +0200
@@ -31,7 +31,7 @@
 import com.oracle.graal.lir.amd64.*;
 import com.oracle.graal.lir.asm.*;
 import com.oracle.max.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 public class AMD64DeoptimizationStub extends AMD64Code {
     public final Label label = new Label();
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.compiler/src/com/oracle/graal/compiler/target/amd64/AMD64LIRGenerator.java
--- a/graal/com.oracle.graal.compiler/src/com/oracle/graal/compiler/target/amd64/AMD64LIRGenerator.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.compiler/src/com/oracle/graal/compiler/target/amd64/AMD64LIRGenerator.java	Tue Oct 02 22:22:06 2012 +0200
@@ -67,8 +67,8 @@
 import com.oracle.graal.nodes.extended.*;
 import com.oracle.graal.nodes.java.*;
 import com.oracle.max.asm.*;
-import com.oracle.max.asm.target.amd64.*;
-import com.oracle.max.asm.target.amd64.AMD64Assembler.ConditionFlag;
+import com.oracle.max.asm.amd64.*;
+import com.oracle.max.asm.amd64.AMD64Assembler.*;
 
 /**
  * This class implements the X86-specific portion of the LIR generator.
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotGraalRuntime.java
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotGraalRuntime.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotGraalRuntime.java	Tue Oct 02 22:22:06 2012 +0200
@@ -35,7 +35,7 @@
 import com.oracle.graal.hotspot.meta.*;
 import com.oracle.graal.hotspot.target.amd64.*;
 import com.oracle.graal.nodes.spi.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 /**
  * Singleton class holding the instance of the GraalCompiler.
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java	Tue Oct 02 22:22:06 2012 +0200
@@ -23,7 +23,7 @@
 package com.oracle.graal.hotspot;
 
 import com.oracle.graal.api.code.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 /**
  * Used to communicate configuration details, runtime offsets, etc. to graal upon compileMethod.
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/meta/HotSpotRegisterConfig.java
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/meta/HotSpotRegisterConfig.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/meta/HotSpotRegisterConfig.java	Tue Oct 02 22:22:06 2012 +0200
@@ -22,11 +22,11 @@
  */
 package com.oracle.graal.hotspot.meta;
 
-import static com.oracle.max.asm.target.amd64.AMD64.*;
+import static com.oracle.max.asm.amd64.AMD64.*;
 
 import java.util.*;
 
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 import com.oracle.graal.api.code.*;
 import com.oracle.graal.api.code.CallingConvention.*;
 import com.oracle.graal.api.code.Register.*;
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/snippets/HotSpotSnippetUtils.java
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/snippets/HotSpotSnippetUtils.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/snippets/HotSpotSnippetUtils.java	Tue Oct 02 22:22:06 2012 +0200
@@ -32,7 +32,7 @@
 import com.oracle.graal.nodes.extended.*;
 import com.oracle.graal.snippets.Snippet.Fold;
 import com.oracle.graal.snippets.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 //JaCoCo Exclude
 
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64BreakpointOp.java
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64BreakpointOp.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64BreakpointOp.java	Tue Oct 02 22:22:06 2012 +0200
@@ -28,7 +28,7 @@
 import com.oracle.graal.lir.LIRInstruction.Opcode;
 import com.oracle.graal.lir.amd64.*;
 import com.oracle.graal.lir.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 /**
  * Emits a breakpoint.
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64DirectCallOp.java
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64DirectCallOp.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64DirectCallOp.java	Tue Oct 02 22:22:06 2012 +0200
@@ -34,7 +34,7 @@
 import com.oracle.graal.lir.asm.*;
 import com.oracle.graal.nodes.java.MethodCallTargetNode.InvokeKind;
 import com.oracle.max.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 /**
  * A direct call that complies with the conventions for such calls in HotSpot.
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64IndirectCallOp.java
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64IndirectCallOp.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64IndirectCallOp.java	Tue Oct 02 22:22:06 2012 +0200
@@ -33,7 +33,7 @@
 import com.oracle.graal.lir.amd64.*;
 import com.oracle.graal.lir.amd64.AMD64Call.IndirectCallOp;
 import com.oracle.graal.lir.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 /**
  * A register indirect call that complies with the extra conventions for such calls in HotSpot.
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64SafepointOp.java
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64SafepointOp.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64SafepointOp.java	Tue Oct 02 22:22:06 2012 +0200
@@ -22,7 +22,7 @@
  */
 package com.oracle.graal.hotspot.target.amd64;
 
-import static com.oracle.max.asm.target.amd64.AMD64.*;
+import static com.oracle.max.asm.amd64.AMD64.*;
 
 import com.oracle.graal.api.code.*;
 import com.oracle.graal.hotspot.*;
@@ -31,7 +31,7 @@
 import com.oracle.graal.lir.LIRInstruction.Opcode;
 import com.oracle.graal.lir.amd64.*;
 import com.oracle.graal.lir.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 /**
  * Emits a safepoint poll.
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64TailcallOp.java
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64TailcallOp.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/AMD64TailcallOp.java	Tue Oct 02 22:22:06 2012 +0200
@@ -29,7 +29,7 @@
 import com.oracle.graal.lir.LIRInstruction.Opcode;
 import com.oracle.graal.lir.amd64.*;
 import com.oracle.graal.lir.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 /**
  * Performs a hard-coded tail call to the specified target, which normally should be an {@link InstalledCode} instance.
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/HotSpotAMD64Backend.java
--- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/HotSpotAMD64Backend.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/target/amd64/HotSpotAMD64Backend.java	Tue Oct 02 22:22:06 2012 +0200
@@ -25,7 +25,7 @@
 import static com.oracle.graal.api.code.CallingConvention.Type.*;
 import static com.oracle.graal.api.code.ValueUtil.*;
 import static com.oracle.graal.api.meta.Value.*;
-import static com.oracle.max.asm.target.amd64.AMD64.*;
+import static com.oracle.max.asm.amd64.AMD64.*;
 
 import java.lang.reflect.*;
 
@@ -48,8 +48,8 @@
 import com.oracle.graal.nodes.*;
 import com.oracle.graal.nodes.java.*;
 import com.oracle.max.asm.*;
-import com.oracle.max.asm.target.amd64.*;
-import com.oracle.max.asm.target.amd64.AMD64Assembler.ConditionFlag;
+import com.oracle.max.asm.amd64.*;
+import com.oracle.max.asm.amd64.AMD64Assembler.*;
 
 /**
  * HotSpot AMD64 specific backend.
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Arithmetic.java
--- a/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Arithmetic.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Arithmetic.java	Tue Oct 02 22:22:06 2012 +0200
@@ -30,8 +30,8 @@
 import com.oracle.graal.lir.*;
 import com.oracle.graal.lir.asm.*;
 import com.oracle.max.asm.*;
-import com.oracle.max.asm.target.amd64.*;
-import com.oracle.max.asm.target.amd64.AMD64Assembler.ConditionFlag;
+import com.oracle.max.asm.amd64.*;
+import com.oracle.max.asm.amd64.AMD64Assembler.*;
 
 public enum AMD64Arithmetic {
     IADD, ISUB, IMUL, IDIV, IREM, IUDIV, IUREM, IAND, IOR, IXOR, ISHL, ISHR, IUSHR,
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Call.java
--- a/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Call.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Call.java	Tue Oct 02 22:22:06 2012 +0200
@@ -30,7 +30,7 @@
 import com.oracle.graal.lir.*;
 import com.oracle.graal.lir.LIRInstruction.*;
 import com.oracle.graal.lir.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 public class AMD64Call {
 
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Code.java
--- a/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Code.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Code.java	Tue Oct 02 22:22:06 2012 +0200
@@ -22,7 +22,7 @@
  */
 package com.oracle.graal.lir.amd64;
 
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 import com.oracle.graal.lir.*;
 import com.oracle.graal.lir.asm.*;
 
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Compare.java
--- a/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Compare.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Compare.java	Tue Oct 02 22:22:06 2012 +0200
@@ -28,7 +28,7 @@
 import com.oracle.graal.api.meta.*;
 import com.oracle.graal.graph.*;
 import com.oracle.graal.lir.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 public enum AMD64Compare {
     ICMP, LCMP, ACMP, FCMP, DCMP;
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64ControlFlow.java
--- a/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64ControlFlow.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64ControlFlow.java	Tue Oct 02 22:22:06 2012 +0200
@@ -36,8 +36,8 @@
 import com.oracle.graal.lir.asm.*;
 import com.oracle.graal.nodes.calc.*;
 import com.oracle.max.asm.*;
-import com.oracle.max.asm.target.amd64.*;
-import com.oracle.max.asm.target.amd64.AMD64Assembler.ConditionFlag;
+import com.oracle.max.asm.amd64.*;
+import com.oracle.max.asm.amd64.AMD64Assembler.*;
 
 public class AMD64ControlFlow {
 
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64LIRInstruction.java
--- a/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64LIRInstruction.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64LIRInstruction.java	Tue Oct 02 22:22:06 2012 +0200
@@ -24,7 +24,7 @@
 
 import com.oracle.graal.lir.*;
 import com.oracle.graal.lir.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 /**
  * Convenience class to provide AMD64MacroAssembler for the {@link #emitCode} method.
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Move.java
--- a/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Move.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.lir.amd64/src/com/oracle/graal/lir/amd64/AMD64Move.java	Tue Oct 02 22:22:06 2012 +0200
@@ -35,7 +35,7 @@
 import com.oracle.graal.lir.StandardOp.MoveOp;
 import com.oracle.graal.lir.asm.*;
 import com.oracle.max.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 public class AMD64Move {
 
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.snippets/src/com/oracle/graal/snippets/target/amd64/AMD64BitScanOp.java
--- a/graal/com.oracle.graal.snippets/src/com/oracle/graal/snippets/target/amd64/AMD64BitScanOp.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.snippets/src/com/oracle/graal/snippets/target/amd64/AMD64BitScanOp.java	Tue Oct 02 22:22:06 2012 +0200
@@ -26,7 +26,7 @@
 import com.oracle.graal.api.meta.*;
 import com.oracle.graal.lir.amd64.*;
 import com.oracle.graal.lir.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 
 public class AMD64BitScanOp extends AMD64LIRInstruction {
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.snippets/src/com/oracle/graal/snippets/target/amd64/AMD64ByteSwapOp.java
--- a/graal/com.oracle.graal.snippets/src/com/oracle/graal/snippets/target/amd64/AMD64ByteSwapOp.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.snippets/src/com/oracle/graal/snippets/target/amd64/AMD64ByteSwapOp.java	Tue Oct 02 22:22:06 2012 +0200
@@ -27,7 +27,7 @@
 import com.oracle.graal.lir.LIRInstruction.*;
 import com.oracle.graal.lir.amd64.*;
 import com.oracle.graal.lir.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 @Opcode("BSWAP")
 public class AMD64ByteSwapOp extends AMD64LIRInstruction {
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.graal.snippets/src/com/oracle/graal/snippets/target/amd64/AMD64MathIntrinsicOp.java
--- a/graal/com.oracle.graal.snippets/src/com/oracle/graal/snippets/target/amd64/AMD64MathIntrinsicOp.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.graal.snippets/src/com/oracle/graal/snippets/target/amd64/AMD64MathIntrinsicOp.java	Tue Oct 02 22:22:06 2012 +0200
@@ -28,7 +28,7 @@
 import com.oracle.graal.graph.*;
 import com.oracle.graal.lir.amd64.*;
 import com.oracle.graal.lir.asm.*;
-import com.oracle.max.asm.target.amd64.*;
+import com.oracle.max.asm.amd64.*;
 
 public class AMD64MathIntrinsicOp extends AMD64LIRInstruction {
     public enum IntrinsicOpcode  {
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.max.asm.amd64/src/com/oracle/max/asm/amd64/AMD64.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.max.asm.amd64/src/com/oracle/max/asm/amd64/AMD64.java	Tue Oct 02 22:22:06 2012 +0200
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2009, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.max.asm.amd64;
+
+import static com.oracle.graal.api.code.Register.RegisterFlag.*;
+import static com.oracle.graal.api.meta.Kind.*;
+import static com.oracle.max.criutils.MemoryBarriers.*;
+
+import com.oracle.graal.api.code.*;
+import com.oracle.graal.api.code.Register.*;
+
+/**
+ * Represents the AMD64 architecture.
+ */
+public class AMD64 extends Architecture {
+
+    // General purpose CPU registers
+    public static final Register rax = new Register(0, 0, 8, "rax", CPU, RegisterFlag.Byte);
+    public static final Register rcx = new Register(1, 1, 8, "rcx", CPU, RegisterFlag.Byte);
+    public static final Register rdx = new Register(2, 2, 8, "rdx", CPU, RegisterFlag.Byte);
+    public static final Register rbx = new Register(3, 3, 8, "rbx", CPU, RegisterFlag.Byte);
+    public static final Register rsp = new Register(4, 4, 8, "rsp", CPU, RegisterFlag.Byte);
+    public static final Register rbp = new Register(5, 5, 8, "rbp", CPU, RegisterFlag.Byte);
+    public static final Register rsi = new Register(6, 6, 8, "rsi", CPU, RegisterFlag.Byte);
+    public static final Register rdi = new Register(7, 7, 8, "rdi", CPU, RegisterFlag.Byte);
+
+    public static final Register r8  = new Register(8,  8,  8, "r8", CPU, RegisterFlag.Byte);
+    public static final Register r9  = new Register(9,  9,  8, "r9", CPU, RegisterFlag.Byte);
+    public static final Register r10 = new Register(10, 10, 8, "r10", CPU, RegisterFlag.Byte);
+    public static final Register r11 = new Register(11, 11, 8, "r11", CPU, RegisterFlag.Byte);
+    public static final Register r12 = new Register(12, 12, 8, "r12", CPU, RegisterFlag.Byte);
+    public static final Register r13 = new Register(13, 13, 8, "r13", CPU, RegisterFlag.Byte);
+    public static final Register r14 = new Register(14, 14, 8, "r14", CPU, RegisterFlag.Byte);
+    public static final Register r15 = new Register(15, 15, 8, "r15", CPU, RegisterFlag.Byte);
+
+    public static final Register[] cpuRegisters = {
+        rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi,
+        r8, r9, r10, r11, r12, r13, r14, r15
+    };
+
+    // XMM registers
+    public static final Register xmm0 = new Register(16, 0, 8, "xmm0", FPU);
+    public static final Register xmm1 = new Register(17, 1, 8, "xmm1", FPU);
+    public static final Register xmm2 = new Register(18, 2, 8, "xmm2", FPU);
+    public static final Register xmm3 = new Register(19, 3, 8, "xmm3", FPU);
+    public static final Register xmm4 = new Register(20, 4, 8, "xmm4", FPU);
+    public static final Register xmm5 = new Register(21, 5, 8, "xmm5", FPU);
+    public static final Register xmm6 = new Register(22, 6, 8, "xmm6", FPU);
+    public static final Register xmm7 = new Register(23, 7, 8, "xmm7", FPU);
+
+    public static final Register xmm8 =  new Register(24,  8, 8, "xmm8",  FPU);
+    public static final Register xmm9 =  new Register(25,  9, 8, "xmm9",  FPU);
+    public static final Register xmm10 = new Register(26, 10, 8, "xmm10", FPU);
+    public static final Register xmm11 = new Register(27, 11, 8, "xmm11", FPU);
+    public static final Register xmm12 = new Register(28, 12, 8, "xmm12", FPU);
+    public static final Register xmm13 = new Register(29, 13, 8, "xmm13", FPU);
+    public static final Register xmm14 = new Register(30, 14, 8, "xmm14", FPU);
+    public static final Register xmm15 = new Register(31, 15, 8, "xmm15", FPU);
+
+    public static final Register[] xmmRegisters = {
+        xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
+        xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+    };
+
+    public static final Register[] cpuxmmRegisters = {
+        rax,  rcx,  rdx,   rbx,   rsp,   rbp,   rsi,   rdi,
+        r8,   r9,   r10,   r11,   r12,   r13,   r14,   r15,
+        xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
+        xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+    };
+
+    /**
+     * Register used to construct an instruction-relative address.
+     */
+    public static final Register rip = new Register(32, -1, 0, "rip");
+
+    public static final Register[] allRegisters = {
+        rax,  rcx,  rdx,   rbx,   rsp,   rbp,   rsi,   rdi,
+        r8,   r9,   r10,   r11,   r12,   r13,   r14,   r15,
+        xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
+        xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+        rip
+    };
+
+    public static final RegisterValue RSP = rsp.asValue(Long);
+
+    public AMD64() {
+        super("AMD64",
+              8,
+              ByteOrder.LittleEndian,
+              allRegisters,
+              LOAD_STORE | STORE_STORE,
+              1,
+              r15.encoding + 1,
+              8);
+    }
+
+    @Override
+    public boolean isX86() {
+        return true;
+    }
+
+    @Override
+    public boolean twoOperandMode() {
+        return true;
+    }
+
+}
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.max.asm.amd64/src/com/oracle/max/asm/amd64/AMD64AsmOptions.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.max.asm.amd64/src/com/oracle/max/asm/amd64/AMD64AsmOptions.java	Tue Oct 02 22:22:06 2012 +0200
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.max.asm.amd64;
+
+public class AMD64AsmOptions {
+    public static int     Atomics                       = 0;
+    public static boolean UseNormalNop                  = true;
+    public static boolean UseAddressNop                 = true;
+    public static boolean UseIncDec                     = false;
+    public static boolean UseXmmLoadAndClearUpper       = true;
+    public static boolean UseXmmRegToRegMoveAll         = false;
+}
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.max.asm.amd64/src/com/oracle/max/asm/amd64/AMD64Assembler.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.max.asm.amd64/src/com/oracle/max/asm/amd64/AMD64Assembler.java	Tue Oct 02 22:22:06 2012 +0200
@@ -0,0 +1,3034 @@
+/*
+ * Copyright (c) 2009, 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.max.asm.amd64;
+
+import static com.oracle.graal.api.code.ValueUtil.*;
+import static com.oracle.max.asm.NumUtil.*;
+import static com.oracle.max.asm.amd64.AMD64.*;
+import static com.oracle.max.asm.amd64.AMD64AsmOptions.*;
+import static com.oracle.max.criutils.MemoryBarriers.*;
+
+import com.oracle.graal.api.code.*;
+import com.oracle.graal.api.meta.*;
+import com.oracle.max.asm.*;
+
+/**
+ * This class implements an assembler that can encode most X86 instructions.
+ */
+public class AMD64Assembler extends AbstractAssembler {
+    /**
+     * The kind for pointers and raw registers.  Since we know we are 64 bit here, we can hardcode it.
+     */
+    private static final Kind Word = Kind.Long;
+
+    private static final int MinEncodingNeedsRex = 8;
+
+    /**
+     * The x86 condition codes used for conditional jumps/moves.
+     */
+    public enum ConditionFlag {
+        zero(0x4, "|zero|"),
+        notZero(0x5, "|nzero|"),
+        equal(0x4, "="),
+        notEqual(0x5, "!="),
+        less(0xc, "<"),
+        lessEqual(0xe, "<="),
+        greater(0xf, ">"),
+        greaterEqual(0xd, ">="),
+        below(0x2, "|<|"),
+        belowEqual(0x6, "|<=|"),
+        above(0x7, "|>|"),
+        aboveEqual(0x3, "|>=|"),
+        overflow(0x0, "|of|"),
+        noOverflow(0x1, "|nof|"),
+        carrySet(0x2, "|carry|"),
+        carryClear(0x3, "|ncarry|"),
+        negative(0x8, "|neg|"),
+        positive(0x9, "|pos|"),
+        parity(0xa, "|par|"),
+        noParity(0xb, "|npar|");
+
+        public final int value;
+        public final String operator;
+
+        private ConditionFlag(int value, String operator) {
+            this.value = value;
+            this.operator = operator;
+        }
+
+        public ConditionFlag negate() {
+            switch(this) {
+                case zero: return notZero;
+                case notZero: return zero;
+                case equal: return notEqual;
+                case notEqual: return equal;
+                case less: return greaterEqual;
+                case lessEqual: return greater;
+                case greater: return lessEqual;
+                case greaterEqual: return less;
+                case below: return aboveEqual;
+                case belowEqual: return above;
+                case above: return belowEqual;
+                case aboveEqual: return below;
+                case overflow: return noOverflow;
+                case noOverflow: return overflow;
+                case carrySet: return carryClear;
+                case carryClear: return carrySet;
+                case negative: return positive;
+                case positive: return negative;
+                case parity: return noParity;
+                case noParity: return parity;
+            }
+            throw new IllegalArgumentException();
+        }
+    }
+
+    /**
+     * Constants for X86 prefix bytes.
+     */
+    private static class Prefix {
+        private static final int REX = 0x40;
+        private static final int REXB = 0x41;
+        private static final int REXX = 0x42;
+        private static final int REXXB = 0x43;
+        private static final int REXR = 0x44;
+        private static final int REXRB = 0x45;
+        private static final int REXRX = 0x46;
+        private static final int REXRXB = 0x47;
+        private static final int REXW = 0x48;
+        private static final int REXWB = 0x49;
+        private static final int REXWX = 0x4A;
+        private static final int REXWXB = 0x4B;
+        private static final int REXWR = 0x4C;
+        private static final int REXWRB = 0x4D;
+        private static final int REXWRX = 0x4E;
+        private static final int REXWRXB = 0x4F;
+    }
+
+    /**
+     * The register to which {@link Register#Frame} and {@link Register#CallerFrame} are bound.
+     */
+    public final Register frameRegister;
+
+    /**
+     * Constructs an assembler for the AMD64 architecture.
+     *
+     * @param registerConfig the register configuration used to bind {@link Register#Frame} and
+     *            {@link Register#CallerFrame} to physical registers. This value can be null if this assembler
+     *            instance will not be used to assemble instructions using these logical registers.
+     */
+    public AMD64Assembler(TargetDescription target, RegisterConfig registerConfig) {
+        super(target);
+        this.frameRegister = registerConfig == null ? null : registerConfig.getFrameRegister();
+    }
+
+    private static int encode(Register r) {
+        assert r.encoding < 16 && r.encoding >= 0 : "encoding out of range: " + r.encoding;
+        return r.encoding & 0x7;
+    }
+
+    private void emitArithB(int op1, int op2, Register dst, int imm8) {
+        assert dst.isByte() : "must have byte register";
+        assert isUByte(op1) && isUByte(op2) : "wrong opcode";
+        assert isUByte(imm8) : "not a byte";
+        assert (op1 & 0x01) == 0 : "should be 8bit operation";
+        emitByte(op1);
+        emitByte(op2 | encode(dst));
+        emitByte(imm8);
+    }
+
+    private void emitArith(int op1, int op2, Register dst, int imm32) {
+        assert isUByte(op1) && isUByte(op2) : "wrong opcode";
+        assert (op1 & 0x01) == 1 : "should be 32bit operation";
+        assert (op1 & 0x02) == 0 : "sign-extension bit should not be set";
+        if (isByte(imm32)) {
+            emitByte(op1 | 0x02); // set sign bit
+            emitByte(op2 | encode(dst));
+            emitByte(imm32 & 0xFF);
+        } else {
+            emitByte(op1);
+            emitByte(op2 | encode(dst));
+            emitInt(imm32);
+        }
+    }
+
+    // immediate-to-memory forms
+    private void emitArithOperand(int op1, Register rm, Address adr, int imm32) {
+        assert (op1 & 0x01) == 1 : "should be 32bit operation";
+        assert (op1 & 0x02) == 0 : "sign-extension bit should not be set";
+        if (isByte(imm32)) {
+            emitByte(op1 | 0x02); // set sign bit
+            emitOperandHelper(rm, adr);
+            emitByte(imm32 & 0xFF);
+        } else {
+            emitByte(op1);
+            emitOperandHelper(rm, adr);
+            emitInt(imm32);
+        }
+    }
+
+    private void emitArith(int op1, int op2, Register dst, Register src) {
+        assert isUByte(op1) && isUByte(op2) : "wrong opcode";
+        emitByte(op1);
+        emitByte(op2 | encode(dst) << 3 | encode(src));
+    }
+
+    private void emitOperandHelper(Register reg, Address addr) {
+        Register base = isLegal(addr.getBase()) ? asRegister(addr.getBase()) : Register.None;
+        Register index = isLegal(addr.getIndex()) ? asRegister(addr.getIndex()) : Register.None;
+
+        Address.Scale scale = addr.getScale();
+        int disp = addr.getDisplacement();
+
+        if (base == Register.Frame) {
+            assert frameRegister != null : "cannot use register " + Register.Frame + " in assembler with null register configuration";
+            base = frameRegister;
+//        } else if (base == Register.CallerFrame) {
+//            assert frameRegister != null : "cannot use register " + Register.Frame + " in assembler with null register configuration";
+//            base = frameRegister;
+//            disp += targetMethod.frameSize() + 8;
+        }
+
+        // Encode the registers as needed in the fields they are used in
+
+        assert reg != Register.None;
+        int regenc = encode(reg) << 3;
+
+        if (base == AMD64.rip) {
+            // [00 000 101] disp32
+            emitByte(0x05 | regenc);
+            emitInt(disp);
+        } else if (addr == Address.Placeholder) {
+            // [00 000 101] disp32
+            emitByte(0x05 | regenc);
+            emitInt(0);
+
+        } else if (base.isValid()) {
+            int baseenc = base.isValid() ? encode(base) : 0;
+            if (index.isValid()) {
+                int indexenc = encode(index) << 3;
+                // [base + indexscale + disp]
+                if (disp == 0 && base != rbp && (base != r13)) {
+                    // [base + indexscale]
+                    // [00 reg 100][ss index base]
+                    assert index != rsp : "illegal addressing mode";
+                    emitByte(0x04 | regenc);
+                    emitByte(scale.log2 << 6 | indexenc | baseenc);
+                } else if (isByte(disp)) {
+                    // [base + indexscale + imm8]
+                    // [01 reg 100][ss index base] imm8
+                    assert index != rsp : "illegal addressing mode";
+                    emitByte(0x44 | regenc);
+                    emitByte(scale.log2 << 6 | indexenc | baseenc);
+                    emitByte(disp & 0xFF);
+                } else {
+                    // [base + indexscale + disp32]
+                    // [10 reg 100][ss index base] disp32
+                    assert index != rsp : "illegal addressing mode";
+                    emitByte(0x84 | regenc);
+                    emitByte(scale.log2 << 6 | indexenc | baseenc);
+                    emitInt(disp);
+                }
+            } else if (base == rsp || (base == r12)) {
+                // [rsp + disp]
+                if (disp == 0) {
+                    // [rsp]
+                    // [00 reg 100][00 100 100]
+                    emitByte(0x04 | regenc);
+                    emitByte(0x24);
+                } else if (isByte(disp)) {
+                    // [rsp + imm8]
+                    // [01 reg 100][00 100 100] disp8
+                    emitByte(0x44 | regenc);
+                    emitByte(0x24);
+                    emitByte(disp & 0xFF);
+                } else {
+                    // [rsp + imm32]
+                    // [10 reg 100][00 100 100] disp32
+                    emitByte(0x84 | regenc);
+                    emitByte(0x24);
+                    emitInt(disp);
+                }
+            } else {
+                // [base + disp]
+                assert base != rsp && (base != r12) : "illegal addressing mode";
+                if (disp == 0 && base != rbp && (base != r13)) {
+                    // [base]
+                    // [00 reg base]
+                    emitByte(0x00 | regenc | baseenc);
+                } else if (isByte(disp)) {
+                    // [base + disp8]
+                    // [01 reg base] disp8
+                    emitByte(0x40 | regenc | baseenc);
+                    emitByte(disp & 0xFF);
+                } else {
+                    // [base + disp32]
+                    // [10 reg base] disp32
+                    emitByte(0x80 | regenc | baseenc);
+                    emitInt(disp);
+                }
+            }
+        } else {
+            if (index.isValid()) {
+                int indexenc = encode(index) << 3;
+                // [indexscale + disp]
+                // [00 reg 100][ss index 101] disp32
+                assert index != rsp : "illegal addressing mode";
+                emitByte(0x04 | regenc);
+                emitByte(scale.log2 << 6 | indexenc | 0x05);
+                emitInt(disp);
+            } else {
+                // [disp] ABSOLUTE
+                // [00 reg 100][00 100 101] disp32
+                emitByte(0x04 | regenc);
+                emitByte(0x25);
+                emitInt(disp);
+            }
+        }
+    }
+
+    public final void addl(Address dst, int imm32) {
+        prefix(dst);
+        emitArithOperand(0x81, rax, dst, imm32);
+    }
+
+    public final void addl(Address dst, Register src) {
+        prefix(dst, src);
+        emitByte(0x01);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void addl(Register dst, int imm32) {
+        prefix(dst);
+        emitArith(0x81, 0xC0, dst, imm32);
+    }
+
+    public final void addl(Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0x03);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void addl(Register dst, Register src) {
+        prefixAndEncode(dst.encoding, src.encoding);
+        emitArith(0x03, 0xC0, dst, src);
+    }
+
+    private void addrNop4() {
+        // 4 bytes: NOP DWORD PTR [EAX+0]
+        emitByte(0x0F);
+        emitByte(0x1F);
+        emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
+        emitByte(0); // 8-bits offset (1 byte)
+    }
+
+    private void addrNop5() {
+        // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
+        emitByte(0x0F);
+        emitByte(0x1F);
+        emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
+        emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
+        emitByte(0); // 8-bits offset (1 byte)
+    }
+
+    private void addrNop7() {
+        // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
+        emitByte(0x0F);
+        emitByte(0x1F);
+        emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
+        emitInt(0); // 32-bits offset (4 bytes)
+    }
+
+    private void addrNop8() {
+        // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
+        emitByte(0x0F);
+        emitByte(0x1F);
+        emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
+        emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
+        emitInt(0); // 32-bits offset (4 bytes)
+    }
+
+    public final void addsd(Register dst, Register src) {
+        assert dst.isFpu() && src.isFpu();
+        emitByte(0xF2);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x58);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void addsd(Register dst, Address src) {
+        assert dst.isFpu();
+        emitByte(0xF2);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x58);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void addss(Register dst, Register src) {
+        assert dst.isFpu() && src.isFpu();
+        emitByte(0xF3);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x58);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void addss(Register dst, Address src) {
+        assert dst.isFpu();
+        emitByte(0xF3);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x58);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void andl(Register dst, int imm32) {
+        prefix(dst);
+        emitArith(0x81, 0xE0, dst, imm32);
+    }
+
+    public final void andl(Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0x23);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void andl(Register dst, Register src) {
+        prefixAndEncode(dst.encoding, src.encoding);
+        emitArith(0x23, 0xC0, dst, src);
+    }
+
+    public final void bsfq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xBC);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void bsfq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0xBC);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void bsrq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xBD);
+        emitByte(0xC0 | encode);
+    }
+
+
+    public final void bsrq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0xBD);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void bsrl(Register dst, Register src) {
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xBD);
+        emitByte(0xC0 | encode);
+    }
+
+
+    public final void bsrl(Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0xBD);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void bswapl(Register reg) { // bswap
+        int encode = prefixAndEncode(reg.encoding);
+        emitByte(0x0F);
+        emitByte(0xC8 | encode);
+    }
+
+    public final void btli(Address src, int imm8) {
+        prefixq(src);
+        emitByte(0x0F);
+        emitByte(0xBA);
+        emitOperandHelper(rsp, src);
+        emitByte(imm8);
+    }
+
+    public final void cdql() {
+        emitByte(0x99);
+    }
+
+    public final void cmovl(ConditionFlag cc, Register dst, Register src) {
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x40 | cc.value);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cmovl(ConditionFlag cc, Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x40 | cc.value);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void cmpb(Address dst, int imm8) {
+        prefix(dst);
+        emitByte(0x80);
+        emitOperandHelper(rdi, dst);
+        emitByte(imm8);
+    }
+
+    public final void cmpl(Address dst, int imm32) {
+        prefix(dst);
+        emitByte(0x81);
+        emitOperandHelper(rdi, dst);
+        emitInt(imm32);
+    }
+
+    public final void cmpl(Register dst, int imm32) {
+        prefix(dst);
+        emitArith(0x81, 0xF8, dst, imm32);
+    }
+
+    public final void cmpl(Register dst, Register src) {
+        prefixAndEncode(dst.encoding, src.encoding);
+        emitArith(0x3B, 0xC0, dst, src);
+    }
+
+    public final void cmpl(Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0x3B);
+        emitOperandHelper(dst, src);
+    }
+
+    // The 32-bit cmpxchg compares the value at adr with the contents of X86.rax,
+    // and stores reg into adr if so; otherwise, the value at adr is loaded into X86.rax,.
+    // The ZF is set if the compared values were equal, and cleared otherwise.
+    public final void cmpxchgl(Register reg, Address adr) { // cmpxchg
+        if ((Atomics & 2) != 0) {
+            // caveat: no instructionmark, so this isn't relocatable.
+            // Emit a synthetic, non-atomic, CAS equivalent.
+            // Beware. The synthetic form sets all ICCs, not just ZF.
+            // cmpxchg r,[m] is equivalent to X86.rax, = CAS (m, X86.rax, r)
+            cmpl(rax, adr);
+            movl(rax, adr);
+            if (reg != rax) {
+                Label l = new Label();
+                jcc(ConditionFlag.notEqual, l);
+                movl(adr, reg);
+                bind(l);
+            }
+        } else {
+
+            prefix(adr, reg);
+            emitByte(0x0F);
+            emitByte(0xB1);
+            emitOperandHelper(reg, adr);
+        }
+    }
+
+    public final void comisd(Register dst, Address src) {
+        assert dst.isFpu();
+        // NOTE: dbx seems to decode this as comiss even though the
+        // 0x66 is there. Strangly ucomisd comes out correct
+        emitByte(0x66);
+        comiss(dst, src);
+    }
+
+    public final void comiss(Register dst, Address src) {
+        assert dst.isFpu();
+
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x2F);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void cvtdq2pd(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+
+        emitByte(0xF3);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xE6);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvtdq2ps(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x5B);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvtsd2ss(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        emitByte(0xF2);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x5A);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvtsi2sdl(Register dst, Register src) {
+        assert dst.isFpu();
+        emitByte(0xF2);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x2A);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvtsi2ssl(Register dst, Register src) {
+        assert dst.isFpu();
+        emitByte(0xF3);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x2A);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvtss2sd(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        emitByte(0xF3);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x5A);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvttsd2sil(Register dst, Register src) {
+        assert src.isFpu();
+        emitByte(0xF2);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x2C);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvttss2sil(Register dst, Register src) {
+        assert src.isFpu();
+        emitByte(0xF3);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x2C);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void decl(Address dst) {
+        // Don't use it directly. Use Macrodecrement() instead.
+        prefix(dst);
+        emitByte(0xFF);
+        emitOperandHelper(rcx, dst);
+    }
+
+    public final void divsd(Register dst, Address src) {
+        assert dst.isFpu();
+        emitByte(0xF2);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x5E);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void divsd(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        emitByte(0xF2);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x5E);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void divss(Register dst, Address src) {
+        assert dst.isFpu();
+        emitByte(0xF3);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x5E);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void divss(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        emitByte(0xF3);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x5E);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void hlt() {
+        emitByte(0xF4);
+    }
+
+    public final void idivl(Register src) {
+        int encode = prefixAndEncode(src.encoding);
+        emitByte(0xF7);
+        emitByte(0xF8 | encode);
+    }
+
+    public final void divl(Register src) {
+        int encode = prefixAndEncode(src.encoding);
+        emitByte(0xF7);
+        emitByte(0xF0 | encode);
+    }
+
+    public final void imull(Register dst, Register src) {
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xAF);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void imull(Register dst, Register src, int value) {
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        if (isByte(value)) {
+            emitByte(0x6B);
+            emitByte(0xC0 | encode);
+            emitByte(value & 0xFF);
+        } else {
+            emitByte(0x69);
+            emitByte(0xC0 | encode);
+            emitInt(value);
+        }
+    }
+
+    public final void incl(Address dst) {
+        // Don't use it directly. Use Macroincrement() instead.
+        prefix(dst);
+        emitByte(0xFF);
+        emitOperandHelper(rax, dst);
+    }
+
+    public final void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
+        int shortSize = 2;
+        int longSize = 6;
+        long disp = jumpTarget - codeBuffer.position();
+        if (!forceDisp32 && isByte(disp - shortSize)) {
+            // 0111 tttn #8-bit disp
+            emitByte(0x70 | cc.value);
+            emitByte((int) ((disp - shortSize) & 0xFF));
+        } else {
+            // 0000 1111 1000 tttn #32-bit disp
+            assert isInt(disp - longSize) : "must be 32bit offset (call4)";
+            emitByte(0x0F);
+            emitByte(0x80 | cc.value);
+            emitInt((int) (disp - longSize));
+        }
+    }
+
+    public final void jcc(ConditionFlag cc, Label l) {
+        assert (0 <= cc.value) && (cc.value < 16) : "illegal cc";
+        if (l.isBound()) {
+            jcc(cc, l.position(), false);
+        } else {
+            // Note: could eliminate cond. jumps to this jump if condition
+            // is the same however, seems to be rather unlikely case.
+            // Note: use jccb() if label to be bound is very close to get
+            // an 8-bit displacement
+            l.addPatchAt(codeBuffer.position());
+            emitByte(0x0F);
+            emitByte(0x80 | cc.value);
+            emitInt(0);
+        }
+
+    }
+
+    public final void jccb(ConditionFlag cc, Label l) {
+        if (l.isBound()) {
+            int shortSize = 2;
+            int entry = l.position();
+            assert isByte(entry - (codeBuffer.position() + shortSize)) : "Dispacement too large for a short jmp";
+            long disp = entry - codeBuffer.position();
+            // 0111 tttn #8-bit disp
+            emitByte(0x70 | cc.value);
+            emitByte((int) ((disp - shortSize) & 0xFF));
+        } else {
+
+            l.addPatchAt(codeBuffer.position());
+            emitByte(0x70 | cc.value);
+            emitByte(0);
+        }
+    }
+
+    public final void jmp(Address adr) {
+        prefix(adr);
+        emitByte(0xFF);
+        emitOperandHelper(rsp, adr);
+    }
+
+    public final void jmp(int jumpTarget, boolean forceDisp32) {
+        int shortSize = 2;
+        int longSize = 5;
+        long disp = jumpTarget - codeBuffer.position();
+        if (!forceDisp32 && isByte(disp - shortSize)) {
+            emitByte(0xEB);
+            emitByte((int) ((disp - shortSize) & 0xFF));
+        } else {
+            emitByte(0xE9);
+            emitInt((int) (disp - longSize));
+        }
+    }
+
+    @Override
+    public final void jmp(Label l) {
+        if (l.isBound()) {
+            jmp(l.position(), false);
+        } else {
+            // By default, forward jumps are always 32-bit displacements, since
+            // we can't yet know where the label will be bound. If you're sure that
+            // the forward jump will not run beyond 256 bytes, use jmpb to
+            // force an 8-bit displacement.
+
+            l.addPatchAt(codeBuffer.position());
+            emitByte(0xE9);
+            emitInt(0);
+        }
+    }
+
+    public final void jmp(Register entry) {
+        int encode = prefixAndEncode(entry.encoding);
+        emitByte(0xFF);
+        emitByte(0xE0 | encode);
+    }
+
+    public final void jmpb(Label l) {
+        if (l.isBound()) {
+            int shortSize = 2;
+            int entry = l.position();
+            assert isByte((entry - codeBuffer.position()) + shortSize) : "Dispacement too large for a short jmp";
+            long offs = entry - codeBuffer.position();
+            emitByte(0xEB);
+            emitByte((int) ((offs - shortSize) & 0xFF));
+        } else {
+
+            l.addPatchAt(codeBuffer.position());
+            emitByte(0xEB);
+            emitByte(0);
+        }
+    }
+
+    public final void leaq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x8D);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void enter(int imm16, int imm8) {
+        emitByte(0xC8);
+        emitShort(imm16);
+        emitByte(imm8);
+    }
+
+    public final void leave() {
+        emitByte(0xC9);
+    }
+
+    public final void lock() {
+        if ((Atomics & 1) != 0) {
+            // Emit either nothing, a NOP, or a NOP: prefix
+            emitByte(0x90);
+        } else {
+            emitByte(0xF0);
+        }
+    }
+
+    // Emit mfence instruction
+    public final void mfence() {
+        emitByte(0x0F);
+        emitByte(0xAE);
+        emitByte(0xF0);
+    }
+
+    public final void mov(Register dst, Register src) {
+        movq(dst, src);
+    }
+
+    public final void movapd(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        int dstenc = dst.encoding;
+        int srcenc = src.encoding;
+        emitByte(0x66);
+        if (dstenc < 8) {
+            if (srcenc >= 8) {
+                emitByte(Prefix.REXB);
+                srcenc -= 8;
+            }
+        } else {
+            if (srcenc < 8) {
+                emitByte(Prefix.REXR);
+            } else {
+                emitByte(Prefix.REXRB);
+                srcenc -= 8;
+            }
+            dstenc -= 8;
+        }
+        emitByte(0x0F);
+        emitByte(0x28);
+        emitByte(0xC0 | dstenc << 3 | srcenc);
+    }
+
+    public final void movaps(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        int dstenc = dst.encoding;
+        int srcenc = src.encoding;
+        if (dstenc < 8) {
+            if (srcenc >= 8) {
+                emitByte(Prefix.REXB);
+                srcenc -= 8;
+            }
+        } else {
+            if (srcenc < 8) {
+                emitByte(Prefix.REXR);
+            } else {
+                emitByte(Prefix.REXRB);
+                srcenc -= 8;
+            }
+            dstenc -= 8;
+        }
+        emitByte(0x0F);
+        emitByte(0x28);
+        emitByte(0xC0 | dstenc << 3 | srcenc);
+    }
+
+    public final void movb(Register dst, Address src) {
+        prefix(src, dst); // , true)
+        emitByte(0x8A);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movb(Address dst, int imm8) {
+        prefix(dst);
+        emitByte(0xC6);
+        emitOperandHelper(rax, dst);
+        emitByte(imm8);
+    }
+
+    public final void movb(Address dst, Register src) {
+        assert src.isByte() : "must have byte register";
+        prefix(dst, src); // , true)
+        emitByte(0x88);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void movdl(Register dst, Register src) {
+        if (dst.isFpu()) {
+            assert !src.isFpu() : "does this hold?";
+            emitByte(0x66);
+            int encode = prefixAndEncode(dst.encoding, src.encoding);
+            emitByte(0x0F);
+            emitByte(0x6E);
+            emitByte(0xC0 | encode);
+        } else if (src.isFpu()) {
+            assert !dst.isFpu();
+            emitByte(0x66);
+            // swap src/dst to get correct prefix
+            int encode = prefixAndEncode(src.encoding, dst.encoding);
+            emitByte(0x0F);
+            emitByte(0x7E);
+            emitByte(0xC0 | encode);
+        }
+    }
+
+    public final void movdqa(Register dst, Address src) {
+        assert dst.isFpu();
+        emitByte(0x66);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x6F);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movdqa(Register dst, Register src) {
+        assert dst.isFpu();
+        emitByte(0x66);
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x6F);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movdqa(Address dst, Register src) {
+        assert src.isFpu();
+        emitByte(0x66);
+        prefix(dst, src);
+        emitByte(0x0F);
+        emitByte(0x7F);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void movdqu(Register dst, Address src) {
+        assert dst.isFpu();
+        emitByte(0xF3);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x6F);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movdqu(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+
+        emitByte(0xF3);
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x6F);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movdqu(Address dst, Register src) {
+        assert src.isFpu();
+
+        emitByte(0xF3);
+        prefix(dst, src);
+        emitByte(0x0F);
+        emitByte(0x7F);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void movl(Register dst, int imm32) {
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xB8 | encode);
+        emitInt(imm32);
+    }
+
+    public final void movl(Register dst, Register src) {
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x8B);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movl(Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0x8B);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movl(Address dst, int imm32) {
+        prefix(dst);
+        emitByte(0xC7);
+        emitOperandHelper(rax, dst);
+        emitInt(imm32);
+    }
+
+    public final void movl(Address dst, Register src) {
+        prefix(dst, src);
+        emitByte(0x89);
+        emitOperandHelper(src, dst);
+    }
+
+    /**
+     * New CPUs require use of movsd and movss to avoid partial register stall
+     * when loading from memory. But for old Opteron use movlpd instead of movsd.
+     * The selection is done in {@link AMD64MacroAssembler#movdbl(Register, Address)}
+     * and {@link AMD64MacroAssembler#movflt(Register, Register)}.
+     */
+    public final void movlpd(Register dst, Address src) {
+        assert dst.isFpu();
+        emitByte(0x66);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x12);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movlpd(Address dst, Register src) {
+        assert src.isFpu();
+        emitByte(0x66);
+        prefix(dst, src);
+        emitByte(0x0F);
+        emitByte(0x13);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void movq(Register dst, Address src) {
+        if (dst.isFpu()) {
+            emitByte(0xF3);
+            prefixq(src, dst);
+            emitByte(0x0F);
+            emitByte(0x7E);
+            emitOperandHelper(dst, src);
+        } else {
+            prefixq(src, dst);
+            emitByte(0x8B);
+            emitOperandHelper(dst, src);
+        }
+    }
+
+    public final void movq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x8B);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movq(Address dst, Register src) {
+        if (src.isFpu()) {
+            emitByte(0x66);
+            prefixq(dst, src);
+            emitByte(0x0F);
+            emitByte(0xD6);
+            emitOperandHelper(src, dst);
+        } else {
+            prefixq(dst, src);
+            emitByte(0x89);
+            emitOperandHelper(src, dst);
+        }
+    }
+
+    public final void movsxb(Register dst, Address src) { // movsxb
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0xBE);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movsxb(Register dst, Register src) { // movsxb
+        int encode = prefixAndEncode(dst.encoding, src.encoding, true);
+        emitByte(0x0F);
+        emitByte(0xBE);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movsd(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        emitByte(0xF2);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x10);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movsd(Register dst, Address src) {
+        assert dst.isFpu();
+        emitByte(0xF2);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x10);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movsd(Address dst, Register src) {
+        assert src.isFpu();
+        emitByte(0xF2);
+        prefix(dst, src);
+        emitByte(0x0F);
+        emitByte(0x11);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void movss(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        emitByte(0xF3);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x10);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movss(Register dst, Address src) {
+        assert dst.isFpu();
+        emitByte(0xF3);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x10);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movss(Address dst, Register src) {
+        assert src.isFpu();
+        emitByte(0xF3);
+        prefix(dst, src);
+        emitByte(0x0F);
+        emitByte(0x11);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void movswl(Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0xBF);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movsxw(Register dst, Register src) { // movsxw
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xBF);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movsxw(Register dst, Address src) { // movsxw
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0xBF);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movzxd(Register dst, Register src) { // movzxd
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x63);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movzxd(Register dst, Address src) { // movzxd
+        prefix(src, dst);
+        emitByte(0x63);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movw(Address dst, int imm16) {
+        emitByte(0x66); // switch to 16-bit mode
+        prefix(dst);
+        emitByte(0xC7);
+        emitOperandHelper(rax, dst);
+        emitShort(imm16);
+    }
+
+    public final void movw(Register dst, Address src) {
+        emitByte(0x66);
+        prefix(src, dst);
+        emitByte(0x8B);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movw(Address dst, Register src) {
+        emitByte(0x66);
+        prefix(dst, src);
+        emitByte(0x89);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void movzxb(Register dst, Address src) { // movzxb
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0xB6);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movzxb(Register dst, Register src) { // movzxb
+        int encode = prefixAndEncode(dst.encoding, src.encoding, true);
+        emitByte(0x0F);
+        emitByte(0xB6);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movzxl(Register dst, Address src) { // movzxw
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0xB7);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movzxl(Register dst, Register src) { // movzxw
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xB7);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void mull(Address src) {
+        prefix(src);
+        emitByte(0xF7);
+        emitOperandHelper(rsp, src);
+    }
+
+    public final void mulsd(Register dst, Address src) {
+        assert dst.isFpu();
+        emitByte(0xF2);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x59);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void mulsd(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+
+        emitByte(0xF2);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x59);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void mulss(Register dst, Address src) {
+        assert dst.isFpu();
+
+        emitByte(0xF3);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x59);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void mulss(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        emitByte(0xF3);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x59);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void negl(Register dst) {
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xF7);
+        emitByte(0xD8 | encode);
+    }
+
+    public final void ensureUniquePC() {
+        nop();
+    }
+
+    public final void nop() {
+        nop(1);
+    }
+
+    public void nop(int count) {
+        int i = count;
+        if (UseNormalNop) {
+            assert i > 0 : " ";
+            // The fancy nops aren't currently recognized by debuggers making it a
+            // pain to disassemble code while debugging. If assert are on clearly
+            // speed is not an issue so simply use the single byte traditional nop
+            // to do alignment.
+
+            for (; i > 0; i--) {
+                emitByte(0x90);
+            }
+            return;
+        }
+
+        if (UseAddressNop) {
+            //
+            // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
+            // 1: 0x90
+            // 2: 0x66 0x90
+            // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
+            // 4: 0x0F 0x1F 0x40 0x00
+            // 5: 0x0F 0x1F 0x44 0x00 0x00
+            // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
+            // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
+            // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+            // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+            // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+            // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+
+            // The rest coding is AMD specific - use consecutive Address nops
+
+            // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
+            // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
+            // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
+            // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
+            // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
+            // Size prefixes (0x66) are added for larger sizes
+
+            while (i >= 22) {
+                i -= 11;
+                emitByte(0x66); // size prefix
+                emitByte(0x66); // size prefix
+                emitByte(0x66); // size prefix
+                addrNop8();
+            }
+            // Generate first nop for size between 21-12
+            switch (i) {
+                case 21:
+                    i -= 1;
+                    emitByte(0x66); // size prefix
+                    // fall through
+                case 20:
+                    // fall through
+                case 19:
+                    i -= 1;
+                    emitByte(0x66); // size prefix
+                    // fall through
+                case 18:
+                    // fall through
+                case 17:
+                    i -= 1;
+                    emitByte(0x66); // size prefix
+                    // fall through
+                case 16:
+                    // fall through
+                case 15:
+                    i -= 8;
+                    addrNop8();
+                    break;
+                case 14:
+                case 13:
+                    i -= 7;
+                    addrNop7();
+                    break;
+                case 12:
+                    i -= 6;
+                    emitByte(0x66); // size prefix
+                    addrNop5();
+                    break;
+                default:
+                    assert i < 12;
+            }
+
+            // Generate second nop for size between 11-1
+            switch (i) {
+                case 11:
+                    emitByte(0x66); // size prefix
+                    emitByte(0x66); // size prefix
+                    emitByte(0x66); // size prefix
+                    addrNop8();
+                    break;
+                case 10:
+                    emitByte(0x66); // size prefix
+                    emitByte(0x66); // size prefix
+                    addrNop8();
+                    break;
+                case 9:
+                    emitByte(0x66); // size prefix
+                    addrNop8();
+                    break;
+                case 8:
+                    addrNop8();
+                    break;
+                case 7:
+                    addrNop7();
+                    break;
+                case 6:
+                    emitByte(0x66); // size prefix
+                    addrNop5();
+                    break;
+                case 5:
+                    addrNop5();
+                    break;
+                case 4:
+                    addrNop4();
+                    break;
+                case 3:
+                    // Don't use "0x0F 0x1F 0x00" - need patching safe padding
+                    emitByte(0x66); // size prefix
+                    emitByte(0x66); // size prefix
+                    emitByte(0x90); // nop
+                    break;
+                case 2:
+                    emitByte(0x66); // size prefix
+                    emitByte(0x90); // nop
+                    break;
+                case 1:
+                    emitByte(0x90); // nop
+                    break;
+                default:
+                    assert i == 0;
+            }
+            return;
+        }
+
+        // Using nops with size prefixes "0x66 0x90".
+        // From AMD Optimization Guide:
+        // 1: 0x90
+        // 2: 0x66 0x90
+        // 3: 0x66 0x66 0x90
+        // 4: 0x66 0x66 0x66 0x90
+        // 5: 0x66 0x66 0x90 0x66 0x90
+        // 6: 0x66 0x66 0x90 0x66 0x66 0x90
+        // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
+        // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
+        // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
+        // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
+        //
+        while (i > 12) {
+            i -= 4;
+            emitByte(0x66); // size prefix
+            emitByte(0x66);
+            emitByte(0x66);
+            emitByte(0x90); // nop
+        }
+        // 1 - 12 nops
+        if (i > 8) {
+            if (i > 9) {
+                i -= 1;
+                emitByte(0x66);
+            }
+            i -= 3;
+            emitByte(0x66);
+            emitByte(0x66);
+            emitByte(0x90);
+        }
+        // 1 - 8 nops
+        if (i > 4) {
+            if (i > 6) {
+                i -= 1;
+                emitByte(0x66);
+            }
+            i -= 3;
+            emitByte(0x66);
+            emitByte(0x66);
+            emitByte(0x90);
+        }
+        switch (i) {
+            case 4:
+                emitByte(0x66);
+                emitByte(0x66);
+                emitByte(0x66);
+                emitByte(0x90);
+                break;
+            case 3:
+                emitByte(0x66);
+                emitByte(0x66);
+                emitByte(0x90);
+                break;
+            case 2:
+                emitByte(0x66);
+                emitByte(0x90);
+                break;
+            case 1:
+                emitByte(0x90);
+                break;
+            default:
+                assert i == 0;
+        }
+    }
+
+    public final void notl(Register dst) {
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xF7);
+        emitByte(0xD0 | encode);
+    }
+
+    public final void orl(Address dst, int imm32) {
+        prefix(dst);
+        emitByte(0x81);
+        emitOperandHelper(rcx, dst);
+        emitInt(imm32);
+    }
+
+    public final void orl(Register dst, int imm32) {
+        prefix(dst);
+        emitArith(0x81, 0xC8, dst, imm32);
+    }
+
+    public final void orl(Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0x0B);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void orl(Register dst, Register src) {
+        prefixAndEncode(dst.encoding, src.encoding);
+        emitArith(0x0B, 0xC0, dst, src);
+    }
+
+    // generic
+    public final void pop(Register dst) {
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0x58 | encode);
+    }
+
+    public final void prefetchPrefix(Address src) {
+        prefix(src);
+        emitByte(0x0F);
+    }
+
+    public final void prefetchnta(Address src) {
+        prefetchPrefix(src);
+        emitByte(0x18);
+        emitOperandHelper(rax, src); // 0, src
+    }
+
+    public final void prefetchr(Address src) {
+        prefetchPrefix(src);
+        emitByte(0x0D);
+        emitOperandHelper(rax, src); // 0, src
+    }
+
+    public final void prefetcht0(Address src) {
+        prefetchPrefix(src);
+        emitByte(0x18);
+        emitOperandHelper(rcx, src); // 1, src
+
+    }
+
+    public final void prefetcht1(Address src) {
+        prefetchPrefix(src);
+        emitByte(0x18);
+        emitOperandHelper(rdx, src); // 2, src
+    }
+
+    public final void prefetcht2(Address src) {
+        prefetchPrefix(src);
+        emitByte(0x18);
+        emitOperandHelper(rbx, src); // 3, src
+    }
+
+    public final void prefetchw(Address src) {
+        prefetchPrefix(src);
+        emitByte(0x0D);
+        emitOperandHelper(rcx, src); // 1, src
+    }
+
+    public final void pshufd(Register dst, Register src, int mode) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        assert isUByte(mode) : "invalid value";
+
+        emitByte(0x66);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x70);
+        emitByte(0xC0 | encode);
+        emitByte(mode & 0xFF);
+    }
+
+    public final void pshufd(Register dst, Address src, int mode) {
+        assert dst.isFpu();
+        assert isUByte(mode) : "invalid value";
+
+        emitByte(0x66);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x70);
+        emitOperandHelper(dst, src);
+        emitByte(mode & 0xFF);
+
+    }
+
+    public final void pshuflw(Register dst, Register src, int mode) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        assert isUByte(mode) : "invalid value";
+
+        emitByte(0xF2);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x70);
+        emitByte(0xC0 | encode);
+        emitByte(mode & 0xFF);
+    }
+
+    public final void pshuflw(Register dst, Address src, int mode) {
+        assert dst.isFpu();
+        assert isUByte(mode) : "invalid value";
+
+        emitByte(0xF2);
+        prefix(src, dst); // QQ new
+        emitByte(0x0F);
+        emitByte(0x70);
+        emitOperandHelper(dst, src);
+        emitByte(mode & 0xFF);
+    }
+
+    public final void psrlq(Register dst, int shift) {
+        assert dst.isFpu();
+        // HMM Table D-1 says sse2 or mmx
+
+        int encode = prefixqAndEncode(xmm2.encoding, dst.encoding);
+        emitByte(0x66);
+        emitByte(0x0F);
+        emitByte(0x73);
+        emitByte(0xC0 | encode);
+        emitByte(shift);
+    }
+
+    public final void punpcklbw(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        emitByte(0x66);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x60);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void push(int imm32) {
+        // in 64bits we push 64bits onto the stack but only
+        // take a 32bit immediate
+        emitByte(0x68);
+        emitInt(imm32);
+    }
+
+    public final void push(Register src) {
+        int encode = prefixAndEncode(src.encoding);
+        emitByte(0x50 | encode);
+    }
+
+    public final void pushf() {
+        emitByte(0x9C);
+    }
+
+    public final void pxor(Register dst, Address src) {
+        assert dst.isFpu();
+
+        emitByte(0x66);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0xEF);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void pxor(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+
+        emitByte(0x66);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xEF);
+        emitByte(0xC0 | encode);
+
+    }
+
+    public final void rcll(Register dst, int imm8) {
+        assert isShiftCount(imm8) : "illegal shift count";
+        int encode = prefixAndEncode(dst.encoding);
+        if (imm8 == 1) {
+            emitByte(0xD1);
+            emitByte(0xD0 | encode);
+        } else {
+            emitByte(0xC1);
+            emitByte(0xD0 | encode);
+            emitByte(imm8);
+        }
+    }
+
+    public final void pause() {
+        emitByte(0xF3);
+        emitByte(0x90);
+    }
+
+    // Copies data from [X86.rsi] to [X86.rdi] using X86.rcx heap words.
+    public final void repeatMoveWords() {
+        emitByte(0xF3);
+        emitByte(Prefix.REXW);
+        emitByte(0xA5);
+    }
+
+    // Copies data from [X86.rsi] to [X86.rdi] using X86.rcx bytes.
+    public final void repeatMoveBytes() {
+        emitByte(0xF3);
+        emitByte(Prefix.REXW);
+        emitByte(0xA4);
+    }
+
+    // sets X86.rcx pointer sized words with X86.rax, value at [edi]
+    // generic
+    public final void repSet() { // repSet
+        emitByte(0xF3);
+        // STOSQ
+        emitByte(Prefix.REXW);
+        emitByte(0xAB);
+    }
+
+    // scans X86.rcx pointer sized words at [edi] for occurance of X86.rax,
+    // generic
+    public final void repneScan() { // repneScan
+        emitByte(0xF2);
+        // SCASQ
+        emitByte(Prefix.REXW);
+        emitByte(0xAF);
+    }
+
+    // scans X86.rcx 4 byte words at [edi] for occurance of X86.rax,
+    // generic
+    public final void repneScanl() { // repneScan
+        emitByte(0xF2);
+        // SCASL
+        emitByte(0xAF);
+    }
+
+    public final void ret(int imm16) {
+        if (imm16 == 0) {
+            emitByte(0xC3);
+        } else {
+            emitByte(0xC2);
+            emitShort(imm16);
+        }
+    }
+
+    public final void sarl(Register dst, int imm8) {
+        int encode = prefixAndEncode(dst.encoding);
+        assert isShiftCount(imm8) : "illegal shift count";
+        if (imm8 == 1) {
+            emitByte(0xD1);
+            emitByte(0xF8 | encode);
+        } else {
+            emitByte(0xC1);
+            emitByte(0xF8 | encode);
+            emitByte(imm8);
+        }
+    }
+
+    public final void sarl(Register dst) {
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xD3);
+        emitByte(0xF8 | encode);
+    }
+
+    public final void sbbl(Address dst, int imm32) {
+        prefix(dst);
+        emitArithOperand(0x81, rbx, dst, imm32);
+    }
+
+    public final void sbbl(Register dst, int imm32) {
+        prefix(dst);
+        emitArith(0x81, 0xD8, dst, imm32);
+    }
+
+    public final void sbbl(Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0x1B);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void sbbl(Register dst, Register src) {
+        prefixAndEncode(dst.encoding, src.encoding);
+        emitArith(0x1B, 0xC0, dst, src);
+    }
+
+    public final void setb(ConditionFlag cc, Register dst) {
+        assert 0 <= cc.value && cc.value < 16 : "illegal cc";
+        int encode = prefixAndEncode(dst.encoding, true);
+        emitByte(0x0F);
+        emitByte(0x90 | cc.value);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void shll(Register dst, int imm8) {
+        assert isShiftCount(imm8) : "illegal shift count";
+        int encode = prefixAndEncode(dst.encoding);
+        if (imm8 == 1) {
+            emitByte(0xD1);
+            emitByte(0xE0 | encode);
+        } else {
+            emitByte(0xC1);
+            emitByte(0xE0 | encode);
+            emitByte(imm8);
+        }
+    }
+
+    public final void shll(Register dst) {
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xD3);
+        emitByte(0xE0 | encode);
+    }
+
+    public final void shrl(Register dst, int imm8) {
+        assert isShiftCount(imm8) : "illegal shift count";
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xC1);
+        emitByte(0xE8 | encode);
+        emitByte(imm8);
+    }
+
+    public final void shrl(Register dst) {
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xD3);
+        emitByte(0xE8 | encode);
+    }
+
+    // copies a single word from [esi] to [edi]
+    public final void smovl() {
+        emitByte(0xA5);
+    }
+
+    public final void sqrtsd(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        // HMM Table D-1 says sse2
+        // assert is64 || target.supportsSSE();
+        emitByte(0xF2);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x51);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void subl(Address dst, int imm32) {
+        prefix(dst);
+        if (isByte(imm32)) {
+            emitByte(0x83);
+            emitOperandHelper(rbp, dst);
+            emitByte(imm32 & 0xFF);
+        } else {
+            emitByte(0x81);
+            emitOperandHelper(rbp, dst);
+            emitInt(imm32);
+        }
+    }
+
+    public final void subl(Register dst, int imm32) {
+        prefix(dst);
+        emitArith(0x81, 0xE8, dst, imm32);
+    }
+
+    public final void subl(Address dst, Register src) {
+        prefix(dst, src);
+        emitByte(0x29);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void subl(Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0x2B);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void subl(Register dst, Register src) {
+        prefixAndEncode(dst.encoding, src.encoding);
+        emitArith(0x2B, 0xC0, dst, src);
+    }
+
+    public final void subsd(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        emitByte(0xF2);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x5C);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void subsd(Register dst, Address src) {
+        assert dst.isFpu();
+
+        emitByte(0xF2);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x5C);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void subss(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        emitByte(0xF3);
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x5C);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void subss(Register dst, Address src) {
+        assert dst.isFpu();
+
+        emitByte(0xF3);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x5C);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void testb(Register dst, int imm8) {
+        prefixAndEncode(dst.encoding, true);
+        emitArithB(0xF6, 0xC0, dst, imm8);
+    }
+
+    public final void testl(Register dst, int imm32) {
+        // not using emitArith because test
+        // doesn't support sign-extension of
+        // 8bit operands
+        int encode = dst.encoding;
+        if (encode == 0) {
+            emitByte(0xA9);
+        } else {
+            encode = prefixAndEncode(encode);
+            emitByte(0xF7);
+            emitByte(0xC0 | encode);
+        }
+        emitInt(imm32);
+    }
+
+    public final void testl(Register dst, Register src) {
+        prefixAndEncode(dst.encoding, src.encoding);
+        emitArith(0x85, 0xC0, dst, src);
+    }
+
+    public final void testl(Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0x85);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void ucomisd(Register dst, Address src) {
+        assert dst.isFpu();
+        emitByte(0x66);
+        ucomiss(dst, src);
+    }
+
+    public final void ucomisd(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        emitByte(0x66);
+        ucomiss(dst, src);
+    }
+
+    public final void ucomiss(Register dst, Address src) {
+        assert dst.isFpu();
+
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x2E);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void ucomiss(Register dst, Register src) {
+        assert dst.isFpu();
+        assert src.isFpu();
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x2E);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void xaddl(Address dst, Register src) {
+        assert src.isFpu();
+
+        prefix(dst, src);
+        emitByte(0x0F);
+        emitByte(0xC1);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void xchgl(Register dst, Address src) { // xchg
+        prefix(src, dst);
+        emitByte(0x87);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void xchgl(Register dst, Register src) {
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x87);
+        emitByte(0xc0 | encode);
+    }
+
+    public final void xorl(Register dst, int imm32) {
+        prefix(dst);
+        emitArith(0x81, 0xF0, dst, imm32);
+    }
+
+    public final void xorl(Register dst, Address src) {
+        prefix(src, dst);
+        emitByte(0x33);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void xorl(Register dst, Register src) {
+        prefixAndEncode(dst.encoding, src.encoding);
+        emitArith(0x33, 0xC0, dst, src);
+    }
+
+    public final void andpd(Register dst, Register src) {
+        emitByte(0x66);
+        andps(dst, src);
+    }
+
+    public final void andpd(Register dst, Address src) {
+        emitByte(0x66);
+        andps(dst, src);
+    }
+
+    public final void andps(Register dst, Register src) {
+        assert dst.isFpu() && src.isFpu();
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x54);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void andps(Register dst, Address src) {
+        assert dst.isFpu();
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x54);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void orpd(Register dst, Register src) {
+        emitByte(0x66);
+        orps(dst, src);
+    }
+
+    public final void orpd(Register dst, Address src) {
+        emitByte(0x66);
+        orps(dst, src);
+    }
+
+    public final void orps(Register dst, Register src) {
+        assert dst.isFpu() && src.isFpu();
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x56);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void orps(Register dst, Address src) {
+        assert dst.isFpu();
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x56);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void xorpd(Register dst, Register src) {
+        emitByte(0x66);
+        xorps(dst, src);
+    }
+
+    public final void xorpd(Register dst, Address src) {
+        emitByte(0x66);
+        xorps(dst, src);
+    }
+
+    public final void xorps(Register dst, Register src) {
+        assert dst.isFpu() && src.isFpu();
+        int encode = prefixAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x57);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void xorps(Register dst, Address src) {
+        assert dst.isFpu();
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x57);
+        emitOperandHelper(dst, src);
+    }
+
+    // 32bit only pieces of the assembler
+
+    public final void decl(Register dst) {
+        // Don't use it directly. Use Macrodecrementl() instead.
+        // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xFF);
+        emitByte(0xC8 | encode);
+    }
+
+    public final void incl(Register dst) {
+        // Don't use it directly. Use Macroincrementl() instead.
+        // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
+        int encode = prefixAndEncode(dst.encoding);
+        emitByte(0xFF);
+        emitByte(0xC0 | encode);
+    }
+
+    int prefixAndEncode(int regEnc) {
+        return prefixAndEncode(regEnc, false);
+    }
+
+    int prefixAndEncode(int regEnc, boolean byteinst) {
+        if (regEnc >= 8) {
+            emitByte(Prefix.REXB);
+            return regEnc - 8;
+        } else if (byteinst && regEnc >= 4) {
+            emitByte(Prefix.REX);
+        }
+        return regEnc;
+    }
+
+    int prefixqAndEncode(int regEnc) {
+        if (regEnc < 8) {
+            emitByte(Prefix.REXW);
+            return regEnc;
+        } else {
+            emitByte(Prefix.REXWB);
+            return regEnc - 8;
+        }
+    }
+
+    int prefixAndEncode(int dstEnc, int srcEnc) {
+        return prefixAndEncode(dstEnc, srcEnc, false);
+    }
+
+    int prefixAndEncode(int dstEncoding, int srcEncoding, boolean byteinst) {
+        int srcEnc = srcEncoding;
+        int dstEnc = dstEncoding;
+        if (dstEnc < 8) {
+            if (srcEnc >= 8) {
+                emitByte(Prefix.REXB);
+                srcEnc -= 8;
+            } else if (byteinst && srcEnc >= 4) {
+                emitByte(Prefix.REX);
+            }
+        } else {
+            if (srcEnc < 8) {
+                emitByte(Prefix.REXR);
+            } else {
+                emitByte(Prefix.REXRB);
+                srcEnc -= 8;
+            }
+            dstEnc -= 8;
+        }
+        return dstEnc << 3 | srcEnc;
+    }
+
+    /**
+     * Creates prefix and the encoding of the lower 6 bits of the ModRM-Byte. It emits an operand prefix. If the given
+     * operands exceed 3 bits, the 4th bit is encoded in the prefix.
+     *
+     * @param regEncoding the encoding of the register part of the ModRM-Byte
+     * @param rmEncoding the encoding of the r/m part of the ModRM-Byte
+     * @return the lower 6 bits of the ModRM-Byte that should be emitted
+     */
+    private int prefixqAndEncode(int regEncoding, int rmEncoding) {
+        int rmEnc = rmEncoding;
+        int regEnc = regEncoding;
+        if (regEnc < 8) {
+            if (rmEnc < 8) {
+                emitByte(Prefix.REXW);
+            } else {
+                emitByte(Prefix.REXWB);
+                rmEnc -= 8;
+            }
+        } else {
+            if (rmEnc < 8) {
+                emitByte(Prefix.REXWR);
+            } else {
+                emitByte(Prefix.REXWRB);
+                rmEnc -= 8;
+            }
+            regEnc -= 8;
+        }
+        return regEnc << 3 | rmEnc;
+    }
+
+    private void prefix(Register reg) {
+        if (reg.encoding >= 8) {
+            emitByte(Prefix.REXB);
+        }
+    }
+
+    private static boolean needsRex(Value value) {
+        return isRegister(value) && asRegister(value).encoding >= MinEncodingNeedsRex;
+    }
+
+
+    private void prefix(Address adr) {
+        if (needsRex(adr.getBase())) {
+            if (needsRex(adr.getIndex())) {
+                emitByte(Prefix.REXXB);
+            } else {
+                emitByte(Prefix.REXB);
+            }
+        } else {
+            if (needsRex(adr.getIndex())) {
+                emitByte(Prefix.REXX);
+            }
+        }
+    }
+
+    private void prefixq(Address adr) {
+        if (needsRex(adr.getBase())) {
+            if (needsRex(adr.getIndex())) {
+                emitByte(Prefix.REXWXB);
+            } else {
+                emitByte(Prefix.REXWB);
+            }
+        } else {
+            if (needsRex(adr.getIndex())) {
+                emitByte(Prefix.REXWX);
+            } else {
+                emitByte(Prefix.REXW);
+            }
+        }
+    }
+
+    private void prefix(Address adr, Register reg) {
+        if (reg.encoding < 8) {
+            if (needsRex(adr.getBase())) {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXXB);
+                } else {
+                    emitByte(Prefix.REXB);
+                }
+            } else {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXX);
+                } else if (reg.encoding >= 4) {
+                    emitByte(Prefix.REX);
+                }
+            }
+        } else {
+            if (needsRex(adr.getBase())) {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXRXB);
+                } else {
+                    emitByte(Prefix.REXRB);
+                }
+            } else {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXRX);
+                } else {
+                    emitByte(Prefix.REXR);
+                }
+            }
+        }
+    }
+
+    private void prefixq(Address adr, Register src) {
+        if (src.encoding < 8) {
+            if (needsRex(adr.getBase())) {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXWXB);
+                } else {
+                    emitByte(Prefix.REXWB);
+                }
+            } else {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXWX);
+                } else {
+                    emitByte(Prefix.REXW);
+                }
+            }
+        } else {
+            if (needsRex(adr.getBase())) {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXWRXB);
+                } else {
+                    emitByte(Prefix.REXWRB);
+                }
+            } else {
+                if (needsRex(adr.getIndex())) {
+                    emitByte(Prefix.REXWRX);
+                } else {
+                    emitByte(Prefix.REXWR);
+                }
+            }
+        }
+    }
+
+    public final void addq(Address dst, int imm32) {
+        prefixq(dst);
+        emitArithOperand(0x81, rax, dst, imm32);
+    }
+
+    public final void addq(Address dst, Register src) {
+        prefixq(dst, src);
+        emitByte(0x01);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void addq(Register dst, int imm32) {
+        prefixqAndEncode(dst.encoding);
+        emitArith(0x81, 0xC0, dst, imm32);
+    }
+
+    public final void addq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x03);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void addq(Register dst, Register src) {
+        prefixqAndEncode(dst.encoding, src.encoding);
+        emitArith(0x03, 0xC0, dst, src);
+    }
+
+    public final void andq(Register dst, int imm32) {
+        prefixqAndEncode(dst.encoding);
+        emitArith(0x81, 0xE0, dst, imm32);
+    }
+
+    public final void andq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x23);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void andq(Register dst, Register src) {
+        prefixqAndEncode(dst.encoding, src.encoding);
+        emitArith(0x23, 0xC0, dst, src);
+    }
+
+    public final void bswapq(Register reg) {
+        int encode = prefixqAndEncode(reg.encoding);
+        emitByte(0x0F);
+        emitByte(0xC8 | encode);
+    }
+
+    public final void cdqq() {
+        emitByte(Prefix.REXW);
+        emitByte(0x99);
+    }
+
+    public final void cmovq(ConditionFlag cc, Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x40 | cc.value);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cmovq(ConditionFlag cc, Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x0F);
+        emitByte(0x40 | cc.value);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void cmpq(Address dst, int imm32) {
+        prefixq(dst);
+        emitByte(0x81);
+        emitOperandHelper(rdi, dst);
+        emitInt(imm32);
+    }
+
+    public final void cmpq(Register dst, int imm32) {
+        prefixqAndEncode(dst.encoding);
+        emitArith(0x81, 0xF8, dst, imm32);
+    }
+
+    public final void cmpq(Address dst, Register src) {
+        prefixq(dst, src);
+        emitByte(0x3B);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void cmpq(Register dst, Register src) {
+        prefixqAndEncode(dst.encoding, src.encoding);
+        emitArith(0x3B, 0xC0, dst, src);
+    }
+
+    public final void cmpq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x3B);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void cmpxchgq(Register reg, Address adr) {
+        prefixq(adr, reg);
+        emitByte(0x0F);
+        emitByte(0xB1);
+        emitOperandHelper(reg, adr);
+    }
+
+    public final void cvtsi2sdq(Register dst, Register src) {
+        assert dst.isFpu();
+        emitByte(0xF2);
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x2A);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvtsi2ssq(Register dst, Register src) {
+        assert dst.isFpu();
+        emitByte(0xF3);
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x2A);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvttsd2siq(Register dst, Register src) {
+        assert src.isFpu();
+        emitByte(0xF2);
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x2C);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void cvttss2siq(Register dst, Register src) {
+        assert src.isFpu();
+        emitByte(0xF3);
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0x2C);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void decq(Register dst) {
+        // Don't use it directly. Use Macrodecrementq() instead.
+        // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xFF);
+        emitByte(0xC8 | encode);
+    }
+
+    public final void decq(Address dst) {
+        // Don't use it directly. Use Macrodecrementq() instead.
+        prefixq(dst);
+        emitByte(0xFF);
+        emitOperandHelper(rcx, dst);
+    }
+
+    public final void divq(Register src) {
+        int encode = prefixqAndEncode(src.encoding);
+        emitByte(0xF7);
+        emitByte(0xF0 | encode);
+    }
+
+    public final void idivq(Register src) {
+        int encode = prefixqAndEncode(src.encoding);
+        emitByte(0xF7);
+        emitByte(0xF8 | encode);
+    }
+
+    public final void imulq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xAF);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void imulq(Register dst, Register src, int value) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        if (isByte(value)) {
+            emitByte(0x6B);
+            emitByte(0xC0 | encode);
+            emitByte(value);
+        } else {
+            emitByte(0x69);
+            emitByte(0xC0 | encode);
+            emitInt(value);
+        }
+    }
+
+    public final void incq(Register dst) {
+        // Don't use it directly. Use Macroincrementq() instead.
+        // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xFF);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void incq(Address dst) {
+        // Don't use it directly. Use Macroincrementq() instead.
+        prefixq(dst);
+        emitByte(0xFF);
+        emitOperandHelper(rax, dst);
+    }
+
+    public final void movq(Register dst, long imm64) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xB8 | encode);
+        emitLong(imm64);
+    }
+
+    public final void movdq(Register dst, Register src) {
+
+        // table D-1 says MMX/SSE2
+        emitByte(0x66);
+
+        if (dst.isFpu()) {
+            assert dst.isFpu();
+            int encode = prefixqAndEncode(dst.encoding, src.encoding);
+            emitByte(0x0F);
+            emitByte(0x6E);
+            emitByte(0xC0 | encode);
+        } else if (src.isFpu()) {
+
+            // swap src/dst to get correct prefix
+            int encode = prefixqAndEncode(src.encoding, dst.encoding);
+            emitByte(0x0F);
+            emitByte(0x7E);
+            emitByte(0xC0 | encode);
+        } else {
+            throw new InternalError("should not reach here");
+        }
+    }
+
+    public final void movsbq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x0F);
+        emitByte(0xBE);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movsbq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xBE);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movslq(Register dst, int imm32) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xC7 | encode);
+        emitInt(imm32);
+        // dbx shows movslq(X86.rcx, 3) as movq $0x0000000049000000,(%X86.rbx)
+        // and movslq(X86.r8, 3); as movl $0x0000000048000000,(%X86.rbx)
+        // as a result we shouldn't use until tested at runtime...
+        throw new InternalError("untested");
+    }
+
+    public final void movslq(Address dst, int imm32) {
+        prefixq(dst);
+        emitByte(0xC7);
+        emitOperandHelper(rax, dst);
+        emitInt(imm32);
+    }
+
+    public final void movslq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x63);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movslq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x63);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movswq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x0F);
+        emitByte(0xBF);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movswq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xBF);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movzbq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x0F);
+        emitByte(0xB6);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movzbq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xB6);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void movzwq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x0F);
+        emitByte(0xB7);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void movzwq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x0F);
+        emitByte(0xB7);
+        emitByte(0xC0 | encode);
+    }
+
+    public final void negq(Register dst) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xF7);
+        emitByte(0xD8 | encode);
+    }
+
+    public final void notq(Register dst) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xF7);
+        emitByte(0xD0 | encode);
+    }
+
+    public final void orq(Address dst, int imm32) {
+        prefixq(dst);
+        emitByte(0x81);
+        emitOperandHelper(rcx, dst);
+        emitInt(imm32);
+    }
+
+    public final void orq(Register dst, int imm32) {
+        prefixqAndEncode(dst.encoding);
+        emitArith(0x81, 0xC8, dst, imm32);
+    }
+
+    public final void orq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x0B);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void orq(Register dst, Register src) {
+        prefixqAndEncode(dst.encoding, src.encoding);
+        emitArith(0x0B, 0xC0, dst, src);
+    }
+
+    public final void popq(Address dst) {
+        prefixq(dst);
+        emitByte(0x8F);
+        emitOperandHelper(rax, dst);
+    }
+
+    public final void pushq(Address src) {
+        prefixq(src);
+        emitByte(0xFF);
+        emitOperandHelper(rsi, src);
+    }
+
+    public final void rclq(Register dst, int imm8) {
+        assert isShiftCount(imm8 >> 1) : "illegal shift count";
+        int encode = prefixqAndEncode(dst.encoding);
+        if (imm8 == 1) {
+            emitByte(0xD1);
+            emitByte(0xD0 | encode);
+        } else {
+            emitByte(0xC1);
+            emitByte(0xD0 | encode);
+            emitByte(imm8);
+        }
+    }
+
+    public final void sarq(Register dst, int imm8) {
+        assert isShiftCount(imm8 >> 1) : "illegal shift count";
+        int encode = prefixqAndEncode(dst.encoding);
+        if (imm8 == 1) {
+            emitByte(0xD1);
+            emitByte(0xF8 | encode);
+        } else {
+            emitByte(0xC1);
+            emitByte(0xF8 | encode);
+            emitByte(imm8);
+        }
+    }
+
+    public final void sarq(Register dst) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xD3);
+        emitByte(0xF8 | encode);
+    }
+
+    public final void shlq(Register dst, int imm8) {
+        assert isShiftCount(imm8 >> 1) : "illegal shift count";
+        int encode = prefixqAndEncode(dst.encoding);
+        if (imm8 == 1) {
+            emitByte(0xD1);
+            emitByte(0xE0 | encode);
+        } else {
+            emitByte(0xC1);
+            emitByte(0xE0 | encode);
+            emitByte(imm8);
+        }
+    }
+
+    public final void shlq(Register dst) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xD3);
+        emitByte(0xE0 | encode);
+    }
+
+    public final void shrq(Register dst, int imm8) {
+        assert isShiftCount(imm8 >> 1) : "illegal shift count";
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xC1);
+        emitByte(0xE8 | encode);
+        emitByte(imm8);
+    }
+
+    public final void shrq(Register dst) {
+        int encode = prefixqAndEncode(dst.encoding);
+        emitByte(0xD3);
+        emitByte(0xE8 | encode);
+    }
+
+    public final void sqrtsd(Register dst, Address src) {
+        assert dst.isFpu();
+
+        emitByte(0xF2);
+        prefix(src, dst);
+        emitByte(0x0F);
+        emitByte(0x51);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void subq(Address dst, int imm32) {
+        prefixq(dst);
+        if (isByte(imm32)) {
+            emitByte(0x83);
+            emitOperandHelper(rbp, dst);
+            emitByte(imm32 & 0xFF);
+        } else {
+            emitByte(0x81);
+            emitOperandHelper(rbp, dst);
+            emitInt(imm32);
+        }
+    }
+
+    public final void subq(Register dst, int imm32) {
+        prefixqAndEncode(dst.encoding);
+        emitArith(0x81, 0xE8, dst, imm32);
+    }
+
+    public final void subq(Address dst, Register src) {
+        prefixq(dst, src);
+        emitByte(0x29);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void subq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x2B);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void subq(Register dst, Register src) {
+        prefixqAndEncode(dst.encoding, src.encoding);
+        emitArith(0x2B, 0xC0, dst, src);
+    }
+
+    public final void testq(Register dst, int imm32) {
+        // not using emitArith because test
+        // doesn't support sign-extension of
+        // 8bit operands
+        int encode = dst.encoding;
+        if (encode == 0) {
+            emitByte(Prefix.REXW);
+            emitByte(0xA9);
+        } else {
+            encode = prefixqAndEncode(encode);
+            emitByte(0xF7);
+            emitByte(0xC0 | encode);
+        }
+        emitInt(imm32);
+    }
+
+    public final void testq(Register dst, Register src) {
+        prefixqAndEncode(dst.encoding, src.encoding);
+        emitArith(0x85, 0xC0, dst, src);
+    }
+
+    public final void xaddq(Address dst, Register src) {
+        prefixq(dst, src);
+        emitByte(0x0F);
+        emitByte(0xC1);
+        emitOperandHelper(src, dst);
+    }
+
+    public final void xchgq(Register dst, Address src) {
+        prefixq(src, dst);
+        emitByte(0x87);
+        emitOperandHelper(dst, src);
+    }
+
+    public final void xchgq(Register dst, Register src) {
+        int encode = prefixqAndEncode(dst.encoding, src.encoding);
+        emitByte(0x87);
+        emitByte(0xc0 | encode);
+    }
+
+    public final void xorq(Register dst, int imm32) {
+        prefixqAndEncode(dst.encoding);
+        emitArith(0x81, 0xF0, dst, imm32);
+    }
+
+    public final void xorq(Register dst, Register src) {
+        prefixqAndEncode(dst.encoding, src.encoding);
+        emitArith(0x33, 0xC0, dst, src);
+    }
+
+    public final void xorq(Register dst, Address src) {
+
+        prefixq(src, dst);
+        emitByte(0x33);
+        emitOperandHelper(dst, src);
+
+    }
+
+    public final void membar(int barriers) {
+        if (target.isMP) {
+            // We only have to handle StoreLoad
+            if ((barriers & STORE_LOAD) != 0) {
+                // All usable chips support "locked" instructions which suffice
+                // as barriers, and are much faster than the alternative of
+                // using cpuid instruction. We use here a locked add [rsp],0.
+                // This is conveniently otherwise a no-op except for blowing
+                // flags.
+                // Any change to this code may need to revisit other places in
+                // the code where this idiom is used, in particular the
+                // orderAccess code.
+                lock();
+                addl(new Address(Word, RSP, 0), 0); // Assert the lock# signal here
+            }
+        }
+    }
+
+    @Override
+    protected final void patchJumpTarget(int branch, int branchTarget) {
+        int op = codeBuffer.getByte(branch);
+        assert op == 0xE8 // call
+            || op == 0x00 // jump table entry
+            || op == 0xE9 // jmp
+            || op == 0xEB // short jmp
+            || (op & 0xF0) == 0x70 // short jcc
+            || op == 0x0F && (codeBuffer.getByte(branch + 1) & 0xF0) == 0x80 // jcc
+        : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
+
+        if (op == 0x00) {
+            int offsetToJumpTableBase = codeBuffer.getShort(branch + 1);
+            int jumpTableBase = branch - offsetToJumpTableBase;
+            int imm32 = branchTarget - jumpTableBase;
+            codeBuffer.emitInt(imm32, branch);
+        } else if (op == 0xEB || (op & 0xF0) == 0x70) {
+
+            // short offset operators (jmp and jcc)
+            int imm8 = branchTarget - (branch + 2);
+            codeBuffer.emitByte(imm8, branch + 1);
+
+        } else {
+
+            int off = 1;
+            if (op == 0x0F) {
+                off = 2;
+            }
+
+            int imm32 = branchTarget - (branch + 4 + off);
+            codeBuffer.emitInt(imm32, branch + off);
+        }
+    }
+
+    public void nullCheck(Register r) {
+        testl(AMD64.rax, new Address(Word, r.asValue(Word), 0));
+    }
+
+    @Override
+    public void align(int modulus) {
+        if (codeBuffer.position() % modulus != 0) {
+            nop(modulus - (codeBuffer.position() % modulus));
+        }
+    }
+
+    public void pushfq() {
+        emitByte(0x9c);
+    }
+
+    public void popfq() {
+        emitByte(0x9D);
+    }
+
+    /**
+     * Makes sure that a subsequent {@linkplain #call} does not fail the alignment check.
+     */
+    public final void alignForPatchableDirectCall() {
+        int dispStart = codeBuffer.position() + 1;
+        int mask = target.wordSize - 1;
+        if ((dispStart & ~mask) != ((dispStart + 3) & ~mask)) {
+            nop(target.wordSize - (dispStart & mask));
+            assert ((codeBuffer.position() + 1) & mask) == 0;
+        }
+    }
+
+    /**
+     * Emits a direct call instruction. Note that the actual call target is not specified, because all calls
+     * need patching anyway. Therefore, 0 is emitted as the call target, and the user is responsible
+     * to add the call address to the appropriate patching tables.
+     */
+    public final void call() {
+        emitByte(0xE8);
+        emitInt(0);
+    }
+
+    public final void call(Register src) {
+        int encode = prefixAndEncode(src.encoding);
+        emitByte(0xFF);
+        emitByte(0xD0 | encode);
+    }
+
+    public void int3() {
+        emitByte(0xCC);
+    }
+
+    public void enter(short imm16, byte imm8) {
+        emitByte(0xC8);
+        // appended:
+        emitByte(imm16 & 0xff);
+        emitByte((imm16 >> 8) & 0xff);
+        emitByte(imm8);
+    }
+
+    private void emitx87(int b1, int b2, int i) {
+        assert 0 <= i && i < 8 : "illegal stack offset";
+        emitByte(b1);
+        emitByte(b2 + i);
+    }
+
+    public void fld(Address src) {
+        emitByte(0xDD);
+        emitOperandHelper(rax, src);
+    }
+
+    public void fld(int i) {
+        emitx87(0xD9, 0xC0, i);
+    }
+
+    public void fldln2() {
+        emitByte(0xD9);
+        emitByte(0xED);
+    }
+
+    public void fldlg2() {
+        emitByte(0xD9);
+        emitByte(0xEC);
+    }
+
+    public void fyl2x() {
+        emitByte(0xD9);
+        emitByte(0xF1);
+    }
+
+    public void fstp(Address src) {
+        emitByte(0xDD);
+        emitOperandHelper(rbx, src);
+    }
+
+    public void fsin() {
+        emitByte(0xD9);
+        emitByte(0xFE);
+    }
+
+    public void fcos() {
+        emitByte(0xD9);
+        emitByte(0xFF);
+    }
+
+    public void fptan() {
+        emitByte(0xD9);
+        emitByte(0xF2);
+    }
+
+    public void fstp(int i) {
+        emitx87(0xDD, 0xD8, i);
+    }
+
+    @Override
+    public void bangStack(int disp) {
+        movq(new Address(target.wordKind, AMD64.RSP, -disp), AMD64.rax);
+    }
+}
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.max.asm.amd64/src/com/oracle/max/asm/amd64/AMD64MacroAssembler.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.max.asm.amd64/src/com/oracle/max/asm/amd64/AMD64MacroAssembler.java	Tue Oct 02 22:22:06 2012 +0200
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2009, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.max.asm.amd64;
+
+import static com.oracle.max.asm.amd64.AMD64AsmOptions.*;
+
+import com.oracle.graal.api.code.*;
+import com.oracle.graal.api.meta.*;
+import com.oracle.max.asm.*;
+
+/**
+ * This class implements commonly used X86 code patterns.
+ */
+public class AMD64MacroAssembler extends AMD64Assembler {
+
+    public AMD64MacroAssembler(TargetDescription target, RegisterConfig registerConfig) {
+        super(target, registerConfig);
+    }
+
+    public void pushptr(Address src) {
+        pushq(src);
+    }
+
+    public void popptr(Address src) {
+        popq(src);
+    }
+
+    public void xorptr(Register dst, Register src) {
+        xorq(dst, src);
+    }
+
+    public void xorptr(Register dst, Address src) {
+        xorq(dst, src);
+    }
+
+    // 64 bit versions
+
+
+    public void decrementq(Register reg, int value) {
+        if (value == Integer.MIN_VALUE) {
+            subq(reg, value);
+            return;
+        }
+        if (value < 0) {
+            incrementq(reg, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            decq(reg);
+        } else {
+            subq(reg, value);
+        }
+    }
+
+    public void incrementq(Register reg, int value) {
+        if (value == Integer.MIN_VALUE) {
+            addq(reg, value);
+            return;
+        }
+        if (value < 0) {
+            decrementq(reg, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            incq(reg);
+        } else {
+            addq(reg, value);
+        }
+    }
+
+    // These are mostly for initializing null
+    public void movptr(Address dst, int src) {
+        movslq(dst, src);
+    }
+
+    public final void cmp32(Register src1, int imm) {
+        cmpl(src1, imm);
+    }
+
+    public final void cmp32(Register src1, Address src2) {
+        cmpl(src1, src2);
+    }
+
+    public void cmpsd2int(Register opr1, Register opr2, Register dst, boolean unorderedIsLess) {
+        assert opr1.isFpu() && opr2.isFpu();
+        ucomisd(opr1, opr2);
+
+        Label l = new Label();
+        if (unorderedIsLess) {
+            movl(dst, -1);
+            jcc(AMD64Assembler.ConditionFlag.parity, l);
+            jcc(AMD64Assembler.ConditionFlag.below, l);
+            movl(dst, 0);
+            jcc(AMD64Assembler.ConditionFlag.equal, l);
+            incrementl(dst, 1);
+        } else { // unordered is greater
+            movl(dst, 1);
+            jcc(AMD64Assembler.ConditionFlag.parity, l);
+            jcc(AMD64Assembler.ConditionFlag.above, l);
+            movl(dst, 0);
+            jcc(AMD64Assembler.ConditionFlag.equal, l);
+            decrementl(dst, 1);
+        }
+        bind(l);
+    }
+
+    public void cmpss2int(Register opr1, Register opr2, Register dst, boolean unorderedIsLess) {
+        assert opr1.isFpu();
+        assert opr2.isFpu();
+        ucomiss(opr1, opr2);
+
+        Label l = new Label();
+        if (unorderedIsLess) {
+            movl(dst, -1);
+            jcc(AMD64Assembler.ConditionFlag.parity, l);
+            jcc(AMD64Assembler.ConditionFlag.below, l);
+            movl(dst, 0);
+            jcc(AMD64Assembler.ConditionFlag.equal, l);
+            incrementl(dst, 1);
+        } else { // unordered is greater
+            movl(dst, 1);
+            jcc(AMD64Assembler.ConditionFlag.parity, l);
+            jcc(AMD64Assembler.ConditionFlag.above, l);
+            movl(dst, 0);
+            jcc(AMD64Assembler.ConditionFlag.equal, l);
+            decrementl(dst, 1);
+        }
+        bind(l);
+    }
+
+    public void cmpptr(Register src1, Register src2) {
+        cmpq(src1, src2);
+    }
+
+    public void cmpptr(Register src1, Address src2) {
+        cmpq(src1, src2);
+    }
+
+    public void cmpptr(Register src1, int src2) {
+        cmpq(src1, src2);
+    }
+
+    public void cmpptr(Address src1, int src2) {
+        cmpq(src1, src2);
+    }
+
+    public void decrementl(Register reg, int value) {
+        if (value == Integer.MIN_VALUE) {
+            subl(reg, value);
+            return;
+        }
+        if (value < 0) {
+            incrementl(reg, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            decl(reg);
+        } else {
+            subl(reg, value);
+        }
+    }
+
+    public void decrementl(Address dst, int value) {
+        if (value == Integer.MIN_VALUE) {
+            subl(dst, value);
+            return;
+        }
+        if (value < 0) {
+            incrementl(dst, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            decl(dst);
+        } else {
+            subl(dst, value);
+        }
+    }
+
+    public void incrementl(Register reg, int value) {
+        if (value == Integer.MIN_VALUE) {
+            addl(reg, value);
+            return;
+        }
+        if (value < 0) {
+            decrementl(reg, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            incl(reg);
+        } else {
+            addl(reg, value);
+        }
+    }
+
+    public void incrementl(Address dst, int value) {
+        if (value == Integer.MIN_VALUE) {
+            addl(dst, value);
+            return;
+        }
+        if (value < 0) {
+            decrementl(dst, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            incl(dst);
+        } else {
+            addl(dst, value);
+        }
+    }
+
+    public void signExtendByte(Register reg) {
+        if (reg.isByte()) {
+            movsxb(reg, reg); // movsxb
+        } else {
+            shll(reg, 24);
+            sarl(reg, 24);
+        }
+    }
+
+    public void signExtendShort(Register reg) {
+        movsxw(reg, reg); // movsxw
+    }
+
+    // Support optimal SSE move instructions.
+    public void movflt(Register dst, Register src) {
+        assert dst.isFpu() && src.isFpu();
+        if (UseXmmRegToRegMoveAll) {
+            movaps(dst, src);
+        } else {
+            movss(dst, src);
+        }
+    }
+
+    public void movflt(Register dst, Address src) {
+        assert dst.isFpu();
+        movss(dst, src);
+    }
+
+    public void movflt(Address dst, Register src) {
+        assert src.isFpu();
+        movss(dst, src);
+    }
+
+    public void movdbl(Register dst, Register src) {
+        assert dst.isFpu() && src.isFpu();
+        if (UseXmmRegToRegMoveAll) {
+            movapd(dst, src);
+        } else {
+            movsd(dst, src);
+        }
+    }
+
+    public void movdbl(Register dst, Address src) {
+        assert dst.isFpu();
+        if (UseXmmLoadAndClearUpper) {
+            movsd(dst, src);
+        } else {
+            movlpd(dst, src);
+        }
+    }
+
+    public void movdbl(Address dst, Register src) {
+        assert src.isFpu();
+        movsd(dst, src);
+    }
+
+    /**
+     * Non-atomic write of a 64-bit constant to memory. Do not use
+     * if the address might be a volatile field!
+     */
+    public void movlong(Address dst, long src) {
+        Address high = new Address(dst.getKind(), dst.getBase(), dst.getIndex(), dst.getScale(), dst.getDisplacement() + 4);
+        movl(dst, (int) (src & 0xFFFFFFFF));
+        movl(high, (int) (src >> 32));
+    }
+
+    public void xchgptr(Register src1, Register src2) {
+        xchgq(src1, src2);
+    }
+
+    public void flog(Register dest, Register value, boolean base10) {
+        assert value.spillSlotSize == dest.spillSlotSize;
+
+        Address tmp = new Address(Kind.Double, AMD64.RSP);
+        if (base10) {
+            fldlg2();
+        } else {
+            fldln2();
+        }
+        subq(AMD64.rsp, value.spillSlotSize);
+        movsd(tmp, value);
+        fld(tmp);
+        fyl2x();
+        fstp(tmp);
+        movsd(dest, tmp);
+        addq(AMD64.rsp, dest.spillSlotSize);
+    }
+
+    public void fsin(Register dest, Register value) {
+        ftrig(dest, value, 's');
+    }
+
+    public void fcos(Register dest, Register value) {
+        ftrig(dest, value, 'c');
+    }
+
+    public void ftan(Register dest, Register value) {
+        ftrig(dest, value, 't');
+    }
+
+    private void ftrig(Register dest, Register value, char op) {
+        assert value.spillSlotSize == dest.spillSlotSize;
+
+        Address tmp = new Address(Kind.Double, AMD64.RSP);
+        subq(AMD64.rsp, value.spillSlotSize);
+        movsd(tmp, value);
+        fld(tmp);
+        if (op == 's') {
+            fsin();
+        } else if (op == 'c') {
+            fcos();
+        } else if (op == 't') {
+            fptan();
+            fstp(0); // ftan pushes 1.0 in addition to the actual result, pop
+        } else {
+            throw new InternalError("should not reach here");
+        }
+        fstp(tmp);
+        movsd(dest, tmp);
+        addq(AMD64.rsp, dest.spillSlotSize);
+    }
+
+    /**
+     * Emit code to save a given set of callee save registers in the
+     * {@linkplain CalleeSaveLayout CSA} within the frame.
+     * @param csl the description of the CSA
+     * @param frameToCSA offset from the frame pointer to the CSA
+     */
+    public void save(CalleeSaveLayout csl, int frameToCSA) {
+        RegisterValue frame = frameRegister.asValue();
+        for (Register r : csl.registers) {
+            int offset = csl.offsetOf(r);
+            movq(new Address(target.wordKind, frame, frameToCSA + offset), r);
+        }
+    }
+
+    public void restore(CalleeSaveLayout csl, int frameToCSA) {
+        RegisterValue frame = frameRegister.asValue();
+        for (Register r : csl.registers) {
+            int offset = csl.offsetOf(r);
+            movq(r, new Address(target.wordKind, frame, frameToCSA + offset));
+        }
+    }
+}
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.max.asm.amd64/src/com/oracle/max/asm/amd64/X86InstructionDecoder.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.max.asm.amd64/src/com/oracle/max/asm/amd64/X86InstructionDecoder.java	Tue Oct 02 22:22:06 2012 +0200
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2009, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package com.oracle.max.asm.amd64;
+
+
+public final class X86InstructionDecoder {
+
+    private boolean targetIs64Bit;
+    private byte[] code;
+    private int currentEndOfInstruction;
+    private int currentDisplacementPosition;
+
+    private static class Prefix {
+
+        // segment overrides
+        public static final int CSSegment = 0x2e;
+        public static final int SSSegment = 0x36;
+        public static final int DSSegment = 0x3e;
+        public static final int ESSegment = 0x26;
+        public static final int FSSegment = 0x64;
+        public static final int GSSegment = 0x65;
+        public static final int REX = 0x40;
+        public static final int REXB = 0x41;
+        public static final int REXX = 0x42;
+        public static final int REXXB = 0x43;
+        public static final int REXR = 0x44;
+        public static final int REXRB = 0x45;
+        public static final int REXRX = 0x46;
+        public static final int REXRXB = 0x47;
+        public static final int REXW = 0x48;
+        public static final int REXWB = 0x49;
+        public static final int REXWX = 0x4A;
+        public static final int REXWXB = 0x4B;
+        public static final int REXWR = 0x4C;
+        public static final int REXWRB = 0x4D;
+        public static final int REXWRX = 0x4E;
+        public static final int REXWRXB = 0x4F;
+    }
+
+    private X86InstructionDecoder(byte[] code, boolean targetIs64Bit) {
+        this.code = code;
+        this.targetIs64Bit = targetIs64Bit;
+    }
+
+    public int currentEndOfInstruction() {
+        return currentEndOfInstruction;
+    }
+
+    public int currentDisplacementPosition() {
+        return currentDisplacementPosition;
+    }
+
+    public void decodePosition(int inst) {
+
+        assert inst >= 0 && inst < code.length;
+
+        // Decode the given instruction, and return the Pointer of
+        // an embedded 32-bit operand word.
+
+        // If "which" is WhichOperand.disp32operand, selects the displacement portion
+        // of an effective Pointer specifier.
+        // If "which" is imm64Operand, selects the trailing immediate constant.
+        // If "which" is WhichOperand.call32operand, selects the displacement of a call or jump.
+        // Caller is responsible for ensuring that there is such an operand,
+        // and that it is 32/64 bits wide.
+
+        // If "which" is endPcOperand, find the end of the instruction.
+
+        int ip = inst;
+        boolean is64bit = false;
+
+        boolean hasDisp32 = false;
+        int tailSize = 0; // other random bytes (#32, #16, etc.) at end of insn
+
+        boolean againAfterPrefix = true;
+
+        while (againAfterPrefix) {
+            againAfterPrefix = false;
+            switch (0xFF & code[ip++]) {
+
+                // These convenience macros generate groups of "case" labels for the switch.
+
+                case Prefix.CSSegment:
+                case Prefix.SSSegment:
+                case Prefix.DSSegment:
+                case Prefix.ESSegment:
+                case Prefix.FSSegment:
+                case Prefix.GSSegment:
+                    // Seems dubious
+                    assert !targetIs64Bit : "shouldn't have that prefix";
+                    assert ip == inst + 1 : "only one prefix allowed";
+                    againAfterPrefix = true;
+                    break;
+
+                case 0x67:
+                case Prefix.REX:
+                case Prefix.REXB:
+                case Prefix.REXX:
+                case Prefix.REXXB:
+                case Prefix.REXR:
+                case Prefix.REXRB:
+                case Prefix.REXRX:
+                case Prefix.REXRXB:
+                    assert targetIs64Bit : "64bit prefixes";
+                    againAfterPrefix = true;
+                    break;
+
+                case Prefix.REXW:
+                case Prefix.REXWB:
+                case Prefix.REXWX:
+                case Prefix.REXWXB:
+                case Prefix.REXWR:
+                case Prefix.REXWRB:
+                case Prefix.REXWRX:
+                case Prefix.REXWRXB:
+                    assert targetIs64Bit : "64bit prefixes";
+                    is64bit = true;
+                    againAfterPrefix = true;
+                    break;
+
+                case 0xFF: // pushq a; decl a; incl a; call a; jmp a
+                case 0x88: // movb a, r
+                case 0x89: // movl a, r
+                case 0x8A: // movb r, a
+                case 0x8B: // movl r, a
+                case 0x8F: // popl a
+                    hasDisp32 = true;
+                    break;
+
+                case 0x68: // pushq #32
+                    currentEndOfInstruction = ip + 4;
+                    currentDisplacementPosition = ip;
+                    return; // not produced by emitOperand
+
+                case 0x66: // movw ... (size prefix)
+                    boolean againAfterSizePrefix2 = true;
+                    while (againAfterSizePrefix2) {
+                        againAfterSizePrefix2 = false;
+                        switch (0xFF & code[ip++]) {
+                            case Prefix.REX:
+                            case Prefix.REXB:
+                            case Prefix.REXX:
+                            case Prefix.REXXB:
+                            case Prefix.REXR:
+                            case Prefix.REXRB:
+                            case Prefix.REXRX:
+                            case Prefix.REXRXB:
+                            case Prefix.REXW:
+                            case Prefix.REXWB:
+                            case Prefix.REXWX:
+                            case Prefix.REXWXB:
+                            case Prefix.REXWR:
+                            case Prefix.REXWRB:
+                            case Prefix.REXWRX:
+                            case Prefix.REXWRXB:
+                                assert targetIs64Bit : "64bit prefix found";
+                                againAfterSizePrefix2 = true;
+                                break;
+                            case 0x8B: // movw r, a
+                            case 0x89: // movw a, r
+                                hasDisp32 = true;
+                                break;
+                            case 0xC7: // movw a, #16
+                                hasDisp32 = true;
+                                tailSize = 2; // the imm16
+                                break;
+                            case 0x0F: // several SSE/SSE2 variants
+                                ip--; // reparse the 0x0F
+                                againAfterPrefix = true;
+                                break;
+                            default:
+                                throw new InternalError("should not reach here");
+                        }
+                    }
+                    break;
+
+                case 0xB8: // movl/q r, #32/#64(oop?)
+                case 0xB9:
+                case 0xBA:
+                case 0xBB:
+                case 0xBC:
+                case 0xBD:
+                case 0xBE:
+                case 0xBF:
+                    currentEndOfInstruction = ip + (is64bit ? 8 : 4);
+                    currentDisplacementPosition = ip;
+                    return;
+
+                case 0x69: // imul r, a, #32
+                case 0xC7: // movl a, #32(oop?)
+                    tailSize = 4;
+                    hasDisp32 = true; // has both kinds of operands!
+                    break;
+
+                case 0x0F: // movx..., etc.
+                    switch (0xFF & code[ip++]) {
+                        case 0x12: // movlps
+                        case 0x28: // movaps
+                        case 0x2E: // ucomiss
+                        case 0x2F: // comiss
+                        case 0x54: // andps
+                        case 0x55: // andnps
+                        case 0x56: // orps
+                        case 0x57: // xorps
+                        case 0x6E: // movd
+                        case 0x7E: // movd
+                        case 0xAE: // ldmxcsr a
+                            // 64bit side says it these have both operands but that doesn't
+                            // appear to be true
+                            hasDisp32 = true;
+                            break;
+
+                        case 0xAD: // shrd r, a, %cl
+                        case 0xAF: // imul r, a
+                        case 0xBE: // movsbl r, a (movsxb)
+                        case 0xBF: // movswl r, a (movsxw)
+                        case 0xB6: // movzbl r, a (movzxb)
+                        case 0xB7: // movzwl r, a (movzxw)
+                        case 0x40: // cmovl cc, r, a
+                        case 0x41:
+                        case 0x42:
+                        case 0x43:
+                        case 0x44:
+                        case 0x45:
+                        case 0x46:
+                        case 0x47:
+                        case 0x48:
+                        case 0x49:
+                        case 0x4A:
+                        case 0x4B:
+                        case 0x4C:
+                        case 0x4D:
+                        case 0x4E:
+                        case 0x4F:
+                        case 0xB0: // cmpxchgb
+                        case 0xB1: // cmpxchg
+                        case 0xC1: // xaddl
+                        case 0xC7: // cmpxchg8
+                        case 0x90: // setcc a
+                        case 0x91:
+                        case 0x92:
+                        case 0x93:
+                        case 0x94:
+                        case 0x95:
+                        case 0x96:
+                        case 0x97:
+                        case 0x98:
+                        case 0x99:
+                        case 0x9A:
+                        case 0x9B:
+                        case 0x9C:
+                        case 0x9D:
+                        case 0x9E:
+                        case 0x9F:
+                            hasDisp32 = true;
+                            // fall out of the switch to decode the Pointer
+                            break;
+
+                        case 0xAC: // shrd r, a, #8
+                            hasDisp32 = true;
+                            tailSize = 1; // the imm8
+                            break;
+
+                        case 0x80: // jcc rdisp32
+                        case 0x81:
+                        case 0x82:
+                        case 0x83:
+                        case 0x84:
+                        case 0x85:
+                        case 0x86:
+                        case 0x87:
+                        case 0x88:
+                        case 0x89:
+                        case 0x8A:
+                        case 0x8B:
+                        case 0x8C:
+                        case 0x8D:
+                        case 0x8E:
+                        case 0x8F:
+                            currentEndOfInstruction = ip + 4;
+                            currentDisplacementPosition = ip;
+                            return;
+                        default:
+                            throw new InternalError("should not reach here");
+                    }
+                    break;
+
+                case 0x81: // addl a, #32; addl r, #32
+                    // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
+                    // on 32bit in the case of cmpl, the imm might be an oop
+                    tailSize = 4;
+                    hasDisp32 = true; // has both kinds of operands!
+                    break;
+
+                case 0x83: // addl a, #8; addl r, #8
+                    // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
+                    hasDisp32 = true; // has both kinds of operands!
+                    tailSize = 1;
+                    break;
+
+                case 0x9B:
+                    switch (0xFF & code[ip++]) {
+                        case 0xD9: // fnstcw a
+                            hasDisp32 = true;
+                            break;
+                        default:
+                            throw new InternalError("should not reach here");
+                    }
+                    break;
+
+                case 0x00: // addb a, r; addl a, r; addb r, a; addl r, a
+                case 0x01:
+                case 0x02:
+                case 0x03:
+                case 0x10: // adc...
+                case 0x11:
+                case 0x12:
+                case 0x13:
+                case 0x20: // and...
+                case 0x21:
+                case 0x22:
+                case 0x23:
+                case 0x30: // xor...
+                case 0x31:
+                case 0x32:
+                case 0x33:
+                case 0x08: // or...
+                case 0x09:
+                case 0x0a:
+                case 0x0b:
+                case 0x18: // sbb...
+                case 0x19:
+                case 0x1a:
+                case 0x1b:
+                case 0x28: // sub...
+                case 0x29:
+                case 0x2a:
+                case 0x2b:
+                case 0xF7: // mull a
+                case 0x8D: // lea r, a
+                case 0x87: // xchg r, a
+                case 0x38: // cmp...
+                case 0x39:
+                case 0x3a:
+                case 0x3b:
+                case 0x85: // test r, a
+                    hasDisp32 = true; // has both kinds of operands!
+                    break;
+
+                case 0xC1: // sal a, #8; sar a, #8; shl a, #8; shr a, #8
+                case 0xC6: // movb a, #8
+                case 0x80: // cmpb a, #8
+                case 0x6B: // imul r, a, #8
+                    hasDisp32 = true; // has both kinds of operands!
+                    tailSize = 1; // the imm8
+                    break;
+
+                case 0xE8: // call rdisp32
+                case 0xE9: // jmp rdisp32
+                    currentEndOfInstruction = ip + 4;
+                    currentDisplacementPosition = ip;
+                    return;
+
+                case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
+                case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
+                case 0xD9: // fldS a; fstS a; fstpS a; fldcw a
+                case 0xDD: // fldD a; fstD a; fstpD a
+                case 0xDB: // fildS a; fistpS a; fldX a; fstpX a
+                case 0xDF: // fildD a; fistpD a
+                case 0xD8: // faddS a; fsubrS a; fmulS a; fdivrS a; fcompS a
+                case 0xDC: // faddD a; fsubrD a; fmulD a; fdivrD a; fcompD a
+                case 0xDE: // faddpD a; fsubrpD a; fmulpD a; fdivrpD a; fcomppD a
+                    hasDisp32 = true;
+                    break;
+
+                case 0xF0: // Lock
+                    againAfterPrefix = true;
+                    break;
+
+                case 0xF3: // For SSE
+                case 0xF2: // For SSE2
+                    switch (0xFF & code[ip++]) {
+                        case Prefix.REX:
+                        case Prefix.REXB:
+                        case Prefix.REXX:
+                        case Prefix.REXXB:
+                        case Prefix.REXR:
+                        case Prefix.REXRB:
+                        case Prefix.REXRX:
+                        case Prefix.REXRXB:
+                        case Prefix.REXW:
+                        case Prefix.REXWB:
+                        case Prefix.REXWX:
+                        case Prefix.REXWXB:
+                        case Prefix.REXWR:
+                        case Prefix.REXWRB:
+                        case Prefix.REXWRX:
+                        case Prefix.REXWRXB:
+                            assert targetIs64Bit : "found 64bit prefix";
+                            ip++;
+                            // fall through
+                        default:
+                            ip++;
+                    }
+                    hasDisp32 = true; // has both kinds of operands!
+                    break;
+
+                default:
+                    throw new InternalError("should not reach here");
+            }
+        }
+
+        assert hasDisp32 : "(thomaswue) not sure if this holds: instruction has no disp32 field";
+
+        // parse the output of emitOperand
+        int op2 = 0xFF & code[ip++];
+        int base = op2 & 0x07;
+        int op3 = -1;
+        int b100 = 4;
+        int b101 = 5;
+        if (base == b100 && (op2 >> 6) != 3) {
+            op3 = 0xFF & code[ip++];
+            base = op3 & 0x07; // refetch the base
+        }
+        // now ip points at the disp (if any)
+
+        switch (op2 >> 6) {
+            case 0:
+                // [00 reg 100][ss index base]
+                // [00 reg 100][00 100 esp]
+                // [00 reg base]
+                // [00 reg 100][ss index 101][disp32]
+                // [00 reg 101] [disp32]
+
+                if (base == b101) {
+
+                    currentDisplacementPosition = ip;
+                    ip += 4; // skip the disp32
+                }
+                break;
+
+            case 1:
+                // [01 reg 100][ss index base][disp8]
+                // [01 reg 100][00 100 esp][disp8]
+                // [01 reg base] [disp8]
+                ip += 1; // skip the disp8
+                break;
+
+            case 2:
+                // [10 reg 100][ss index base][disp32]
+                // [10 reg 100][00 100 esp][disp32]
+                // [10 reg base] [disp32]
+                currentDisplacementPosition = ip;
+                ip += 4; // skip the disp32
+                break;
+
+            case 3:
+                // [11 reg base] (not a memory addressing mode)
+                break;
+        }
+
+        currentEndOfInstruction = ip + tailSize;
+    }
+
+    public static void patchRelativeInstruction(byte[] code, int codePos, int relative) {
+        X86InstructionDecoder decoder = new X86InstructionDecoder(code, true);
+        decoder.decodePosition(codePos);
+        int patchPos = decoder.currentDisplacementPosition();
+        int endOfInstruction = decoder.currentEndOfInstruction();
+        int offset = relative - endOfInstruction + codePos;
+        patchDisp32(code, patchPos, offset);
+    }
+
+    private static void patchDisp32(byte[] code, int pos, int offset) {
+        assert pos + 4 <= code.length;
+
+        assert code[pos] == 0;
+        assert code[pos + 1] == 0;
+        assert code[pos + 2] == 0;
+        assert code[pos + 3] == 0;
+
+        code[pos] = (byte) (offset & 0xFF);
+        code[pos + 1] = (byte) ((offset >> 8) & 0xFF);
+        code[pos + 2] = (byte) ((offset >> 16) & 0xFF);
+        code[pos + 3] = (byte) ((offset >> 24) & 0xFF);
+    }
+}
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.max.asm/src/com/oracle/max/asm/AsmOptions.java
--- a/graal/com.oracle.max.asm/src/com/oracle/max/asm/AsmOptions.java	Tue Oct 02 22:06:37 2012 +0200
+++ b/graal/com.oracle.max.asm/src/com/oracle/max/asm/AsmOptions.java	Tue Oct 02 22:22:06 2012 +0200
@@ -24,10 +24,4 @@
 
 public class AsmOptions {
     public static int     InitialCodeBufferSize         = 232;
-    public static int     Atomics                       = 0;
-    public static boolean UseNormalNop                  = true;
-    public static boolean UseAddressNop                 = true;
-    public static boolean UseIncDec                     = false;
-    public static boolean UseXmmLoadAndClearUpper       = true;
-    public static boolean UseXmmRegToRegMoveAll         = false;
 }
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.max.asm/src/com/oracle/max/asm/target/amd64/AMD64.java
--- a/graal/com.oracle.max.asm/src/com/oracle/max/asm/target/amd64/AMD64.java	Tue Oct 02 22:06:37 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2009, 2011, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-package com.oracle.max.asm.target.amd64;
-
-import static com.oracle.graal.api.code.Register.RegisterFlag.*;
-import static com.oracle.graal.api.meta.Kind.*;
-import static com.oracle.max.criutils.MemoryBarriers.*;
-
-import com.oracle.graal.api.code.*;
-import com.oracle.graal.api.code.Register.*;
-
-/**
- * Represents the AMD64 architecture.
- */
-public class AMD64 extends Architecture {
-
-    // General purpose CPU registers
-    public static final Register rax = new Register(0, 0, 8, "rax", CPU, RegisterFlag.Byte);
-    public static final Register rcx = new Register(1, 1, 8, "rcx", CPU, RegisterFlag.Byte);
-    public static final Register rdx = new Register(2, 2, 8, "rdx", CPU, RegisterFlag.Byte);
-    public static final Register rbx = new Register(3, 3, 8, "rbx", CPU, RegisterFlag.Byte);
-    public static final Register rsp = new Register(4, 4, 8, "rsp", CPU, RegisterFlag.Byte);
-    public static final Register rbp = new Register(5, 5, 8, "rbp", CPU, RegisterFlag.Byte);
-    public static final Register rsi = new Register(6, 6, 8, "rsi", CPU, RegisterFlag.Byte);
-    public static final Register rdi = new Register(7, 7, 8, "rdi", CPU, RegisterFlag.Byte);
-
-    public static final Register r8  = new Register(8,  8,  8, "r8", CPU, RegisterFlag.Byte);
-    public static final Register r9  = new Register(9,  9,  8, "r9", CPU, RegisterFlag.Byte);
-    public static final Register r10 = new Register(10, 10, 8, "r10", CPU, RegisterFlag.Byte);
-    public static final Register r11 = new Register(11, 11, 8, "r11", CPU, RegisterFlag.Byte);
-    public static final Register r12 = new Register(12, 12, 8, "r12", CPU, RegisterFlag.Byte);
-    public static final Register r13 = new Register(13, 13, 8, "r13", CPU, RegisterFlag.Byte);
-    public static final Register r14 = new Register(14, 14, 8, "r14", CPU, RegisterFlag.Byte);
-    public static final Register r15 = new Register(15, 15, 8, "r15", CPU, RegisterFlag.Byte);
-
-    public static final Register[] cpuRegisters = {
-        rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi,
-        r8, r9, r10, r11, r12, r13, r14, r15
-    };
-
-    // XMM registers
-    public static final Register xmm0 = new Register(16, 0, 8, "xmm0", FPU);
-    public static final Register xmm1 = new Register(17, 1, 8, "xmm1", FPU);
-    public static final Register xmm2 = new Register(18, 2, 8, "xmm2", FPU);
-    public static final Register xmm3 = new Register(19, 3, 8, "xmm3", FPU);
-    public static final Register xmm4 = new Register(20, 4, 8, "xmm4", FPU);
-    public static final Register xmm5 = new Register(21, 5, 8, "xmm5", FPU);
-    public static final Register xmm6 = new Register(22, 6, 8, "xmm6", FPU);
-    public static final Register xmm7 = new Register(23, 7, 8, "xmm7", FPU);
-
-    public static final Register xmm8 =  new Register(24,  8, 8, "xmm8",  FPU);
-    public static final Register xmm9 =  new Register(25,  9, 8, "xmm9",  FPU);
-    public static final Register xmm10 = new Register(26, 10, 8, "xmm10", FPU);
-    public static final Register xmm11 = new Register(27, 11, 8, "xmm11", FPU);
-    public static final Register xmm12 = new Register(28, 12, 8, "xmm12", FPU);
-    public static final Register xmm13 = new Register(29, 13, 8, "xmm13", FPU);
-    public static final Register xmm14 = new Register(30, 14, 8, "xmm14", FPU);
-    public static final Register xmm15 = new Register(31, 15, 8, "xmm15", FPU);
-
-    public static final Register[] xmmRegisters = {
-        xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
-        xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
-    };
-
-    public static final Register[] cpuxmmRegisters = {
-        rax,  rcx,  rdx,   rbx,   rsp,   rbp,   rsi,   rdi,
-        r8,   r9,   r10,   r11,   r12,   r13,   r14,   r15,
-        xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
-        xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
-    };
-
-    /**
-     * Register used to construct an instruction-relative address.
-     */
-    public static final Register rip = new Register(32, -1, 0, "rip");
-
-    public static final Register[] allRegisters = {
-        rax,  rcx,  rdx,   rbx,   rsp,   rbp,   rsi,   rdi,
-        r8,   r9,   r10,   r11,   r12,   r13,   r14,   r15,
-        xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
-        xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
-        rip
-    };
-
-    public static final RegisterValue RSP = rsp.asValue(Long);
-
-    public AMD64() {
-        super("AMD64",
-              8,
-              ByteOrder.LittleEndian,
-              allRegisters,
-              LOAD_STORE | STORE_STORE,
-              1,
-              r15.encoding + 1,
-              8);
-    }
-
-    @Override
-    public boolean isX86() {
-        return true;
-    }
-
-    @Override
-    public boolean twoOperandMode() {
-        return true;
-    }
-
-}
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.max.asm/src/com/oracle/max/asm/target/amd64/AMD64Assembler.java
--- a/graal/com.oracle.max.asm/src/com/oracle/max/asm/target/amd64/AMD64Assembler.java	Tue Oct 02 22:06:37 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3033 +0,0 @@
-/*
- * Copyright (c) 2009, 2012, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-package com.oracle.max.asm.target.amd64;
-
-import static com.oracle.graal.api.code.ValueUtil.*;
-import static com.oracle.max.asm.NumUtil.*;
-import static com.oracle.max.asm.target.amd64.AMD64.*;
-import static com.oracle.max.criutils.MemoryBarriers.*;
-
-import com.oracle.graal.api.code.*;
-import com.oracle.graal.api.meta.*;
-import com.oracle.max.asm.*;
-
-/**
- * This class implements an assembler that can encode most X86 instructions.
- */
-public class AMD64Assembler extends AbstractAssembler {
-    /**
-     * The kind for pointers and raw registers.  Since we know we are 64 bit here, we can hardcode it.
-     */
-    private static final Kind Word = Kind.Long;
-
-    private static final int MinEncodingNeedsRex = 8;
-
-    /**
-     * The x86 condition codes used for conditional jumps/moves.
-     */
-    public enum ConditionFlag {
-        zero(0x4, "|zero|"),
-        notZero(0x5, "|nzero|"),
-        equal(0x4, "="),
-        notEqual(0x5, "!="),
-        less(0xc, "<"),
-        lessEqual(0xe, "<="),
-        greater(0xf, ">"),
-        greaterEqual(0xd, ">="),
-        below(0x2, "|<|"),
-        belowEqual(0x6, "|<=|"),
-        above(0x7, "|>|"),
-        aboveEqual(0x3, "|>=|"),
-        overflow(0x0, "|of|"),
-        noOverflow(0x1, "|nof|"),
-        carrySet(0x2, "|carry|"),
-        carryClear(0x3, "|ncarry|"),
-        negative(0x8, "|neg|"),
-        positive(0x9, "|pos|"),
-        parity(0xa, "|par|"),
-        noParity(0xb, "|npar|");
-
-        public final int value;
-        public final String operator;
-
-        private ConditionFlag(int value, String operator) {
-            this.value = value;
-            this.operator = operator;
-        }
-
-        public ConditionFlag negate() {
-            switch(this) {
-                case zero: return notZero;
-                case notZero: return zero;
-                case equal: return notEqual;
-                case notEqual: return equal;
-                case less: return greaterEqual;
-                case lessEqual: return greater;
-                case greater: return lessEqual;
-                case greaterEqual: return less;
-                case below: return aboveEqual;
-                case belowEqual: return above;
-                case above: return belowEqual;
-                case aboveEqual: return below;
-                case overflow: return noOverflow;
-                case noOverflow: return overflow;
-                case carrySet: return carryClear;
-                case carryClear: return carrySet;
-                case negative: return positive;
-                case positive: return negative;
-                case parity: return noParity;
-                case noParity: return parity;
-            }
-            throw new IllegalArgumentException();
-        }
-    }
-
-    /**
-     * Constants for X86 prefix bytes.
-     */
-    private static class Prefix {
-        private static final int REX = 0x40;
-        private static final int REXB = 0x41;
-        private static final int REXX = 0x42;
-        private static final int REXXB = 0x43;
-        private static final int REXR = 0x44;
-        private static final int REXRB = 0x45;
-        private static final int REXRX = 0x46;
-        private static final int REXRXB = 0x47;
-        private static final int REXW = 0x48;
-        private static final int REXWB = 0x49;
-        private static final int REXWX = 0x4A;
-        private static final int REXWXB = 0x4B;
-        private static final int REXWR = 0x4C;
-        private static final int REXWRB = 0x4D;
-        private static final int REXWRX = 0x4E;
-        private static final int REXWRXB = 0x4F;
-    }
-
-    /**
-     * The register to which {@link Register#Frame} and {@link Register#CallerFrame} are bound.
-     */
-    public final Register frameRegister;
-
-    /**
-     * Constructs an assembler for the AMD64 architecture.
-     *
-     * @param registerConfig the register configuration used to bind {@link Register#Frame} and
-     *            {@link Register#CallerFrame} to physical registers. This value can be null if this assembler
-     *            instance will not be used to assemble instructions using these logical registers.
-     */
-    public AMD64Assembler(TargetDescription target, RegisterConfig registerConfig) {
-        super(target);
-        this.frameRegister = registerConfig == null ? null : registerConfig.getFrameRegister();
-    }
-
-    private static int encode(Register r) {
-        assert r.encoding < 16 && r.encoding >= 0 : "encoding out of range: " + r.encoding;
-        return r.encoding & 0x7;
-    }
-
-    private void emitArithB(int op1, int op2, Register dst, int imm8) {
-        assert dst.isByte() : "must have byte register";
-        assert isUByte(op1) && isUByte(op2) : "wrong opcode";
-        assert isUByte(imm8) : "not a byte";
-        assert (op1 & 0x01) == 0 : "should be 8bit operation";
-        emitByte(op1);
-        emitByte(op2 | encode(dst));
-        emitByte(imm8);
-    }
-
-    private void emitArith(int op1, int op2, Register dst, int imm32) {
-        assert isUByte(op1) && isUByte(op2) : "wrong opcode";
-        assert (op1 & 0x01) == 1 : "should be 32bit operation";
-        assert (op1 & 0x02) == 0 : "sign-extension bit should not be set";
-        if (isByte(imm32)) {
-            emitByte(op1 | 0x02); // set sign bit
-            emitByte(op2 | encode(dst));
-            emitByte(imm32 & 0xFF);
-        } else {
-            emitByte(op1);
-            emitByte(op2 | encode(dst));
-            emitInt(imm32);
-        }
-    }
-
-    // immediate-to-memory forms
-    private void emitArithOperand(int op1, Register rm, Address adr, int imm32) {
-        assert (op1 & 0x01) == 1 : "should be 32bit operation";
-        assert (op1 & 0x02) == 0 : "sign-extension bit should not be set";
-        if (isByte(imm32)) {
-            emitByte(op1 | 0x02); // set sign bit
-            emitOperandHelper(rm, adr);
-            emitByte(imm32 & 0xFF);
-        } else {
-            emitByte(op1);
-            emitOperandHelper(rm, adr);
-            emitInt(imm32);
-        }
-    }
-
-    private void emitArith(int op1, int op2, Register dst, Register src) {
-        assert isUByte(op1) && isUByte(op2) : "wrong opcode";
-        emitByte(op1);
-        emitByte(op2 | encode(dst) << 3 | encode(src));
-    }
-
-    private void emitOperandHelper(Register reg, Address addr) {
-        Register base = isLegal(addr.getBase()) ? asRegister(addr.getBase()) : Register.None;
-        Register index = isLegal(addr.getIndex()) ? asRegister(addr.getIndex()) : Register.None;
-
-        Address.Scale scale = addr.getScale();
-        int disp = addr.getDisplacement();
-
-        if (base == Register.Frame) {
-            assert frameRegister != null : "cannot use register " + Register.Frame + " in assembler with null register configuration";
-            base = frameRegister;
-//        } else if (base == Register.CallerFrame) {
-//            assert frameRegister != null : "cannot use register " + Register.Frame + " in assembler with null register configuration";
-//            base = frameRegister;
-//            disp += targetMethod.frameSize() + 8;
-        }
-
-        // Encode the registers as needed in the fields they are used in
-
-        assert reg != Register.None;
-        int regenc = encode(reg) << 3;
-
-        if (base == AMD64.rip) {
-            // [00 000 101] disp32
-            emitByte(0x05 | regenc);
-            emitInt(disp);
-        } else if (addr == Address.Placeholder) {
-            // [00 000 101] disp32
-            emitByte(0x05 | regenc);
-            emitInt(0);
-
-        } else if (base.isValid()) {
-            int baseenc = base.isValid() ? encode(base) : 0;
-            if (index.isValid()) {
-                int indexenc = encode(index) << 3;
-                // [base + indexscale + disp]
-                if (disp == 0 && base != rbp && (base != r13)) {
-                    // [base + indexscale]
-                    // [00 reg 100][ss index base]
-                    assert index != rsp : "illegal addressing mode";
-                    emitByte(0x04 | regenc);
-                    emitByte(scale.log2 << 6 | indexenc | baseenc);
-                } else if (isByte(disp)) {
-                    // [base + indexscale + imm8]
-                    // [01 reg 100][ss index base] imm8
-                    assert index != rsp : "illegal addressing mode";
-                    emitByte(0x44 | regenc);
-                    emitByte(scale.log2 << 6 | indexenc | baseenc);
-                    emitByte(disp & 0xFF);
-                } else {
-                    // [base + indexscale + disp32]
-                    // [10 reg 100][ss index base] disp32
-                    assert index != rsp : "illegal addressing mode";
-                    emitByte(0x84 | regenc);
-                    emitByte(scale.log2 << 6 | indexenc | baseenc);
-                    emitInt(disp);
-                }
-            } else if (base == rsp || (base == r12)) {
-                // [rsp + disp]
-                if (disp == 0) {
-                    // [rsp]
-                    // [00 reg 100][00 100 100]
-                    emitByte(0x04 | regenc);
-                    emitByte(0x24);
-                } else if (isByte(disp)) {
-                    // [rsp + imm8]
-                    // [01 reg 100][00 100 100] disp8
-                    emitByte(0x44 | regenc);
-                    emitByte(0x24);
-                    emitByte(disp & 0xFF);
-                } else {
-                    // [rsp + imm32]
-                    // [10 reg 100][00 100 100] disp32
-                    emitByte(0x84 | regenc);
-                    emitByte(0x24);
-                    emitInt(disp);
-                }
-            } else {
-                // [base + disp]
-                assert base != rsp && (base != r12) : "illegal addressing mode";
-                if (disp == 0 && base != rbp && (base != r13)) {
-                    // [base]
-                    // [00 reg base]
-                    emitByte(0x00 | regenc | baseenc);
-                } else if (isByte(disp)) {
-                    // [base + disp8]
-                    // [01 reg base] disp8
-                    emitByte(0x40 | regenc | baseenc);
-                    emitByte(disp & 0xFF);
-                } else {
-                    // [base + disp32]
-                    // [10 reg base] disp32
-                    emitByte(0x80 | regenc | baseenc);
-                    emitInt(disp);
-                }
-            }
-        } else {
-            if (index.isValid()) {
-                int indexenc = encode(index) << 3;
-                // [indexscale + disp]
-                // [00 reg 100][ss index 101] disp32
-                assert index != rsp : "illegal addressing mode";
-                emitByte(0x04 | regenc);
-                emitByte(scale.log2 << 6 | indexenc | 0x05);
-                emitInt(disp);
-            } else {
-                // [disp] ABSOLUTE
-                // [00 reg 100][00 100 101] disp32
-                emitByte(0x04 | regenc);
-                emitByte(0x25);
-                emitInt(disp);
-            }
-        }
-    }
-
-    public final void addl(Address dst, int imm32) {
-        prefix(dst);
-        emitArithOperand(0x81, rax, dst, imm32);
-    }
-
-    public final void addl(Address dst, Register src) {
-        prefix(dst, src);
-        emitByte(0x01);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void addl(Register dst, int imm32) {
-        prefix(dst);
-        emitArith(0x81, 0xC0, dst, imm32);
-    }
-
-    public final void addl(Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0x03);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void addl(Register dst, Register src) {
-        prefixAndEncode(dst.encoding, src.encoding);
-        emitArith(0x03, 0xC0, dst, src);
-    }
-
-    private void addrNop4() {
-        // 4 bytes: NOP DWORD PTR [EAX+0]
-        emitByte(0x0F);
-        emitByte(0x1F);
-        emitByte(0x40); // emitRm(cbuf, 0x1, EAXEnc, EAXEnc);
-        emitByte(0); // 8-bits offset (1 byte)
-    }
-
-    private void addrNop5() {
-        // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
-        emitByte(0x0F);
-        emitByte(0x1F);
-        emitByte(0x44); // emitRm(cbuf, 0x1, EAXEnc, 0x4);
-        emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
-        emitByte(0); // 8-bits offset (1 byte)
-    }
-
-    private void addrNop7() {
-        // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
-        emitByte(0x0F);
-        emitByte(0x1F);
-        emitByte(0x80); // emitRm(cbuf, 0x2, EAXEnc, EAXEnc);
-        emitInt(0); // 32-bits offset (4 bytes)
-    }
-
-    private void addrNop8() {
-        // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
-        emitByte(0x0F);
-        emitByte(0x1F);
-        emitByte(0x84); // emitRm(cbuf, 0x2, EAXEnc, 0x4);
-        emitByte(0x00); // emitRm(cbuf, 0x0, EAXEnc, EAXEnc);
-        emitInt(0); // 32-bits offset (4 bytes)
-    }
-
-    public final void addsd(Register dst, Register src) {
-        assert dst.isFpu() && src.isFpu();
-        emitByte(0xF2);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x58);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void addsd(Register dst, Address src) {
-        assert dst.isFpu();
-        emitByte(0xF2);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x58);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void addss(Register dst, Register src) {
-        assert dst.isFpu() && src.isFpu();
-        emitByte(0xF3);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x58);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void addss(Register dst, Address src) {
-        assert dst.isFpu();
-        emitByte(0xF3);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x58);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void andl(Register dst, int imm32) {
-        prefix(dst);
-        emitArith(0x81, 0xE0, dst, imm32);
-    }
-
-    public final void andl(Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0x23);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void andl(Register dst, Register src) {
-        prefixAndEncode(dst.encoding, src.encoding);
-        emitArith(0x23, 0xC0, dst, src);
-    }
-
-    public final void bsfq(Register dst, Register src) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xBC);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void bsfq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0xBC);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void bsrq(Register dst, Register src) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xBD);
-        emitByte(0xC0 | encode);
-    }
-
-
-    public final void bsrq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0xBD);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void bsrl(Register dst, Register src) {
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xBD);
-        emitByte(0xC0 | encode);
-    }
-
-
-    public final void bsrl(Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0xBD);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void bswapl(Register reg) { // bswap
-        int encode = prefixAndEncode(reg.encoding);
-        emitByte(0x0F);
-        emitByte(0xC8 | encode);
-    }
-
-    public final void btli(Address src, int imm8) {
-        prefixq(src);
-        emitByte(0x0F);
-        emitByte(0xBA);
-        emitOperandHelper(rsp, src);
-        emitByte(imm8);
-    }
-
-    public final void cdql() {
-        emitByte(0x99);
-    }
-
-    public final void cmovl(ConditionFlag cc, Register dst, Register src) {
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x40 | cc.value);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cmovl(ConditionFlag cc, Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x40 | cc.value);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void cmpb(Address dst, int imm8) {
-        prefix(dst);
-        emitByte(0x80);
-        emitOperandHelper(rdi, dst);
-        emitByte(imm8);
-    }
-
-    public final void cmpl(Address dst, int imm32) {
-        prefix(dst);
-        emitByte(0x81);
-        emitOperandHelper(rdi, dst);
-        emitInt(imm32);
-    }
-
-    public final void cmpl(Register dst, int imm32) {
-        prefix(dst);
-        emitArith(0x81, 0xF8, dst, imm32);
-    }
-
-    public final void cmpl(Register dst, Register src) {
-        prefixAndEncode(dst.encoding, src.encoding);
-        emitArith(0x3B, 0xC0, dst, src);
-    }
-
-    public final void cmpl(Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0x3B);
-        emitOperandHelper(dst, src);
-    }
-
-    // The 32-bit cmpxchg compares the value at adr with the contents of X86.rax,
-    // and stores reg into adr if so; otherwise, the value at adr is loaded into X86.rax,.
-    // The ZF is set if the compared values were equal, and cleared otherwise.
-    public final void cmpxchgl(Register reg, Address adr) { // cmpxchg
-        if ((AsmOptions.Atomics & 2) != 0) {
-            // caveat: no instructionmark, so this isn't relocatable.
-            // Emit a synthetic, non-atomic, CAS equivalent.
-            // Beware. The synthetic form sets all ICCs, not just ZF.
-            // cmpxchg r,[m] is equivalent to X86.rax, = CAS (m, X86.rax, r)
-            cmpl(rax, adr);
-            movl(rax, adr);
-            if (reg != rax) {
-                Label l = new Label();
-                jcc(ConditionFlag.notEqual, l);
-                movl(adr, reg);
-                bind(l);
-            }
-        } else {
-
-            prefix(adr, reg);
-            emitByte(0x0F);
-            emitByte(0xB1);
-            emitOperandHelper(reg, adr);
-        }
-    }
-
-    public final void comisd(Register dst, Address src) {
-        assert dst.isFpu();
-        // NOTE: dbx seems to decode this as comiss even though the
-        // 0x66 is there. Strangly ucomisd comes out correct
-        emitByte(0x66);
-        comiss(dst, src);
-    }
-
-    public final void comiss(Register dst, Address src) {
-        assert dst.isFpu();
-
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x2F);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void cvtdq2pd(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-
-        emitByte(0xF3);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xE6);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cvtdq2ps(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x5B);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cvtsd2ss(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        emitByte(0xF2);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x5A);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cvtsi2sdl(Register dst, Register src) {
-        assert dst.isFpu();
-        emitByte(0xF2);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x2A);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cvtsi2ssl(Register dst, Register src) {
-        assert dst.isFpu();
-        emitByte(0xF3);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x2A);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cvtss2sd(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        emitByte(0xF3);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x5A);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cvttsd2sil(Register dst, Register src) {
-        assert src.isFpu();
-        emitByte(0xF2);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x2C);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cvttss2sil(Register dst, Register src) {
-        assert src.isFpu();
-        emitByte(0xF3);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x2C);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void decl(Address dst) {
-        // Don't use it directly. Use Macrodecrement() instead.
-        prefix(dst);
-        emitByte(0xFF);
-        emitOperandHelper(rcx, dst);
-    }
-
-    public final void divsd(Register dst, Address src) {
-        assert dst.isFpu();
-        emitByte(0xF2);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x5E);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void divsd(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        emitByte(0xF2);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x5E);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void divss(Register dst, Address src) {
-        assert dst.isFpu();
-        emitByte(0xF3);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x5E);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void divss(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        emitByte(0xF3);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x5E);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void hlt() {
-        emitByte(0xF4);
-    }
-
-    public final void idivl(Register src) {
-        int encode = prefixAndEncode(src.encoding);
-        emitByte(0xF7);
-        emitByte(0xF8 | encode);
-    }
-
-    public final void divl(Register src) {
-        int encode = prefixAndEncode(src.encoding);
-        emitByte(0xF7);
-        emitByte(0xF0 | encode);
-    }
-
-    public final void imull(Register dst, Register src) {
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xAF);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void imull(Register dst, Register src, int value) {
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        if (isByte(value)) {
-            emitByte(0x6B);
-            emitByte(0xC0 | encode);
-            emitByte(value & 0xFF);
-        } else {
-            emitByte(0x69);
-            emitByte(0xC0 | encode);
-            emitInt(value);
-        }
-    }
-
-    public final void incl(Address dst) {
-        // Don't use it directly. Use Macroincrement() instead.
-        prefix(dst);
-        emitByte(0xFF);
-        emitOperandHelper(rax, dst);
-    }
-
-    public final void jcc(ConditionFlag cc, int jumpTarget, boolean forceDisp32) {
-        int shortSize = 2;
-        int longSize = 6;
-        long disp = jumpTarget - codeBuffer.position();
-        if (!forceDisp32 && isByte(disp - shortSize)) {
-            // 0111 tttn #8-bit disp
-            emitByte(0x70 | cc.value);
-            emitByte((int) ((disp - shortSize) & 0xFF));
-        } else {
-            // 0000 1111 1000 tttn #32-bit disp
-            assert isInt(disp - longSize) : "must be 32bit offset (call4)";
-            emitByte(0x0F);
-            emitByte(0x80 | cc.value);
-            emitInt((int) (disp - longSize));
-        }
-    }
-
-    public final void jcc(ConditionFlag cc, Label l) {
-        assert (0 <= cc.value) && (cc.value < 16) : "illegal cc";
-        if (l.isBound()) {
-            jcc(cc, l.position(), false);
-        } else {
-            // Note: could eliminate cond. jumps to this jump if condition
-            // is the same however, seems to be rather unlikely case.
-            // Note: use jccb() if label to be bound is very close to get
-            // an 8-bit displacement
-            l.addPatchAt(codeBuffer.position());
-            emitByte(0x0F);
-            emitByte(0x80 | cc.value);
-            emitInt(0);
-        }
-
-    }
-
-    public final void jccb(ConditionFlag cc, Label l) {
-        if (l.isBound()) {
-            int shortSize = 2;
-            int entry = l.position();
-            assert isByte(entry - (codeBuffer.position() + shortSize)) : "Dispacement too large for a short jmp";
-            long disp = entry - codeBuffer.position();
-            // 0111 tttn #8-bit disp
-            emitByte(0x70 | cc.value);
-            emitByte((int) ((disp - shortSize) & 0xFF));
-        } else {
-
-            l.addPatchAt(codeBuffer.position());
-            emitByte(0x70 | cc.value);
-            emitByte(0);
-        }
-    }
-
-    public final void jmp(Address adr) {
-        prefix(adr);
-        emitByte(0xFF);
-        emitOperandHelper(rsp, adr);
-    }
-
-    public final void jmp(int jumpTarget, boolean forceDisp32) {
-        int shortSize = 2;
-        int longSize = 5;
-        long disp = jumpTarget - codeBuffer.position();
-        if (!forceDisp32 && isByte(disp - shortSize)) {
-            emitByte(0xEB);
-            emitByte((int) ((disp - shortSize) & 0xFF));
-        } else {
-            emitByte(0xE9);
-            emitInt((int) (disp - longSize));
-        }
-    }
-
-    @Override
-    public final void jmp(Label l) {
-        if (l.isBound()) {
-            jmp(l.position(), false);
-        } else {
-            // By default, forward jumps are always 32-bit displacements, since
-            // we can't yet know where the label will be bound. If you're sure that
-            // the forward jump will not run beyond 256 bytes, use jmpb to
-            // force an 8-bit displacement.
-
-            l.addPatchAt(codeBuffer.position());
-            emitByte(0xE9);
-            emitInt(0);
-        }
-    }
-
-    public final void jmp(Register entry) {
-        int encode = prefixAndEncode(entry.encoding);
-        emitByte(0xFF);
-        emitByte(0xE0 | encode);
-    }
-
-    public final void jmpb(Label l) {
-        if (l.isBound()) {
-            int shortSize = 2;
-            int entry = l.position();
-            assert isByte((entry - codeBuffer.position()) + shortSize) : "Dispacement too large for a short jmp";
-            long offs = entry - codeBuffer.position();
-            emitByte(0xEB);
-            emitByte((int) ((offs - shortSize) & 0xFF));
-        } else {
-
-            l.addPatchAt(codeBuffer.position());
-            emitByte(0xEB);
-            emitByte(0);
-        }
-    }
-
-    public final void leaq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x8D);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void enter(int imm16, int imm8) {
-        emitByte(0xC8);
-        emitShort(imm16);
-        emitByte(imm8);
-    }
-
-    public final void leave() {
-        emitByte(0xC9);
-    }
-
-    public final void lock() {
-        if ((AsmOptions.Atomics & 1) != 0) {
-            // Emit either nothing, a NOP, or a NOP: prefix
-            emitByte(0x90);
-        } else {
-            emitByte(0xF0);
-        }
-    }
-
-    // Emit mfence instruction
-    public final void mfence() {
-        emitByte(0x0F);
-        emitByte(0xAE);
-        emitByte(0xF0);
-    }
-
-    public final void mov(Register dst, Register src) {
-        movq(dst, src);
-    }
-
-    public final void movapd(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        int dstenc = dst.encoding;
-        int srcenc = src.encoding;
-        emitByte(0x66);
-        if (dstenc < 8) {
-            if (srcenc >= 8) {
-                emitByte(Prefix.REXB);
-                srcenc -= 8;
-            }
-        } else {
-            if (srcenc < 8) {
-                emitByte(Prefix.REXR);
-            } else {
-                emitByte(Prefix.REXRB);
-                srcenc -= 8;
-            }
-            dstenc -= 8;
-        }
-        emitByte(0x0F);
-        emitByte(0x28);
-        emitByte(0xC0 | dstenc << 3 | srcenc);
-    }
-
-    public final void movaps(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        int dstenc = dst.encoding;
-        int srcenc = src.encoding;
-        if (dstenc < 8) {
-            if (srcenc >= 8) {
-                emitByte(Prefix.REXB);
-                srcenc -= 8;
-            }
-        } else {
-            if (srcenc < 8) {
-                emitByte(Prefix.REXR);
-            } else {
-                emitByte(Prefix.REXRB);
-                srcenc -= 8;
-            }
-            dstenc -= 8;
-        }
-        emitByte(0x0F);
-        emitByte(0x28);
-        emitByte(0xC0 | dstenc << 3 | srcenc);
-    }
-
-    public final void movb(Register dst, Address src) {
-        prefix(src, dst); // , true)
-        emitByte(0x8A);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movb(Address dst, int imm8) {
-        prefix(dst);
-        emitByte(0xC6);
-        emitOperandHelper(rax, dst);
-        emitByte(imm8);
-    }
-
-    public final void movb(Address dst, Register src) {
-        assert src.isByte() : "must have byte register";
-        prefix(dst, src); // , true)
-        emitByte(0x88);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void movdl(Register dst, Register src) {
-        if (dst.isFpu()) {
-            assert !src.isFpu() : "does this hold?";
-            emitByte(0x66);
-            int encode = prefixAndEncode(dst.encoding, src.encoding);
-            emitByte(0x0F);
-            emitByte(0x6E);
-            emitByte(0xC0 | encode);
-        } else if (src.isFpu()) {
-            assert !dst.isFpu();
-            emitByte(0x66);
-            // swap src/dst to get correct prefix
-            int encode = prefixAndEncode(src.encoding, dst.encoding);
-            emitByte(0x0F);
-            emitByte(0x7E);
-            emitByte(0xC0 | encode);
-        }
-    }
-
-    public final void movdqa(Register dst, Address src) {
-        assert dst.isFpu();
-        emitByte(0x66);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x6F);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movdqa(Register dst, Register src) {
-        assert dst.isFpu();
-        emitByte(0x66);
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x6F);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movdqa(Address dst, Register src) {
-        assert src.isFpu();
-        emitByte(0x66);
-        prefix(dst, src);
-        emitByte(0x0F);
-        emitByte(0x7F);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void movdqu(Register dst, Address src) {
-        assert dst.isFpu();
-        emitByte(0xF3);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x6F);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movdqu(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-
-        emitByte(0xF3);
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x6F);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movdqu(Address dst, Register src) {
-        assert src.isFpu();
-
-        emitByte(0xF3);
-        prefix(dst, src);
-        emitByte(0x0F);
-        emitByte(0x7F);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void movl(Register dst, int imm32) {
-        int encode = prefixAndEncode(dst.encoding);
-        emitByte(0xB8 | encode);
-        emitInt(imm32);
-    }
-
-    public final void movl(Register dst, Register src) {
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x8B);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movl(Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0x8B);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movl(Address dst, int imm32) {
-        prefix(dst);
-        emitByte(0xC7);
-        emitOperandHelper(rax, dst);
-        emitInt(imm32);
-    }
-
-    public final void movl(Address dst, Register src) {
-        prefix(dst, src);
-        emitByte(0x89);
-        emitOperandHelper(src, dst);
-    }
-
-    /**
-     * New CPUs require use of movsd and movss to avoid partial register stall
-     * when loading from memory. But for old Opteron use movlpd instead of movsd.
-     * The selection is done in {@link AMD64MacroAssembler#movdbl(Register, Address)}
-     * and {@link AMD64MacroAssembler#movflt(Register, Register)}.
-     */
-    public final void movlpd(Register dst, Address src) {
-        assert dst.isFpu();
-        emitByte(0x66);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x12);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movlpd(Address dst, Register src) {
-        assert src.isFpu();
-        emitByte(0x66);
-        prefix(dst, src);
-        emitByte(0x0F);
-        emitByte(0x13);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void movq(Register dst, Address src) {
-        if (dst.isFpu()) {
-            emitByte(0xF3);
-            prefixq(src, dst);
-            emitByte(0x0F);
-            emitByte(0x7E);
-            emitOperandHelper(dst, src);
-        } else {
-            prefixq(src, dst);
-            emitByte(0x8B);
-            emitOperandHelper(dst, src);
-        }
-    }
-
-    public final void movq(Register dst, Register src) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x8B);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movq(Address dst, Register src) {
-        if (src.isFpu()) {
-            emitByte(0x66);
-            prefixq(dst, src);
-            emitByte(0x0F);
-            emitByte(0xD6);
-            emitOperandHelper(src, dst);
-        } else {
-            prefixq(dst, src);
-            emitByte(0x89);
-            emitOperandHelper(src, dst);
-        }
-    }
-
-    public final void movsxb(Register dst, Address src) { // movsxb
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0xBE);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movsxb(Register dst, Register src) { // movsxb
-        int encode = prefixAndEncode(dst.encoding, src.encoding, true);
-        emitByte(0x0F);
-        emitByte(0xBE);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movsd(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        emitByte(0xF2);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x10);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movsd(Register dst, Address src) {
-        assert dst.isFpu();
-        emitByte(0xF2);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x10);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movsd(Address dst, Register src) {
-        assert src.isFpu();
-        emitByte(0xF2);
-        prefix(dst, src);
-        emitByte(0x0F);
-        emitByte(0x11);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void movss(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        emitByte(0xF3);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x10);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movss(Register dst, Address src) {
-        assert dst.isFpu();
-        emitByte(0xF3);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x10);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movss(Address dst, Register src) {
-        assert src.isFpu();
-        emitByte(0xF3);
-        prefix(dst, src);
-        emitByte(0x0F);
-        emitByte(0x11);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void movswl(Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0xBF);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movsxw(Register dst, Register src) { // movsxw
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xBF);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movsxw(Register dst, Address src) { // movsxw
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0xBF);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movzxd(Register dst, Register src) { // movzxd
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x63);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movzxd(Register dst, Address src) { // movzxd
-        prefix(src, dst);
-        emitByte(0x63);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movw(Address dst, int imm16) {
-        emitByte(0x66); // switch to 16-bit mode
-        prefix(dst);
-        emitByte(0xC7);
-        emitOperandHelper(rax, dst);
-        emitShort(imm16);
-    }
-
-    public final void movw(Register dst, Address src) {
-        emitByte(0x66);
-        prefix(src, dst);
-        emitByte(0x8B);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movw(Address dst, Register src) {
-        emitByte(0x66);
-        prefix(dst, src);
-        emitByte(0x89);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void movzxb(Register dst, Address src) { // movzxb
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0xB6);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movzxb(Register dst, Register src) { // movzxb
-        int encode = prefixAndEncode(dst.encoding, src.encoding, true);
-        emitByte(0x0F);
-        emitByte(0xB6);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movzxl(Register dst, Address src) { // movzxw
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0xB7);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movzxl(Register dst, Register src) { // movzxw
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xB7);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void mull(Address src) {
-        prefix(src);
-        emitByte(0xF7);
-        emitOperandHelper(rsp, src);
-    }
-
-    public final void mulsd(Register dst, Address src) {
-        assert dst.isFpu();
-        emitByte(0xF2);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x59);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void mulsd(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-
-        emitByte(0xF2);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x59);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void mulss(Register dst, Address src) {
-        assert dst.isFpu();
-
-        emitByte(0xF3);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x59);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void mulss(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        emitByte(0xF3);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x59);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void negl(Register dst) {
-        int encode = prefixAndEncode(dst.encoding);
-        emitByte(0xF7);
-        emitByte(0xD8 | encode);
-    }
-
-    public final void ensureUniquePC() {
-        nop();
-    }
-
-    public final void nop() {
-        nop(1);
-    }
-
-    public void nop(int count) {
-        int i = count;
-        if (AsmOptions.UseNormalNop) {
-            assert i > 0 : " ";
-            // The fancy nops aren't currently recognized by debuggers making it a
-            // pain to disassemble code while debugging. If assert are on clearly
-            // speed is not an issue so simply use the single byte traditional nop
-            // to do alignment.
-
-            for (; i > 0; i--) {
-                emitByte(0x90);
-            }
-            return;
-        }
-
-        if (AsmOptions.UseAddressNop) {
-            //
-            // Using multi-bytes nops "0x0F 0x1F [Address]" for AMD.
-            // 1: 0x90
-            // 2: 0x66 0x90
-            // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
-            // 4: 0x0F 0x1F 0x40 0x00
-            // 5: 0x0F 0x1F 0x44 0x00 0x00
-            // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00
-            // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
-            // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
-            // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
-            // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
-            // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
-
-            // The rest coding is AMD specific - use consecutive Address nops
-
-            // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
-            // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
-            // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
-            // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
-            // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
-            // Size prefixes (0x66) are added for larger sizes
-
-            while (i >= 22) {
-                i -= 11;
-                emitByte(0x66); // size prefix
-                emitByte(0x66); // size prefix
-                emitByte(0x66); // size prefix
-                addrNop8();
-            }
-            // Generate first nop for size between 21-12
-            switch (i) {
-                case 21:
-                    i -= 1;
-                    emitByte(0x66); // size prefix
-                    // fall through
-                case 20:
-                    // fall through
-                case 19:
-                    i -= 1;
-                    emitByte(0x66); // size prefix
-                    // fall through
-                case 18:
-                    // fall through
-                case 17:
-                    i -= 1;
-                    emitByte(0x66); // size prefix
-                    // fall through
-                case 16:
-                    // fall through
-                case 15:
-                    i -= 8;
-                    addrNop8();
-                    break;
-                case 14:
-                case 13:
-                    i -= 7;
-                    addrNop7();
-                    break;
-                case 12:
-                    i -= 6;
-                    emitByte(0x66); // size prefix
-                    addrNop5();
-                    break;
-                default:
-                    assert i < 12;
-            }
-
-            // Generate second nop for size between 11-1
-            switch (i) {
-                case 11:
-                    emitByte(0x66); // size prefix
-                    emitByte(0x66); // size prefix
-                    emitByte(0x66); // size prefix
-                    addrNop8();
-                    break;
-                case 10:
-                    emitByte(0x66); // size prefix
-                    emitByte(0x66); // size prefix
-                    addrNop8();
-                    break;
-                case 9:
-                    emitByte(0x66); // size prefix
-                    addrNop8();
-                    break;
-                case 8:
-                    addrNop8();
-                    break;
-                case 7:
-                    addrNop7();
-                    break;
-                case 6:
-                    emitByte(0x66); // size prefix
-                    addrNop5();
-                    break;
-                case 5:
-                    addrNop5();
-                    break;
-                case 4:
-                    addrNop4();
-                    break;
-                case 3:
-                    // Don't use "0x0F 0x1F 0x00" - need patching safe padding
-                    emitByte(0x66); // size prefix
-                    emitByte(0x66); // size prefix
-                    emitByte(0x90); // nop
-                    break;
-                case 2:
-                    emitByte(0x66); // size prefix
-                    emitByte(0x90); // nop
-                    break;
-                case 1:
-                    emitByte(0x90); // nop
-                    break;
-                default:
-                    assert i == 0;
-            }
-            return;
-        }
-
-        // Using nops with size prefixes "0x66 0x90".
-        // From AMD Optimization Guide:
-        // 1: 0x90
-        // 2: 0x66 0x90
-        // 3: 0x66 0x66 0x90
-        // 4: 0x66 0x66 0x66 0x90
-        // 5: 0x66 0x66 0x90 0x66 0x90
-        // 6: 0x66 0x66 0x90 0x66 0x66 0x90
-        // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
-        // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
-        // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
-        // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
-        //
-        while (i > 12) {
-            i -= 4;
-            emitByte(0x66); // size prefix
-            emitByte(0x66);
-            emitByte(0x66);
-            emitByte(0x90); // nop
-        }
-        // 1 - 12 nops
-        if (i > 8) {
-            if (i > 9) {
-                i -= 1;
-                emitByte(0x66);
-            }
-            i -= 3;
-            emitByte(0x66);
-            emitByte(0x66);
-            emitByte(0x90);
-        }
-        // 1 - 8 nops
-        if (i > 4) {
-            if (i > 6) {
-                i -= 1;
-                emitByte(0x66);
-            }
-            i -= 3;
-            emitByte(0x66);
-            emitByte(0x66);
-            emitByte(0x90);
-        }
-        switch (i) {
-            case 4:
-                emitByte(0x66);
-                emitByte(0x66);
-                emitByte(0x66);
-                emitByte(0x90);
-                break;
-            case 3:
-                emitByte(0x66);
-                emitByte(0x66);
-                emitByte(0x90);
-                break;
-            case 2:
-                emitByte(0x66);
-                emitByte(0x90);
-                break;
-            case 1:
-                emitByte(0x90);
-                break;
-            default:
-                assert i == 0;
-        }
-    }
-
-    public final void notl(Register dst) {
-        int encode = prefixAndEncode(dst.encoding);
-        emitByte(0xF7);
-        emitByte(0xD0 | encode);
-    }
-
-    public final void orl(Address dst, int imm32) {
-        prefix(dst);
-        emitByte(0x81);
-        emitOperandHelper(rcx, dst);
-        emitInt(imm32);
-    }
-
-    public final void orl(Register dst, int imm32) {
-        prefix(dst);
-        emitArith(0x81, 0xC8, dst, imm32);
-    }
-
-    public final void orl(Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0x0B);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void orl(Register dst, Register src) {
-        prefixAndEncode(dst.encoding, src.encoding);
-        emitArith(0x0B, 0xC0, dst, src);
-    }
-
-    // generic
-    public final void pop(Register dst) {
-        int encode = prefixAndEncode(dst.encoding);
-        emitByte(0x58 | encode);
-    }
-
-    public final void prefetchPrefix(Address src) {
-        prefix(src);
-        emitByte(0x0F);
-    }
-
-    public final void prefetchnta(Address src) {
-        prefetchPrefix(src);
-        emitByte(0x18);
-        emitOperandHelper(rax, src); // 0, src
-    }
-
-    public final void prefetchr(Address src) {
-        prefetchPrefix(src);
-        emitByte(0x0D);
-        emitOperandHelper(rax, src); // 0, src
-    }
-
-    public final void prefetcht0(Address src) {
-        prefetchPrefix(src);
-        emitByte(0x18);
-        emitOperandHelper(rcx, src); // 1, src
-
-    }
-
-    public final void prefetcht1(Address src) {
-        prefetchPrefix(src);
-        emitByte(0x18);
-        emitOperandHelper(rdx, src); // 2, src
-    }
-
-    public final void prefetcht2(Address src) {
-        prefetchPrefix(src);
-        emitByte(0x18);
-        emitOperandHelper(rbx, src); // 3, src
-    }
-
-    public final void prefetchw(Address src) {
-        prefetchPrefix(src);
-        emitByte(0x0D);
-        emitOperandHelper(rcx, src); // 1, src
-    }
-
-    public final void pshufd(Register dst, Register src, int mode) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        assert isUByte(mode) : "invalid value";
-
-        emitByte(0x66);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x70);
-        emitByte(0xC0 | encode);
-        emitByte(mode & 0xFF);
-    }
-
-    public final void pshufd(Register dst, Address src, int mode) {
-        assert dst.isFpu();
-        assert isUByte(mode) : "invalid value";
-
-        emitByte(0x66);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x70);
-        emitOperandHelper(dst, src);
-        emitByte(mode & 0xFF);
-
-    }
-
-    public final void pshuflw(Register dst, Register src, int mode) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        assert isUByte(mode) : "invalid value";
-
-        emitByte(0xF2);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x70);
-        emitByte(0xC0 | encode);
-        emitByte(mode & 0xFF);
-    }
-
-    public final void pshuflw(Register dst, Address src, int mode) {
-        assert dst.isFpu();
-        assert isUByte(mode) : "invalid value";
-
-        emitByte(0xF2);
-        prefix(src, dst); // QQ new
-        emitByte(0x0F);
-        emitByte(0x70);
-        emitOperandHelper(dst, src);
-        emitByte(mode & 0xFF);
-    }
-
-    public final void psrlq(Register dst, int shift) {
-        assert dst.isFpu();
-        // HMM Table D-1 says sse2 or mmx
-
-        int encode = prefixqAndEncode(xmm2.encoding, dst.encoding);
-        emitByte(0x66);
-        emitByte(0x0F);
-        emitByte(0x73);
-        emitByte(0xC0 | encode);
-        emitByte(shift);
-    }
-
-    public final void punpcklbw(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        emitByte(0x66);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x60);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void push(int imm32) {
-        // in 64bits we push 64bits onto the stack but only
-        // take a 32bit immediate
-        emitByte(0x68);
-        emitInt(imm32);
-    }
-
-    public final void push(Register src) {
-        int encode = prefixAndEncode(src.encoding);
-        emitByte(0x50 | encode);
-    }
-
-    public final void pushf() {
-        emitByte(0x9C);
-    }
-
-    public final void pxor(Register dst, Address src) {
-        assert dst.isFpu();
-
-        emitByte(0x66);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0xEF);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void pxor(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-
-        emitByte(0x66);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xEF);
-        emitByte(0xC0 | encode);
-
-    }
-
-    public final void rcll(Register dst, int imm8) {
-        assert isShiftCount(imm8) : "illegal shift count";
-        int encode = prefixAndEncode(dst.encoding);
-        if (imm8 == 1) {
-            emitByte(0xD1);
-            emitByte(0xD0 | encode);
-        } else {
-            emitByte(0xC1);
-            emitByte(0xD0 | encode);
-            emitByte(imm8);
-        }
-    }
-
-    public final void pause() {
-        emitByte(0xF3);
-        emitByte(0x90);
-    }
-
-    // Copies data from [X86.rsi] to [X86.rdi] using X86.rcx heap words.
-    public final void repeatMoveWords() {
-        emitByte(0xF3);
-        emitByte(Prefix.REXW);
-        emitByte(0xA5);
-    }
-
-    // Copies data from [X86.rsi] to [X86.rdi] using X86.rcx bytes.
-    public final void repeatMoveBytes() {
-        emitByte(0xF3);
-        emitByte(Prefix.REXW);
-        emitByte(0xA4);
-    }
-
-    // sets X86.rcx pointer sized words with X86.rax, value at [edi]
-    // generic
-    public final void repSet() { // repSet
-        emitByte(0xF3);
-        // STOSQ
-        emitByte(Prefix.REXW);
-        emitByte(0xAB);
-    }
-
-    // scans X86.rcx pointer sized words at [edi] for occurance of X86.rax,
-    // generic
-    public final void repneScan() { // repneScan
-        emitByte(0xF2);
-        // SCASQ
-        emitByte(Prefix.REXW);
-        emitByte(0xAF);
-    }
-
-    // scans X86.rcx 4 byte words at [edi] for occurance of X86.rax,
-    // generic
-    public final void repneScanl() { // repneScan
-        emitByte(0xF2);
-        // SCASL
-        emitByte(0xAF);
-    }
-
-    public final void ret(int imm16) {
-        if (imm16 == 0) {
-            emitByte(0xC3);
-        } else {
-            emitByte(0xC2);
-            emitShort(imm16);
-        }
-    }
-
-    public final void sarl(Register dst, int imm8) {
-        int encode = prefixAndEncode(dst.encoding);
-        assert isShiftCount(imm8) : "illegal shift count";
-        if (imm8 == 1) {
-            emitByte(0xD1);
-            emitByte(0xF8 | encode);
-        } else {
-            emitByte(0xC1);
-            emitByte(0xF8 | encode);
-            emitByte(imm8);
-        }
-    }
-
-    public final void sarl(Register dst) {
-        int encode = prefixAndEncode(dst.encoding);
-        emitByte(0xD3);
-        emitByte(0xF8 | encode);
-    }
-
-    public final void sbbl(Address dst, int imm32) {
-        prefix(dst);
-        emitArithOperand(0x81, rbx, dst, imm32);
-    }
-
-    public final void sbbl(Register dst, int imm32) {
-        prefix(dst);
-        emitArith(0x81, 0xD8, dst, imm32);
-    }
-
-    public final void sbbl(Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0x1B);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void sbbl(Register dst, Register src) {
-        prefixAndEncode(dst.encoding, src.encoding);
-        emitArith(0x1B, 0xC0, dst, src);
-    }
-
-    public final void setb(ConditionFlag cc, Register dst) {
-        assert 0 <= cc.value && cc.value < 16 : "illegal cc";
-        int encode = prefixAndEncode(dst.encoding, true);
-        emitByte(0x0F);
-        emitByte(0x90 | cc.value);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void shll(Register dst, int imm8) {
-        assert isShiftCount(imm8) : "illegal shift count";
-        int encode = prefixAndEncode(dst.encoding);
-        if (imm8 == 1) {
-            emitByte(0xD1);
-            emitByte(0xE0 | encode);
-        } else {
-            emitByte(0xC1);
-            emitByte(0xE0 | encode);
-            emitByte(imm8);
-        }
-    }
-
-    public final void shll(Register dst) {
-        int encode = prefixAndEncode(dst.encoding);
-        emitByte(0xD3);
-        emitByte(0xE0 | encode);
-    }
-
-    public final void shrl(Register dst, int imm8) {
-        assert isShiftCount(imm8) : "illegal shift count";
-        int encode = prefixAndEncode(dst.encoding);
-        emitByte(0xC1);
-        emitByte(0xE8 | encode);
-        emitByte(imm8);
-    }
-
-    public final void shrl(Register dst) {
-        int encode = prefixAndEncode(dst.encoding);
-        emitByte(0xD3);
-        emitByte(0xE8 | encode);
-    }
-
-    // copies a single word from [esi] to [edi]
-    public final void smovl() {
-        emitByte(0xA5);
-    }
-
-    public final void sqrtsd(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        // HMM Table D-1 says sse2
-        // assert is64 || target.supportsSSE();
-        emitByte(0xF2);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x51);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void subl(Address dst, int imm32) {
-        prefix(dst);
-        if (isByte(imm32)) {
-            emitByte(0x83);
-            emitOperandHelper(rbp, dst);
-            emitByte(imm32 & 0xFF);
-        } else {
-            emitByte(0x81);
-            emitOperandHelper(rbp, dst);
-            emitInt(imm32);
-        }
-    }
-
-    public final void subl(Register dst, int imm32) {
-        prefix(dst);
-        emitArith(0x81, 0xE8, dst, imm32);
-    }
-
-    public final void subl(Address dst, Register src) {
-        prefix(dst, src);
-        emitByte(0x29);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void subl(Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0x2B);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void subl(Register dst, Register src) {
-        prefixAndEncode(dst.encoding, src.encoding);
-        emitArith(0x2B, 0xC0, dst, src);
-    }
-
-    public final void subsd(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        emitByte(0xF2);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x5C);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void subsd(Register dst, Address src) {
-        assert dst.isFpu();
-
-        emitByte(0xF2);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x5C);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void subss(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        emitByte(0xF3);
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x5C);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void subss(Register dst, Address src) {
-        assert dst.isFpu();
-
-        emitByte(0xF3);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x5C);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void testb(Register dst, int imm8) {
-        prefixAndEncode(dst.encoding, true);
-        emitArithB(0xF6, 0xC0, dst, imm8);
-    }
-
-    public final void testl(Register dst, int imm32) {
-        // not using emitArith because test
-        // doesn't support sign-extension of
-        // 8bit operands
-        int encode = dst.encoding;
-        if (encode == 0) {
-            emitByte(0xA9);
-        } else {
-            encode = prefixAndEncode(encode);
-            emitByte(0xF7);
-            emitByte(0xC0 | encode);
-        }
-        emitInt(imm32);
-    }
-
-    public final void testl(Register dst, Register src) {
-        prefixAndEncode(dst.encoding, src.encoding);
-        emitArith(0x85, 0xC0, dst, src);
-    }
-
-    public final void testl(Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0x85);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void ucomisd(Register dst, Address src) {
-        assert dst.isFpu();
-        emitByte(0x66);
-        ucomiss(dst, src);
-    }
-
-    public final void ucomisd(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        emitByte(0x66);
-        ucomiss(dst, src);
-    }
-
-    public final void ucomiss(Register dst, Address src) {
-        assert dst.isFpu();
-
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x2E);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void ucomiss(Register dst, Register src) {
-        assert dst.isFpu();
-        assert src.isFpu();
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x2E);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void xaddl(Address dst, Register src) {
-        assert src.isFpu();
-
-        prefix(dst, src);
-        emitByte(0x0F);
-        emitByte(0xC1);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void xchgl(Register dst, Address src) { // xchg
-        prefix(src, dst);
-        emitByte(0x87);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void xchgl(Register dst, Register src) {
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x87);
-        emitByte(0xc0 | encode);
-    }
-
-    public final void xorl(Register dst, int imm32) {
-        prefix(dst);
-        emitArith(0x81, 0xF0, dst, imm32);
-    }
-
-    public final void xorl(Register dst, Address src) {
-        prefix(src, dst);
-        emitByte(0x33);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void xorl(Register dst, Register src) {
-        prefixAndEncode(dst.encoding, src.encoding);
-        emitArith(0x33, 0xC0, dst, src);
-    }
-
-    public final void andpd(Register dst, Register src) {
-        emitByte(0x66);
-        andps(dst, src);
-    }
-
-    public final void andpd(Register dst, Address src) {
-        emitByte(0x66);
-        andps(dst, src);
-    }
-
-    public final void andps(Register dst, Register src) {
-        assert dst.isFpu() && src.isFpu();
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x54);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void andps(Register dst, Address src) {
-        assert dst.isFpu();
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x54);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void orpd(Register dst, Register src) {
-        emitByte(0x66);
-        orps(dst, src);
-    }
-
-    public final void orpd(Register dst, Address src) {
-        emitByte(0x66);
-        orps(dst, src);
-    }
-
-    public final void orps(Register dst, Register src) {
-        assert dst.isFpu() && src.isFpu();
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x56);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void orps(Register dst, Address src) {
-        assert dst.isFpu();
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x56);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void xorpd(Register dst, Register src) {
-        emitByte(0x66);
-        xorps(dst, src);
-    }
-
-    public final void xorpd(Register dst, Address src) {
-        emitByte(0x66);
-        xorps(dst, src);
-    }
-
-    public final void xorps(Register dst, Register src) {
-        assert dst.isFpu() && src.isFpu();
-        int encode = prefixAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x57);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void xorps(Register dst, Address src) {
-        assert dst.isFpu();
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x57);
-        emitOperandHelper(dst, src);
-    }
-
-    // 32bit only pieces of the assembler
-
-    public final void decl(Register dst) {
-        // Don't use it directly. Use Macrodecrementl() instead.
-        // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
-        int encode = prefixAndEncode(dst.encoding);
-        emitByte(0xFF);
-        emitByte(0xC8 | encode);
-    }
-
-    public final void incl(Register dst) {
-        // Don't use it directly. Use Macroincrementl() instead.
-        // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
-        int encode = prefixAndEncode(dst.encoding);
-        emitByte(0xFF);
-        emitByte(0xC0 | encode);
-    }
-
-    int prefixAndEncode(int regEnc) {
-        return prefixAndEncode(regEnc, false);
-    }
-
-    int prefixAndEncode(int regEnc, boolean byteinst) {
-        if (regEnc >= 8) {
-            emitByte(Prefix.REXB);
-            return regEnc - 8;
-        } else if (byteinst && regEnc >= 4) {
-            emitByte(Prefix.REX);
-        }
-        return regEnc;
-    }
-
-    int prefixqAndEncode(int regEnc) {
-        if (regEnc < 8) {
-            emitByte(Prefix.REXW);
-            return regEnc;
-        } else {
-            emitByte(Prefix.REXWB);
-            return regEnc - 8;
-        }
-    }
-
-    int prefixAndEncode(int dstEnc, int srcEnc) {
-        return prefixAndEncode(dstEnc, srcEnc, false);
-    }
-
-    int prefixAndEncode(int dstEncoding, int srcEncoding, boolean byteinst) {
-        int srcEnc = srcEncoding;
-        int dstEnc = dstEncoding;
-        if (dstEnc < 8) {
-            if (srcEnc >= 8) {
-                emitByte(Prefix.REXB);
-                srcEnc -= 8;
-            } else if (byteinst && srcEnc >= 4) {
-                emitByte(Prefix.REX);
-            }
-        } else {
-            if (srcEnc < 8) {
-                emitByte(Prefix.REXR);
-            } else {
-                emitByte(Prefix.REXRB);
-                srcEnc -= 8;
-            }
-            dstEnc -= 8;
-        }
-        return dstEnc << 3 | srcEnc;
-    }
-
-    /**
-     * Creates prefix and the encoding of the lower 6 bits of the ModRM-Byte. It emits an operand prefix. If the given
-     * operands exceed 3 bits, the 4th bit is encoded in the prefix.
-     *
-     * @param regEncoding the encoding of the register part of the ModRM-Byte
-     * @param rmEncoding the encoding of the r/m part of the ModRM-Byte
-     * @return the lower 6 bits of the ModRM-Byte that should be emitted
-     */
-    private int prefixqAndEncode(int regEncoding, int rmEncoding) {
-        int rmEnc = rmEncoding;
-        int regEnc = regEncoding;
-        if (regEnc < 8) {
-            if (rmEnc < 8) {
-                emitByte(Prefix.REXW);
-            } else {
-                emitByte(Prefix.REXWB);
-                rmEnc -= 8;
-            }
-        } else {
-            if (rmEnc < 8) {
-                emitByte(Prefix.REXWR);
-            } else {
-                emitByte(Prefix.REXWRB);
-                rmEnc -= 8;
-            }
-            regEnc -= 8;
-        }
-        return regEnc << 3 | rmEnc;
-    }
-
-    private void prefix(Register reg) {
-        if (reg.encoding >= 8) {
-            emitByte(Prefix.REXB);
-        }
-    }
-
-    private static boolean needsRex(Value value) {
-        return isRegister(value) && asRegister(value).encoding >= MinEncodingNeedsRex;
-    }
-
-
-    private void prefix(Address adr) {
-        if (needsRex(adr.getBase())) {
-            if (needsRex(adr.getIndex())) {
-                emitByte(Prefix.REXXB);
-            } else {
-                emitByte(Prefix.REXB);
-            }
-        } else {
-            if (needsRex(adr.getIndex())) {
-                emitByte(Prefix.REXX);
-            }
-        }
-    }
-
-    private void prefixq(Address adr) {
-        if (needsRex(adr.getBase())) {
-            if (needsRex(adr.getIndex())) {
-                emitByte(Prefix.REXWXB);
-            } else {
-                emitByte(Prefix.REXWB);
-            }
-        } else {
-            if (needsRex(adr.getIndex())) {
-                emitByte(Prefix.REXWX);
-            } else {
-                emitByte(Prefix.REXW);
-            }
-        }
-    }
-
-    private void prefix(Address adr, Register reg) {
-        if (reg.encoding < 8) {
-            if (needsRex(adr.getBase())) {
-                if (needsRex(adr.getIndex())) {
-                    emitByte(Prefix.REXXB);
-                } else {
-                    emitByte(Prefix.REXB);
-                }
-            } else {
-                if (needsRex(adr.getIndex())) {
-                    emitByte(Prefix.REXX);
-                } else if (reg.encoding >= 4) {
-                    emitByte(Prefix.REX);
-                }
-            }
-        } else {
-            if (needsRex(adr.getBase())) {
-                if (needsRex(adr.getIndex())) {
-                    emitByte(Prefix.REXRXB);
-                } else {
-                    emitByte(Prefix.REXRB);
-                }
-            } else {
-                if (needsRex(adr.getIndex())) {
-                    emitByte(Prefix.REXRX);
-                } else {
-                    emitByte(Prefix.REXR);
-                }
-            }
-        }
-    }
-
-    private void prefixq(Address adr, Register src) {
-        if (src.encoding < 8) {
-            if (needsRex(adr.getBase())) {
-                if (needsRex(adr.getIndex())) {
-                    emitByte(Prefix.REXWXB);
-                } else {
-                    emitByte(Prefix.REXWB);
-                }
-            } else {
-                if (needsRex(adr.getIndex())) {
-                    emitByte(Prefix.REXWX);
-                } else {
-                    emitByte(Prefix.REXW);
-                }
-            }
-        } else {
-            if (needsRex(adr.getBase())) {
-                if (needsRex(adr.getIndex())) {
-                    emitByte(Prefix.REXWRXB);
-                } else {
-                    emitByte(Prefix.REXWRB);
-                }
-            } else {
-                if (needsRex(adr.getIndex())) {
-                    emitByte(Prefix.REXWRX);
-                } else {
-                    emitByte(Prefix.REXWR);
-                }
-            }
-        }
-    }
-
-    public final void addq(Address dst, int imm32) {
-        prefixq(dst);
-        emitArithOperand(0x81, rax, dst, imm32);
-    }
-
-    public final void addq(Address dst, Register src) {
-        prefixq(dst, src);
-        emitByte(0x01);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void addq(Register dst, int imm32) {
-        prefixqAndEncode(dst.encoding);
-        emitArith(0x81, 0xC0, dst, imm32);
-    }
-
-    public final void addq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x03);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void addq(Register dst, Register src) {
-        prefixqAndEncode(dst.encoding, src.encoding);
-        emitArith(0x03, 0xC0, dst, src);
-    }
-
-    public final void andq(Register dst, int imm32) {
-        prefixqAndEncode(dst.encoding);
-        emitArith(0x81, 0xE0, dst, imm32);
-    }
-
-    public final void andq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x23);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void andq(Register dst, Register src) {
-        prefixqAndEncode(dst.encoding, src.encoding);
-        emitArith(0x23, 0xC0, dst, src);
-    }
-
-    public final void bswapq(Register reg) {
-        int encode = prefixqAndEncode(reg.encoding);
-        emitByte(0x0F);
-        emitByte(0xC8 | encode);
-    }
-
-    public final void cdqq() {
-        emitByte(Prefix.REXW);
-        emitByte(0x99);
-    }
-
-    public final void cmovq(ConditionFlag cc, Register dst, Register src) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x40 | cc.value);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cmovq(ConditionFlag cc, Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x0F);
-        emitByte(0x40 | cc.value);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void cmpq(Address dst, int imm32) {
-        prefixq(dst);
-        emitByte(0x81);
-        emitOperandHelper(rdi, dst);
-        emitInt(imm32);
-    }
-
-    public final void cmpq(Register dst, int imm32) {
-        prefixqAndEncode(dst.encoding);
-        emitArith(0x81, 0xF8, dst, imm32);
-    }
-
-    public final void cmpq(Address dst, Register src) {
-        prefixq(dst, src);
-        emitByte(0x3B);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void cmpq(Register dst, Register src) {
-        prefixqAndEncode(dst.encoding, src.encoding);
-        emitArith(0x3B, 0xC0, dst, src);
-    }
-
-    public final void cmpq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x3B);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void cmpxchgq(Register reg, Address adr) {
-        prefixq(adr, reg);
-        emitByte(0x0F);
-        emitByte(0xB1);
-        emitOperandHelper(reg, adr);
-    }
-
-    public final void cvtsi2sdq(Register dst, Register src) {
-        assert dst.isFpu();
-        emitByte(0xF2);
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x2A);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cvtsi2ssq(Register dst, Register src) {
-        assert dst.isFpu();
-        emitByte(0xF3);
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x2A);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cvttsd2siq(Register dst, Register src) {
-        assert src.isFpu();
-        emitByte(0xF2);
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x2C);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void cvttss2siq(Register dst, Register src) {
-        assert src.isFpu();
-        emitByte(0xF3);
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0x2C);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void decq(Register dst) {
-        // Don't use it directly. Use Macrodecrementq() instead.
-        // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
-        int encode = prefixqAndEncode(dst.encoding);
-        emitByte(0xFF);
-        emitByte(0xC8 | encode);
-    }
-
-    public final void decq(Address dst) {
-        // Don't use it directly. Use Macrodecrementq() instead.
-        prefixq(dst);
-        emitByte(0xFF);
-        emitOperandHelper(rcx, dst);
-    }
-
-    public final void divq(Register src) {
-        int encode = prefixqAndEncode(src.encoding);
-        emitByte(0xF7);
-        emitByte(0xF0 | encode);
-    }
-
-    public final void idivq(Register src) {
-        int encode = prefixqAndEncode(src.encoding);
-        emitByte(0xF7);
-        emitByte(0xF8 | encode);
-    }
-
-    public final void imulq(Register dst, Register src) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xAF);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void imulq(Register dst, Register src, int value) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        if (isByte(value)) {
-            emitByte(0x6B);
-            emitByte(0xC0 | encode);
-            emitByte(value);
-        } else {
-            emitByte(0x69);
-            emitByte(0xC0 | encode);
-            emitInt(value);
-        }
-    }
-
-    public final void incq(Register dst) {
-        // Don't use it directly. Use Macroincrementq() instead.
-        // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
-        int encode = prefixqAndEncode(dst.encoding);
-        emitByte(0xFF);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void incq(Address dst) {
-        // Don't use it directly. Use Macroincrementq() instead.
-        prefixq(dst);
-        emitByte(0xFF);
-        emitOperandHelper(rax, dst);
-    }
-
-    public final void movq(Register dst, long imm64) {
-        int encode = prefixqAndEncode(dst.encoding);
-        emitByte(0xB8 | encode);
-        emitLong(imm64);
-    }
-
-    public final void movdq(Register dst, Register src) {
-
-        // table D-1 says MMX/SSE2
-        emitByte(0x66);
-
-        if (dst.isFpu()) {
-            assert dst.isFpu();
-            int encode = prefixqAndEncode(dst.encoding, src.encoding);
-            emitByte(0x0F);
-            emitByte(0x6E);
-            emitByte(0xC0 | encode);
-        } else if (src.isFpu()) {
-
-            // swap src/dst to get correct prefix
-            int encode = prefixqAndEncode(src.encoding, dst.encoding);
-            emitByte(0x0F);
-            emitByte(0x7E);
-            emitByte(0xC0 | encode);
-        } else {
-            throw new InternalError("should not reach here");
-        }
-    }
-
-    public final void movsbq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x0F);
-        emitByte(0xBE);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movsbq(Register dst, Register src) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xBE);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movslq(Register dst, int imm32) {
-        int encode = prefixqAndEncode(dst.encoding);
-        emitByte(0xC7 | encode);
-        emitInt(imm32);
-        // dbx shows movslq(X86.rcx, 3) as movq $0x0000000049000000,(%X86.rbx)
-        // and movslq(X86.r8, 3); as movl $0x0000000048000000,(%X86.rbx)
-        // as a result we shouldn't use until tested at runtime...
-        throw new InternalError("untested");
-    }
-
-    public final void movslq(Address dst, int imm32) {
-        prefixq(dst);
-        emitByte(0xC7);
-        emitOperandHelper(rax, dst);
-        emitInt(imm32);
-    }
-
-    public final void movslq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x63);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movslq(Register dst, Register src) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x63);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movswq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x0F);
-        emitByte(0xBF);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movswq(Register dst, Register src) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xBF);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movzbq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x0F);
-        emitByte(0xB6);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movzbq(Register dst, Register src) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xB6);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void movzwq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x0F);
-        emitByte(0xB7);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void movzwq(Register dst, Register src) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x0F);
-        emitByte(0xB7);
-        emitByte(0xC0 | encode);
-    }
-
-    public final void negq(Register dst) {
-        int encode = prefixqAndEncode(dst.encoding);
-        emitByte(0xF7);
-        emitByte(0xD8 | encode);
-    }
-
-    public final void notq(Register dst) {
-        int encode = prefixqAndEncode(dst.encoding);
-        emitByte(0xF7);
-        emitByte(0xD0 | encode);
-    }
-
-    public final void orq(Address dst, int imm32) {
-        prefixq(dst);
-        emitByte(0x81);
-        emitOperandHelper(rcx, dst);
-        emitInt(imm32);
-    }
-
-    public final void orq(Register dst, int imm32) {
-        prefixqAndEncode(dst.encoding);
-        emitArith(0x81, 0xC8, dst, imm32);
-    }
-
-    public final void orq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x0B);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void orq(Register dst, Register src) {
-        prefixqAndEncode(dst.encoding, src.encoding);
-        emitArith(0x0B, 0xC0, dst, src);
-    }
-
-    public final void popq(Address dst) {
-        prefixq(dst);
-        emitByte(0x8F);
-        emitOperandHelper(rax, dst);
-    }
-
-    public final void pushq(Address src) {
-        prefixq(src);
-        emitByte(0xFF);
-        emitOperandHelper(rsi, src);
-    }
-
-    public final void rclq(Register dst, int imm8) {
-        assert isShiftCount(imm8 >> 1) : "illegal shift count";
-        int encode = prefixqAndEncode(dst.encoding);
-        if (imm8 == 1) {
-            emitByte(0xD1);
-            emitByte(0xD0 | encode);
-        } else {
-            emitByte(0xC1);
-            emitByte(0xD0 | encode);
-            emitByte(imm8);
-        }
-    }
-
-    public final void sarq(Register dst, int imm8) {
-        assert isShiftCount(imm8 >> 1) : "illegal shift count";
-        int encode = prefixqAndEncode(dst.encoding);
-        if (imm8 == 1) {
-            emitByte(0xD1);
-            emitByte(0xF8 | encode);
-        } else {
-            emitByte(0xC1);
-            emitByte(0xF8 | encode);
-            emitByte(imm8);
-        }
-    }
-
-    public final void sarq(Register dst) {
-        int encode = prefixqAndEncode(dst.encoding);
-        emitByte(0xD3);
-        emitByte(0xF8 | encode);
-    }
-
-    public final void shlq(Register dst, int imm8) {
-        assert isShiftCount(imm8 >> 1) : "illegal shift count";
-        int encode = prefixqAndEncode(dst.encoding);
-        if (imm8 == 1) {
-            emitByte(0xD1);
-            emitByte(0xE0 | encode);
-        } else {
-            emitByte(0xC1);
-            emitByte(0xE0 | encode);
-            emitByte(imm8);
-        }
-    }
-
-    public final void shlq(Register dst) {
-        int encode = prefixqAndEncode(dst.encoding);
-        emitByte(0xD3);
-        emitByte(0xE0 | encode);
-    }
-
-    public final void shrq(Register dst, int imm8) {
-        assert isShiftCount(imm8 >> 1) : "illegal shift count";
-        int encode = prefixqAndEncode(dst.encoding);
-        emitByte(0xC1);
-        emitByte(0xE8 | encode);
-        emitByte(imm8);
-    }
-
-    public final void shrq(Register dst) {
-        int encode = prefixqAndEncode(dst.encoding);
-        emitByte(0xD3);
-        emitByte(0xE8 | encode);
-    }
-
-    public final void sqrtsd(Register dst, Address src) {
-        assert dst.isFpu();
-
-        emitByte(0xF2);
-        prefix(src, dst);
-        emitByte(0x0F);
-        emitByte(0x51);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void subq(Address dst, int imm32) {
-        prefixq(dst);
-        if (isByte(imm32)) {
-            emitByte(0x83);
-            emitOperandHelper(rbp, dst);
-            emitByte(imm32 & 0xFF);
-        } else {
-            emitByte(0x81);
-            emitOperandHelper(rbp, dst);
-            emitInt(imm32);
-        }
-    }
-
-    public final void subq(Register dst, int imm32) {
-        prefixqAndEncode(dst.encoding);
-        emitArith(0x81, 0xE8, dst, imm32);
-    }
-
-    public final void subq(Address dst, Register src) {
-        prefixq(dst, src);
-        emitByte(0x29);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void subq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x2B);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void subq(Register dst, Register src) {
-        prefixqAndEncode(dst.encoding, src.encoding);
-        emitArith(0x2B, 0xC0, dst, src);
-    }
-
-    public final void testq(Register dst, int imm32) {
-        // not using emitArith because test
-        // doesn't support sign-extension of
-        // 8bit operands
-        int encode = dst.encoding;
-        if (encode == 0) {
-            emitByte(Prefix.REXW);
-            emitByte(0xA9);
-        } else {
-            encode = prefixqAndEncode(encode);
-            emitByte(0xF7);
-            emitByte(0xC0 | encode);
-        }
-        emitInt(imm32);
-    }
-
-    public final void testq(Register dst, Register src) {
-        prefixqAndEncode(dst.encoding, src.encoding);
-        emitArith(0x85, 0xC0, dst, src);
-    }
-
-    public final void xaddq(Address dst, Register src) {
-        prefixq(dst, src);
-        emitByte(0x0F);
-        emitByte(0xC1);
-        emitOperandHelper(src, dst);
-    }
-
-    public final void xchgq(Register dst, Address src) {
-        prefixq(src, dst);
-        emitByte(0x87);
-        emitOperandHelper(dst, src);
-    }
-
-    public final void xchgq(Register dst, Register src) {
-        int encode = prefixqAndEncode(dst.encoding, src.encoding);
-        emitByte(0x87);
-        emitByte(0xc0 | encode);
-    }
-
-    public final void xorq(Register dst, int imm32) {
-        prefixqAndEncode(dst.encoding);
-        emitArith(0x81, 0xF0, dst, imm32);
-    }
-
-    public final void xorq(Register dst, Register src) {
-        prefixqAndEncode(dst.encoding, src.encoding);
-        emitArith(0x33, 0xC0, dst, src);
-    }
-
-    public final void xorq(Register dst, Address src) {
-
-        prefixq(src, dst);
-        emitByte(0x33);
-        emitOperandHelper(dst, src);
-
-    }
-
-    public final void membar(int barriers) {
-        if (target.isMP) {
-            // We only have to handle StoreLoad
-            if ((barriers & STORE_LOAD) != 0) {
-                // All usable chips support "locked" instructions which suffice
-                // as barriers, and are much faster than the alternative of
-                // using cpuid instruction. We use here a locked add [rsp],0.
-                // This is conveniently otherwise a no-op except for blowing
-                // flags.
-                // Any change to this code may need to revisit other places in
-                // the code where this idiom is used, in particular the
-                // orderAccess code.
-                lock();
-                addl(new Address(Word, RSP, 0), 0); // Assert the lock# signal here
-            }
-        }
-    }
-
-    @Override
-    protected final void patchJumpTarget(int branch, int branchTarget) {
-        int op = codeBuffer.getByte(branch);
-        assert op == 0xE8 // call
-            || op == 0x00 // jump table entry
-            || op == 0xE9 // jmp
-            || op == 0xEB // short jmp
-            || (op & 0xF0) == 0x70 // short jcc
-            || op == 0x0F && (codeBuffer.getByte(branch + 1) & 0xF0) == 0x80 // jcc
-        : "Invalid opcode at patch point branch=" + branch + ", branchTarget=" + branchTarget + ", op=" + op;
-
-        if (op == 0x00) {
-            int offsetToJumpTableBase = codeBuffer.getShort(branch + 1);
-            int jumpTableBase = branch - offsetToJumpTableBase;
-            int imm32 = branchTarget - jumpTableBase;
-            codeBuffer.emitInt(imm32, branch);
-        } else if (op == 0xEB || (op & 0xF0) == 0x70) {
-
-            // short offset operators (jmp and jcc)
-            int imm8 = branchTarget - (branch + 2);
-            codeBuffer.emitByte(imm8, branch + 1);
-
-        } else {
-
-            int off = 1;
-            if (op == 0x0F) {
-                off = 2;
-            }
-
-            int imm32 = branchTarget - (branch + 4 + off);
-            codeBuffer.emitInt(imm32, branch + off);
-        }
-    }
-
-    public void nullCheck(Register r) {
-        testl(AMD64.rax, new Address(Word, r.asValue(Word), 0));
-    }
-
-    @Override
-    public void align(int modulus) {
-        if (codeBuffer.position() % modulus != 0) {
-            nop(modulus - (codeBuffer.position() % modulus));
-        }
-    }
-
-    public void pushfq() {
-        emitByte(0x9c);
-    }
-
-    public void popfq() {
-        emitByte(0x9D);
-    }
-
-    /**
-     * Makes sure that a subsequent {@linkplain #call} does not fail the alignment check.
-     */
-    public final void alignForPatchableDirectCall() {
-        int dispStart = codeBuffer.position() + 1;
-        int mask = target.wordSize - 1;
-        if ((dispStart & ~mask) != ((dispStart + 3) & ~mask)) {
-            nop(target.wordSize - (dispStart & mask));
-            assert ((codeBuffer.position() + 1) & mask) == 0;
-        }
-    }
-
-    /**
-     * Emits a direct call instruction. Note that the actual call target is not specified, because all calls
-     * need patching anyway. Therefore, 0 is emitted as the call target, and the user is responsible
-     * to add the call address to the appropriate patching tables.
-     */
-    public final void call() {
-        emitByte(0xE8);
-        emitInt(0);
-    }
-
-    public final void call(Register src) {
-        int encode = prefixAndEncode(src.encoding);
-        emitByte(0xFF);
-        emitByte(0xD0 | encode);
-    }
-
-    public void int3() {
-        emitByte(0xCC);
-    }
-
-    public void enter(short imm16, byte imm8) {
-        emitByte(0xC8);
-        // appended:
-        emitByte(imm16 & 0xff);
-        emitByte((imm16 >> 8) & 0xff);
-        emitByte(imm8);
-    }
-
-    private void emitx87(int b1, int b2, int i) {
-        assert 0 <= i && i < 8 : "illegal stack offset";
-        emitByte(b1);
-        emitByte(b2 + i);
-    }
-
-    public void fld(Address src) {
-        emitByte(0xDD);
-        emitOperandHelper(rax, src);
-    }
-
-    public void fld(int i) {
-        emitx87(0xD9, 0xC0, i);
-    }
-
-    public void fldln2() {
-        emitByte(0xD9);
-        emitByte(0xED);
-    }
-
-    public void fldlg2() {
-        emitByte(0xD9);
-        emitByte(0xEC);
-    }
-
-    public void fyl2x() {
-        emitByte(0xD9);
-        emitByte(0xF1);
-    }
-
-    public void fstp(Address src) {
-        emitByte(0xDD);
-        emitOperandHelper(rbx, src);
-    }
-
-    public void fsin() {
-        emitByte(0xD9);
-        emitByte(0xFE);
-    }
-
-    public void fcos() {
-        emitByte(0xD9);
-        emitByte(0xFF);
-    }
-
-    public void fptan() {
-        emitByte(0xD9);
-        emitByte(0xF2);
-    }
-
-    public void fstp(int i) {
-        emitx87(0xDD, 0xD8, i);
-    }
-
-    @Override
-    public void bangStack(int disp) {
-        movq(new Address(target.wordKind, AMD64.RSP, -disp), AMD64.rax);
-    }
-}
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.max.asm/src/com/oracle/max/asm/target/amd64/AMD64MacroAssembler.java
--- a/graal/com.oracle.max.asm/src/com/oracle/max/asm/target/amd64/AMD64MacroAssembler.java	Tue Oct 02 22:06:37 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,390 +0,0 @@
-/*
- * Copyright (c) 2009, 2011, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-package com.oracle.max.asm.target.amd64;
-
-import com.oracle.graal.api.code.*;
-import com.oracle.graal.api.meta.*;
-import com.oracle.max.asm.*;
-
-/**
- * This class implements commonly used X86 code patterns.
- */
-public class AMD64MacroAssembler extends AMD64Assembler {
-
-    public AMD64MacroAssembler(TargetDescription target, RegisterConfig registerConfig) {
-        super(target, registerConfig);
-    }
-
-    public void pushptr(Address src) {
-        pushq(src);
-    }
-
-    public void popptr(Address src) {
-        popq(src);
-    }
-
-    public void xorptr(Register dst, Register src) {
-        xorq(dst, src);
-    }
-
-    public void xorptr(Register dst, Address src) {
-        xorq(dst, src);
-    }
-
-    // 64 bit versions
-
-
-    public void decrementq(Register reg, int value) {
-        if (value == Integer.MIN_VALUE) {
-            subq(reg, value);
-            return;
-        }
-        if (value < 0) {
-            incrementq(reg, -value);
-            return;
-        }
-        if (value == 0) {
-            return;
-        }
-        if (value == 1 && AsmOptions.UseIncDec) {
-            decq(reg);
-        } else {
-            subq(reg, value);
-        }
-    }
-
-    public void incrementq(Register reg, int value) {
-        if (value == Integer.MIN_VALUE) {
-            addq(reg, value);
-            return;
-        }
-        if (value < 0) {
-            decrementq(reg, -value);
-            return;
-        }
-        if (value == 0) {
-            return;
-        }
-        if (value == 1 && AsmOptions.UseIncDec) {
-            incq(reg);
-        } else {
-            addq(reg, value);
-        }
-    }
-
-    // These are mostly for initializing null
-    public void movptr(Address dst, int src) {
-        movslq(dst, src);
-    }
-
-    public final void cmp32(Register src1, int imm) {
-        cmpl(src1, imm);
-    }
-
-    public final void cmp32(Register src1, Address src2) {
-        cmpl(src1, src2);
-    }
-
-    public void cmpsd2int(Register opr1, Register opr2, Register dst, boolean unorderedIsLess) {
-        assert opr1.isFpu() && opr2.isFpu();
-        ucomisd(opr1, opr2);
-
-        Label l = new Label();
-        if (unorderedIsLess) {
-            movl(dst, -1);
-            jcc(AMD64Assembler.ConditionFlag.parity, l);
-            jcc(AMD64Assembler.ConditionFlag.below, l);
-            movl(dst, 0);
-            jcc(AMD64Assembler.ConditionFlag.equal, l);
-            incrementl(dst, 1);
-        } else { // unordered is greater
-            movl(dst, 1);
-            jcc(AMD64Assembler.ConditionFlag.parity, l);
-            jcc(AMD64Assembler.ConditionFlag.above, l);
-            movl(dst, 0);
-            jcc(AMD64Assembler.ConditionFlag.equal, l);
-            decrementl(dst, 1);
-        }
-        bind(l);
-    }
-
-    public void cmpss2int(Register opr1, Register opr2, Register dst, boolean unorderedIsLess) {
-        assert opr1.isFpu();
-        assert opr2.isFpu();
-        ucomiss(opr1, opr2);
-
-        Label l = new Label();
-        if (unorderedIsLess) {
-            movl(dst, -1);
-            jcc(AMD64Assembler.ConditionFlag.parity, l);
-            jcc(AMD64Assembler.ConditionFlag.below, l);
-            movl(dst, 0);
-            jcc(AMD64Assembler.ConditionFlag.equal, l);
-            incrementl(dst, 1);
-        } else { // unordered is greater
-            movl(dst, 1);
-            jcc(AMD64Assembler.ConditionFlag.parity, l);
-            jcc(AMD64Assembler.ConditionFlag.above, l);
-            movl(dst, 0);
-            jcc(AMD64Assembler.ConditionFlag.equal, l);
-            decrementl(dst, 1);
-        }
-        bind(l);
-    }
-
-    public void cmpptr(Register src1, Register src2) {
-        cmpq(src1, src2);
-    }
-
-    public void cmpptr(Register src1, Address src2) {
-        cmpq(src1, src2);
-    }
-
-    public void cmpptr(Register src1, int src2) {
-        cmpq(src1, src2);
-    }
-
-    public void cmpptr(Address src1, int src2) {
-        cmpq(src1, src2);
-    }
-
-    public void decrementl(Register reg, int value) {
-        if (value == Integer.MIN_VALUE) {
-            subl(reg, value);
-            return;
-        }
-        if (value < 0) {
-            incrementl(reg, -value);
-            return;
-        }
-        if (value == 0) {
-            return;
-        }
-        if (value == 1 && AsmOptions.UseIncDec) {
-            decl(reg);
-        } else {
-            subl(reg, value);
-        }
-    }
-
-    public void decrementl(Address dst, int value) {
-        if (value == Integer.MIN_VALUE) {
-            subl(dst, value);
-            return;
-        }
-        if (value < 0) {
-            incrementl(dst, -value);
-            return;
-        }
-        if (value == 0) {
-            return;
-        }
-        if (value == 1 && AsmOptions.UseIncDec) {
-            decl(dst);
-        } else {
-            subl(dst, value);
-        }
-    }
-
-    public void incrementl(Register reg, int value) {
-        if (value == Integer.MIN_VALUE) {
-            addl(reg, value);
-            return;
-        }
-        if (value < 0) {
-            decrementl(reg, -value);
-            return;
-        }
-        if (value == 0) {
-            return;
-        }
-        if (value == 1 && AsmOptions.UseIncDec) {
-            incl(reg);
-        } else {
-            addl(reg, value);
-        }
-    }
-
-    public void incrementl(Address dst, int value) {
-        if (value == Integer.MIN_VALUE) {
-            addl(dst, value);
-            return;
-        }
-        if (value < 0) {
-            decrementl(dst, -value);
-            return;
-        }
-        if (value == 0) {
-            return;
-        }
-        if (value == 1 && AsmOptions.UseIncDec) {
-            incl(dst);
-        } else {
-            addl(dst, value);
-        }
-    }
-
-    public void signExtendByte(Register reg) {
-        if (reg.isByte()) {
-            movsxb(reg, reg); // movsxb
-        } else {
-            shll(reg, 24);
-            sarl(reg, 24);
-        }
-    }
-
-    public void signExtendShort(Register reg) {
-        movsxw(reg, reg); // movsxw
-    }
-
-    // Support optimal SSE move instructions.
-    public void movflt(Register dst, Register src) {
-        assert dst.isFpu() && src.isFpu();
-        if (AsmOptions.UseXmmRegToRegMoveAll) {
-            movaps(dst, src);
-        } else {
-            movss(dst, src);
-        }
-    }
-
-    public void movflt(Register dst, Address src) {
-        assert dst.isFpu();
-        movss(dst, src);
-    }
-
-    public void movflt(Address dst, Register src) {
-        assert src.isFpu();
-        movss(dst, src);
-    }
-
-    public void movdbl(Register dst, Register src) {
-        assert dst.isFpu() && src.isFpu();
-        if (AsmOptions.UseXmmRegToRegMoveAll) {
-            movapd(dst, src);
-        } else {
-            movsd(dst, src);
-        }
-    }
-
-    public void movdbl(Register dst, Address src) {
-        assert dst.isFpu();
-        if (AsmOptions.UseXmmLoadAndClearUpper) {
-            movsd(dst, src);
-        } else {
-            movlpd(dst, src);
-        }
-    }
-
-    public void movdbl(Address dst, Register src) {
-        assert src.isFpu();
-        movsd(dst, src);
-    }
-
-    /**
-     * Non-atomic write of a 64-bit constant to memory. Do not use
-     * if the address might be a volatile field!
-     */
-    public void movlong(Address dst, long src) {
-        Address high = new Address(dst.getKind(), dst.getBase(), dst.getIndex(), dst.getScale(), dst.getDisplacement() + 4);
-        movl(dst, (int) (src & 0xFFFFFFFF));
-        movl(high, (int) (src >> 32));
-    }
-
-    public void xchgptr(Register src1, Register src2) {
-        xchgq(src1, src2);
-    }
-
-    public void flog(Register dest, Register value, boolean base10) {
-        assert value.spillSlotSize == dest.spillSlotSize;
-
-        Address tmp = new Address(Kind.Double, AMD64.RSP);
-        if (base10) {
-            fldlg2();
-        } else {
-            fldln2();
-        }
-        subq(AMD64.rsp, value.spillSlotSize);
-        movsd(tmp, value);
-        fld(tmp);
-        fyl2x();
-        fstp(tmp);
-        movsd(dest, tmp);
-        addq(AMD64.rsp, dest.spillSlotSize);
-    }
-
-    public void fsin(Register dest, Register value) {
-        ftrig(dest, value, 's');
-    }
-
-    public void fcos(Register dest, Register value) {
-        ftrig(dest, value, 'c');
-    }
-
-    public void ftan(Register dest, Register value) {
-        ftrig(dest, value, 't');
-    }
-
-    private void ftrig(Register dest, Register value, char op) {
-        assert value.spillSlotSize == dest.spillSlotSize;
-
-        Address tmp = new Address(Kind.Double, AMD64.RSP);
-        subq(AMD64.rsp, value.spillSlotSize);
-        movsd(tmp, value);
-        fld(tmp);
-        if (op == 's') {
-            fsin();
-        } else if (op == 'c') {
-            fcos();
-        } else if (op == 't') {
-            fptan();
-            fstp(0); // ftan pushes 1.0 in addition to the actual result, pop
-        } else {
-            throw new InternalError("should not reach here");
-        }
-        fstp(tmp);
-        movsd(dest, tmp);
-        addq(AMD64.rsp, dest.spillSlotSize);
-    }
-
-    /**
-     * Emit code to save a given set of callee save registers in the
-     * {@linkplain CalleeSaveLayout CSA} within the frame.
-     * @param csl the description of the CSA
-     * @param frameToCSA offset from the frame pointer to the CSA
-     */
-    public void save(CalleeSaveLayout csl, int frameToCSA) {
-        RegisterValue frame = frameRegister.asValue();
-        for (Register r : csl.registers) {
-            int offset = csl.offsetOf(r);
-            movq(new Address(target.wordKind, frame, frameToCSA + offset), r);
-        }
-    }
-
-    public void restore(CalleeSaveLayout csl, int frameToCSA) {
-        RegisterValue frame = frameRegister.asValue();
-        for (Register r : csl.registers) {
-            int offset = csl.offsetOf(r);
-            movq(r, new Address(target.wordKind, frame, frameToCSA + offset));
-        }
-    }
-}
diff -r dc409418cc2c -r 85c1b84f8fd9 graal/com.oracle.max.asm/src/com/oracle/max/asm/target/amd64/X86InstructionDecoder.java
--- a/graal/com.oracle.max.asm/src/com/oracle/max/asm/target/amd64/X86InstructionDecoder.java	Tue Oct 02 22:06:37 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,507 +0,0 @@
-/*
- * Copyright (c) 2009, 2011, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-package com.oracle.max.asm.target.amd64;
-
-
-public final class X86InstructionDecoder {
-
-    private boolean targetIs64Bit;
-    private byte[] code;
-    private int currentEndOfInstruction;
-    private int currentDisplacementPosition;
-
-    private static class Prefix {
-
-        // segment overrides
-        public static final int CSSegment = 0x2e;
-        public static final int SSSegment = 0x36;
-        public static final int DSSegment = 0x3e;
-        public static final int ESSegment = 0x26;
-        public static final int FSSegment = 0x64;
-        public static final int GSSegment = 0x65;
-        public static final int REX = 0x40;
-        public static final int REXB = 0x41;
-        public static final int REXX = 0x42;
-        public static final int REXXB = 0x43;
-        public static final int REXR = 0x44;
-        public static final int REXRB = 0x45;
-        public static final int REXRX = 0x46;
-        public static final int REXRXB = 0x47;
-        public static final int REXW = 0x48;
-        public static final int REXWB = 0x49;
-        public static final int REXWX = 0x4A;
-        public static final int REXWXB = 0x4B;
-        public static final int REXWR = 0x4C;
-        public static final int REXWRB = 0x4D;
-        public static final int REXWRX = 0x4E;
-        public static final int REXWRXB = 0x4F;
-    }
-
-    private X86InstructionDecoder(byte[] code, boolean targetIs64Bit) {
-        this.code = code;
-        this.targetIs64Bit = targetIs64Bit;
-    }
-
-    public int currentEndOfInstruction() {
-        return currentEndOfInstruction;
-    }
-
-    public int currentDisplacementPosition() {
-        return currentDisplacementPosition;
-    }
-
-    public void decodePosition(int inst) {
-
-        assert inst >= 0 && inst < code.length;
-
-        // Decode the given instruction, and return the Pointer of
-        // an embedded 32-bit operand word.
-
-        // If "which" is WhichOperand.disp32operand, selects the displacement portion
-        // of an effective Pointer specifier.
-        // If "which" is imm64Operand, selects the trailing immediate constant.
-        // If "which" is WhichOperand.call32operand, selects the displacement of a call or jump.
-        // Caller is responsible for ensuring that there is such an operand,
-        // and that it is 32/64 bits wide.
-
-        // If "which" is endPcOperand, find the end of the instruction.
-
-        int ip = inst;
-        boolean is64bit = false;
-
-        boolean hasDisp32 = false;
-        int tailSize = 0; // other random bytes (#32, #16, etc.) at end of insn
-
-        boolean againAfterPrefix = true;
-
-        while (againAfterPrefix) {
-            againAfterPrefix = false;
-            switch (0xFF & code[ip++]) {
-
-                // These convenience macros generate groups of "case" labels for the switch.
-
-                case Prefix.CSSegment:
-                case Prefix.SSSegment:
-                case Prefix.DSSegment:
-                case Prefix.ESSegment:
-                case Prefix.FSSegment:
-                case Prefix.GSSegment:
-                    // Seems dubious
-                    assert !targetIs64Bit : "shouldn't have that prefix";
-                    assert ip == inst + 1 : "only one prefix allowed";
-                    againAfterPrefix = true;
-                    break;
-
-                case 0x67:
-                case Prefix.REX:
-                case Prefix.REXB:
-                case Prefix.REXX:
-                case Prefix.REXXB:
-                case Prefix.REXR:
-                case Prefix.REXRB:
-                case Prefix.REXRX:
-                case Prefix.REXRXB:
-                    assert targetIs64Bit : "64bit prefixes";
-                    againAfterPrefix = true;
-                    break;
-
-                case Prefix.REXW:
-                case Prefix.REXWB:
-                case Prefix.REXWX:
-                case Prefix.REXWXB:
-                case Prefix.REXWR:
-                case Prefix.REXWRB:
-                case Prefix.REXWRX:
-                case Prefix.REXWRXB:
-                    assert targetIs64Bit : "64bit prefixes";
-                    is64bit = true;
-                    againAfterPrefix = true;
-                    break;
-
-                case 0xFF: // pushq a; decl a; incl a; call a; jmp a
-                case 0x88: // movb a, r
-                case 0x89: // movl a, r
-                case 0x8A: // movb r, a
-                case 0x8B: // movl r, a
-                case 0x8F: // popl a
-                    hasDisp32 = true;
-                    break;
-
-                case 0x68: // pushq #32
-                    currentEndOfInstruction = ip + 4;
-                    currentDisplacementPosition = ip;
-                    return; // not produced by emitOperand
-
-                case 0x66: // movw ... (size prefix)
-                    boolean againAfterSizePrefix2 = true;
-                    while (againAfterSizePrefix2) {
-                        againAfterSizePrefix2 = false;
-                        switch (0xFF & code[ip++]) {
-                            case Prefix.REX:
-                            case Prefix.REXB:
-                            case Prefix.REXX:
-                            case Prefix.REXXB:
-                            case Prefix.REXR:
-                            case Prefix.REXRB:
-                            case Prefix.REXRX:
-                            case Prefix.REXRXB:
-                            case Prefix.REXW:
-                            case Prefix.REXWB:
-                            case Prefix.REXWX:
-                            case Prefix.REXWXB:
-                            case Prefix.REXWR:
-                            case Prefix.REXWRB:
-                            case Prefix.REXWRX:
-                            case Prefix.REXWRXB:
-                                assert targetIs64Bit : "64bit prefix found";
-                                againAfterSizePrefix2 = true;
-                                break;
-                            case 0x8B: // movw r, a
-                            case 0x89: // movw a, r
-                                hasDisp32 = true;
-                                break;
-                            case 0xC7: // movw a, #16
-                                hasDisp32 = true;
-                                tailSize = 2; // the imm16
-                                break;
-                            case 0x0F: // several SSE/SSE2 variants
-                                ip--; // reparse the 0x0F
-                                againAfterPrefix = true;
-                                break;
-                            default:
-                                throw new InternalError("should not reach here");
-                        }
-                    }
-                    break;
-
-                case 0xB8: // movl/q r, #32/#64(oop?)
-                case 0xB9:
-                case 0xBA:
-                case 0xBB:
-                case 0xBC:
-                case 0xBD:
-                case 0xBE:
-                case 0xBF:
-                    currentEndOfInstruction = ip + (is64bit ? 8 : 4);
-                    currentDisplacementPosition = ip;
-                    return;
-
-                case 0x69: // imul r, a, #32
-                case 0xC7: // movl a, #32(oop?)
-                    tailSize = 4;
-                    hasDisp32 = true; // has both kinds of operands!
-                    break;
-
-                case 0x0F: // movx..., etc.
-                    switch (0xFF & code[ip++]) {
-                        case 0x12: // movlps
-                        case 0x28: // movaps
-                        case 0x2E: // ucomiss
-                        case 0x2F: // comiss
-                        case 0x54: // andps
-                        case 0x55: // andnps
-                        case 0x56: // orps
-                        case 0x57: // xorps
-                        case 0x6E: // movd
-                        case 0x7E: // movd
-                        case 0xAE: // ldmxcsr a
-                            // 64bit side says it these have both operands but that doesn't
-                            // appear to be true
-                            hasDisp32 = true;
-                            break;
-
-                        case 0xAD: // shrd r, a, %cl
-                        case 0xAF: // imul r, a
-                        case 0xBE: // movsbl r, a (movsxb)
-                        case 0xBF: // movswl r, a (movsxw)
-                        case 0xB6: // movzbl r, a (movzxb)
-                        case 0xB7: // movzwl r, a (movzxw)
-                        case 0x40: // cmovl cc, r, a
-                        case 0x41:
-                        case 0x42:
-                        case 0x43:
-                        case 0x44:
-                        case 0x45:
-                        case 0x46:
-                        case 0x47:
-                        case 0x48:
-                        case 0x49:
-                        case 0x4A:
-                        case 0x4B:
-                        case 0x4C:
-                        case 0x4D:
-                        case 0x4E:
-                        case 0x4F:
-                        case 0xB0: // cmpxchgb
-                        case 0xB1: // cmpxchg
-                        case 0xC1: // xaddl
-                        case 0xC7: // cmpxchg8
-                        case 0x90: // setcc a
-                        case 0x91:
-                        case 0x92:
-                        case 0x93:
-                        case 0x94:
-                        case 0x95:
-                        case 0x96:
-                        case 0x97:
-                        case 0x98:
-                        case 0x99:
-                        case 0x9A:
-                        case 0x9B:
-                        case 0x9C:
-                        case 0x9D:
-                        case 0x9E:
-                        case 0x9F:
-                            hasDisp32 = true;
-                            // fall out of the switch to decode the Pointer
-                            break;
-
-                        case 0xAC: // shrd r, a, #8
-                            hasDisp32 = true;
-                            tailSize = 1; // the imm8
-                            break;
-
-                        case 0x80: // jcc rdisp32
-                        case 0x81:
-                        case 0x82:
-                        case 0x83:
-                        case 0x84:
-                        case 0x85:
-                        case 0x86:
-                        case 0x87:
-                        case 0x88:
-                        case 0x89:
-                        case 0x8A:
-                        case 0x8B:
-                        case 0x8C:
-                        case 0x8D:
-                        case 0x8E:
-                        case 0x8F:
-                            currentEndOfInstruction = ip + 4;
-                            currentDisplacementPosition = ip;
-                            return;
-                        default:
-                            throw new InternalError("should not reach here");
-                    }
-                    break;
-
-                case 0x81: // addl a, #32; addl r, #32
-                    // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
-                    // on 32bit in the case of cmpl, the imm might be an oop
-                    tailSize = 4;
-                    hasDisp32 = true; // has both kinds of operands!
-                    break;
-
-                case 0x83: // addl a, #8; addl r, #8
-                    // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
-                    hasDisp32 = true; // has both kinds of operands!
-                    tailSize = 1;
-                    break;
-
-                case 0x9B:
-                    switch (0xFF & code[ip++]) {
-                        case 0xD9: // fnstcw a
-                            hasDisp32 = true;
-                            break;
-                        default:
-                            throw new InternalError("should not reach here");
-                    }
-                    break;
-
-                case 0x00: // addb a, r; addl a, r; addb r, a; addl r, a
-                case 0x01:
-                case 0x02:
-                case 0x03:
-                case 0x10: // adc...
-                case 0x11:
-                case 0x12:
-                case 0x13:
-                case 0x20: // and...
-                case 0x21:
-                case 0x22:
-                case 0x23:
-                case 0x30: // xor...
-                case 0x31:
-                case 0x32:
-                case 0x33:
-                case 0x08: // or...
-                case 0x09:
-                case 0x0a:
-                case 0x0b:
-                case 0x18: // sbb...
-                case 0x19:
-                case 0x1a:
-                case 0x1b:
-                case 0x28: // sub...
-                case 0x29:
-                case 0x2a:
-                case 0x2b:
-                case 0xF7: // mull a
-                case 0x8D: // lea r, a
-                case 0x87: // xchg r, a
-                case 0x38: // cmp...
-                case 0x39:
-                case 0x3a:
-                case 0x3b:
-                case 0x85: // test r, a
-                    hasDisp32 = true; // has both kinds of operands!
-                    break;
-
-                case 0xC1: // sal a, #8; sar a, #8; shl a, #8; shr a, #8
-                case 0xC6: // movb a, #8
-                case 0x80: // cmpb a, #8
-                case 0x6B: // imul r, a, #8
-                    hasDisp32 = true; // has both kinds of operands!
-                    tailSize = 1; // the imm8
-                    break;
-
-                case 0xE8: // call rdisp32
-                case 0xE9: // jmp rdisp32
-                    currentEndOfInstruction = ip + 4;
-                    currentDisplacementPosition = ip;
-                    return;
-
-                case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
-                case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
-                case 0xD9: // fldS a; fstS a; fstpS a; fldcw a
-                case 0xDD: // fldD a; fstD a; fstpD a
-                case 0xDB: // fildS a; fistpS a; fldX a; fstpX a
-                case 0xDF: // fildD a; fistpD a
-                case 0xD8: // faddS a; fsubrS a; fmulS a; fdivrS a; fcompS a
-                case 0xDC: // faddD a; fsubrD a; fmulD a; fdivrD a; fcompD a
-                case 0xDE: // faddpD a; fsubrpD a; fmulpD a; fdivrpD a; fcomppD a
-                    hasDisp32 = true;
-                    break;
-
-                case 0xF0: // Lock
-                    againAfterPrefix = true;
-                    break;
-
-                case 0xF3: // For SSE
-                case 0xF2: // For SSE2
-                    switch (0xFF & code[ip++]) {
-                        case Prefix.REX:
-                        case Prefix.REXB:
-                        case Prefix.REXX:
-                        case Prefix.REXXB:
-                        case Prefix.REXR:
-                        case Prefix.REXRB:
-                        case Prefix.REXRX:
-                        case Prefix.REXRXB:
-                        case Prefix.REXW:
-                        case Prefix.REXWB:
-                        case Prefix.REXWX:
-                        case Prefix.REXWXB:
-                        case Prefix.REXWR:
-                        case Prefix.REXWRB:
-                        case Prefix.REXWRX:
-                        case Prefix.REXWRXB:
-                            assert targetIs64Bit : "found 64bit prefix";
-                            ip++;
-                            // fall through
-                        default:
-                            ip++;
-                    }
-                    hasDisp32 = true; // has both kinds of operands!
-                    break;
-
-                default:
-                    throw new InternalError("should not reach here");
-            }
-        }
-
-        assert hasDisp32 : "(thomaswue) not sure if this holds: instruction has no disp32 field";
-
-        // parse the output of emitOperand
-        int op2 = 0xFF & code[ip++];
-        int base = op2 & 0x07;
-        int op3 = -1;
-        int b100 = 4;
-        int b101 = 5;
-        if (base == b100 && (op2 >> 6) != 3) {
-            op3 = 0xFF & code[ip++];
-            base = op3 & 0x07; // refetch the base
-        }
-        // now ip points at the disp (if any)
-
-        switch (op2 >> 6) {
-            case 0:
-                // [00 reg 100][ss index base]
-                // [00 reg 100][00 100 esp]
-                // [00 reg base]
-                // [00 reg 100][ss index 101][disp32]
-                // [00 reg 101] [disp32]
-
-                if (base == b101) {
-
-                    currentDisplacementPosition = ip;
-                    ip += 4; // skip the disp32
-                }
-                break;
-
-            case 1:
-                // [01 reg 100][ss index base][disp8]
-                // [01 reg 100][00 100 esp][disp8]
-                // [01 reg base] [disp8]
-                ip += 1; // skip the disp8
-                break;
-
-            case 2:
-                // [10 reg 100][ss index base][disp32]
-                // [10 reg 100][00 100 esp][disp32]
-                // [10 reg base] [disp32]
-                currentDisplacementPosition = ip;
-                ip += 4; // skip the disp32
-                break;
-
-            case 3:
-                // [11 reg base] (not a memory addressing mode)
-                break;
-        }
-
-        currentEndOfInstruction = ip + tailSize;
-    }
-
-    public static void patchRelativeInstruction(byte[] code, int codePos, int relative) {
-        X86InstructionDecoder decoder = new X86InstructionDecoder(code, true);
-        decoder.decodePosition(codePos);
-        int patchPos = decoder.currentDisplacementPosition();
-        int endOfInstruction = decoder.currentEndOfInstruction();
-        int offset = relative - endOfInstruction + codePos;
-        patchDisp32(code, patchPos, offset);
-    }
-
-    private static void patchDisp32(byte[] code, int pos, int offset) {
-        assert pos + 4 <= code.length;
-
-        assert code[pos] == 0;
-        assert code[pos + 1] == 0;
-        assert code[pos + 2] == 0;
-        assert code[pos + 3] == 0;
-
-        code[pos] = (byte) (offset & 0xFF);
-        code[pos + 1] = (byte) ((offset >> 8) & 0xFF);
-        code[pos + 2] = (byte) ((offset >> 16) & 0xFF);
-        code[pos + 3] = (byte) ((offset >> 24) & 0xFF);
-    }
-}
diff -r dc409418cc2c -r 85c1b84f8fd9 mx/projects
--- a/mx/projects	Tue Oct 02 22:06:37 2012 +0200
+++ b/mx/projects	Tue Oct 02 22:22:06 2012 +0200
@@ -98,7 +98,7 @@
 # graal.lir.amd64
 project@com.oracle.graal.lir.amd64@subDir=graal
 project@com.oracle.graal.lir.amd64@sourceDirs=src
-project@com.oracle.graal.lir.amd64@dependencies=com.oracle.graal.lir
+project@com.oracle.graal.lir.amd64@dependencies=com.oracle.graal.lir,com.oracle.max.asm.amd64
 project@com.oracle.graal.lir.amd64@checkstyle=com.oracle.graal.graph
 project@com.oracle.graal.lir.amd64@javaCompliance=1.7
 
@@ -220,6 +220,13 @@
 project@com.oracle.max.asm@checkstyle=com.oracle.graal.graph
 project@com.oracle.max.asm@javaCompliance=1.7
 
+# max.asm.amd64
+project@com.oracle.max.asm.amd64@subDir=graal
+project@com.oracle.max.asm.amd64@sourceDirs=src
+project@com.oracle.max.asm.amd64@dependencies=com.oracle.max.asm
+project@com.oracle.max.asm.amd64@checkstyle=com.oracle.graal.graph
+project@com.oracle.max.asm.amd64@javaCompliance=1.7
+
 # max.criutils
 project@com.oracle.max.criutils@subDir=graal
 project@com.oracle.max.criutils@sourceDirs=src
diff -r dc409418cc2c -r 85c1b84f8fd9 src/share/vm/runtime/arguments.cpp
--- a/src/share/vm/runtime/arguments.cpp	Tue Oct 02 22:06:37 2012 +0200
+++ b/src/share/vm/runtime/arguments.cpp	Tue Oct 02 22:22:06 2012 +0200
@@ -2154,6 +2154,7 @@
         "com.oracle.max.criutils",
         "com.oracle.graal.hotspot",
         "com.oracle.max.asm",
+        "com.oracle.max.asm.amd64",
         "com.oracle.graal.alloc",
         "com.oracle.graal.snippets",
         "com.oracle.graal.compiler",