diff src/cpu/x86/vm/x86_64.ad @ 681:fbde8ec322d0

6761600: Use sse 4.2 in intrinsics Summary: Use SSE 4.2 in intrinsics for String.{compareTo/equals/indexOf} and Arrays.equals. Reviewed-by: kvn, never, jrose
author cfang
date Tue, 31 Mar 2009 14:07:08 -0700
parents d0994e5bebce
children 93c14e5562c4
line wrap: on
line diff
--- a/src/cpu/x86/vm/x86_64.ad	Tue Mar 31 10:02:01 2009 -0700
+++ b/src/cpu/x86/vm/x86_64.ad	Tue Mar 31 14:07:08 2009 -0700
@@ -3694,13 +3694,16 @@
     }
   %}
 
-  enc_class enc_String_Compare()
-  %{
+  enc_class enc_String_Compare(rdi_RegP str1, rsi_RegP str2, regD tmp1, regD tmp2,
+                        rax_RegI tmp3, rbx_RegI tmp4, rcx_RegI result) %{
     Label RCX_GOOD_LABEL, LENGTH_DIFF_LABEL,
           POP_LABEL, DONE_LABEL, CONT_LABEL,
           WHILE_HEAD_LABEL;
     MacroAssembler masm(&cbuf);
 
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+    XMMRegister tmp2Reg   = as_XMMRegister($tmp2$$reg);
+
     // Get the first character position in both strings
     //         [8] char array, [12] offset, [16] count
     int value_offset  = java_lang_String::value_offset_in_bytes();
@@ -3718,6 +3721,7 @@
     // Compute the minimum of the string lengths(rsi) and the
     // difference of the string lengths (stack)
 
+    // do the conditional move stuff
     masm.movl(rdi, Address(rdi, count_offset));
     masm.movl(rsi, Address(rsi, count_offset));
     masm.movl(rcx, rdi);
@@ -3745,7 +3749,7 @@
       Label LSkip2;
       // Check if the strings start at same location
       masm.cmpptr(rbx, rax);
-      masm.jcc(Assembler::notEqual, LSkip2);
+      masm.jccb(Assembler::notEqual, LSkip2);
 
       // Check if the length difference is zero (from stack)
       masm.cmpl(Address(rsp, 0), 0x0);
@@ -3755,9 +3759,52 @@
       masm.bind(LSkip2);
     }
 
+    // Advance to next character
+    masm.addptr(rax, 2);
+    masm.addptr(rbx, 2);
+
+    if (UseSSE42Intrinsics) {
+      // With SSE4.2, use double quad vector compare
+      Label COMPARE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
+      // Setup to compare 16-byte vectors
+      masm.movl(rdi, rsi);
+      masm.andl(rsi, 0xfffffff8); // rsi holds the vector count
+      masm.andl(rdi, 0x00000007); // rdi holds the tail count
+      masm.testl(rsi, rsi);
+      masm.jccb(Assembler::zero, COMPARE_TAIL);
+
+      masm.lea(rax, Address(rax, rsi, Address::times_2));
+      masm.lea(rbx, Address(rbx, rsi, Address::times_2));
+      masm.negptr(rsi);
+
+      masm.bind(COMPARE_VECTORS);
+      masm.movdqu(tmp1Reg, Address(rax, rsi, Address::times_2));
+      masm.movdqu(tmp2Reg, Address(rbx, rsi, Address::times_2));
+      masm.pxor(tmp1Reg, tmp2Reg);
+      masm.ptest(tmp1Reg, tmp1Reg);
+      masm.jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
+      masm.addptr(rsi, 8);
+      masm.jcc(Assembler::notZero, COMPARE_VECTORS);
+      masm.jmpb(COMPARE_TAIL);
+
+      // Mismatched characters in the vectors
+      masm.bind(VECTOR_NOT_EQUAL);
+      masm.lea(rax, Address(rax, rsi, Address::times_2));
+      masm.lea(rbx, Address(rbx, rsi, Address::times_2));
+      masm.movl(rdi, 8);
+
+      // Compare tail (< 8 chars), or rescan last vectors to
+      // find 1st mismatched characters
+      masm.bind(COMPARE_TAIL);
+      masm.testl(rdi, rdi);
+      masm.jccb(Assembler::zero, LENGTH_DIFF_LABEL);
+      masm.movl(rsi, rdi);
+      // Fallthru to tail compare
+    }
+
     // Shift RAX and RBX to the end of the arrays, negate min
-    masm.lea(rax, Address(rax, rsi, Address::times_2, 2));
-    masm.lea(rbx, Address(rbx, rsi, Address::times_2, 2));
+    masm.lea(rax, Address(rax, rsi, Address::times_2, 0));
+    masm.lea(rbx, Address(rbx, rsi, Address::times_2, 0));
     masm.negptr(rsi);
 
     // Compare the rest of the characters
@@ -3765,93 +3812,329 @@
     masm.load_unsigned_short(rcx, Address(rbx, rsi, Address::times_2, 0));
     masm.load_unsigned_short(rdi, Address(rax, rsi, Address::times_2, 0));
     masm.subl(rcx, rdi);
-    masm.jcc(Assembler::notZero, POP_LABEL);
+    masm.jccb(Assembler::notZero, POP_LABEL);
     masm.increment(rsi);
     masm.jcc(Assembler::notZero, WHILE_HEAD_LABEL);
 
     // Strings are equal up to min length.  Return the length difference.
     masm.bind(LENGTH_DIFF_LABEL);
     masm.pop(rcx);
-    masm.jmp(DONE_LABEL);
+    masm.jmpb(DONE_LABEL);
 
     // Discard the stored length difference
     masm.bind(POP_LABEL);
     masm.addptr(rsp, 8);
-       
+
     // That's it
     masm.bind(DONE_LABEL);
   %}
 
-  enc_class enc_Array_Equals(rdi_RegP ary1, rsi_RegP ary2, rax_RegI tmp1, rbx_RegI tmp2, rcx_RegI result) %{
-    Label TRUE_LABEL, FALSE_LABEL, DONE_LABEL, COMPARE_LOOP_HDR, COMPARE_LOOP;
+ enc_class enc_String_IndexOf(rsi_RegP str1, rdi_RegP str2, regD tmp1, rax_RegI tmp2,
+                        rcx_RegI tmp3, rdx_RegI tmp4, rbx_RegI result) %{
+    // SSE4.2 version
+    Label LOAD_SUBSTR, PREP_FOR_SCAN, SCAN_TO_SUBSTR,
+          SCAN_SUBSTR, RET_NEG_ONE, RET_NOT_FOUND, CLEANUP, DONE;
     MacroAssembler masm(&cbuf);
 
-    Register ary1Reg   = as_Register($ary1$$reg);
-    Register ary2Reg   = as_Register($ary2$$reg);
-    Register tmp1Reg   = as_Register($tmp1$$reg);
-    Register tmp2Reg   = as_Register($tmp2$$reg);
-    Register resultReg = as_Register($result$$reg);
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+
+    // Get the first character position in both strings
+    //         [8] char array, [12] offset, [16] count
+    int value_offset  = java_lang_String::value_offset_in_bytes();
+    int offset_offset = java_lang_String::offset_offset_in_bytes();
+    int count_offset  = java_lang_String::count_offset_in_bytes();
+    int base_offset   = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+    // Get counts for string and substr
+    masm.movl(rdx, Address(rsi, count_offset));
+    masm.movl(rax, Address(rdi, count_offset));
+    // Check for substr count > string count
+    masm.cmpl(rax, rdx);
+    masm.jcc(Assembler::greater, RET_NEG_ONE);
+
+    // Start the indexOf operation
+    // Get start addr of string
+    masm.load_heap_oop(rbx, Address(rsi, value_offset));
+    masm.movl(rcx, Address(rsi, offset_offset));
+    masm.lea(rsi, Address(rbx, rcx, Address::times_2, base_offset));
+    masm.push(rsi);
+
+    // Get start addr of substr
+    masm.load_heap_oop(rbx, Address(rdi, value_offset));
+    masm.movl(rcx, Address(rdi, offset_offset));
+    masm.lea(rdi, Address(rbx, rcx, Address::times_2, base_offset));
+    masm.push(rdi);
+    masm.push(rax);
+    masm.jmpb(PREP_FOR_SCAN);
+
+    // Substr count saved at sp
+    // Substr saved at sp+8
+    // String saved at sp+16
+
+    // Prep to load substr for scan
+    masm.bind(LOAD_SUBSTR);
+    masm.movptr(rdi, Address(rsp, 8));
+    masm.movl(rax, Address(rsp, 0));
+
+    // Load substr
+    masm.bind(PREP_FOR_SCAN);
+    masm.movdqu(tmp1Reg, Address(rdi, 0));
+    masm.addq(rdx, 8);    // prime the loop
+    masm.subptr(rsi, 16);
+
+    // Scan string for substr in 16-byte vectors
+    masm.bind(SCAN_TO_SUBSTR);
+    masm.subq(rdx, 8);
+    masm.addptr(rsi, 16);
+    masm.pcmpestri(tmp1Reg, Address(rsi, 0), 0x0d);
+    masm.jcc(Assembler::above, SCAN_TO_SUBSTR);
+    masm.jccb(Assembler::aboveEqual, RET_NOT_FOUND);
+
+    // Fallthru: found a potential substr
+
+    //Make sure string is still long enough
+    masm.subl(rdx, rcx);
+    masm.cmpl(rdx, rax);
+    masm.jccb(Assembler::negative, RET_NOT_FOUND);
+    // Compute start addr of substr
+    masm.lea(rsi, Address(rsi, rcx, Address::times_2));
+    masm.movptr(rbx, rsi);
+
+    // Compare potential substr
+    masm.addq(rdx, 8);        // prime the loop
+    masm.addq(rax, 8);
+    masm.subptr(rsi, 16);
+    masm.subptr(rdi, 16);
+
+    // Scan 16-byte vectors of string and substr
+    masm.bind(SCAN_SUBSTR);
+    masm.subq(rax, 8);
+    masm.subq(rdx, 8);
+    masm.addptr(rsi, 16);
+    masm.addptr(rdi, 16);
+    masm.movdqu(tmp1Reg, Address(rdi, 0));
+    masm.pcmpestri(tmp1Reg, Address(rsi, 0), 0x0d);
+    masm.jcc(Assembler::noOverflow, LOAD_SUBSTR);   // OF == 0
+    masm.jcc(Assembler::positive, SCAN_SUBSTR);     // SF == 0
+
+    // Compute substr offset
+    masm.movptr(rsi, Address(rsp, 16));
+    masm.subptr(rbx, rsi);
+    masm.shrl(rbx, 1);
+    masm.jmpb(CLEANUP);
+
+    masm.bind(RET_NEG_ONE);
+    masm.movl(rbx, -1);
+    masm.jmpb(DONE);
+
+    masm.bind(RET_NOT_FOUND);
+    masm.movl(rbx, -1);
+
+    masm.bind(CLEANUP);
+    masm.addptr(rsp, 24);
+
+    masm.bind(DONE);
+  %}
+
+  enc_class enc_String_Equals(rdi_RegP str1, rsi_RegP str2, regD tmp1, regD tmp2,
+                              rbx_RegI tmp3, rcx_RegI tmp2, rax_RegI result) %{
+    Label RET_TRUE, RET_FALSE, DONE, COMPARE_VECTORS, COMPARE_CHAR;
+    MacroAssembler masm(&cbuf);
+
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+    XMMRegister tmp2Reg   = as_XMMRegister($tmp2$$reg);
+
+    int value_offset  = java_lang_String::value_offset_in_bytes();
+    int offset_offset = java_lang_String::offset_offset_in_bytes();
+    int count_offset  = java_lang_String::count_offset_in_bytes();
+    int base_offset   = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+    // does source == target string?
+    masm.cmpptr(rdi, rsi);
+    masm.jcc(Assembler::equal, RET_TRUE);
+
+    // get and compare counts
+    masm.movl(rcx, Address(rdi, count_offset));
+    masm.movl(rax, Address(rsi, count_offset));
+    masm.cmpl(rcx, rax);
+    masm.jcc(Assembler::notEqual, RET_FALSE);
+    masm.testl(rax, rax);
+    masm.jcc(Assembler::zero, RET_TRUE);
+
+    // get source string offset and value
+    masm.load_heap_oop(rbx, Address(rsi, value_offset));
+    masm.movl(rax, Address(rsi, offset_offset));
+    masm.lea(rsi, Address(rbx, rax, Address::times_2, base_offset));
+
+    // get compare string offset and value
+    masm.load_heap_oop(rbx, Address(rdi, value_offset));
+    masm.movl(rax, Address(rdi, offset_offset));
+    masm.lea(rdi, Address(rbx, rax, Address::times_2, base_offset));
+
+    // Set byte count
+    masm.shll(rcx, 1);
+    masm.movl(rax, rcx);
+
+    if (UseSSE42Intrinsics) {
+      // With SSE4.2, use double quad vector compare
+      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
+      // Compare 16-byte vectors
+      masm.andl(rcx, 0xfffffff0);  // vector count (in bytes)
+      masm.andl(rax, 0x0000000e);  // tail count (in bytes)
+      masm.testl(rcx, rcx);
+      masm.jccb(Assembler::zero, COMPARE_TAIL);
+      masm.lea(rdi, Address(rdi, rcx, Address::times_1));
+      masm.lea(rsi, Address(rsi, rcx, Address::times_1));
+      masm.negptr(rcx);
+
+      masm.bind(COMPARE_WIDE_VECTORS);
+      masm.movdqu(tmp1Reg, Address(rdi, rcx, Address::times_1));
+      masm.movdqu(tmp2Reg, Address(rsi, rcx, Address::times_1));
+      masm.pxor(tmp1Reg, tmp2Reg);
+      masm.ptest(tmp1Reg, tmp1Reg);
+      masm.jccb(Assembler::notZero, RET_FALSE);
+      masm.addptr(rcx, 16);
+      masm.jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
+      masm.bind(COMPARE_TAIL);
+      masm.movl(rcx, rax);
+      // Fallthru to tail compare
+    }
+
+    // Compare 4-byte vectors
+    masm.andl(rcx, 0xfffffffc);  // vector count (in bytes)
+    masm.andl(rax, 0x00000002);  // tail char (in bytes)
+    masm.testl(rcx, rcx);
+    masm.jccb(Assembler::zero, COMPARE_CHAR);
+    masm.lea(rdi, Address(rdi, rcx, Address::times_1));
+    masm.lea(rsi, Address(rsi, rcx, Address::times_1));
+    masm.negptr(rcx);
+
+    masm.bind(COMPARE_VECTORS);
+    masm.movl(rbx, Address(rdi, rcx, Address::times_1));
+    masm.cmpl(rbx, Address(rsi, rcx, Address::times_1));
+    masm.jccb(Assembler::notEqual, RET_FALSE);
+    masm.addptr(rcx, 4);
+    masm.jcc(Assembler::notZero, COMPARE_VECTORS);
+
+    // Compare trailing char (final 2 bytes), if any
+    masm.bind(COMPARE_CHAR);
+    masm.testl(rax, rax);
+    masm.jccb(Assembler::zero, RET_TRUE);
+    masm.load_unsigned_short(rbx, Address(rdi, 0));
+    masm.load_unsigned_short(rcx, Address(rsi, 0));
+    masm.cmpl(rbx, rcx);
+    masm.jccb(Assembler::notEqual, RET_FALSE);
+
+    masm.bind(RET_TRUE);
+    masm.movl(rax, 1);   // return true
+    masm.jmpb(DONE);
+
+    masm.bind(RET_FALSE);
+    masm.xorl(rax, rax); // return false
+
+    masm.bind(DONE);
+  %}
+
+  enc_class enc_Array_Equals(rdi_RegP ary1, rsi_RegP ary2, regD tmp1, regD tmp2,
+                             rax_RegI tmp3, rbx_RegI tmp4, rcx_RegI result) %{
+    Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
+    MacroAssembler masm(&cbuf);
+
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+    XMMRegister tmp2Reg   = as_XMMRegister($tmp2$$reg);
+    Register ary1Reg      = as_Register($ary1$$reg);
+    Register ary2Reg      = as_Register($ary2$$reg);
+    Register tmp3Reg      = as_Register($tmp3$$reg);
+    Register tmp4Reg      = as_Register($tmp4$$reg);
+    Register resultReg    = as_Register($result$$reg);
 
     int length_offset  = arrayOopDesc::length_offset_in_bytes();
     int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
 
     // Check the input args
-    masm.cmpq(ary1Reg, ary2Reg);                        
+    masm.cmpq(ary1Reg, ary2Reg);
     masm.jcc(Assembler::equal, TRUE_LABEL);
-    masm.testq(ary1Reg, ary1Reg);                       
+    masm.testq(ary1Reg, ary1Reg);
     masm.jcc(Assembler::zero, FALSE_LABEL);
-    masm.testq(ary2Reg, ary2Reg);                       
+    masm.testq(ary2Reg, ary2Reg);
     masm.jcc(Assembler::zero, FALSE_LABEL);
 
     // Check the lengths
-    masm.movl(tmp2Reg, Address(ary1Reg, length_offset));
+    masm.movl(tmp4Reg, Address(ary1Reg, length_offset));
     masm.movl(resultReg, Address(ary2Reg, length_offset));
-    masm.cmpl(tmp2Reg, resultReg);
+    masm.cmpl(tmp4Reg, resultReg);
     masm.jcc(Assembler::notEqual, FALSE_LABEL);
     masm.testl(resultReg, resultReg);
     masm.jcc(Assembler::zero, TRUE_LABEL);
 
-    // Get the number of 4 byte vectors to compare
-    masm.shrl(resultReg, 1);
-
-    // Check for odd-length arrays
-    masm.andl(tmp2Reg, 1);
-    masm.testl(tmp2Reg, tmp2Reg);
-    masm.jcc(Assembler::zero, COMPARE_LOOP_HDR);
-
-    // Compare 2-byte "tail" at end of arrays
-    masm.load_unsigned_short(tmp1Reg, Address(ary1Reg, resultReg, Address::times_4, base_offset));
-    masm.load_unsigned_short(tmp2Reg, Address(ary2Reg, resultReg, Address::times_4, base_offset));
-    masm.cmpl(tmp1Reg, tmp2Reg);
-    masm.jcc(Assembler::notEqual, FALSE_LABEL);
+    //load array address
+    masm.lea(ary1Reg, Address(ary1Reg, base_offset));
+    masm.lea(ary2Reg, Address(ary2Reg, base_offset));
+
+    //set byte count
+    masm.shll(tmp4Reg, 1);
+    masm.movl(resultReg,tmp4Reg);
+
+    if (UseSSE42Intrinsics){
+      // With SSE4.2, use double quad vector compare
+      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
+      // Compare 16-byte vectors
+      masm.andl(tmp4Reg, 0xfffffff0);    // vector count (in bytes)
+      masm.andl(resultReg, 0x0000000e);  // tail count (in bytes)
+      masm.testl(tmp4Reg, tmp4Reg);
+      masm.jccb(Assembler::zero, COMPARE_TAIL);
+      masm.lea(ary1Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+      masm.lea(ary2Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+      masm.negptr(tmp4Reg);
+
+      masm.bind(COMPARE_WIDE_VECTORS);
+      masm.movdqu(tmp1Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+      masm.movdqu(tmp2Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+      masm.pxor(tmp1Reg, tmp2Reg);
+      masm.ptest(tmp1Reg, tmp1Reg);
+
+      masm.jccb(Assembler::notZero, FALSE_LABEL);
+      masm.addptr(tmp4Reg, 16);
+      masm.jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
+      masm.bind(COMPARE_TAIL);
+      masm.movl(tmp4Reg, resultReg);
+      // Fallthru to tail compare
+    }
+
+   // Compare 4-byte vectors
+    masm.andl(tmp4Reg, 0xfffffffc);    // vector count (in bytes)
+    masm.andl(resultReg, 0x00000002);  // tail char (in bytes)
+    masm.testl(tmp4Reg, tmp4Reg); //if tmp2 == 0, only compare char
+    masm.jccb(Assembler::zero, COMPARE_CHAR);
+    masm.lea(ary1Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+    masm.lea(ary2Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+    masm.negptr(tmp4Reg);
+
+    masm.bind(COMPARE_VECTORS);
+    masm.movl(tmp3Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+    masm.cmpl(tmp3Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+    masm.jccb(Assembler::notEqual, FALSE_LABEL);
+    masm.addptr(tmp4Reg, 4);
+    masm.jcc(Assembler::notZero, COMPARE_VECTORS);
+
+    // Compare trailing char (final 2 bytes), if any
+    masm.bind(COMPARE_CHAR);
     masm.testl(resultReg, resultReg);
-    masm.jcc(Assembler::zero, TRUE_LABEL);
-
-    // Setup compare loop
-    masm.bind(COMPARE_LOOP_HDR);
-    // Shift tmp1Reg and tmp2Reg to the last 4-byte boundary of the arrays
-    masm.leaq(tmp1Reg, Address(ary1Reg, resultReg, Address::times_4, base_offset));
-    masm.leaq(tmp2Reg, Address(ary2Reg, resultReg, Address::times_4, base_offset));
-    masm.negq(resultReg);
-
-    // 4-byte-wide compare loop
-    masm.bind(COMPARE_LOOP);
-    masm.movl(ary1Reg, Address(tmp1Reg, resultReg, Address::times_4, 0));
-    masm.movl(ary2Reg, Address(tmp2Reg, resultReg, Address::times_4, 0));
-    masm.cmpl(ary1Reg, ary2Reg);
-    masm.jcc(Assembler::notEqual, FALSE_LABEL);
-    masm.incrementq(resultReg);
-    masm.jcc(Assembler::notZero, COMPARE_LOOP);
+    masm.jccb(Assembler::zero, TRUE_LABEL);
+    masm.load_unsigned_short(tmp3Reg, Address(ary1Reg, 0));
+    masm.load_unsigned_short(tmp4Reg, Address(ary2Reg, 0));
+    masm.cmpl(tmp3Reg, tmp4Reg);
+    masm.jccb(Assembler::notEqual, FALSE_LABEL);
 
     masm.bind(TRUE_LABEL);
     masm.movl(resultReg, 1);   // return true
-    masm.jmp(DONE_LABEL);
+    masm.jmpb(DONE);
 
     masm.bind(FALSE_LABEL);
     masm.xorl(resultReg, resultReg); // return false
 
     // That's it
-    masm.bind(DONE_LABEL);
+    masm.bind(DONE);
   %}
 
   enc_class enc_rethrow()
@@ -5087,7 +5370,7 @@
 %}
 
 // Double register operands
-operand regD()
+operand regD() 
 %{
   constraint(ALLOC_IN_RC(double_reg));
   match(RegD);
@@ -11540,27 +11823,52 @@
   ins_pipe(pipe_slow);
 %}
 
-instruct string_compare(rdi_RegP str1, rsi_RegP str2, rax_RegI tmp1,
-                        rbx_RegI tmp2, rcx_RegI result, rFlagsReg cr)
+instruct string_compare(rdi_RegP str1, rsi_RegP str2, regD tmp1, regD tmp2,
+                        rax_RegI tmp3, rbx_RegI tmp4, rcx_RegI result, rFlagsReg cr)
 %{
   match(Set result (StrComp str1 str2));
-  effect(USE_KILL str1, USE_KILL str2, KILL tmp1, KILL tmp2, KILL cr);
+  effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, KILL tmp3, KILL tmp4, KILL cr);
   //ins_cost(300);
 
   format %{ "String Compare $str1, $str2 -> $result    // XXX KILL RAX, RBX" %}
-  ins_encode( enc_String_Compare() );
+  ins_encode( enc_String_Compare(str1, str2, tmp1, tmp2, tmp3, tmp4, result) );
+  ins_pipe( pipe_slow );
+%}
+
+instruct string_indexof(rsi_RegP str1, rdi_RegP str2, regD tmp1, rax_RegI tmp2,
+                        rcx_RegI tmp3, rdx_RegI tmp4, rbx_RegI result, rFlagsReg cr)
+%{
+  predicate(UseSSE42Intrinsics);
+  match(Set result (StrIndexOf str1 str2));
+  effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, KILL tmp2, KILL tmp3, KILL tmp4, KILL cr);
+
+  format %{ "String IndexOf $str1,$str2 -> $result   // KILL RAX, RCX, RDX" %}
+  ins_encode( enc_String_IndexOf(str1, str2, tmp1, tmp2, tmp3, tmp4, result) );
+  ins_pipe( pipe_slow );
+%}
+
+// fast string equals
+instruct string_equals(rdi_RegP str1, rsi_RegP str2, regD tmp1, regD tmp2, rbx_RegI tmp3,
+                       rcx_RegI tmp4, rax_RegI result, rFlagsReg cr)
+%{
+  match(Set result (StrEquals str1 str2));
+  effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, KILL tmp3, KILL tmp4, KILL cr);
+
+  format %{ "String Equals $str1,$str2 -> $result    // KILL RBX, RCX" %}
+  ins_encode( enc_String_Equals(str1, str2, tmp1, tmp2, tmp3, tmp4, result) );
   ins_pipe( pipe_slow );
 %}
 
 // fast array equals
-instruct array_equals(rdi_RegP ary1, rsi_RegP ary2, rax_RegI tmp1, 
-                      rbx_RegI tmp2, rcx_RegI result, rFlagsReg cr) %{
+instruct array_equals(rdi_RegP ary1, rsi_RegP ary2, regD tmp1, regD tmp2, rax_RegI tmp3,
+                      rbx_RegI tmp4, rcx_RegI result, rFlagsReg cr)
+%{
   match(Set result (AryEq ary1 ary2));
-  effect(USE_KILL ary1, USE_KILL ary2, KILL tmp1, KILL tmp2, KILL cr);
+  effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr);
   //ins_cost(300);
 
-  format %{ "Array Equals $ary1,$ary2 -> $result    // KILL RAX, RBX" %}
-  ins_encode( enc_Array_Equals(ary1, ary2, tmp1, tmp2, result) );
+  format %{ "Array Equals $ary1,$ary2 -> $result   // KILL RAX, RBX" %}
+  ins_encode( enc_Array_Equals(ary1, ary2, tmp1, tmp2, tmp3, tmp4, result) );
   ins_pipe( pipe_slow );
 %}