comparison src/cpu/x86/vm/macroAssembler_x86.cpp @ 7482:989155e2d07a

Merge with hs25-b15.
author Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
date Wed, 16 Jan 2013 01:34:24 +0100
parents 038dd2875b94
children b30b3c2a0cf2 db9981fd3124
comparison
equal deleted inserted replaced
7381:6761a8f854a4 7482:989155e2d07a
1021 movptr(dst, rscratch1); 1021 movptr(dst, rscratch1);
1022 } 1022 }
1023 1023
1024 void MacroAssembler::leave() { 1024 void MacroAssembler::leave() {
1025 // %%% is this really better? Why not on 32bit too? 1025 // %%% is this really better? Why not on 32bit too?
1026 emit_byte(0xC9); // LEAVE 1026 emit_int8((unsigned char)0xC9); // LEAVE
1027 } 1027 }
1028 1028
1029 void MacroAssembler::lneg(Register hi, Register lo) { 1029 void MacroAssembler::lneg(Register hi, Register lo) {
1030 ShouldNotReachHere(); // 64bit doesn't use two regs 1030 ShouldNotReachHere(); // 64bit doesn't use two regs
1031 negq(lo); 1031 negq(lo);
2110 // A 5 byte nop that is safe for patching (see patch_verified_entry) 2110 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2111 void MacroAssembler::fat_nop() { 2111 void MacroAssembler::fat_nop() {
2112 if (UseAddressNop) { 2112 if (UseAddressNop) {
2113 addr_nop_5(); 2113 addr_nop_5();
2114 } else { 2114 } else {
2115 emit_byte(0x26); // es: 2115 emit_int8(0x26); // es:
2116 emit_byte(0x2e); // cs: 2116 emit_int8(0x2e); // cs:
2117 emit_byte(0x64); // fs: 2117 emit_int8(0x64); // fs:
2118 emit_byte(0x65); // gs: 2118 emit_int8(0x65); // gs:
2119 emit_byte(0x90); 2119 emit_int8((unsigned char)0x90);
2120 } 2120 }
2121 } 2121 }
2122 2122
2123 void MacroAssembler::fcmp(Register tmp) { 2123 void MacroAssembler::fcmp(Register tmp) {
2124 fcmp(tmp, 1, true, true); 2124 fcmp(tmp, 1, true, true);
2532 const int short_size = 2; 2532 const int short_size = 2;
2533 const int long_size = 6; 2533 const int long_size = 6;
2534 int offs = (intptr_t)dst.target() - ((intptr_t)pc()); 2534 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2535 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) { 2535 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2536 // 0111 tttn #8-bit disp 2536 // 0111 tttn #8-bit disp
2537 emit_byte(0x70 | cc); 2537 emit_int8(0x70 | cc);
2538 emit_byte((offs - short_size) & 0xFF); 2538 emit_int8((offs - short_size) & 0xFF);
2539 } else { 2539 } else {
2540 // 0000 1111 1000 tttn #32-bit disp 2540 // 0000 1111 1000 tttn #32-bit disp
2541 emit_byte(0x0F); 2541 emit_int8(0x0F);
2542 emit_byte(0x80 | cc); 2542 emit_int8((unsigned char)(0x80 | cc));
2543 emit_long(offs - long_size); 2543 emit_int32(offs - long_size);
2544 } 2544 }
2545 } else { 2545 } else {
2546 #ifdef ASSERT 2546 #ifdef ASSERT
2547 warning("reversing conditional branch"); 2547 warning("reversing conditional branch");
2548 #endif /* ASSERT */ 2548 #endif /* ASSERT */
3083 } 3083 }
3084 } 3084 }
3085 3085
3086 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) { 3086 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3087 // Used in sign-bit flipping with aligned address. 3087 // Used in sign-bit flipping with aligned address.
3088 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 3088 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3089 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3089 if (reachable(src)) { 3090 if (reachable(src)) {
3090 Assembler::pshufb(dst, as_Address(src)); 3091 Assembler::pshufb(dst, as_Address(src));
3091 } else { 3092 } else {
3092 lea(rscratch1, src); 3093 lea(rscratch1, src);
3093 Assembler::pshufb(dst, Address(rscratch1, 0)); 3094 Assembler::pshufb(dst, Address(rscratch1, 0));
5221 } 5222 }
5222 #endif 5223 #endif
5223 5224
5224 } 5225 }
5225 5226
5227 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
5228 // cnt - number of qwords (8-byte words).
5229 // base - start address, qword aligned.
5230 assert(base==rdi, "base register must be edi for rep stos");
5231 assert(tmp==rax, "tmp register must be eax for rep stos");
5232 assert(cnt==rcx, "cnt register must be ecx for rep stos");
5233
5234 xorptr(tmp, tmp);
5235 if (UseFastStosb) {
5236 shlptr(cnt,3); // convert to number of bytes
5237 rep_stosb();
5238 } else {
5239 NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
5240 rep_stos();
5241 }
5242 }
5226 5243
5227 // IndexOf for constant substrings with size >= 8 chars 5244 // IndexOf for constant substrings with size >= 8 chars
5228 // which don't need to be loaded through stack. 5245 // which don't need to be loaded through stack.
5229 void MacroAssembler::string_indexofC8(Register str1, Register str2, 5246 void MacroAssembler::string_indexofC8(Register str1, Register str2,
5230 Register cnt1, Register cnt2, 5247 Register cnt1, Register cnt2,
5656 5673
5657 // Is the minimum length zero? 5674 // Is the minimum length zero?
5658 testl(cnt2, cnt2); 5675 testl(cnt2, cnt2);
5659 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 5676 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
5660 5677
5661 // Load first characters 5678 // Compare first characters
5662 load_unsigned_short(result, Address(str1, 0)); 5679 load_unsigned_short(result, Address(str1, 0));
5663 load_unsigned_short(cnt1, Address(str2, 0)); 5680 load_unsigned_short(cnt1, Address(str2, 0));
5664
5665 // Compare first characters
5666 subl(result, cnt1); 5681 subl(result, cnt1);
5667 jcc(Assembler::notZero, POP_LABEL); 5682 jcc(Assembler::notZero, POP_LABEL);
5668 decrementl(cnt2); 5683 cmpl(cnt2, 1);
5669 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 5684 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
5670 5685
5671 { 5686 // Check if the strings start at the same location.
5672 // Check after comparing first character to see if strings are equivalent 5687 cmpptr(str1, str2);
5673 Label LSkip2; 5688 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
5674 // Check if the strings start at same location
5675 cmpptr(str1, str2);
5676 jccb(Assembler::notEqual, LSkip2);
5677
5678 // Check if the length difference is zero (from stack)
5679 cmpl(Address(rsp, 0), 0x0);
5680 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
5681
5682 // Strings might not be equivalent
5683 bind(LSkip2);
5684 }
5685 5689
5686 Address::ScaleFactor scale = Address::times_2; 5690 Address::ScaleFactor scale = Address::times_2;
5687 int stride = 8; 5691 int stride = 8;
5688 5692
5689 // Advance to next element 5693 if (UseAVX >= 2) {
5690 addptr(str1, 16/stride); 5694 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
5691 addptr(str2, 16/stride); 5695 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
5692 5696 Label COMPARE_TAIL_LONG;
5693 if (UseSSE42Intrinsics) { 5697 int pcmpmask = 0x19;
5698
5699 // Setup to compare 16-chars (32-bytes) vectors,
5700 // start from first character again because it has aligned address.
5701 int stride2 = 16;
5702 int adr_stride = stride << scale;
5703 int adr_stride2 = stride2 << scale;
5704
5705 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
5706 // rax and rdx are used by pcmpestri as elements counters
5707 movl(result, cnt2);
5708 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
5709 jcc(Assembler::zero, COMPARE_TAIL_LONG);
5710
5711 // fast path : compare first 2 8-char vectors.
5712 bind(COMPARE_16_CHARS);
5713 movdqu(vec1, Address(str1, 0));
5714 pcmpestri(vec1, Address(str2, 0), pcmpmask);
5715 jccb(Assembler::below, COMPARE_INDEX_CHAR);
5716
5717 movdqu(vec1, Address(str1, adr_stride));
5718 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
5719 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
5720 addl(cnt1, stride);
5721
5722 // Compare the characters at index in cnt1
5723 bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
5724 load_unsigned_short(result, Address(str1, cnt1, scale));
5725 load_unsigned_short(cnt2, Address(str2, cnt1, scale));
5726 subl(result, cnt2);
5727 jmp(POP_LABEL);
5728
5729 // Setup the registers to start vector comparison loop
5730 bind(COMPARE_WIDE_VECTORS);
5731 lea(str1, Address(str1, result, scale));
5732 lea(str2, Address(str2, result, scale));
5733 subl(result, stride2);
5734 subl(cnt2, stride2);
5735 jccb(Assembler::zero, COMPARE_WIDE_TAIL);
5736 negptr(result);
5737
5738 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
5739 bind(COMPARE_WIDE_VECTORS_LOOP);
5740 vmovdqu(vec1, Address(str1, result, scale));
5741 vpxor(vec1, Address(str2, result, scale));
5742 vptest(vec1, vec1);
5743 jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
5744 addptr(result, stride2);
5745 subl(cnt2, stride2);
5746 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
5747
5748 // compare wide vectors tail
5749 bind(COMPARE_WIDE_TAIL);
5750 testptr(result, result);
5751 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
5752
5753 movl(result, stride2);
5754 movl(cnt2, result);
5755 negptr(result);
5756 jmpb(COMPARE_WIDE_VECTORS_LOOP);
5757
5758 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
5759 bind(VECTOR_NOT_EQUAL);
5760 lea(str1, Address(str1, result, scale));
5761 lea(str2, Address(str2, result, scale));
5762 jmp(COMPARE_16_CHARS);
5763
5764 // Compare tail chars, length between 1 to 15 chars
5765 bind(COMPARE_TAIL_LONG);
5766 movl(cnt2, result);
5767 cmpl(cnt2, stride);
5768 jccb(Assembler::less, COMPARE_SMALL_STR);
5769
5770 movdqu(vec1, Address(str1, 0));
5771 pcmpestri(vec1, Address(str2, 0), pcmpmask);
5772 jcc(Assembler::below, COMPARE_INDEX_CHAR);
5773 subptr(cnt2, stride);
5774 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
5775 lea(str1, Address(str1, result, scale));
5776 lea(str2, Address(str2, result, scale));
5777 negptr(cnt2);
5778 jmpb(WHILE_HEAD_LABEL);
5779
5780 bind(COMPARE_SMALL_STR);
5781 } else if (UseSSE42Intrinsics) {
5694 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 5782 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
5695 int pcmpmask = 0x19; 5783 int pcmpmask = 0x19;
5696 // Setup to compare 16-byte vectors 5784 // Setup to compare 8-char (16-byte) vectors,
5785 // start from first character again because it has aligned address.
5697 movl(result, cnt2); 5786 movl(result, cnt2);
5698 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 5787 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
5699 jccb(Assembler::zero, COMPARE_TAIL); 5788 jccb(Assembler::zero, COMPARE_TAIL);
5700 5789
5701 lea(str1, Address(str1, result, scale)); 5790 lea(str1, Address(str1, result, scale));
5723 addptr(result, stride); 5812 addptr(result, stride);
5724 subptr(cnt2, stride); 5813 subptr(cnt2, stride);
5725 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 5814 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
5726 5815
5727 // compare wide vectors tail 5816 // compare wide vectors tail
5728 testl(result, result); 5817 testptr(result, result);
5729 jccb(Assembler::zero, LENGTH_DIFF_LABEL); 5818 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
5730 5819
5731 movl(cnt2, stride); 5820 movl(cnt2, stride);
5732 movl(result, stride); 5821 movl(result, stride);
5733 negptr(result); 5822 negptr(result);
5735 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 5824 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
5736 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 5825 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
5737 5826
5738 // Mismatched characters in the vectors 5827 // Mismatched characters in the vectors
5739 bind(VECTOR_NOT_EQUAL); 5828 bind(VECTOR_NOT_EQUAL);
5740 addptr(result, cnt1); 5829 addptr(cnt1, result);
5741 movptr(cnt2, result); 5830 load_unsigned_short(result, Address(str1, cnt1, scale));
5742 load_unsigned_short(result, Address(str1, cnt2, scale)); 5831 load_unsigned_short(cnt2, Address(str2, cnt1, scale));
5743 load_unsigned_short(cnt1, Address(str2, cnt2, scale)); 5832 subl(result, cnt2);
5744 subl(result, cnt1);
5745 jmpb(POP_LABEL); 5833 jmpb(POP_LABEL);
5746 5834
5747 bind(COMPARE_TAIL); // limit is zero 5835 bind(COMPARE_TAIL); // limit is zero
5748 movl(cnt2, result); 5836 movl(cnt2, result);
5749 // Fallthru to tail compare 5837 // Fallthru to tail compare
5750 } 5838 }
5751
5752 // Shift str2 and str1 to the end of the arrays, negate min 5839 // Shift str2 and str1 to the end of the arrays, negate min
5753 lea(str1, Address(str1, cnt2, scale, 0)); 5840 lea(str1, Address(str1, cnt2, scale));
5754 lea(str2, Address(str2, cnt2, scale, 0)); 5841 lea(str2, Address(str2, cnt2, scale));
5842 decrementl(cnt2); // first character was compared already
5755 negptr(cnt2); 5843 negptr(cnt2);
5756 5844
5757 // Compare the rest of the elements 5845 // Compare the rest of the elements
5758 bind(WHILE_HEAD_LABEL); 5846 bind(WHILE_HEAD_LABEL);
5759 load_unsigned_short(result, Address(str1, cnt2, scale, 0)); 5847 load_unsigned_short(result, Address(str1, cnt2, scale, 0));
5814 } 5902 }
5815 5903
5816 shll(limit, 1); // byte count != 0 5904 shll(limit, 1); // byte count != 0
5817 movl(result, limit); // copy 5905 movl(result, limit); // copy
5818 5906
5819 if (UseSSE42Intrinsics) { 5907 if (UseAVX >= 2) {
5908 // With AVX2, use 32-byte vector compare
5909 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
5910
5911 // Compare 32-byte vectors
5912 andl(result, 0x0000001e); // tail count (in bytes)
5913 andl(limit, 0xffffffe0); // vector count (in bytes)
5914 jccb(Assembler::zero, COMPARE_TAIL);
5915
5916 lea(ary1, Address(ary1, limit, Address::times_1));
5917 lea(ary2, Address(ary2, limit, Address::times_1));
5918 negptr(limit);
5919
5920 bind(COMPARE_WIDE_VECTORS);
5921 vmovdqu(vec1, Address(ary1, limit, Address::times_1));
5922 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
5923 vpxor(vec1, vec2);
5924
5925 vptest(vec1, vec1);
5926 jccb(Assembler::notZero, FALSE_LABEL);
5927 addptr(limit, 32);
5928 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
5929
5930 testl(result, result);
5931 jccb(Assembler::zero, TRUE_LABEL);
5932
5933 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
5934 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
5935 vpxor(vec1, vec2);
5936
5937 vptest(vec1, vec1);
5938 jccb(Assembler::notZero, FALSE_LABEL);
5939 jmpb(TRUE_LABEL);
5940
5941 bind(COMPARE_TAIL); // limit is zero
5942 movl(limit, result);
5943 // Fallthru to tail compare
5944 } else if (UseSSE42Intrinsics) {
5820 // With SSE4.2, use double quad vector compare 5945 // With SSE4.2, use double quad vector compare
5821 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 5946 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
5822 5947
5823 // Compare 16-byte vectors 5948 // Compare 16-byte vectors
5824 andl(result, 0x0000000e); // tail count (in bytes) 5949 andl(result, 0x0000000e); // tail count (in bytes)
5992 } 6117 }
5993 BIND(L_fill_32_bytes); 6118 BIND(L_fill_32_bytes);
5994 { 6119 {
5995 assert( UseSSE >= 2, "supported cpu only" ); 6120 assert( UseSSE >= 2, "supported cpu only" );
5996 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 6121 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5997 // Fill 32-byte chunks
5998 movdl(xtmp, value); 6122 movdl(xtmp, value);
5999 pshufd(xtmp, xtmp, 0); 6123 if (UseAVX >= 2 && UseUnalignedLoadStores) {
6000 6124 // Fill 64-byte chunks
6001 subl(count, 8 << shift); 6125 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
6002 jcc(Assembler::less, L_check_fill_8_bytes); 6126 vpbroadcastd(xtmp, xtmp);
6003 align(16); 6127
6004 6128 subl(count, 16 << shift);
6005 BIND(L_fill_32_bytes_loop); 6129 jcc(Assembler::less, L_check_fill_32_bytes);
6006 6130 align(16);
6007 if (UseUnalignedLoadStores) { 6131
6008 movdqu(Address(to, 0), xtmp); 6132 BIND(L_fill_64_bytes_loop);
6009 movdqu(Address(to, 16), xtmp); 6133 vmovdqu(Address(to, 0), xtmp);
6134 vmovdqu(Address(to, 32), xtmp);
6135 addptr(to, 64);
6136 subl(count, 16 << shift);
6137 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6138
6139 BIND(L_check_fill_32_bytes);
6140 addl(count, 8 << shift);
6141 jccb(Assembler::less, L_check_fill_8_bytes);
6142 vmovdqu(Address(to, 0), xtmp);
6143 addptr(to, 32);
6144 subl(count, 8 << shift);
6010 } else { 6145 } else {
6011 movq(Address(to, 0), xtmp); 6146 // Fill 32-byte chunks
6012 movq(Address(to, 8), xtmp); 6147 pshufd(xtmp, xtmp, 0);
6013 movq(Address(to, 16), xtmp); 6148
6014 movq(Address(to, 24), xtmp); 6149 subl(count, 8 << shift);
6150 jcc(Assembler::less, L_check_fill_8_bytes);
6151 align(16);
6152
6153 BIND(L_fill_32_bytes_loop);
6154
6155 if (UseUnalignedLoadStores) {
6156 movdqu(Address(to, 0), xtmp);
6157 movdqu(Address(to, 16), xtmp);
6158 } else {
6159 movq(Address(to, 0), xtmp);
6160 movq(Address(to, 8), xtmp);
6161 movq(Address(to, 16), xtmp);
6162 movq(Address(to, 24), xtmp);
6163 }
6164
6165 addptr(to, 32);
6166 subl(count, 8 << shift);
6167 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6015 } 6168 }
6016
6017 addptr(to, 32);
6018 subl(count, 8 << shift);
6019 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6020 BIND(L_check_fill_8_bytes); 6169 BIND(L_check_fill_8_bytes);
6021 addl(count, 8 << shift); 6170 addl(count, 8 << shift);
6022 jccb(Assembler::zero, L_exit); 6171 jccb(Assembler::zero, L_exit);
6023 jmpb(L_fill_8_bytes); 6172 jmpb(L_fill_8_bytes);
6024 6173