Mercurial > hg > truffle
diff src/cpu/x86/vm/macroAssembler_x86.cpp @ 7482:989155e2d07a
Merge with hs25-b15.
author | Thomas Wuerthinger <thomas.wuerthinger@oracle.com> |
---|---|
date | Wed, 16 Jan 2013 01:34:24 +0100 |
parents | 038dd2875b94 |
children | b30b3c2a0cf2 db9981fd3124 |
line wrap: on
line diff
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Jan 15 18:54:02 2013 +0100 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Wed Jan 16 01:34:24 2013 +0100 @@ -1023,7 +1023,7 @@ void MacroAssembler::leave() { // %%% is this really better? Why not on 32bit too? - emit_byte(0xC9); // LEAVE + emit_int8((unsigned char)0xC9); // LEAVE } void MacroAssembler::lneg(Register hi, Register lo) { @@ -2112,11 +2112,11 @@ if (UseAddressNop) { addr_nop_5(); } else { - emit_byte(0x26); // es: - emit_byte(0x2e); // cs: - emit_byte(0x64); // fs: - emit_byte(0x65); // gs: - emit_byte(0x90); + emit_int8(0x26); // es: + emit_int8(0x2e); // cs: + emit_int8(0x64); // fs: + emit_int8(0x65); // gs: + emit_int8((unsigned char)0x90); } } @@ -2534,13 +2534,13 @@ int offs = (intptr_t)dst.target() - ((intptr_t)pc()); if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) { // 0111 tttn #8-bit disp - emit_byte(0x70 | cc); - emit_byte((offs - short_size) & 0xFF); + emit_int8(0x70 | cc); + emit_int8((offs - short_size) & 0xFF); } else { // 0000 1111 1000 tttn #32-bit disp - emit_byte(0x0F); - emit_byte(0x80 | cc); - emit_long(offs - long_size); + emit_int8(0x0F); + emit_int8((unsigned char)(0x80 | cc)); + emit_int32(offs - long_size); } } else { #ifdef ASSERT @@ -3085,7 +3085,8 @@ void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) { // Used in sign-bit flipping with aligned address. - assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); + bool aligned_adr = (((intptr_t)src.target() & 15) == 0); + assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes"); if (reachable(src)) { Assembler::pshufb(dst, as_Address(src)); } else { @@ -5223,6 +5224,22 @@ } +void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) { + // cnt - number of qwords (8-byte words). + // base - start address, qword aligned. + assert(base==rdi, "base register must be edi for rep stos"); + assert(tmp==rax, "tmp register must be eax for rep stos"); + assert(cnt==rcx, "cnt register must be ecx for rep stos"); + + xorptr(tmp, tmp); + if (UseFastStosb) { + shlptr(cnt,3); // convert to number of bytes + rep_stosb(); + } else { + NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM + rep_stos(); + } +} // IndexOf for constant substrings with size >= 8 chars // which don't need to be loaded through stack. @@ -5658,42 +5675,114 @@ testl(cnt2, cnt2); jcc(Assembler::zero, LENGTH_DIFF_LABEL); - // Load first characters + // Compare first characters load_unsigned_short(result, Address(str1, 0)); load_unsigned_short(cnt1, Address(str2, 0)); - - // Compare first characters subl(result, cnt1); jcc(Assembler::notZero, POP_LABEL); - decrementl(cnt2); - jcc(Assembler::zero, LENGTH_DIFF_LABEL); - - { - // Check after comparing first character to see if strings are equivalent - Label LSkip2; - // Check if the strings start at same location - cmpptr(str1, str2); - jccb(Assembler::notEqual, LSkip2); - - // Check if the length difference is zero (from stack) - cmpl(Address(rsp, 0), 0x0); - jcc(Assembler::equal, LENGTH_DIFF_LABEL); - - // Strings might not be equivalent - bind(LSkip2); - } + cmpl(cnt2, 1); + jcc(Assembler::equal, LENGTH_DIFF_LABEL); + + // Check if the strings start at the same location. + cmpptr(str1, str2); + jcc(Assembler::equal, LENGTH_DIFF_LABEL); Address::ScaleFactor scale = Address::times_2; int stride = 8; - // Advance to next element - addptr(str1, 16/stride); - addptr(str2, 16/stride); - - if (UseSSE42Intrinsics) { + if (UseAVX >= 2) { + Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; + Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; + Label COMPARE_TAIL_LONG; + int pcmpmask = 0x19; + + // Setup to compare 16-chars (32-bytes) vectors, + // start from first character again because it has aligned address. + int stride2 = 16; + int adr_stride = stride << scale; + int adr_stride2 = stride2 << scale; + + assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); + // rax and rdx are used by pcmpestri as elements counters + movl(result, cnt2); + andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count + jcc(Assembler::zero, COMPARE_TAIL_LONG); + + // fast path : compare first 2 8-char vectors. + bind(COMPARE_16_CHARS); + movdqu(vec1, Address(str1, 0)); + pcmpestri(vec1, Address(str2, 0), pcmpmask); + jccb(Assembler::below, COMPARE_INDEX_CHAR); + + movdqu(vec1, Address(str1, adr_stride)); + pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); + jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); + addl(cnt1, stride); + + // Compare the characters at index in cnt1 + bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character + load_unsigned_short(result, Address(str1, cnt1, scale)); + load_unsigned_short(cnt2, Address(str2, cnt1, scale)); + subl(result, cnt2); + jmp(POP_LABEL); + + // Setup the registers to start vector comparison loop + bind(COMPARE_WIDE_VECTORS); + lea(str1, Address(str1, result, scale)); + lea(str2, Address(str2, result, scale)); + subl(result, stride2); + subl(cnt2, stride2); + jccb(Assembler::zero, COMPARE_WIDE_TAIL); + negptr(result); + + // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) + bind(COMPARE_WIDE_VECTORS_LOOP); + vmovdqu(vec1, Address(str1, result, scale)); + vpxor(vec1, Address(str2, result, scale)); + vptest(vec1, vec1); + jccb(Assembler::notZero, VECTOR_NOT_EQUAL); + addptr(result, stride2); + subl(cnt2, stride2); + jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); + + // compare wide vectors tail + bind(COMPARE_WIDE_TAIL); + testptr(result, result); + jccb(Assembler::zero, LENGTH_DIFF_LABEL); + + movl(result, stride2); + movl(cnt2, result); + negptr(result); + jmpb(COMPARE_WIDE_VECTORS_LOOP); + + // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. + bind(VECTOR_NOT_EQUAL); + lea(str1, Address(str1, result, scale)); + lea(str2, Address(str2, result, scale)); + jmp(COMPARE_16_CHARS); + + // Compare tail chars, length between 1 to 15 chars + bind(COMPARE_TAIL_LONG); + movl(cnt2, result); + cmpl(cnt2, stride); + jccb(Assembler::less, COMPARE_SMALL_STR); + + movdqu(vec1, Address(str1, 0)); + pcmpestri(vec1, Address(str2, 0), pcmpmask); + jcc(Assembler::below, COMPARE_INDEX_CHAR); + subptr(cnt2, stride); + jccb(Assembler::zero, LENGTH_DIFF_LABEL); + lea(str1, Address(str1, result, scale)); + lea(str2, Address(str2, result, scale)); + negptr(cnt2); + jmpb(WHILE_HEAD_LABEL); + + bind(COMPARE_SMALL_STR); + } else if (UseSSE42Intrinsics) { Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; int pcmpmask = 0x19; - // Setup to compare 16-byte vectors + // Setup to compare 8-char (16-byte) vectors, + // start from first character again because it has aligned address. movl(result, cnt2); andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count jccb(Assembler::zero, COMPARE_TAIL); @@ -5725,7 +5814,7 @@ jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); // compare wide vectors tail - testl(result, result); + testptr(result, result); jccb(Assembler::zero, LENGTH_DIFF_LABEL); movl(cnt2, stride); @@ -5737,21 +5826,20 @@ // Mismatched characters in the vectors bind(VECTOR_NOT_EQUAL); - addptr(result, cnt1); - movptr(cnt2, result); - load_unsigned_short(result, Address(str1, cnt2, scale)); - load_unsigned_short(cnt1, Address(str2, cnt2, scale)); - subl(result, cnt1); + addptr(cnt1, result); + load_unsigned_short(result, Address(str1, cnt1, scale)); + load_unsigned_short(cnt2, Address(str2, cnt1, scale)); + subl(result, cnt2); jmpb(POP_LABEL); bind(COMPARE_TAIL); // limit is zero movl(cnt2, result); // Fallthru to tail compare } - // Shift str2 and str1 to the end of the arrays, negate min - lea(str1, Address(str1, cnt2, scale, 0)); - lea(str2, Address(str2, cnt2, scale, 0)); + lea(str1, Address(str1, cnt2, scale)); + lea(str2, Address(str2, cnt2, scale)); + decrementl(cnt2); // first character was compared already negptr(cnt2); // Compare the rest of the elements @@ -5816,7 +5904,44 @@ shll(limit, 1); // byte count != 0 movl(result, limit); // copy - if (UseSSE42Intrinsics) { + if (UseAVX >= 2) { + // With AVX2, use 32-byte vector compare + Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; + + // Compare 32-byte vectors + andl(result, 0x0000001e); // tail count (in bytes) + andl(limit, 0xffffffe0); // vector count (in bytes) + jccb(Assembler::zero, COMPARE_TAIL); + + lea(ary1, Address(ary1, limit, Address::times_1)); + lea(ary2, Address(ary2, limit, Address::times_1)); + negptr(limit); + + bind(COMPARE_WIDE_VECTORS); + vmovdqu(vec1, Address(ary1, limit, Address::times_1)); + vmovdqu(vec2, Address(ary2, limit, Address::times_1)); + vpxor(vec1, vec2); + + vptest(vec1, vec1); + jccb(Assembler::notZero, FALSE_LABEL); + addptr(limit, 32); + jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); + + testl(result, result); + jccb(Assembler::zero, TRUE_LABEL); + + vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); + vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); + vpxor(vec1, vec2); + + vptest(vec1, vec1); + jccb(Assembler::notZero, FALSE_LABEL); + jmpb(TRUE_LABEL); + + bind(COMPARE_TAIL); // limit is zero + movl(limit, result); + // Fallthru to tail compare + } else if (UseSSE42Intrinsics) { // With SSE4.2, use double quad vector compare Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; @@ -5994,29 +6119,53 @@ { assert( UseSSE >= 2, "supported cpu only" ); Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; - // Fill 32-byte chunks movdl(xtmp, value); - pshufd(xtmp, xtmp, 0); - - subl(count, 8 << shift); - jcc(Assembler::less, L_check_fill_8_bytes); - align(16); - - BIND(L_fill_32_bytes_loop); - - if (UseUnalignedLoadStores) { - movdqu(Address(to, 0), xtmp); - movdqu(Address(to, 16), xtmp); + if (UseAVX >= 2 && UseUnalignedLoadStores) { + // Fill 64-byte chunks + Label L_fill_64_bytes_loop, L_check_fill_32_bytes; + vpbroadcastd(xtmp, xtmp); + + subl(count, 16 << shift); + jcc(Assembler::less, L_check_fill_32_bytes); + align(16); + + BIND(L_fill_64_bytes_loop); + vmovdqu(Address(to, 0), xtmp); + vmovdqu(Address(to, 32), xtmp); + addptr(to, 64); + subl(count, 16 << shift); + jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); + + BIND(L_check_fill_32_bytes); + addl(count, 8 << shift); + jccb(Assembler::less, L_check_fill_8_bytes); + vmovdqu(Address(to, 0), xtmp); + addptr(to, 32); + subl(count, 8 << shift); } else { - movq(Address(to, 0), xtmp); - movq(Address(to, 8), xtmp); - movq(Address(to, 16), xtmp); - movq(Address(to, 24), xtmp); + // Fill 32-byte chunks + pshufd(xtmp, xtmp, 0); + + subl(count, 8 << shift); + jcc(Assembler::less, L_check_fill_8_bytes); + align(16); + + BIND(L_fill_32_bytes_loop); + + if (UseUnalignedLoadStores) { + movdqu(Address(to, 0), xtmp); + movdqu(Address(to, 16), xtmp); + } else { + movq(Address(to, 0), xtmp); + movq(Address(to, 8), xtmp); + movq(Address(to, 16), xtmp); + movq(Address(to, 24), xtmp); + } + + addptr(to, 32); + subl(count, 8 << shift); + jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); } - - addptr(to, 32); - subl(count, 8 << shift); - jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); BIND(L_check_fill_8_bytes); addl(count, 8 << shift); jccb(Assembler::zero, L_exit);