Mercurial > hg > graal-jvmci-8
comparison src/cpu/x86/vm/macroAssembler_x86.cpp @ 7477:038dd2875b94
8005419: Improve intrinsics code performance on x86 by using AVX2
Summary: use 256bit vpxor,vptest instructions in String.compareTo() and equals() intrinsics.
Reviewed-by: twisti
author | kvn |
---|---|
date | Tue, 08 Jan 2013 11:30:51 -0800 |
parents | ffa87474d7a4 |
children | b30b3c2a0cf2 db9981fd3124 |
comparison
equal
deleted
inserted
replaced
7476:ffa87474d7a4 | 7477:038dd2875b94 |
---|---|
5673 | 5673 |
5674 // Is the minimum length zero? | 5674 // Is the minimum length zero? |
5675 testl(cnt2, cnt2); | 5675 testl(cnt2, cnt2); |
5676 jcc(Assembler::zero, LENGTH_DIFF_LABEL); | 5676 jcc(Assembler::zero, LENGTH_DIFF_LABEL); |
5677 | 5677 |
5678 // Load first characters | 5678 // Compare first characters |
5679 load_unsigned_short(result, Address(str1, 0)); | 5679 load_unsigned_short(result, Address(str1, 0)); |
5680 load_unsigned_short(cnt1, Address(str2, 0)); | 5680 load_unsigned_short(cnt1, Address(str2, 0)); |
5681 | |
5682 // Compare first characters | |
5683 subl(result, cnt1); | 5681 subl(result, cnt1); |
5684 jcc(Assembler::notZero, POP_LABEL); | 5682 jcc(Assembler::notZero, POP_LABEL); |
5685 decrementl(cnt2); | 5683 cmpl(cnt2, 1); |
5686 jcc(Assembler::zero, LENGTH_DIFF_LABEL); | 5684 jcc(Assembler::equal, LENGTH_DIFF_LABEL); |
5687 | 5685 |
5688 { | 5686 // Check if the strings start at the same location. |
5689 // Check after comparing first character to see if strings are equivalent | 5687 cmpptr(str1, str2); |
5690 Label LSkip2; | 5688 jcc(Assembler::equal, LENGTH_DIFF_LABEL); |
5691 // Check if the strings start at same location | |
5692 cmpptr(str1, str2); | |
5693 jccb(Assembler::notEqual, LSkip2); | |
5694 | |
5695 // Check if the length difference is zero (from stack) | |
5696 cmpl(Address(rsp, 0), 0x0); | |
5697 jcc(Assembler::equal, LENGTH_DIFF_LABEL); | |
5698 | |
5699 // Strings might not be equivalent | |
5700 bind(LSkip2); | |
5701 } | |
5702 | 5689 |
5703 Address::ScaleFactor scale = Address::times_2; | 5690 Address::ScaleFactor scale = Address::times_2; |
5704 int stride = 8; | 5691 int stride = 8; |
5705 | 5692 |
5706 // Advance to next element | 5693 if (UseAVX >= 2) { |
5707 addptr(str1, 16/stride); | 5694 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; |
5708 addptr(str2, 16/stride); | 5695 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; |
5709 | 5696 Label COMPARE_TAIL_LONG; |
5710 if (UseSSE42Intrinsics) { | 5697 int pcmpmask = 0x19; |
5698 | |
5699 // Setup to compare 16-chars (32-bytes) vectors, | |
5700 // start from first character again because it has aligned address. | |
5701 int stride2 = 16; | |
5702 int adr_stride = stride << scale; | |
5703 int adr_stride2 = stride2 << scale; | |
5704 | |
5705 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); | |
5706 // rax and rdx are used by pcmpestri as elements counters | |
5707 movl(result, cnt2); | |
5708 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count | |
5709 jcc(Assembler::zero, COMPARE_TAIL_LONG); | |
5710 | |
5711 // fast path : compare first 2 8-char vectors. | |
5712 bind(COMPARE_16_CHARS); | |
5713 movdqu(vec1, Address(str1, 0)); | |
5714 pcmpestri(vec1, Address(str2, 0), pcmpmask); | |
5715 jccb(Assembler::below, COMPARE_INDEX_CHAR); | |
5716 | |
5717 movdqu(vec1, Address(str1, adr_stride)); | |
5718 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); | |
5719 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); | |
5720 addl(cnt1, stride); | |
5721 | |
5722 // Compare the characters at index in cnt1 | |
5723 bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character | |
5724 load_unsigned_short(result, Address(str1, cnt1, scale)); | |
5725 load_unsigned_short(cnt2, Address(str2, cnt1, scale)); | |
5726 subl(result, cnt2); | |
5727 jmp(POP_LABEL); | |
5728 | |
5729 // Setup the registers to start vector comparison loop | |
5730 bind(COMPARE_WIDE_VECTORS); | |
5731 lea(str1, Address(str1, result, scale)); | |
5732 lea(str2, Address(str2, result, scale)); | |
5733 subl(result, stride2); | |
5734 subl(cnt2, stride2); | |
5735 jccb(Assembler::zero, COMPARE_WIDE_TAIL); | |
5736 negptr(result); | |
5737 | |
5738 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) | |
5739 bind(COMPARE_WIDE_VECTORS_LOOP); | |
5740 vmovdqu(vec1, Address(str1, result, scale)); | |
5741 vpxor(vec1, Address(str2, result, scale)); | |
5742 vptest(vec1, vec1); | |
5743 jccb(Assembler::notZero, VECTOR_NOT_EQUAL); | |
5744 addptr(result, stride2); | |
5745 subl(cnt2, stride2); | |
5746 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); | |
5747 | |
5748 // compare wide vectors tail | |
5749 bind(COMPARE_WIDE_TAIL); | |
5750 testptr(result, result); | |
5751 jccb(Assembler::zero, LENGTH_DIFF_LABEL); | |
5752 | |
5753 movl(result, stride2); | |
5754 movl(cnt2, result); | |
5755 negptr(result); | |
5756 jmpb(COMPARE_WIDE_VECTORS_LOOP); | |
5757 | |
5758 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. | |
5759 bind(VECTOR_NOT_EQUAL); | |
5760 lea(str1, Address(str1, result, scale)); | |
5761 lea(str2, Address(str2, result, scale)); | |
5762 jmp(COMPARE_16_CHARS); | |
5763 | |
5764 // Compare tail chars, length between 1 to 15 chars | |
5765 bind(COMPARE_TAIL_LONG); | |
5766 movl(cnt2, result); | |
5767 cmpl(cnt2, stride); | |
5768 jccb(Assembler::less, COMPARE_SMALL_STR); | |
5769 | |
5770 movdqu(vec1, Address(str1, 0)); | |
5771 pcmpestri(vec1, Address(str2, 0), pcmpmask); | |
5772 jcc(Assembler::below, COMPARE_INDEX_CHAR); | |
5773 subptr(cnt2, stride); | |
5774 jccb(Assembler::zero, LENGTH_DIFF_LABEL); | |
5775 lea(str1, Address(str1, result, scale)); | |
5776 lea(str2, Address(str2, result, scale)); | |
5777 negptr(cnt2); | |
5778 jmpb(WHILE_HEAD_LABEL); | |
5779 | |
5780 bind(COMPARE_SMALL_STR); | |
5781 } else if (UseSSE42Intrinsics) { | |
5711 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; | 5782 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; |
5712 int pcmpmask = 0x19; | 5783 int pcmpmask = 0x19; |
5713 // Setup to compare 16-byte vectors | 5784 // Setup to compare 8-char (16-byte) vectors, |
5785 // start from first character again because it has aligned address. | |
5714 movl(result, cnt2); | 5786 movl(result, cnt2); |
5715 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count | 5787 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count |
5716 jccb(Assembler::zero, COMPARE_TAIL); | 5788 jccb(Assembler::zero, COMPARE_TAIL); |
5717 | 5789 |
5718 lea(str1, Address(str1, result, scale)); | 5790 lea(str1, Address(str1, result, scale)); |
5740 addptr(result, stride); | 5812 addptr(result, stride); |
5741 subptr(cnt2, stride); | 5813 subptr(cnt2, stride); |
5742 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); | 5814 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); |
5743 | 5815 |
5744 // compare wide vectors tail | 5816 // compare wide vectors tail |
5745 testl(result, result); | 5817 testptr(result, result); |
5746 jccb(Assembler::zero, LENGTH_DIFF_LABEL); | 5818 jccb(Assembler::zero, LENGTH_DIFF_LABEL); |
5747 | 5819 |
5748 movl(cnt2, stride); | 5820 movl(cnt2, stride); |
5749 movl(result, stride); | 5821 movl(result, stride); |
5750 negptr(result); | 5822 negptr(result); |
5752 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); | 5824 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); |
5753 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); | 5825 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); |
5754 | 5826 |
5755 // Mismatched characters in the vectors | 5827 // Mismatched characters in the vectors |
5756 bind(VECTOR_NOT_EQUAL); | 5828 bind(VECTOR_NOT_EQUAL); |
5757 addptr(result, cnt1); | 5829 addptr(cnt1, result); |
5758 movptr(cnt2, result); | 5830 load_unsigned_short(result, Address(str1, cnt1, scale)); |
5759 load_unsigned_short(result, Address(str1, cnt2, scale)); | 5831 load_unsigned_short(cnt2, Address(str2, cnt1, scale)); |
5760 load_unsigned_short(cnt1, Address(str2, cnt2, scale)); | 5832 subl(result, cnt2); |
5761 subl(result, cnt1); | |
5762 jmpb(POP_LABEL); | 5833 jmpb(POP_LABEL); |
5763 | 5834 |
5764 bind(COMPARE_TAIL); // limit is zero | 5835 bind(COMPARE_TAIL); // limit is zero |
5765 movl(cnt2, result); | 5836 movl(cnt2, result); |
5766 // Fallthru to tail compare | 5837 // Fallthru to tail compare |
5767 } | 5838 } |
5768 | |
5769 // Shift str2 and str1 to the end of the arrays, negate min | 5839 // Shift str2 and str1 to the end of the arrays, negate min |
5770 lea(str1, Address(str1, cnt2, scale, 0)); | 5840 lea(str1, Address(str1, cnt2, scale)); |
5771 lea(str2, Address(str2, cnt2, scale, 0)); | 5841 lea(str2, Address(str2, cnt2, scale)); |
5842 decrementl(cnt2); // first character was compared already | |
5772 negptr(cnt2); | 5843 negptr(cnt2); |
5773 | 5844 |
5774 // Compare the rest of the elements | 5845 // Compare the rest of the elements |
5775 bind(WHILE_HEAD_LABEL); | 5846 bind(WHILE_HEAD_LABEL); |
5776 load_unsigned_short(result, Address(str1, cnt2, scale, 0)); | 5847 load_unsigned_short(result, Address(str1, cnt2, scale, 0)); |
5831 } | 5902 } |
5832 | 5903 |
5833 shll(limit, 1); // byte count != 0 | 5904 shll(limit, 1); // byte count != 0 |
5834 movl(result, limit); // copy | 5905 movl(result, limit); // copy |
5835 | 5906 |
5836 if (UseSSE42Intrinsics) { | 5907 if (UseAVX >= 2) { |
5908 // With AVX2, use 32-byte vector compare | |
5909 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; | |
5910 | |
5911 // Compare 32-byte vectors | |
5912 andl(result, 0x0000001e); // tail count (in bytes) | |
5913 andl(limit, 0xffffffe0); // vector count (in bytes) | |
5914 jccb(Assembler::zero, COMPARE_TAIL); | |
5915 | |
5916 lea(ary1, Address(ary1, limit, Address::times_1)); | |
5917 lea(ary2, Address(ary2, limit, Address::times_1)); | |
5918 negptr(limit); | |
5919 | |
5920 bind(COMPARE_WIDE_VECTORS); | |
5921 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); | |
5922 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); | |
5923 vpxor(vec1, vec2); | |
5924 | |
5925 vptest(vec1, vec1); | |
5926 jccb(Assembler::notZero, FALSE_LABEL); | |
5927 addptr(limit, 32); | |
5928 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); | |
5929 | |
5930 testl(result, result); | |
5931 jccb(Assembler::zero, TRUE_LABEL); | |
5932 | |
5933 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); | |
5934 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); | |
5935 vpxor(vec1, vec2); | |
5936 | |
5937 vptest(vec1, vec1); | |
5938 jccb(Assembler::notZero, FALSE_LABEL); | |
5939 jmpb(TRUE_LABEL); | |
5940 | |
5941 bind(COMPARE_TAIL); // limit is zero | |
5942 movl(limit, result); | |
5943 // Fallthru to tail compare | |
5944 } else if (UseSSE42Intrinsics) { | |
5837 // With SSE4.2, use double quad vector compare | 5945 // With SSE4.2, use double quad vector compare |
5838 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; | 5946 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; |
5839 | 5947 |
5840 // Compare 16-byte vectors | 5948 // Compare 16-byte vectors |
5841 andl(result, 0x0000000e); // tail count (in bytes) | 5949 andl(result, 0x0000000e); // tail count (in bytes) |