comparison src/cpu/x86/vm/macroAssembler_x86.cpp @ 7477:038dd2875b94

8005419: Improve intrinsics code performance on x86 by using AVX2 Summary: use 256bit vpxor,vptest instructions in String.compareTo() and equals() intrinsics. Reviewed-by: twisti
author kvn
date Tue, 08 Jan 2013 11:30:51 -0800
parents ffa87474d7a4
children b30b3c2a0cf2 db9981fd3124
comparison
equal deleted inserted replaced
7476:ffa87474d7a4 7477:038dd2875b94
5673 5673
5674 // Is the minimum length zero? 5674 // Is the minimum length zero?
5675 testl(cnt2, cnt2); 5675 testl(cnt2, cnt2);
5676 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 5676 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
5677 5677
5678 // Load first characters 5678 // Compare first characters
5679 load_unsigned_short(result, Address(str1, 0)); 5679 load_unsigned_short(result, Address(str1, 0));
5680 load_unsigned_short(cnt1, Address(str2, 0)); 5680 load_unsigned_short(cnt1, Address(str2, 0));
5681
5682 // Compare first characters
5683 subl(result, cnt1); 5681 subl(result, cnt1);
5684 jcc(Assembler::notZero, POP_LABEL); 5682 jcc(Assembler::notZero, POP_LABEL);
5685 decrementl(cnt2); 5683 cmpl(cnt2, 1);
5686 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 5684 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
5687 5685
5688 { 5686 // Check if the strings start at the same location.
5689 // Check after comparing first character to see if strings are equivalent 5687 cmpptr(str1, str2);
5690 Label LSkip2; 5688 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
5691 // Check if the strings start at same location
5692 cmpptr(str1, str2);
5693 jccb(Assembler::notEqual, LSkip2);
5694
5695 // Check if the length difference is zero (from stack)
5696 cmpl(Address(rsp, 0), 0x0);
5697 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
5698
5699 // Strings might not be equivalent
5700 bind(LSkip2);
5701 }
5702 5689
5703 Address::ScaleFactor scale = Address::times_2; 5690 Address::ScaleFactor scale = Address::times_2;
5704 int stride = 8; 5691 int stride = 8;
5705 5692
5706 // Advance to next element 5693 if (UseAVX >= 2) {
5707 addptr(str1, 16/stride); 5694 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
5708 addptr(str2, 16/stride); 5695 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
5709 5696 Label COMPARE_TAIL_LONG;
5710 if (UseSSE42Intrinsics) { 5697 int pcmpmask = 0x19;
5698
5699 // Setup to compare 16-chars (32-bytes) vectors,
5700 // start from first character again because it has aligned address.
5701 int stride2 = 16;
5702 int adr_stride = stride << scale;
5703 int adr_stride2 = stride2 << scale;
5704
5705 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
5706 // rax and rdx are used by pcmpestri as elements counters
5707 movl(result, cnt2);
5708 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
5709 jcc(Assembler::zero, COMPARE_TAIL_LONG);
5710
5711 // fast path : compare first 2 8-char vectors.
5712 bind(COMPARE_16_CHARS);
5713 movdqu(vec1, Address(str1, 0));
5714 pcmpestri(vec1, Address(str2, 0), pcmpmask);
5715 jccb(Assembler::below, COMPARE_INDEX_CHAR);
5716
5717 movdqu(vec1, Address(str1, adr_stride));
5718 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
5719 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
5720 addl(cnt1, stride);
5721
5722 // Compare the characters at index in cnt1
5723 bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
5724 load_unsigned_short(result, Address(str1, cnt1, scale));
5725 load_unsigned_short(cnt2, Address(str2, cnt1, scale));
5726 subl(result, cnt2);
5727 jmp(POP_LABEL);
5728
5729 // Setup the registers to start vector comparison loop
5730 bind(COMPARE_WIDE_VECTORS);
5731 lea(str1, Address(str1, result, scale));
5732 lea(str2, Address(str2, result, scale));
5733 subl(result, stride2);
5734 subl(cnt2, stride2);
5735 jccb(Assembler::zero, COMPARE_WIDE_TAIL);
5736 negptr(result);
5737
5738 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
5739 bind(COMPARE_WIDE_VECTORS_LOOP);
5740 vmovdqu(vec1, Address(str1, result, scale));
5741 vpxor(vec1, Address(str2, result, scale));
5742 vptest(vec1, vec1);
5743 jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
5744 addptr(result, stride2);
5745 subl(cnt2, stride2);
5746 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
5747
5748 // compare wide vectors tail
5749 bind(COMPARE_WIDE_TAIL);
5750 testptr(result, result);
5751 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
5752
5753 movl(result, stride2);
5754 movl(cnt2, result);
5755 negptr(result);
5756 jmpb(COMPARE_WIDE_VECTORS_LOOP);
5757
5758 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
5759 bind(VECTOR_NOT_EQUAL);
5760 lea(str1, Address(str1, result, scale));
5761 lea(str2, Address(str2, result, scale));
5762 jmp(COMPARE_16_CHARS);
5763
5764 // Compare tail chars, length between 1 to 15 chars
5765 bind(COMPARE_TAIL_LONG);
5766 movl(cnt2, result);
5767 cmpl(cnt2, stride);
5768 jccb(Assembler::less, COMPARE_SMALL_STR);
5769
5770 movdqu(vec1, Address(str1, 0));
5771 pcmpestri(vec1, Address(str2, 0), pcmpmask);
5772 jcc(Assembler::below, COMPARE_INDEX_CHAR);
5773 subptr(cnt2, stride);
5774 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
5775 lea(str1, Address(str1, result, scale));
5776 lea(str2, Address(str2, result, scale));
5777 negptr(cnt2);
5778 jmpb(WHILE_HEAD_LABEL);
5779
5780 bind(COMPARE_SMALL_STR);
5781 } else if (UseSSE42Intrinsics) {
5711 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 5782 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
5712 int pcmpmask = 0x19; 5783 int pcmpmask = 0x19;
5713 // Setup to compare 16-byte vectors 5784 // Setup to compare 8-char (16-byte) vectors,
5785 // start from first character again because it has aligned address.
5714 movl(result, cnt2); 5786 movl(result, cnt2);
5715 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 5787 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
5716 jccb(Assembler::zero, COMPARE_TAIL); 5788 jccb(Assembler::zero, COMPARE_TAIL);
5717 5789
5718 lea(str1, Address(str1, result, scale)); 5790 lea(str1, Address(str1, result, scale));
5740 addptr(result, stride); 5812 addptr(result, stride);
5741 subptr(cnt2, stride); 5813 subptr(cnt2, stride);
5742 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 5814 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
5743 5815
5744 // compare wide vectors tail 5816 // compare wide vectors tail
5745 testl(result, result); 5817 testptr(result, result);
5746 jccb(Assembler::zero, LENGTH_DIFF_LABEL); 5818 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
5747 5819
5748 movl(cnt2, stride); 5820 movl(cnt2, stride);
5749 movl(result, stride); 5821 movl(result, stride);
5750 negptr(result); 5822 negptr(result);
5752 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 5824 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
5753 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 5825 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
5754 5826
5755 // Mismatched characters in the vectors 5827 // Mismatched characters in the vectors
5756 bind(VECTOR_NOT_EQUAL); 5828 bind(VECTOR_NOT_EQUAL);
5757 addptr(result, cnt1); 5829 addptr(cnt1, result);
5758 movptr(cnt2, result); 5830 load_unsigned_short(result, Address(str1, cnt1, scale));
5759 load_unsigned_short(result, Address(str1, cnt2, scale)); 5831 load_unsigned_short(cnt2, Address(str2, cnt1, scale));
5760 load_unsigned_short(cnt1, Address(str2, cnt2, scale)); 5832 subl(result, cnt2);
5761 subl(result, cnt1);
5762 jmpb(POP_LABEL); 5833 jmpb(POP_LABEL);
5763 5834
5764 bind(COMPARE_TAIL); // limit is zero 5835 bind(COMPARE_TAIL); // limit is zero
5765 movl(cnt2, result); 5836 movl(cnt2, result);
5766 // Fallthru to tail compare 5837 // Fallthru to tail compare
5767 } 5838 }
5768
5769 // Shift str2 and str1 to the end of the arrays, negate min 5839 // Shift str2 and str1 to the end of the arrays, negate min
5770 lea(str1, Address(str1, cnt2, scale, 0)); 5840 lea(str1, Address(str1, cnt2, scale));
5771 lea(str2, Address(str2, cnt2, scale, 0)); 5841 lea(str2, Address(str2, cnt2, scale));
5842 decrementl(cnt2); // first character was compared already
5772 negptr(cnt2); 5843 negptr(cnt2);
5773 5844
5774 // Compare the rest of the elements 5845 // Compare the rest of the elements
5775 bind(WHILE_HEAD_LABEL); 5846 bind(WHILE_HEAD_LABEL);
5776 load_unsigned_short(result, Address(str1, cnt2, scale, 0)); 5847 load_unsigned_short(result, Address(str1, cnt2, scale, 0));
5831 } 5902 }
5832 5903
5833 shll(limit, 1); // byte count != 0 5904 shll(limit, 1); // byte count != 0
5834 movl(result, limit); // copy 5905 movl(result, limit); // copy
5835 5906
5836 if (UseSSE42Intrinsics) { 5907 if (UseAVX >= 2) {
5908 // With AVX2, use 32-byte vector compare
5909 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
5910
5911 // Compare 32-byte vectors
5912 andl(result, 0x0000001e); // tail count (in bytes)
5913 andl(limit, 0xffffffe0); // vector count (in bytes)
5914 jccb(Assembler::zero, COMPARE_TAIL);
5915
5916 lea(ary1, Address(ary1, limit, Address::times_1));
5917 lea(ary2, Address(ary2, limit, Address::times_1));
5918 negptr(limit);
5919
5920 bind(COMPARE_WIDE_VECTORS);
5921 vmovdqu(vec1, Address(ary1, limit, Address::times_1));
5922 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
5923 vpxor(vec1, vec2);
5924
5925 vptest(vec1, vec1);
5926 jccb(Assembler::notZero, FALSE_LABEL);
5927 addptr(limit, 32);
5928 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
5929
5930 testl(result, result);
5931 jccb(Assembler::zero, TRUE_LABEL);
5932
5933 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
5934 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
5935 vpxor(vec1, vec2);
5936
5937 vptest(vec1, vec1);
5938 jccb(Assembler::notZero, FALSE_LABEL);
5939 jmpb(TRUE_LABEL);
5940
5941 bind(COMPARE_TAIL); // limit is zero
5942 movl(limit, result);
5943 // Fallthru to tail compare
5944 } else if (UseSSE42Intrinsics) {
5837 // With SSE4.2, use double quad vector compare 5945 // With SSE4.2, use double quad vector compare
5838 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 5946 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
5839 5947
5840 // Compare 16-byte vectors 5948 // Compare 16-byte vectors
5841 andl(result, 0x0000000e); // tail count (in bytes) 5949 andl(result, 0x0000000e); // tail count (in bytes)