comparison src/share/vm/opto/superword.cpp @ 6614:006050192a5a

6340864: Implement vectorization optimizations in hotspot-server Summary: Added asm encoding and mach nodes for vector arithmetic instructions on x86. Reviewed-by: roland
author kvn
date Mon, 20 Aug 2012 09:07:21 -0700
parents 6f8f439e247d
children 4b0d6fd74911
comparison
equal deleted inserted replaced
6594:d5ec46c7da5c 6614:006050192a5a
1355 vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen); 1355 vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
1356 } else if (n->req() == 3) { 1356 } else if (n->req() == 3) {
1357 // Promote operands to vector 1357 // Promote operands to vector
1358 Node* in1 = vector_opd(p, 1); 1358 Node* in1 = vector_opd(p, 1);
1359 Node* in2 = vector_opd(p, 2); 1359 Node* in2 = vector_opd(p, 2);
1360 if (VectorNode::is_invariant_vector(in1) && (n->is_Add() || n->is_Mul())) {
1361 // Move invariant vector input into second position to avoid register spilling.
1362 Node* tmp = in1;
1363 in1 = in2;
1364 in2 = tmp;
1365 }
1360 vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n)); 1366 vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
1361 } else { 1367 } else {
1362 ShouldNotReachHere(); 1368 ShouldNotReachHere();
1363 } 1369 }
1364 assert(vn != NULL, "sanity"); 1370 assert(vn != NULL, "sanity");
1397 } 1403 }
1398 1404
1399 if (same_opd) { 1405 if (same_opd) {
1400 if (opd->is_Vector() || opd->is_LoadVector()) { 1406 if (opd->is_Vector() || opd->is_LoadVector()) {
1401 return opd; // input is matching vector 1407 return opd; // input is matching vector
1408 }
1409 if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
1410 // No vector is needed for shift count.
1411 // Vector instructions do not mask shift count, do it here.
1412 Compile* C = _phase->C;
1413 Node* cnt = opd;
1414 juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
1415 const TypeInt* t = opd->find_int_type();
1416 if (t != NULL && t->is_con()) {
1417 juint shift = t->get_con();
1418 if (shift > mask) { // Unsigned cmp
1419 cnt = ConNode::make(C, TypeInt::make(shift & mask));
1420 }
1421 } else {
1422 if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) {
1423 cnt = ConNode::make(C, TypeInt::make(mask));
1424 _phase->_igvn.register_new_node_with_optimizer(cnt);
1425 cnt = new (C, 3) AndINode(opd, cnt);
1426 _phase->_igvn.register_new_node_with_optimizer(cnt);
1427 _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
1428 }
1429 assert(opd->bottom_type()->isa_int(), "int type only");
1430 // Move non constant shift count into XMM register.
1431 cnt = new (_phase->C, 2) MoveI2FNode(cnt);
1432 }
1433 if (cnt != opd) {
1434 _phase->_igvn.register_new_node_with_optimizer(cnt);
1435 _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
1436 }
1437 return cnt;
1402 } 1438 }
1403 assert(!opd->is_StoreVector(), "such vector is not expected here"); 1439 assert(!opd->is_StoreVector(), "such vector is not expected here");
1404 // Convert scalar input to vector with the same number of elements as 1440 // Convert scalar input to vector with the same number of elements as
1405 // p0's vector. Use p0's type because size of operand's container in 1441 // p0's vector. Use p0's type because size of operand's container in
1406 // vector should match p0's size regardless operand's size. 1442 // vector should match p0's size regardless operand's size.
1716 // Propagate narrowed type backwards through operations 1752 // Propagate narrowed type backwards through operations
1717 // that don't depend on higher order bits 1753 // that don't depend on higher order bits
1718 for (int i = _block.length() - 1; i >= 0; i--) { 1754 for (int i = _block.length() - 1; i >= 0; i--) {
1719 Node* n = _block.at(i); 1755 Node* n = _block.at(i);
1720 // Only integer types need be examined 1756 // Only integer types need be examined
1721 if (n->bottom_type()->isa_int()) { 1757 const Type* vt = velt_type(n);
1758 if (vt->basic_type() == T_INT) {
1722 uint start, end; 1759 uint start, end;
1723 vector_opd_range(n, &start, &end); 1760 vector_opd_range(n, &start, &end);
1724 const Type* vt = velt_type(n); 1761 const Type* vt = velt_type(n);
1725 1762
1726 for (uint j = start; j < end; j++) { 1763 for (uint j = start; j < end; j++) {
1727 Node* in = n->in(j); 1764 Node* in = n->in(j);
1728 // Don't propagate through a type conversion 1765 // Don't propagate through a memory
1729 if (n->bottom_type() != in->bottom_type()) 1766 if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT &&
1730 continue; 1767 data_size(n) < data_size(in)) {
1731 switch(in->Opcode()) { 1768 bool same_type = true;
1732 case Op_AddI: case Op_AddL: 1769 for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
1733 case Op_SubI: case Op_SubL: 1770 Node *use = in->fast_out(k);
1734 case Op_MulI: case Op_MulL: 1771 if (!in_bb(use) || !same_velt_type(use, n)) {
1735 case Op_AndI: case Op_AndL: 1772 same_type = false;
1736 case Op_OrI: case Op_OrL: 1773 break;
1737 case Op_XorI: case Op_XorL:
1738 case Op_LShiftI: case Op_LShiftL:
1739 case Op_CMoveI: case Op_CMoveL:
1740 if (in_bb(in)) {
1741 bool same_type = true;
1742 for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
1743 Node *use = in->fast_out(k);
1744 if (!in_bb(use) || !same_velt_type(use, n)) {
1745 same_type = false;
1746 break;
1747 }
1748 } 1774 }
1749 if (same_type) { 1775 }
1750 set_velt_type(in, vt); 1776 if (same_type) {
1751 } 1777 set_velt_type(in, vt);
1752 } 1778 }
1753 } 1779 }
1754 } 1780 }
1755 } 1781 }
1756 } 1782 }
1790 if (n->is_Mem()) { 1816 if (n->is_Mem()) {
1791 return Type::get_const_basic_type(n->as_Mem()->memory_type()); 1817 return Type::get_const_basic_type(n->as_Mem()->memory_type());
1792 } 1818 }
1793 const Type* t = _igvn.type(n); 1819 const Type* t = _igvn.type(n);
1794 if (t->basic_type() == T_INT) { 1820 if (t->basic_type() == T_INT) {
1795 if (t->higher_equal(TypeInt::BOOL)) return TypeInt::BOOL; 1821 // A narrow type of arithmetic operations will be determined by
1796 if (t->higher_equal(TypeInt::BYTE)) return TypeInt::BYTE; 1822 // propagating the type of memory operations.
1797 if (t->higher_equal(TypeInt::CHAR)) return TypeInt::CHAR;
1798 if (t->higher_equal(TypeInt::SHORT)) return TypeInt::SHORT;
1799 return TypeInt::INT; 1823 return TypeInt::INT;
1800 } 1824 }
1801 return t; 1825 return t;
1802 } 1826 }
1803 1827
1938 1962
1939 // Given: 1963 // Given:
1940 // lim0 == original pre loop limit 1964 // lim0 == original pre loop limit
1941 // V == v_align (power of 2) 1965 // V == v_align (power of 2)
1942 // invar == extra invariant piece of the address expression 1966 // invar == extra invariant piece of the address expression
1943 // e == k [ +/- invar ] 1967 // e == offset [ +/- invar ]
1944 // 1968 //
1945 // When reassociating expressions involving '%' the basic rules are: 1969 // When reassociating expressions involving '%' the basic rules are:
1946 // (a - b) % k == 0 => a % k == b % k 1970 // (a - b) % k == 0 => a % k == b % k
1947 // and: 1971 // and:
1948 // (a + b) % k == 0 => a % k == (k - b) % k 1972 // (a + b) % k == 0 => a % k == (k - b) % k
1991 int stride = iv_stride(); 2015 int stride = iv_stride();
1992 int scale = align_to_ref_p.scale_in_bytes(); 2016 int scale = align_to_ref_p.scale_in_bytes();
1993 int elt_size = align_to_ref_p.memory_size(); 2017 int elt_size = align_to_ref_p.memory_size();
1994 int v_align = vw / elt_size; 2018 int v_align = vw / elt_size;
1995 assert(v_align > 1, "sanity"); 2019 assert(v_align > 1, "sanity");
1996 int k = align_to_ref_p.offset_in_bytes() / elt_size; 2020 int offset = align_to_ref_p.offset_in_bytes() / elt_size;
1997 2021 Node *offsn = _igvn.intcon(offset);
1998 Node *kn = _igvn.intcon(k); 2022
1999 2023 Node *e = offsn;
2000 Node *e = kn;
2001 if (align_to_ref_p.invar() != NULL) { 2024 if (align_to_ref_p.invar() != NULL) {
2002 // incorporate any extra invariant piece producing k +/- invar >>> log2(elt) 2025 // incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt)
2003 Node* log2_elt = _igvn.intcon(exact_log2(elt_size)); 2026 Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
2004 Node* aref = new (_phase->C, 3) URShiftINode(align_to_ref_p.invar(), log2_elt); 2027 Node* aref = new (_phase->C, 3) URShiftINode(align_to_ref_p.invar(), log2_elt);
2005 _phase->_igvn.register_new_node_with_optimizer(aref); 2028 _phase->_igvn.register_new_node_with_optimizer(aref);
2006 _phase->set_ctrl(aref, pre_ctrl); 2029 _phase->set_ctrl(aref, pre_ctrl);
2007 if (align_to_ref_p.negate_invar()) { 2030 if (align_to_ref_p.negate_invar()) {
2012 _phase->_igvn.register_new_node_with_optimizer(e); 2035 _phase->_igvn.register_new_node_with_optimizer(e);
2013 _phase->set_ctrl(e, pre_ctrl); 2036 _phase->set_ctrl(e, pre_ctrl);
2014 } 2037 }
2015 if (vw > ObjectAlignmentInBytes) { 2038 if (vw > ObjectAlignmentInBytes) {
2016 // incorporate base e +/- base && Mask >>> log2(elt) 2039 // incorporate base e +/- base && Mask >>> log2(elt)
2017 Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw)));
2018 Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base()); 2040 Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base());
2019 _phase->_igvn.register_new_node_with_optimizer(xbase); 2041 _phase->_igvn.register_new_node_with_optimizer(xbase);
2020 Node* masked_xbase = new (_phase->C, 3) AndXNode(xbase, mask); 2042 #ifdef _LP64
2043 xbase = new (_phase->C, 2) ConvL2INode(xbase);
2044 _phase->_igvn.register_new_node_with_optimizer(xbase);
2045 #endif
2046 Node* mask = _igvn.intcon(vw-1);
2047 Node* masked_xbase = new (_phase->C, 3) AndINode(xbase, mask);
2021 _phase->_igvn.register_new_node_with_optimizer(masked_xbase); 2048 _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
2022 #ifdef _LP64
2023 masked_xbase = new (_phase->C, 2) ConvL2INode(masked_xbase);
2024 _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
2025 #endif
2026 Node* log2_elt = _igvn.intcon(exact_log2(elt_size)); 2049 Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
2027 Node* bref = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt); 2050 Node* bref = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt);
2028 _phase->_igvn.register_new_node_with_optimizer(bref); 2051 _phase->_igvn.register_new_node_with_optimizer(bref);
2029 _phase->set_ctrl(bref, pre_ctrl); 2052 _phase->set_ctrl(bref, pre_ctrl);
2030 e = new (_phase->C, 3) AddINode(e, bref); 2053 e = new (_phase->C, 3) AddINode(e, bref);