Mercurial > hg > truffle
diff src/share/vm/opto/superword.cpp @ 6179:8c92982cbbc4
7119644: Increase superword's vector size up to 256 bits
Summary: Increase vector size up to 256-bits for YMM AVX registers on x86.
Reviewed-by: never, twisti, roland
author | kvn |
---|---|
date | Fri, 15 Jun 2012 01:25:19 -0700 |
parents | 5e990493719e |
children | 6f8f439e247d |
line wrap: on
line diff
--- a/src/share/vm/opto/superword.cpp Thu Jun 14 14:59:52 2012 -0700 +++ b/src/share/vm/opto/superword.cpp Fri Jun 15 01:25:19 2012 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -67,6 +67,10 @@ //------------------------------transform_loop--------------------------- void SuperWord::transform_loop(IdealLoopTree* lpt) { + assert(UseSuperWord, "should be"); + // Do vectors exist on this architecture? + if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return; + assert(lpt->_head->is_CountedLoop(), "must be"); CountedLoopNode *cl = lpt->_head->as_CountedLoop(); @@ -89,15 +93,12 @@ Node *pre_opaq1 = pre_end->limit(); if (pre_opaq1->Opcode() != Op_Opaque1) return; - // Do vectors exist on this architecture? - if (vector_width_in_bytes() == 0) return; - init(); // initialize data structures set_lpt(lpt); set_lp(cl); - // For now, define one block which is the entire loop body + // For now, define one block which is the entire loop body set_bb(cl); assert(_packset.length() == 0, "packset must be empty"); @@ -177,7 +178,7 @@ Node_List memops; for (int i = 0; i < _block.length(); i++) { Node* n = _block.at(i); - if (n->is_Mem() && in_bb(n) && + if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) && is_java_primitive(n->as_Mem()->memory_type())) { int align = memory_alignment(n->as_Mem(), 0); if (align != bottom_align) { @@ -185,54 +186,130 @@ } } } - if (memops.size() == 0) return; - // Find a memory reference to align to. The pre-loop trip count - // is modified to align this reference to a vector-aligned address - find_align_to_ref(memops); - if (align_to_ref() == NULL) return; + Node_List align_to_refs; + int best_iv_adjustment = 0; + MemNode* best_align_to_mem_ref = NULL; - SWPointer align_to_ref_p(align_to_ref(), this); - int offset = align_to_ref_p.offset_in_bytes(); - int scale = align_to_ref_p.scale_in_bytes(); - int vw = vector_width_in_bytes(); - int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; - int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw; + while (memops.size() != 0) { + // Find a memory reference to align to. + MemNode* mem_ref = find_align_to_ref(memops); + if (mem_ref == NULL) break; + align_to_refs.push(mem_ref); + int iv_adjustment = get_iv_adjustment(mem_ref); -#ifndef PRODUCT - if (TraceSuperWord) - tty->print_cr("\noffset = %d iv_adjustment = %d elt_align = %d scale = %d iv_stride = %d", - offset, iv_adjustment, align_to_ref_p.memory_size(), align_to_ref_p.scale_in_bytes(), iv_stride()); -#endif + if (best_align_to_mem_ref == NULL) { + // Set memory reference which is the best from all memory operations + // to be used for alignment. The pre-loop trip count is modified to align + // this reference to a vector-aligned address. + best_align_to_mem_ref = mem_ref; + best_iv_adjustment = iv_adjustment; + } - // Set alignment relative to "align_to_ref" - for (int i = memops.size() - 1; i >= 0; i--) { - MemNode* s = memops.at(i)->as_Mem(); - SWPointer p2(s, this); - if (p2.comparable(align_to_ref_p)) { - int align = memory_alignment(s, iv_adjustment); - set_alignment(s, align); - } else { - memops.remove(i); + SWPointer align_to_ref_p(mem_ref, this); + // Set alignment relative to "align_to_ref" for all related memory operations. + for (int i = memops.size() - 1; i >= 0; i--) { + MemNode* s = memops.at(i)->as_Mem(); + if (isomorphic(s, mem_ref)) { + SWPointer p2(s, this); + if (p2.comparable(align_to_ref_p)) { + int align = memory_alignment(s, iv_adjustment); + set_alignment(s, align); + } + } } - } - // Create initial pack pairs of memory operations - for (uint i = 0; i < memops.size(); i++) { - Node* s1 = memops.at(i); - for (uint j = 0; j < memops.size(); j++) { - Node* s2 = memops.at(j); - if (s1 != s2 && are_adjacent_refs(s1, s2)) { - int align = alignment(s1); - if (stmts_can_pack(s1, s2, align)) { - Node_List* pair = new Node_List(); - pair->push(s1); - pair->push(s2); - _packset.append(pair); + // Create initial pack pairs of memory operations for which + // alignment is set and vectors will be aligned. + bool create_pack = true; + if (memory_alignment(mem_ref, best_iv_adjustment) != 0) { + if (same_velt_type(mem_ref, best_align_to_mem_ref)) { + // Can't allow vectorization of unaligned memory accesses with the + // same type since it could be overlapped accesses to the same array. + create_pack = false; + } else { + // Allow independent (different type) unaligned memory operations + // if HW supports them. + if (!Matcher::misaligned_vectors_ok()) { + create_pack = false; + } else { + // Check if packs of the same memory type but + // with a different alignment were created before. + for (uint i = 0; i < align_to_refs.size(); i++) { + MemNode* mr = align_to_refs.at(i)->as_Mem(); + if (same_velt_type(mr, mem_ref) && + memory_alignment(mr, iv_adjustment) != 0) + create_pack = false; + } } } } - } + if (create_pack) { + for (uint i = 0; i < memops.size(); i++) { + Node* s1 = memops.at(i); + int align = alignment(s1); + if (align == top_align) continue; + for (uint j = 0; j < memops.size(); j++) { + Node* s2 = memops.at(j); + if (alignment(s2) == top_align) continue; + if (s1 != s2 && are_adjacent_refs(s1, s2)) { + if (stmts_can_pack(s1, s2, align)) { + Node_List* pair = new Node_List(); + pair->push(s1); + pair->push(s2); + _packset.append(pair); + } + } + } + } + } else { // Don't create unaligned pack + // First, remove remaining memory ops of the same type from the list. + for (int i = memops.size() - 1; i >= 0; i--) { + MemNode* s = memops.at(i)->as_Mem(); + if (same_velt_type(s, mem_ref)) { + memops.remove(i); + } + } + + // Second, remove already constructed packs of the same type. + for (int i = _packset.length() - 1; i >= 0; i--) { + Node_List* p = _packset.at(i); + MemNode* s = p->at(0)->as_Mem(); + if (same_velt_type(s, mem_ref)) { + remove_pack_at(i); + } + } + + // If needed find the best memory reference for loop alignment again. + if (same_velt_type(mem_ref, best_align_to_mem_ref)) { + // Put memory ops from remaining packs back on memops list for + // the best alignment search. + uint orig_msize = memops.size(); + for (int i = 0; i < _packset.length(); i++) { + Node_List* p = _packset.at(i); + MemNode* s = p->at(0)->as_Mem(); + assert(!same_velt_type(s, mem_ref), "sanity"); + memops.push(s); + } + MemNode* best_align_to_mem_ref = find_align_to_ref(memops); + if (best_align_to_mem_ref == NULL) break; + best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref); + // Restore list. + while (memops.size() > orig_msize) + (void)memops.pop(); + } + } // unaligned memory accesses + + // Remove used mem nodes. + for (int i = memops.size() - 1; i >= 0; i--) { + MemNode* m = memops.at(i)->as_Mem(); + if (alignment(m) != top_align) { + memops.remove(i); + } + } + + } // while (memops.size() != 0 + set_align_to_ref(best_align_to_mem_ref); #ifndef PRODUCT if (TraceSuperWord) { @@ -246,7 +323,7 @@ // Find a memory reference to align the loop induction variable to. // Looks first at stores then at loads, looking for a memory reference // with the largest number of references similar to it. -void SuperWord::find_align_to_ref(Node_List &memops) { +MemNode* SuperWord::find_align_to_ref(Node_List &memops) { GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0); // Count number of comparable memory ops @@ -270,20 +347,28 @@ } } - // Find Store (or Load) with the greatest number of "comparable" references + // Find Store (or Load) with the greatest number of "comparable" references, + // biggest vector size, smallest data size and smallest iv offset. int max_ct = 0; + int max_vw = 0; int max_idx = -1; int min_size = max_jint; int min_iv_offset = max_jint; for (uint j = 0; j < memops.size(); j++) { MemNode* s = memops.at(j)->as_Mem(); if (s->is_Store()) { + int vw = vector_width_in_bytes(velt_basic_type(s)); + assert(vw > 1, "sanity"); SWPointer p(s, this); - if (cmp_ct.at(j) > max_ct || - cmp_ct.at(j) == max_ct && (data_size(s) < min_size || - data_size(s) == min_size && - p.offset_in_bytes() < min_iv_offset)) { + if (cmp_ct.at(j) > max_ct || + cmp_ct.at(j) == max_ct && + (vw > max_vw || + vw == max_vw && + (data_size(s) < min_size || + data_size(s) == min_size && + (p.offset_in_bytes() < min_iv_offset)))) { max_ct = cmp_ct.at(j); + max_vw = vw; max_idx = j; min_size = data_size(s); min_iv_offset = p.offset_in_bytes(); @@ -295,12 +380,18 @@ for (uint j = 0; j < memops.size(); j++) { MemNode* s = memops.at(j)->as_Mem(); if (s->is_Load()) { + int vw = vector_width_in_bytes(velt_basic_type(s)); + assert(vw > 1, "sanity"); SWPointer p(s, this); - if (cmp_ct.at(j) > max_ct || - cmp_ct.at(j) == max_ct && (data_size(s) < min_size || - data_size(s) == min_size && - p.offset_in_bytes() < min_iv_offset)) { + if (cmp_ct.at(j) > max_ct || + cmp_ct.at(j) == max_ct && + (vw > max_vw || + vw == max_vw && + (data_size(s) < min_size || + data_size(s) == min_size && + (p.offset_in_bytes() < min_iv_offset)))) { max_ct = cmp_ct.at(j); + max_vw = vw; max_idx = j; min_size = data_size(s); min_iv_offset = p.offset_in_bytes(); @@ -309,10 +400,7 @@ } } - if (max_ct > 0) - set_align_to_ref(memops.at(max_idx)->as_Mem()); - -#ifndef PRODUCT +#ifdef ASSERT if (TraceSuperWord && Verbose) { tty->print_cr("\nVector memops after find_align_to_refs"); for (uint i = 0; i < memops.size(); i++) { @@ -321,6 +409,17 @@ } } #endif + + if (max_ct > 0) { +#ifdef ASSERT + if (TraceSuperWord) { + tty->print("\nVector align to node: "); + memops.at(max_idx)->as_Mem()->dump(); + } +#endif + return memops.at(max_idx)->as_Mem(); + } + return NULL; } //------------------------------ref_is_alignable--------------------------- @@ -341,7 +440,9 @@ // If initial offset from start of object is computable, // compute alignment within the vector. - int vw = vector_width_in_bytes(); + BasicType bt = velt_basic_type(p.mem()); + int vw = vector_width_in_bytes(bt); + assert(vw > 1, "sanity"); if (vw % span == 0) { Node* init_nd = pre_end->init_trip(); if (init_nd->is_Con() && p.invar() == NULL) { @@ -361,6 +462,26 @@ return false; } +//---------------------------get_iv_adjustment--------------------------- +// Calculate loop's iv adjustment for this memory ops. +int SuperWord::get_iv_adjustment(MemNode* mem_ref) { + SWPointer align_to_ref_p(mem_ref, this); + int offset = align_to_ref_p.offset_in_bytes(); + int scale = align_to_ref_p.scale_in_bytes(); + BasicType bt = velt_basic_type(mem_ref); + int vw = vector_width_in_bytes(bt); + assert(vw > 1, "sanity"); + int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; + int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw; + +#ifndef PRODUCT + if (TraceSuperWord) + tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d", + offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw); +#endif + return iv_adjustment; +} + //---------------------------dependence_graph--------------------------- // Construct dependency graph. // Add dependence edges to load/store nodes for memory dependence @@ -488,9 +609,13 @@ bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) { // Do not use superword for non-primitives - if((s1->is_Mem() && !is_java_primitive(s1->as_Mem()->memory_type())) || - (s2->is_Mem() && !is_java_primitive(s2->as_Mem()->memory_type()))) + BasicType bt1 = velt_basic_type(s1); + BasicType bt2 = velt_basic_type(s2); + if(!is_java_primitive(bt1) || !is_java_primitive(bt2)) return false; + if (Matcher::max_vector_size(bt1) < 2) { + return false; // No vectors for this type + } if (isomorphic(s1, s2)) { if (independent(s1, s2)) { @@ -552,7 +677,7 @@ if (s1->Opcode() != s2->Opcode()) return false; if (s1->req() != s2->req()) return false; if (s1->in(0) != s2->in(0)) return false; - if (velt_type(s1) != velt_type(s2)) return false; + if (!same_velt_type(s1, s2)) return false; return true; } @@ -595,14 +720,16 @@ //------------------------------set_alignment--------------------------- void SuperWord::set_alignment(Node* s1, Node* s2, int align) { set_alignment(s1, align); - set_alignment(s2, align + data_size(s1)); + if (align == top_align || align == bottom_align) { + set_alignment(s2, align); + } else { + set_alignment(s2, align + data_size(s1)); + } } //------------------------------data_size--------------------------- int SuperWord::data_size(Node* s) { - const Type* t = velt_type(s); - BasicType bt = t->array_element_basic_type(); - int bsize = type2aelembytes(bt); + int bsize = type2aelembytes(velt_basic_type(s)); assert(bsize != 0, "valid size"); return bsize; } @@ -631,9 +758,9 @@ //------------------------------follow_use_defs--------------------------- // Extend the packset by visiting operand definitions of nodes in pack p bool SuperWord::follow_use_defs(Node_List* p) { + assert(p->size() == 2, "just checking"); Node* s1 = p->at(0); Node* s2 = p->at(1); - assert(p->size() == 2, "just checking"); assert(s1->req() == s2->req(), "just checking"); assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking"); @@ -718,7 +845,12 @@ for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break; for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break; if (i1 != i2) { - return false; + if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) { + // Further analysis relies on operands position matching. + u2->swap_edges(i1, i2); + } else { + return false; + } } } while (i1 < ct); return true; @@ -727,7 +859,7 @@ //------------------------------est_savings--------------------------- // Estimate the savings from executing s1 and s2 as a pack int SuperWord::est_savings(Node* s1, Node* s2) { - int save = 2 - 1; // 2 operations per instruction in packed form + int save_in = 2 - 1; // 2 operations per instruction in packed form // inputs for (uint i = 1; i < s1->req(); i++) { @@ -735,17 +867,18 @@ Node* x2 = s2->in(i); if (x1 != x2) { if (are_adjacent_refs(x1, x2)) { - save += adjacent_profit(x1, x2); + save_in += adjacent_profit(x1, x2); } else if (!in_packset(x1, x2)) { - save -= pack_cost(2); + save_in -= pack_cost(2); } else { - save += unpack_cost(2); + save_in += unpack_cost(2); } } } // uses of result uint ct = 0; + int save_use = 0; for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { Node* s1_use = s1->fast_out(i); for (int j = 0; j < _packset.length(); j++) { @@ -756,7 +889,7 @@ if (p->at(p->size()-1) == s2_use) { ct++; if (are_adjacent_refs(s1_use, s2_use)) { - save += adjacent_profit(s1_use, s2_use); + save_use += adjacent_profit(s1_use, s2_use); } } } @@ -764,10 +897,10 @@ } } - if (ct < s1->outcnt()) save += unpack_cost(1); - if (ct < s2->outcnt()) save += unpack_cost(1); + if (ct < s1->outcnt()) save_use += unpack_cost(1); + if (ct < s2->outcnt()) save_use += unpack_cost(1); - return save; + return MAX2(save_in, save_use); } //------------------------------costs--------------------------- @@ -778,8 +911,9 @@ //------------------------------combine_packs--------------------------- // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last void SuperWord::combine_packs() { - bool changed; - do { + bool changed = true; + // Combine packs regardless max vector size. + while (changed) { changed = false; for (int i = 0; i < _packset.length(); i++) { Node_List* p1 = _packset.at(i); @@ -787,6 +921,7 @@ for (int j = 0; j < _packset.length(); j++) { Node_List* p2 = _packset.at(j); if (p2 == NULL) continue; + if (i == j) continue; if (p1->at(p1->size()-1) == p2->at(0)) { for (uint k = 1; k < p2->size(); k++) { p1->push(p2->at(k)); @@ -796,8 +931,39 @@ } } } - } while (changed); + } + // Split packs which have size greater then max vector size. + for (int i = 0; i < _packset.length(); i++) { + Node_List* p1 = _packset.at(i); + if (p1 != NULL) { + BasicType bt = velt_basic_type(p1->at(0)); + uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector + assert(is_power_of_2(max_vlen), "sanity"); + uint psize = p1->size(); + if (!is_power_of_2(psize)) { + // Skip pack which can't be vector. + // case1: for(...) { a[i] = i; } elements values are different (i+x) + // case2: for(...) { a[i] = b[i+1]; } can't align both, load and store + _packset.at_put(i, NULL); + continue; + } + if (psize > max_vlen) { + Node_List* pack = new Node_List(); + for (uint j = 0; j < psize; j++) { + pack->push(p1->at(j)); + if (pack->size() >= max_vlen) { + assert(is_power_of_2(pack->size()), "sanity"); + _packset.append(pack); + pack = new Node_List(); + } + } + _packset.at_put(i, NULL); + } + } + } + + // Compress list. for (int i = _packset.length() - 1; i >= 0; i--) { Node_List* p1 = _packset.at(i); if (p1 == NULL) { @@ -880,8 +1046,7 @@ // Can code be generated for pack p? bool SuperWord::implemented(Node_List* p) { Node* p0 = p->at(0); - int vopc = VectorNode::opcode(p0->Opcode(), p->size(), velt_type(p0)); - return vopc > 0 && Matcher::has_match_rule(vopc); + return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0)); } //------------------------------profitable--------------------------- @@ -939,36 +1104,36 @@ } //-------------------------------remove_and_insert------------------- -//remove "current" from its current position in the memory graph and insert -//it after the appropriate insertion point (lip or uip) +// Remove "current" from its current position in the memory graph and insert +// it after the appropriate insertion point (lip or uip). void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip, Node *uip, Unique_Node_List &sched_before) { Node* my_mem = current->in(MemNode::Memory); - _igvn.rehash_node_delayed(current); - _igvn.hash_delete(my_mem); + bool sched_up = sched_before.member(current); - //remove current_store from its current position in the memmory graph + // remove current_store from its current position in the memmory graph for (DUIterator i = current->outs(); current->has_out(i); i++) { Node* use = current->out(i); if (use->is_Mem()) { assert(use->in(MemNode::Memory) == current, "must be"); - _igvn.rehash_node_delayed(use); if (use == prev) { // connect prev to my_mem - use->set_req(MemNode::Memory, my_mem); + _igvn.replace_input_of(use, MemNode::Memory, my_mem); + --i; //deleted this edge; rescan position } else if (sched_before.member(use)) { - _igvn.hash_delete(uip); - use->set_req(MemNode::Memory, uip); + if (!sched_up) { // Will be moved together with current + _igvn.replace_input_of(use, MemNode::Memory, uip); + --i; //deleted this edge; rescan position + } } else { - _igvn.hash_delete(lip); - use->set_req(MemNode::Memory, lip); + if (sched_up) { // Will be moved together with current + _igvn.replace_input_of(use, MemNode::Memory, lip); + --i; //deleted this edge; rescan position + } } - --i; //deleted this edge; rescan position } } - bool sched_up = sched_before.member(current); Node *insert_pt = sched_up ? uip : lip; - _igvn.hash_delete(insert_pt); // all uses of insert_pt's memory state should use current's instead for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) { @@ -988,7 +1153,7 @@ } //connect current to insert_pt - current->set_req(MemNode::Memory, insert_pt); + _igvn.replace_input_of(current, MemNode::Memory, insert_pt); } //------------------------------co_locate_pack---------------------------------- @@ -1025,7 +1190,7 @@ if (use->is_Mem() && use != previous) memops.push(use); } - if(current == first) break; + if (current == first) break; previous = current; current = current->in(MemNode::Memory)->as_Mem(); } @@ -1038,27 +1203,37 @@ Node *s2 = memops.at(j); if (!independent(s1, s2)) { if (in_pack(s2, pk) || schedule_before_pack.member(s2)) { - schedule_before_pack.push(s1); //s1 must be scheduled before + schedule_before_pack.push(s1); // s1 must be scheduled before Node_List* mem_pk = my_pack(s1); if (mem_pk != NULL) { for (uint ii = 0; ii < mem_pk->size(); ii++) { - Node* s = mem_pk->at(ii); // follow partner + Node* s = mem_pk->at(ii); // follow partner if (memops.member(s) && !schedule_before_pack.member(s)) schedule_before_pack.push(s); } } + break; } } } } } + Node* upper_insert_pt = first->in(MemNode::Memory); + // Following code moves loads connected to upper_insert_pt below aliased stores. + // Collect such loads here and reconnect them back to upper_insert_pt later. + memops.clear(); + for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) { + Node* use = upper_insert_pt->out(i); + if (!use->is_Store()) + memops.push(use); + } + MemNode* lower_insert_pt = last; - Node* upper_insert_pt = first->in(MemNode::Memory); previous = last; //previous store in pk current = last->in(MemNode::Memory)->as_Mem(); - //start scheduling from "last" to "first" + // start scheduling from "last" to "first" while (true) { assert(in_bb(current), "stay in block"); assert(in_pack(previous, pk), "previous stays in pack"); @@ -1066,16 +1241,13 @@ if (in_pack(current, pk)) { // Forward users of my memory state (except "previous) to my input memory state - _igvn.hash_delete(current); for (DUIterator i = current->outs(); current->has_out(i); i++) { Node* use = current->out(i); if (use->is_Mem() && use != previous) { assert(use->in(MemNode::Memory) == current, "must be"); if (schedule_before_pack.member(use)) { - _igvn.hash_delete(upper_insert_pt); _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt); } else { - _igvn.hash_delete(lower_insert_pt); _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt); } --i; // deleted this edge; rescan position @@ -1089,6 +1261,14 @@ if (current == first) break; current = my_mem->as_Mem(); } // end while + + // Reconnect loads back to upper_insert_pt. + for (uint i = 0; i < memops.size(); i++) { + Node *ld = memops.at(i); + if (ld->in(MemNode::Memory) != upper_insert_pt) { + _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt); + } + } } else if (pk->at(0)->is_Load()) { //load // all loads in the pack should have the same memory state. By default, // we use the memory state of the last load. However, if any load could @@ -1149,35 +1329,30 @@ Node* vn = NULL; Node* low_adr = p->at(0); Node* first = executed_first(p); + int opc = n->Opcode(); if (n->is_Load()) { - int opc = n->Opcode(); Node* ctl = n->in(MemNode::Control); Node* mem = first->in(MemNode::Memory); Node* adr = low_adr->in(MemNode::Address); const TypePtr* atyp = n->adr_type(); - vn = VectorLoadNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen); - + vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n)); } else if (n->is_Store()) { // Promote value to be stored to vector Node* val = vector_opd(p, MemNode::ValueIn); - - int opc = n->Opcode(); Node* ctl = n->in(MemNode::Control); Node* mem = first->in(MemNode::Memory); Node* adr = low_adr->in(MemNode::Address); const TypePtr* atyp = n->adr_type(); - vn = VectorStoreNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen); - + vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen); } else if (n->req() == 3) { // Promote operands to vector Node* in1 = vector_opd(p, 1); Node* in2 = vector_opd(p, 2); - vn = VectorNode::make(_phase->C, n->Opcode(), in1, in2, vlen, velt_type(n)); - + vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n)); } else { ShouldNotReachHere(); } - + assert(vn != NULL, "sanity"); _phase->_igvn.register_new_node_with_optimizer(vn); _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0))); for (uint j = 0; j < p->size(); j++) { @@ -1185,6 +1360,12 @@ _igvn.replace_node(pm, vn); } _igvn._worklist.push(vn); +#ifdef ASSERT + if (TraceSuperWord) { + tty->print("new Vector node: "); + vn->dump(); + } +#endif } } } @@ -1207,10 +1388,10 @@ } if (same_opd) { - if (opd->is_Vector() || opd->is_VectorLoad()) { + if (opd->is_Vector() || opd->is_LoadVector()) { return opd; // input is matching vector } - assert(!opd->is_VectorStore(), "such vector is not expected here"); + assert(!opd->is_StoreVector(), "such vector is not expected here"); // Convert scalar input to vector with the same number of elements as // p0's vector. Use p0's type because size of operand's container in // vector should match p0's size regardless operand's size. @@ -1219,12 +1400,18 @@ _phase->_igvn.register_new_node_with_optimizer(vn); _phase->set_ctrl(vn, _phase->get_ctrl(opd)); +#ifdef ASSERT + if (TraceSuperWord) { + tty->print("new Vector node: "); + vn->dump(); + } +#endif return vn; } // Insert pack operation - const Type* p0_t = velt_type(p0); - PackNode* pk = PackNode::make(_phase->C, opd, p0_t); + BasicType bt = velt_basic_type(p0); + PackNode* pk = PackNode::make(_phase->C, opd, vlen, bt); DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); ) for (uint i = 1; i < vlen; i++) { @@ -1232,10 +1419,16 @@ Node* in = pi->in(opd_idx); assert(my_pack(in) == NULL, "Should already have been unpacked"); assert(opd_bt == in->bottom_type()->basic_type(), "all same type"); - pk->add_opd(in); + pk->add_opd(i, in); } _phase->_igvn.register_new_node_with_optimizer(pk); _phase->set_ctrl(pk, _phase->get_ctrl(opd)); +#ifdef ASSERT + if (TraceSuperWord) { + tty->print("new Pack node: "); + pk->dump(); + } +#endif return pk; } @@ -1273,16 +1466,15 @@ // Insert extract operation _igvn.hash_delete(def); int def_pos = alignment(def) / data_size(def); - const Type* def_t = velt_type(def); - Node* ex = ExtractNode::make(_phase->C, def, def_pos, def_t); + Node* ex = ExtractNode::make(_phase->C, def, def_pos, velt_basic_type(def)); _phase->_igvn.register_new_node_with_optimizer(ex); _phase->set_ctrl(ex, _phase->get_ctrl(def)); _igvn.replace_input_of(use, idx, ex); _igvn._worklist.push(def); bb_insert_after(ex, bb_idx(def)); - set_velt_type(ex, def_t); + set_velt_type(ex, velt_type(def)); } } @@ -1509,10 +1701,7 @@ // Initial type for (int i = 0; i < _block.length(); i++) { Node* n = _block.at(i); - const Type* t = n->is_Mem() ? Type::get_const_basic_type(n->as_Mem()->memory_type()) - : _igvn.type(n); - const Type* vt = container_type(t); - set_velt_type(n, vt); + set_velt_type(n, container_type(n)); } // Propagate narrowed type backwards through operations @@ -1543,7 +1732,7 @@ bool same_type = true; for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) { Node *use = in->fast_out(k); - if (!in_bb(use) || velt_type(use) != vt) { + if (!in_bb(use) || !same_velt_type(use, n)) { same_type = false; break; } @@ -1575,20 +1764,24 @@ if (!p.valid()) { return bottom_align; } + int vw = vector_width_in_bytes(velt_basic_type(s)); + if (vw < 2) { + return bottom_align; // No vectors for this type + } int offset = p.offset_in_bytes(); offset += iv_adjust_in_bytes; - int off_rem = offset % vector_width_in_bytes(); - int off_mod = off_rem >= 0 ? off_rem : off_rem + vector_width_in_bytes(); + int off_rem = offset % vw; + int off_mod = off_rem >= 0 ? off_rem : off_rem + vw; return off_mod; } //---------------------------container_type--------------------------- // Smallest type containing range of values -const Type* SuperWord::container_type(const Type* t) { - const Type* tp = t->make_ptr(); - if (tp && tp->isa_aryptr()) { - t = tp->is_aryptr()->elem(); +const Type* SuperWord::container_type(Node* n) { + if (n->is_Mem()) { + return Type::get_const_basic_type(n->as_Mem()->memory_type()); } + const Type* t = _igvn.type(n); if (t->basic_type() == T_INT) { if (t->higher_equal(TypeInt::BOOL)) return TypeInt::BOOL; if (t->higher_equal(TypeInt::BYTE)) return TypeInt::BYTE; @@ -1599,11 +1792,22 @@ return t; } +bool SuperWord::same_velt_type(Node* n1, Node* n2) { + const Type* vt1 = velt_type(n1); + const Type* vt2 = velt_type(n1); + if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) { + // Compare vectors element sizes for integer types. + return data_size(n1) == data_size(n2); + } + return vt1 == vt2; +} + //-------------------------vector_opd_range----------------------- // (Start, end] half-open range defining which operands are vector void SuperWord::vector_opd_range(Node* n, uint* start, uint* end) { switch (n->Opcode()) { - case Op_LoadB: case Op_LoadUS: + case Op_LoadB: case Op_LoadUB: + case Op_LoadS: case Op_LoadUS: case Op_LoadI: case Op_LoadL: case Op_LoadF: case Op_LoadD: case Op_LoadP: @@ -1721,6 +1925,7 @@ assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, ""); SWPointer align_to_ref_p(align_to_ref, this); + assert(align_to_ref_p.valid(), "sanity"); // Given: // lim0 == original pre loop limit @@ -1773,10 +1978,12 @@ // N = (V - (e - lim0)) % V // lim = lim0 - (V - (e - lim0)) % V + int vw = vector_width_in_bytes(velt_basic_type(align_to_ref)); + assert(vw > 1, "sanity"); int stride = iv_stride(); int scale = align_to_ref_p.scale_in_bytes(); int elt_size = align_to_ref_p.memory_size(); - int v_align = vector_width_in_bytes() / elt_size; + int v_align = vw / elt_size; int k = align_to_ref_p.offset_in_bytes() / elt_size; Node *kn = _igvn.intcon(k); @@ -1796,6 +2003,25 @@ _phase->_igvn.register_new_node_with_optimizer(e); _phase->set_ctrl(e, pre_ctrl); } + if (vw > ObjectAlignmentInBytes) { + // incorporate base e +/- base && Mask >>> log2(elt) + Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw))); + Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base()); + _phase->_igvn.register_new_node_with_optimizer(xbase); + Node* masked_xbase = new (_phase->C, 3) AndXNode(xbase, mask); + _phase->_igvn.register_new_node_with_optimizer(masked_xbase); +#ifdef _LP64 + masked_xbase = new (_phase->C, 2) ConvL2INode(masked_xbase); + _phase->_igvn.register_new_node_with_optimizer(masked_xbase); +#endif + Node* log2_elt = _igvn.intcon(exact_log2(elt_size)); + Node* bref = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt); + _phase->_igvn.register_new_node_with_optimizer(bref); + _phase->set_ctrl(bref, pre_ctrl); + e = new (_phase->C, 3) AddINode(e, bref); + _phase->_igvn.register_new_node_with_optimizer(e); + _phase->set_ctrl(e, pre_ctrl); + } // compute e +/- lim0 if (scale < 0) {