truffle: src/share/vm/opto/superword.cpp comparison

comparison src/share/vm/opto/superword.cpp @ 6179:8c92982cbbc4

7119644: Increase superword's vector size up to 256 bits Summary: Increase vector size up to 256-bits for YMM AVX registers on x86. Reviewed-by: never, twisti, roland

author	kvn
date	Fri, 15 Jun 2012 01:25:19 -0700
parents	5e990493719e
children	6f8f439e247d

comparison

equal deleted inserted replaced

-:eba1d5bce9e8
+:8c92982cbbc4
 /*
-* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+* Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 _iv(NULL)                               // induction var
 {}
 //------------------------------transform_loop---------------------------
 void SuperWord::transform_loop(IdealLoopTree* lpt) {
+assert(UseSuperWord, "should be");
+// Do vectors exist on this architecture?
+if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
 assert(lpt->_head->is_CountedLoop(), "must be");
 CountedLoopNode *cl = lpt->_head->as_CountedLoop();
 if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
 CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
 if (pre_end == NULL) return;
 Node *pre_opaq1 = pre_end->limit();
 if (pre_opaq1->Opcode() != Op_Opaque1) return;
-// Do vectors exist on this architecture?
-if (vector_width_in_bytes() == 0) return;
 init(); // initialize data structures
 set_lpt(lpt);
 set_lp(cl);
 // For now, define one block which is the entire loop body
 set_bb(cl);
 assert(_packset.length() == 0, "packset must be empty");
 SLP_extract();
 }
 void SuperWord::find_adjacent_refs() {
 // Get list of memory operations
 Node_List memops;
 for (int i = 0; i < _block.length(); i++) {
 Node* n = _block.at(i);
-if (n->is_Mem() && in_bb(n) &&
+if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
 is_java_primitive(n->as_Mem()->memory_type())) {
 int align = memory_alignment(n->as_Mem(), 0);
 if (align != bottom_align) {
 memops.push(n);
 }
 }
 }
-if (memops.size() == 0) return;
+Node_List align_to_refs;
-// Find a memory reference to align to.  The pre-loop trip count
+int best_iv_adjustment = 0;
-// is modified to align this reference to a vector-aligned address
+MemNode* best_align_to_mem_ref = NULL;
-find_align_to_ref(memops);
-if (align_to_ref() == NULL) return;
+while (memops.size() != 0) {
+// Find a memory reference to align to.
-SWPointer align_to_ref_p(align_to_ref(), this);
+MemNode* mem_ref = find_align_to_ref(memops);
-int offset = align_to_ref_p.offset_in_bytes();
+if (mem_ref == NULL) break;
-int scale  = align_to_ref_p.scale_in_bytes();
+align_to_refs.push(mem_ref);
-int vw              = vector_width_in_bytes();
+int iv_adjustment = get_iv_adjustment(mem_ref);
-int stride_sign     = (scale * iv_stride()) > 0 ? 1 : -1;
-int iv_adjustment   = (stride_sign * vw - (offset % vw)) % vw;
+if (best_align_to_mem_ref == NULL) {
+// Set memory reference which is the best from all memory operations
-#ifndef PRODUCT
+// to be used for alignment. The pre-loop trip count is modified to align
-if (TraceSuperWord)
+// this reference to a vector-aligned address.
-tty->print_cr("\noffset = %d iv_adjustment = %d  elt_align = %d scale = %d iv_stride = %d",
+best_align_to_mem_ref = mem_ref;
-offset, iv_adjustment, align_to_ref_p.memory_size(), align_to_ref_p.scale_in_bytes(), iv_stride());
+best_iv_adjustment = iv_adjustment;
-#endif
+}
-// Set alignment relative to "align_to_ref"
+SWPointer align_to_ref_p(mem_ref, this);
-for (int i = memops.size() - 1; i >= 0; i--) {
+// Set alignment relative to "align_to_ref" for all related memory operations.
-MemNode* s = memops.at(i)->as_Mem();
+for (int i = memops.size() - 1; i >= 0; i--) {
-SWPointer p2(s, this);
+MemNode* s = memops.at(i)->as_Mem();
-if (p2.comparable(align_to_ref_p)) {
+if (isomorphic(s, mem_ref)) {
-int align = memory_alignment(s, iv_adjustment);
+SWPointer p2(s, this);
-set_alignment(s, align);
+if (p2.comparable(align_to_ref_p)) {
-} else {
+int align = memory_alignment(s, iv_adjustment);
-memops.remove(i);
+set_alignment(s, align);
 }
 }
+}
-// Create initial pack pairs of memory operations
-for (uint i = 0; i < memops.size(); i++) {
+// Create initial pack pairs of memory operations for which
-Node* s1 = memops.at(i);
+// alignment is set and vectors will be aligned.
-for (uint j = 0; j < memops.size(); j++) {
+bool create_pack = true;
-Node* s2 = memops.at(j);
+if (memory_alignment(mem_ref, best_iv_adjustment) != 0) {
-if (s1 != s2 && are_adjacent_refs(s1, s2)) {
+if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
+// Can't allow vectorization of unaligned memory accesses with the
+// same type since it could be overlapped accesses to the same array.
+create_pack = false;
+} else {
+// Allow independent (different type) unaligned memory operations
+// if HW supports them.
+if (!Matcher::misaligned_vectors_ok()) {
+create_pack = false;
+} else {
+// Check if packs of the same memory type but
+// with a different alignment were created before.
+for (uint i = 0; i < align_to_refs.size(); i++) {
+MemNode* mr = align_to_refs.at(i)->as_Mem();
+if (same_velt_type(mr, mem_ref) &&
+memory_alignment(mr, iv_adjustment) != 0)
+create_pack = false;
+}
+}
+}
+}
+if (create_pack) {
+for (uint i = 0; i < memops.size(); i++) {
+Node* s1 = memops.at(i);
 int align = alignment(s1);
-if (stmts_can_pack(s1, s2, align)) {
+if (align == top_align) continue;
-Node_List* pair = new Node_List();
+for (uint j = 0; j < memops.size(); j++) {
-pair->push(s1);
+Node* s2 = memops.at(j);
-pair->push(s2);
+if (alignment(s2) == top_align) continue;
-_packset.append(pair);
+if (s1 != s2 && are_adjacent_refs(s1, s2)) {
-}
+if (stmts_can_pack(s1, s2, align)) {
-}
+Node_List* pair = new Node_List();
-}
+pair->push(s1);
-}
+pair->push(s2);
+_packset.append(pair);
+}
+}
+}
+}
+} else { // Don't create unaligned pack
+// First, remove remaining memory ops of the same type from the list.
+for (int i = memops.size() - 1; i >= 0; i--) {
+MemNode* s = memops.at(i)->as_Mem();
+if (same_velt_type(s, mem_ref)) {
+memops.remove(i);
+}
+}
+// Second, remove already constructed packs of the same type.
+for (int i = _packset.length() - 1; i >= 0; i--) {
+Node_List* p = _packset.at(i);
+MemNode* s = p->at(0)->as_Mem();
+if (same_velt_type(s, mem_ref)) {
+remove_pack_at(i);
+}
+}
+// If needed find the best memory reference for loop alignment again.
+if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
+// Put memory ops from remaining packs back on memops list for
+// the best alignment search.
+uint orig_msize = memops.size();
+for (int i = 0; i < _packset.length(); i++) {
+Node_List* p = _packset.at(i);
+MemNode* s = p->at(0)->as_Mem();
+assert(!same_velt_type(s, mem_ref), "sanity");
+memops.push(s);
+}
+MemNode* best_align_to_mem_ref = find_align_to_ref(memops);
+if (best_align_to_mem_ref == NULL) break;
+best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
+// Restore list.
+while (memops.size() > orig_msize)
+(void)memops.pop();
+}
+} // unaligned memory accesses
+// Remove used mem nodes.
+for (int i = memops.size() - 1; i >= 0; i--) {
+MemNode* m = memops.at(i)->as_Mem();
+if (alignment(m) != top_align) {
+memops.remove(i);
+}
+}
+} // while (memops.size() != 0
+set_align_to_ref(best_align_to_mem_ref);
 #ifndef PRODUCT
 if (TraceSuperWord) {
 tty->print_cr("\nAfter find_adjacent_refs");
 print_packset();
 //------------------------------find_align_to_ref---------------------------
 // Find a memory reference to align the loop induction variable to.
 // Looks first at stores then at loads, looking for a memory reference
 // with the largest number of references similar to it.
-void SuperWord::find_align_to_ref(Node_List &memops) {
+MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
 GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
 // Count number of comparable memory ops
 for (uint i = 0; i < memops.size(); i++) {
 MemNode* s1 = memops.at(i)->as_Mem();
 }
 }
 }
 }
-// Find Store (or Load) with the greatest number of "comparable" references
+// Find Store (or Load) with the greatest number of "comparable" references,
+// biggest vector size, smallest data size and smallest iv offset.
 int max_ct        = 0;
+int max_vw        = 0;
 int max_idx       = -1;
 int min_size      = max_jint;
 int min_iv_offset = max_jint;
 for (uint j = 0; j < memops.size(); j++) {
 MemNode* s = memops.at(j)->as_Mem();
 if (s->is_Store()) {
+int vw = vector_width_in_bytes(velt_basic_type(s));
+assert(vw > 1, "sanity");
 SWPointer p(s, this);
-if (cmp_ct.at(j) > max_ct ||
+if (cmp_ct.at(j) >  max_ct ||
-cmp_ct.at(j) == max_ct && (data_size(s) < min_size ||
+cmp_ct.at(j) == max_ct &&
-data_size(s) == min_size &&
+(vw >  max_vw ||
-p.offset_in_bytes() < min_iv_offset)) {
+vw == max_vw &&
+(data_size(s) <  min_size ||
+data_size(s) == min_size &&
+(p.offset_in_bytes() < min_iv_offset)))) {
 max_ct = cmp_ct.at(j);
+max_vw = vw;
 max_idx = j;
 min_size = data_size(s);
 min_iv_offset = p.offset_in_bytes();
 }
 }
 // If no stores, look at loads
 if (max_ct == 0) {
 for (uint j = 0; j < memops.size(); j++) {
 MemNode* s = memops.at(j)->as_Mem();
 if (s->is_Load()) {
+int vw = vector_width_in_bytes(velt_basic_type(s));
+assert(vw > 1, "sanity");
 SWPointer p(s, this);
-if (cmp_ct.at(j) > max_ct ||
+if (cmp_ct.at(j) >  max_ct ||
-cmp_ct.at(j) == max_ct && (data_size(s) < min_size ||
+cmp_ct.at(j) == max_ct &&
-data_size(s) == min_size &&
+(vw >  max_vw ||
-p.offset_in_bytes() < min_iv_offset)) {
+vw == max_vw &&
+(data_size(s) <  min_size ||
+data_size(s) == min_size &&
+(p.offset_in_bytes() < min_iv_offset)))) {
 max_ct = cmp_ct.at(j);
+max_vw = vw;
 max_idx = j;
 min_size = data_size(s);
 min_iv_offset = p.offset_in_bytes();
 }
 }
 }
 }
-if (max_ct > 0)
+#ifdef ASSERT
-set_align_to_ref(memops.at(max_idx)->as_Mem());
-#ifndef PRODUCT
 if (TraceSuperWord && Verbose) {
 tty->print_cr("\nVector memops after find_align_to_refs");
 for (uint i = 0; i < memops.size(); i++) {
 MemNode* s = memops.at(i)->as_Mem();
 s->dump();
 }
 }
 #endif
+if (max_ct > 0) {
+#ifdef ASSERT
+if (TraceSuperWord) {
+tty->print("\nVector align to node: ");
+memops.at(max_idx)->as_Mem()->dump();
+}
+#endif
+return memops.at(max_idx)->as_Mem();
+}
+return NULL;
 }
 //------------------------------ref_is_alignable---------------------------
 // Can the preloop align the reference to position zero in the vector?
 bool SuperWord::ref_is_alignable(SWPointer& p) {
 if (ABS(span) == p.memory_size())
 return true;
 // If initial offset from start of object is computable,
 // compute alignment within the vector.
-int vw = vector_width_in_bytes();
+BasicType bt = velt_basic_type(p.mem());
+int vw = vector_width_in_bytes(bt);
+assert(vw > 1, "sanity");
 if (vw % span == 0) {
 Node* init_nd = pre_end->init_trip();
 if (init_nd->is_Con() && p.invar() == NULL) {
 int init = init_nd->bottom_type()->is_int()->get_con();
 return (init_offset % vw) % -span == 0;
 }
 }
 }
 return false;
+}
+//---------------------------get_iv_adjustment---------------------------
+// Calculate loop's iv adjustment for this memory ops.
+int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
+SWPointer align_to_ref_p(mem_ref, this);
+int offset = align_to_ref_p.offset_in_bytes();
+int scale  = align_to_ref_p.scale_in_bytes();
+BasicType bt = velt_basic_type(mem_ref);
+int vw       = vector_width_in_bytes(bt);
+assert(vw > 1, "sanity");
+int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
+int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw;
+#ifndef PRODUCT
+if (TraceSuperWord)
+tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
+offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw);
+#endif
+return iv_adjustment;
 }
 //---------------------------dependence_graph---------------------------
 // Construct dependency graph.
 // Add dependence edges to load/store nodes for memory dependence
 // Can s1 and s2 be in a pack with s1 immediately preceding s2 and
 // s1 aligned at "align"
 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
 // Do not use superword for non-primitives
-if((s1->is_Mem() && !is_java_primitive(s1->as_Mem()->memory_type())) ||
+BasicType bt1 = velt_basic_type(s1);
-(s2->is_Mem() && !is_java_primitive(s2->as_Mem()->memory_type())))
+BasicType bt2 = velt_basic_type(s2);
+if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
 return false;
+if (Matcher::max_vector_size(bt1) < 2) {
+return false; // No vectors for this type
+}
 if (isomorphic(s1, s2)) {
 if (independent(s1, s2)) {
 if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
 if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
 // Are s1 and s2 similar?
 bool SuperWord::isomorphic(Node* s1, Node* s2) {
 if (s1->Opcode() != s2->Opcode()) return false;
 if (s1->req() != s2->req()) return false;
 if (s1->in(0) != s2->in(0)) return false;
-if (velt_type(s1) != velt_type(s2)) return false;
+if (!same_velt_type(s1, s2)) return false;
 return true;
 }
 //------------------------------independent---------------------------
 // Is there no data path from s1 to s2 or s2 to s1?
 }
 //------------------------------set_alignment---------------------------
 void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
 set_alignment(s1, align);
-set_alignment(s2, align + data_size(s1));
+if (align == top_align || align == bottom_align) {
+set_alignment(s2, align);
+} else {
+set_alignment(s2, align + data_size(s1));
+}
 }
 //------------------------------data_size---------------------------
 int SuperWord::data_size(Node* s) {
-const Type* t = velt_type(s);
+int bsize = type2aelembytes(velt_basic_type(s));
-BasicType  bt = t->array_element_basic_type();
-int bsize = type2aelembytes(bt);
 assert(bsize != 0, "valid size");
 return bsize;
 }
 //------------------------------extend_packlist---------------------------
 }
 //------------------------------follow_use_defs---------------------------
 // Extend the packset by visiting operand definitions of nodes in pack p
 bool SuperWord::follow_use_defs(Node_List* p) {
+assert(p->size() == 2, "just checking");
 Node* s1 = p->at(0);
 Node* s2 = p->at(1);
-assert(p->size() == 2, "just checking");
 assert(s1->req() == s2->req(), "just checking");
 assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
 if (s1->is_Load()) return false;
 uint i2 = 0;
 do {
 for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
 for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
 if (i1 != i2) {
-return false;
+if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
+// Further analysis relies on operands position matching.
+u2->swap_edges(i1, i2);
+} else {
+return false;
+}
 }
 } while (i1 < ct);
 return true;
 }
 //------------------------------est_savings---------------------------
 // Estimate the savings from executing s1 and s2 as a pack
 int SuperWord::est_savings(Node* s1, Node* s2) {
-int save = 2 - 1; // 2 operations per instruction in packed form
+int save_in = 2 - 1; // 2 operations per instruction in packed form
 // inputs
 for (uint i = 1; i < s1->req(); i++) {
 Node* x1 = s1->in(i);
 Node* x2 = s2->in(i);
 if (x1 != x2) {
 if (are_adjacent_refs(x1, x2)) {
-save += adjacent_profit(x1, x2);
+save_in += adjacent_profit(x1, x2);
 } else if (!in_packset(x1, x2)) {
-save -= pack_cost(2);
+save_in -= pack_cost(2);
 } else {
-save += unpack_cost(2);
+save_in += unpack_cost(2);
 }
 }
 }
 // uses of result
 uint ct = 0;
+int save_use = 0;
 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
 Node* s1_use = s1->fast_out(i);
 for (int j = 0; j < _packset.length(); j++) {
 Node_List* p = _packset.at(j);
 if (p->at(0) == s1_use) {
 for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
 Node* s2_use = s2->fast_out(k);
 if (p->at(p->size()-1) == s2_use) {
 ct++;
 if (are_adjacent_refs(s1_use, s2_use)) {
-save += adjacent_profit(s1_use, s2_use);
+save_use += adjacent_profit(s1_use, s2_use);
 }
 }
 }
 }
 }
 }
-if (ct < s1->outcnt()) save += unpack_cost(1);
+if (ct < s1->outcnt()) save_use += unpack_cost(1);
-if (ct < s2->outcnt()) save += unpack_cost(1);
+if (ct < s2->outcnt()) save_use += unpack_cost(1);
-return save;
+return MAX2(save_in, save_use);
 }
 //------------------------------costs---------------------------
 int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; }
 int SuperWord::pack_cost(int ct)   { return ct; }
 int SuperWord::unpack_cost(int ct) { return ct; }
 //------------------------------combine_packs---------------------------
 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
 void SuperWord::combine_packs() {
-bool changed;
+bool changed = true;
-do {
+// Combine packs regardless max vector size.
+while (changed) {
 changed = false;
 for (int i = 0; i < _packset.length(); i++) {
 Node_List* p1 = _packset.at(i);
 if (p1 == NULL) continue;
 for (int j = 0; j < _packset.length(); j++) {
 Node_List* p2 = _packset.at(j);
 if (p2 == NULL) continue;
+if (i == j) continue;
 if (p1->at(p1->size()-1) == p2->at(0)) {
 for (uint k = 1; k < p2->size(); k++) {
 p1->push(p2->at(k));
 }
 _packset.at_put(j, NULL);
 changed = true;
 }
 }
 }
-} while (changed);
+}
+// Split packs which have size greater then max vector size.
+for (int i = 0; i < _packset.length(); i++) {
+Node_List* p1 = _packset.at(i);
+if (p1 != NULL) {
+BasicType bt = velt_basic_type(p1->at(0));
+uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
+assert(is_power_of_2(max_vlen), "sanity");
+uint psize = p1->size();
+if (!is_power_of_2(psize)) {
+// Skip pack which can't be vector.
+// case1: for(...) { a[i] = i; }    elements values are different (i+x)
+// case2: for(...) { a[i] = b[i+1]; }  can't align both, load and store
+_packset.at_put(i, NULL);
+continue;
+}
+if (psize > max_vlen) {
+Node_List* pack = new Node_List();
+for (uint j = 0; j < psize; j++) {
+pack->push(p1->at(j));
+if (pack->size() >= max_vlen) {
+assert(is_power_of_2(pack->size()), "sanity");
+_packset.append(pack);
+pack = new Node_List();
+}
+}
+_packset.at_put(i, NULL);
+}
+}
+}
+// Compress list.
 for (int i = _packset.length() - 1; i >= 0; i--) {
 Node_List* p1 = _packset.at(i);
 if (p1 == NULL) {
 _packset.remove_at(i);
 }
 //------------------------------implemented---------------------------
 // Can code be generated for pack p?
 bool SuperWord::implemented(Node_List* p) {
 Node* p0 = p->at(0);
-int vopc = VectorNode::opcode(p0->Opcode(), p->size(), velt_type(p0));
+return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0));
-return vopc > 0 && Matcher::has_match_rule(vopc);
 }
 //------------------------------profitable---------------------------
 // For pack p, are all operands and all uses (with in the block) vector?
 bool SuperWord::profitable(Node_List* p) {
 co_locate_pack(_packset.at(i));
 }
 }
 //-------------------------------remove_and_insert-------------------
-//remove "current" from its current position in the memory graph and insert
+// Remove "current" from its current position in the memory graph and insert
-//it after the appropriate insertion point (lip or uip)
+// it after the appropriate insertion point (lip or uip).
 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
 Node *uip, Unique_Node_List &sched_before) {
 Node* my_mem = current->in(MemNode::Memory);
-_igvn.rehash_node_delayed(current);
+bool sched_up = sched_before.member(current);
-_igvn.hash_delete(my_mem);
+// remove current_store from its current position in the memmory graph
-//remove current_store from its current position in the memmory graph
 for (DUIterator i = current->outs(); current->has_out(i); i++) {
 Node* use = current->out(i);
 if (use->is_Mem()) {
 assert(use->in(MemNode::Memory) == current, "must be");
-_igvn.rehash_node_delayed(use);
 if (use == prev) { // connect prev to my_mem
-use->set_req(MemNode::Memory, my_mem);
+_igvn.replace_input_of(use, MemNode::Memory, my_mem);
+--i; //deleted this edge; rescan position
 } else if (sched_before.member(use)) {
-_igvn.hash_delete(uip);
+if (!sched_up) { // Will be moved together with current
-use->set_req(MemNode::Memory, uip);
+_igvn.replace_input_of(use, MemNode::Memory, uip);
+--i; //deleted this edge; rescan position
+}
 } else {
-_igvn.hash_delete(lip);
+if (sched_up) { // Will be moved together with current
-use->set_req(MemNode::Memory, lip);
+_igvn.replace_input_of(use, MemNode::Memory, lip);
-}
+--i; //deleted this edge; rescan position
---i; //deleted this edge; rescan position
+}
 }
 }
+}
-bool sched_up = sched_before.member(current);
 Node *insert_pt =  sched_up ?  uip : lip;
-_igvn.hash_delete(insert_pt);
 // all uses of insert_pt's memory state should use current's instead
 for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
 Node* use = insert_pt->out(i);
 if (use->is_Mem()) {
 --i;
 }
 }
 //connect current to insert_pt
-current->set_req(MemNode::Memory, insert_pt);
+_igvn.replace_input_of(current, MemNode::Memory, insert_pt);
 }
 //------------------------------co_locate_pack----------------------------------
 // To schedule a store pack, we need to move any sandwiched memory ops either before
 // or after the pack, based upon dependence information:
 for (DUIterator i = current->outs(); current->has_out(i); i++) {
 Node* use = current->out(i);
 if (use->is_Mem() && use != previous)
 memops.push(use);
 }
-if(current == first) break;
+if (current == first) break;
 previous = current;
 current  = current->in(MemNode::Memory)->as_Mem();
 }
 // determine which memory operations should be scheduled before the pack
 if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
 for (uint j = 0; j< i; j++) {
 Node *s2 = memops.at(j);
 if (!independent(s1, s2)) {
 if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
-schedule_before_pack.push(s1); //s1 must be scheduled before
+schedule_before_pack.push(s1); // s1 must be scheduled before
 Node_List* mem_pk = my_pack(s1);
 if (mem_pk != NULL) {
 for (uint ii = 0; ii < mem_pk->size(); ii++) {
-Node* s = mem_pk->at(ii); // follow partner
+Node* s = mem_pk->at(ii);  // follow partner
 if (memops.member(s) && !schedule_before_pack.member(s))
 schedule_before_pack.push(s);
 }
 }
+break;
 }
 }
 }
 }
 }
+Node*    upper_insert_pt = first->in(MemNode::Memory);
+// Following code moves loads connected to upper_insert_pt below aliased stores.
+// Collect such loads here and reconnect them back to upper_insert_pt later.
+memops.clear();
+for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
+Node* use = upper_insert_pt->out(i);
+if (!use->is_Store())
+memops.push(use);
+}
 MemNode* lower_insert_pt = last;
-Node*    upper_insert_pt = first->in(MemNode::Memory);
 previous                 = last; //previous store in pk
 current                  = last->in(MemNode::Memory)->as_Mem();
-//start scheduling from "last" to "first"
+// start scheduling from "last" to "first"
 while (true) {
 assert(in_bb(current), "stay in block");
 assert(in_pack(previous, pk), "previous stays in pack");
 Node* my_mem = current->in(MemNode::Memory);
 if (in_pack(current, pk)) {
 // Forward users of my memory state (except "previous) to my input memory state
-_igvn.hash_delete(current);
 for (DUIterator i = current->outs(); current->has_out(i); i++) {
 Node* use = current->out(i);
 if (use->is_Mem() && use != previous) {
 assert(use->in(MemNode::Memory) == current, "must be");
 if (schedule_before_pack.member(use)) {
-_igvn.hash_delete(upper_insert_pt);
 _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
 } else {
-_igvn.hash_delete(lower_insert_pt);
 _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
 }
 --i; // deleted this edge; rescan position
 }
 }
 }
 if (current == first) break;
 current = my_mem->as_Mem();
 } // end while
+// Reconnect loads back to upper_insert_pt.
+for (uint i = 0; i < memops.size(); i++) {
+Node *ld = memops.at(i);
+if (ld->in(MemNode::Memory) != upper_insert_pt) {
+_igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
+}
+}
 } else if (pk->at(0)->is_Load()) { //load
 // all loads in the pack should have the same memory state. By default,
 // we use the memory state of the last load. However, if any load could
 // not be moved down due to the dependence constraint, we use the memory
 // state of the first load.
 if (p && n == executed_last(p)) {
 uint vlen = p->size();
 Node* vn = NULL;
 Node* low_adr = p->at(0);
 Node* first   = executed_first(p);
+int   opc = n->Opcode();
 if (n->is_Load()) {
-int   opc = n->Opcode();
 Node* ctl = n->in(MemNode::Control);
 Node* mem = first->in(MemNode::Memory);
 Node* adr = low_adr->in(MemNode::Address);
 const TypePtr* atyp = n->adr_type();
-vn = VectorLoadNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen);
+vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
 } else if (n->is_Store()) {
 // Promote value to be stored to vector
 Node* val = vector_opd(p, MemNode::ValueIn);
-int   opc = n->Opcode();
 Node* ctl = n->in(MemNode::Control);
 Node* mem = first->in(MemNode::Memory);
 Node* adr = low_adr->in(MemNode::Address);
 const TypePtr* atyp = n->adr_type();
-vn = VectorStoreNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
+vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
 } else if (n->req() == 3) {
 // Promote operands to vector
 Node* in1 = vector_opd(p, 1);
 Node* in2 = vector_opd(p, 2);
-vn = VectorNode::make(_phase->C, n->Opcode(), in1, in2, vlen, velt_type(n));
+vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
 } else {
 ShouldNotReachHere();
 }
+assert(vn != NULL, "sanity");
 _phase->_igvn.register_new_node_with_optimizer(vn);
 _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
 for (uint j = 0; j < p->size(); j++) {
 Node* pm = p->at(j);
 _igvn.replace_node(pm, vn);
 }
 _igvn._worklist.push(vn);
+#ifdef ASSERT
+if (TraceSuperWord) {
+tty->print("new Vector node: ");
+vn->dump();
+}
+#endif
 }
 }
 }
 //------------------------------vector_opd---------------------------
 break;
 }
 }
 if (same_opd) {
-if (opd->is_Vector() || opd->is_VectorLoad()) {
+if (opd->is_Vector() || opd->is_LoadVector()) {
 return opd; // input is matching vector
 }
-assert(!opd->is_VectorStore(), "such vector is not expected here");
+assert(!opd->is_StoreVector(), "such vector is not expected here");
 // Convert scalar input to vector with the same number of elements as
 // p0's vector. Use p0's type because size of operand's container in
 // vector should match p0's size regardless operand's size.
 const Type* p0_t = velt_type(p0);
 VectorNode* vn = VectorNode::scalar2vector(_phase->C, opd, vlen, p0_t);
 _phase->_igvn.register_new_node_with_optimizer(vn);
 _phase->set_ctrl(vn, _phase->get_ctrl(opd));
+#ifdef ASSERT
+if (TraceSuperWord) {
+tty->print("new Vector node: ");
+vn->dump();
+}
+#endif
 return vn;
 }
 // Insert pack operation
-const Type* p0_t = velt_type(p0);
+BasicType bt = velt_basic_type(p0);
-PackNode* pk = PackNode::make(_phase->C, opd, p0_t);
+PackNode* pk = PackNode::make(_phase->C, opd, vlen, bt);
 DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
 for (uint i = 1; i < vlen; i++) {
 Node* pi = p->at(i);
 Node* in = pi->in(opd_idx);
 assert(my_pack(in) == NULL, "Should already have been unpacked");
 assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
-pk->add_opd(in);
+pk->add_opd(i, in);
 }
 _phase->_igvn.register_new_node_with_optimizer(pk);
 _phase->set_ctrl(pk, _phase->get_ctrl(opd));
+#ifdef ASSERT
+if (TraceSuperWord) {
+tty->print("new Pack node: ");
+pk->dump();
+}
+#endif
 return pk;
 }
 //------------------------------insert_extracts---------------------------
 // If a use of pack p is not a vector use, then replace the
 Node* def = use->in(idx);
 // Insert extract operation
 _igvn.hash_delete(def);
 int def_pos = alignment(def) / data_size(def);
-const Type* def_t = velt_type(def);
+Node* ex = ExtractNode::make(_phase->C, def, def_pos, velt_basic_type(def));
-Node* ex = ExtractNode::make(_phase->C, def, def_pos, def_t);
 _phase->_igvn.register_new_node_with_optimizer(ex);
 _phase->set_ctrl(ex, _phase->get_ctrl(def));
 _igvn.replace_input_of(use, idx, ex);
 _igvn._worklist.push(def);
 bb_insert_after(ex, bb_idx(def));
-set_velt_type(ex, def_t);
+set_velt_type(ex, velt_type(def));
 }
 }
 //------------------------------is_vector_use---------------------------
 // Is use->in(u_idx) a vector use?
 #endif
 // Initial type
 for (int i = 0; i < _block.length(); i++) {
 Node* n = _block.at(i);
-const Type* t  = n->is_Mem() ? Type::get_const_basic_type(n->as_Mem()->memory_type())
+set_velt_type(n, container_type(n));
-: _igvn.type(n);
-const Type* vt = container_type(t);
-set_velt_type(n, vt);
 }
 // Propagate narrowed type backwards through operations
 // that don't depend on higher order bits
 for (int i = _block.length() - 1; i >= 0; i--) {
 case Op_CMoveI:  case Op_CMoveL:
 if (in_bb(in)) {
 bool same_type = true;
 for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
 Node *use = in->fast_out(k);
-if (!in_bb(use) || velt_type(use) != vt) {
+if (!in_bb(use) || !same_velt_type(use, n)) {
 same_type = false;
 break;
 }
 }
 if (same_type) {
 int SuperWord::memory_alignment(MemNode* s, int iv_adjust_in_bytes) {
 SWPointer p(s, this);
 if (!p.valid()) {
 return bottom_align;
 }
+int vw = vector_width_in_bytes(velt_basic_type(s));
+if (vw < 2) {
+return bottom_align; // No vectors for this type
+}
 int offset  = p.offset_in_bytes();
 offset     += iv_adjust_in_bytes;
-int off_rem = offset % vector_width_in_bytes();
+int off_rem = offset % vw;
-int off_mod = off_rem >= 0 ? off_rem : off_rem + vector_width_in_bytes();
+int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
 return off_mod;
 }
 //---------------------------container_type---------------------------
 // Smallest type containing range of values
-const Type* SuperWord::container_type(const Type* t) {
+const Type* SuperWord::container_type(Node* n) {
-const Type* tp = t->make_ptr();
+if (n->is_Mem()) {
-if (tp && tp->isa_aryptr()) {
+return Type::get_const_basic_type(n->as_Mem()->memory_type());
-t = tp->is_aryptr()->elem();
+}
-}
+const Type* t = _igvn.type(n);
 if (t->basic_type() == T_INT) {
 if (t->higher_equal(TypeInt::BOOL))  return TypeInt::BOOL;
 if (t->higher_equal(TypeInt::BYTE))  return TypeInt::BYTE;
 if (t->higher_equal(TypeInt::CHAR))  return TypeInt::CHAR;
 if (t->higher_equal(TypeInt::SHORT)) return TypeInt::SHORT;
 return TypeInt::INT;
 }
 return t;
 }
+bool SuperWord::same_velt_type(Node* n1, Node* n2) {
+const Type* vt1 = velt_type(n1);
+const Type* vt2 = velt_type(n1);
+if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
+// Compare vectors element sizes for integer types.
+return data_size(n1) == data_size(n2);
+}
+return vt1 == vt2;
+}
 //-------------------------vector_opd_range-----------------------
 // (Start, end] half-open range defining which operands are vector
 void SuperWord::vector_opd_range(Node* n, uint* start, uint* end) {
 switch (n->Opcode()) {
-case Op_LoadB:   case Op_LoadUS:
+case Op_LoadB:   case Op_LoadUB:
+case Op_LoadS:   case Op_LoadUS:
 case Op_LoadI:   case Op_LoadL:
 case Op_LoadF:   case Op_LoadD:
 case Op_LoadP:
 *start = 0;
 *end   = 0;
 // pre-loop Opaque1 node.
 Node *orig_limit = pre_opaq->original_loop_limit();
 assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
 SWPointer align_to_ref_p(align_to_ref, this);
+assert(align_to_ref_p.valid(), "sanity");
 // Given:
 //     lim0 == original pre loop limit
 //     V == v_align (power of 2)
 //     invar == extra invariant piece of the address expression
 //   Solving for lim:
 //     (e - lim0 + N) % V == 0
 //     N = (V - (e - lim0)) % V
 //     lim = lim0 - (V - (e - lim0)) % V
+int vw = vector_width_in_bytes(velt_basic_type(align_to_ref));
+assert(vw > 1, "sanity");
 int stride   = iv_stride();
 int scale    = align_to_ref_p.scale_in_bytes();
 int elt_size = align_to_ref_p.memory_size();
-int v_align  = vector_width_in_bytes() / elt_size;
+int v_align  = vw / elt_size;
 int k        = align_to_ref_p.offset_in_bytes() / elt_size;
 Node *kn   = _igvn.intcon(k);
 Node *e = kn;
 if (align_to_ref_p.negate_invar()) {
 e = new (_phase->C, 3) SubINode(e, aref);
 } else {
 e = new (_phase->C, 3) AddINode(e, aref);
 }
+_phase->_igvn.register_new_node_with_optimizer(e);
+_phase->set_ctrl(e, pre_ctrl);
+}
+if (vw > ObjectAlignmentInBytes) {
+// incorporate base e +/- base && Mask >>> log2(elt)
+Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw)));
+Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base());
+_phase->_igvn.register_new_node_with_optimizer(xbase);
+Node* masked_xbase  = new (_phase->C, 3) AndXNode(xbase, mask);
+_phase->_igvn.register_new_node_with_optimizer(masked_xbase);
+#ifdef _LP64
+masked_xbase  = new (_phase->C, 2) ConvL2INode(masked_xbase);
+_phase->_igvn.register_new_node_with_optimizer(masked_xbase);
+#endif
+Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
+Node* bref     = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt);
+_phase->_igvn.register_new_node_with_optimizer(bref);
+_phase->set_ctrl(bref, pre_ctrl);
+e = new (_phase->C, 3) AddINode(e, bref);
 _phase->_igvn.register_new_node_with_optimizer(e);
 _phase->set_ctrl(e, pre_ctrl);
 }
 // compute e +/- lim0

Mercurial > hg > truffle

comparison src/share/vm/opto/superword.cpp @ 6179:8c92982cbbc4