# HG changeset patch
# User kvn
# Date 1275497372 25200
# Node ID 3657cb01ffc5b74da8f698aab584a7c80786bea7
# Parent  1eb493f334235ffcc2ab032d15f69690944c179a
6954029: Improve implicit null check generation with compressed oops
Summary: Hoist DecodeN instruction above null check
Reviewed-by: never, twisti

diff -r 1eb493f33423 -r 3657cb01ffc5 src/cpu/sparc/vm/sparc.ad
--- a/src/cpu/sparc/vm/sparc.ad	Sat May 29 19:22:32 2010 -0700
+++ b/src/cpu/sparc/vm/sparc.ad	Wed Jun 02 09:49:32 2010 -0700
@@ -1760,6 +1760,12 @@
 // registers?  True for Intel but false for most RISCs
 const bool Matcher::clone_shift_expressions = false;
 
+bool Matcher::narrow_oop_use_complex_address() {
+  NOT_LP64(ShouldNotCallThis());
+  assert(UseCompressedOops, "only for compressed oops code");
+  return false;
+}
+
 // Is it better to copy float constants, or load them directly from memory?
 // Intel can load a float constant from a direct address, requiring no
 // extra registers.  Most RISCs will have to materialize an address into a
diff -r 1eb493f33423 -r 3657cb01ffc5 src/cpu/sparc/vm/vm_version_sparc.cpp
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Sat May 29 19:22:32 2010 -0700
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Wed Jun 02 09:49:32 2010 -0700
@@ -65,13 +65,6 @@
       FLAG_SET_DEFAULT(UseInlineCaches, false);
     }
 #ifdef _LP64
-    // Single issue niagara1 is slower for CompressedOops
-    // but niagaras after that it's fine.
-    if (!is_niagara1_plus()) {
-      if (FLAG_IS_DEFAULT(UseCompressedOops)) {
-        FLAG_SET_ERGO(bool, UseCompressedOops, false);
-      }
-    }
     // 32-bit oops don't make sense for the 64-bit VM on sparc
     // since the 32-bit VM has the same registers and smaller objects.
     Universe::set_narrow_oop_shift(LogMinObjAlignmentInBytes);
diff -r 1eb493f33423 -r 3657cb01ffc5 src/cpu/x86/vm/x86_32.ad
--- a/src/cpu/x86/vm/x86_32.ad	Sat May 29 19:22:32 2010 -0700
+++ b/src/cpu/x86/vm/x86_32.ad	Wed Jun 02 09:49:32 2010 -0700
@@ -1377,6 +1377,12 @@
 // registers?  True for Intel but false for most RISCs
 const bool Matcher::clone_shift_expressions = true;
 
+bool Matcher::narrow_oop_use_complex_address() {
+  ShouldNotCallThis();
+  return true;
+}
+
+
 // Is it better to copy float constants, or load them directly from memory?
 // Intel can load a float constant from a direct address, requiring no
 // extra registers.  Most RISCs will have to materialize an address into a
diff -r 1eb493f33423 -r 3657cb01ffc5 src/cpu/x86/vm/x86_64.ad
--- a/src/cpu/x86/vm/x86_64.ad	Sat May 29 19:22:32 2010 -0700
+++ b/src/cpu/x86/vm/x86_64.ad	Wed Jun 02 09:49:32 2010 -0700
@@ -2037,6 +2037,11 @@
 // into registers?  True for Intel but false for most RISCs
 const bool Matcher::clone_shift_expressions = true;
 
+bool Matcher::narrow_oop_use_complex_address() {
+  assert(UseCompressedOops, "only for compressed oops code");
+  return (LogMinObjAlignmentInBytes <= 3);
+}
+
 // Is it better to copy float constants, or load them directly from
 // memory?  Intel can load a float constant from a direct address,
 // requiring no extra registers.  Most RISCs will have to materialize
diff -r 1eb493f33423 -r 3657cb01ffc5 src/share/vm/opto/compile.cpp
--- a/src/share/vm/opto/compile.cpp	Sat May 29 19:22:32 2010 -0700
+++ b/src/share/vm/opto/compile.cpp	Wed Jun 02 09:49:32 2010 -0700
@@ -2176,14 +2176,14 @@
 
 #ifdef _LP64
   case Op_CastPP:
-    if (n->in(1)->is_DecodeN() && Universe::narrow_oop_use_implicit_null_checks()) {
+    if (n->in(1)->is_DecodeN() && Matcher::gen_narrow_oop_implicit_null_checks()) {
       Compile* C = Compile::current();
       Node* in1 = n->in(1);
       const Type* t = n->bottom_type();
       Node* new_in1 = in1->clone();
       new_in1->as_DecodeN()->set_type(t);
 
-      if (!Matcher::clone_shift_expressions) {
+      if (!Matcher::narrow_oop_use_complex_address()) {
         //
         // x86, ARM and friends can handle 2 adds in addressing mode
         // and Matcher can fold a DecodeN node into address by using
@@ -2231,8 +2231,12 @@
         new_in2 = in2->in(1);
       } else if (in2->Opcode() == Op_ConP) {
         const Type* t = in2->bottom_type();
-        if (t == TypePtr::NULL_PTR && Universe::narrow_oop_use_implicit_null_checks()) {
-          new_in2 = ConNode::make(C, TypeNarrowOop::NULL_PTR);
+        if (t == TypePtr::NULL_PTR) {
+          // Don't convert CmpP null check into CmpN if compressed
+          // oops implicit null check is not generated.
+          // This will allow to generate normal oop implicit null check.
+          if (Matcher::gen_narrow_oop_implicit_null_checks())
+            new_in2 = ConNode::make(C, TypeNarrowOop::NULL_PTR);
           //
           // This transformation together with CastPP transformation above
           // will generated code for implicit NULL checks for compressed oops.
@@ -2289,9 +2293,9 @@
 
   case Op_DecodeN:
     assert(!n->in(1)->is_EncodeP(), "should be optimized out");
-    // DecodeN could be pinned on Sparc where it can't be fold into
+    // DecodeN could be pinned when it can't be fold into
     // an address expression, see the code for Op_CastPP above.
-    assert(n->in(0) == NULL || !Matcher::clone_shift_expressions, "no control except on sparc");
+    assert(n->in(0) == NULL || !Matcher::narrow_oop_use_complex_address(), "no control");
     break;
 
   case Op_EncodeP: {
@@ -2496,6 +2500,10 @@
     }
   }
 
+  // Skip next transformation if compressed oops are not used.
+  if (!UseCompressedOops || !Matcher::gen_narrow_oop_implicit_null_checks())
+    return;
+
   // Go over safepoints nodes to skip DecodeN nodes for debug edges.
   // It could be done for an uncommon traps or any safepoints/calls
   // if the DecodeN node is referenced only in a debug info.
diff -r 1eb493f33423 -r 3657cb01ffc5 src/share/vm/opto/connode.cpp
--- a/src/share/vm/opto/connode.cpp	Sat May 29 19:22:32 2010 -0700
+++ b/src/share/vm/opto/connode.cpp	Wed Jun 02 09:49:32 2010 -0700
@@ -437,7 +437,7 @@
 // If not converting int->oop, throw away cast after constant propagation
 Node *CastPPNode::Ideal_DU_postCCP( PhaseCCP *ccp ) {
   const Type *t = ccp->type(in(1));
-  if (!t->isa_oop_ptr() || (in(1)->is_DecodeN() && Universe::narrow_oop_use_implicit_null_checks())) {
+  if (!t->isa_oop_ptr() || (in(1)->is_DecodeN() && Matcher::gen_narrow_oop_implicit_null_checks())) {
     return NULL; // do not transform raw pointers or narrow oops
   }
   return ConstraintCastNode::Ideal_DU_postCCP(ccp);
diff -r 1eb493f33423 -r 3657cb01ffc5 src/share/vm/opto/lcm.cpp
--- a/src/share/vm/opto/lcm.cpp	Sat May 29 19:22:32 2010 -0700
+++ b/src/share/vm/opto/lcm.cpp	Wed Jun 02 09:49:32 2010 -0700
@@ -32,7 +32,8 @@
 // with suitable memory ops nearby.  Use the memory op to do the NULL check.
 // I can generate a memory op if there is not one nearby.
 // The proj is the control projection for the not-null case.
-// The val is the pointer being checked for nullness.
+// The val is the pointer being checked for nullness or
+// decodeHeapOop_not_null node if it did not fold into address.
 void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowed_reasons) {
   // Assume if null check need for 0 offset then always needed
   // Intel solaris doesn't support any null checks yet and no
@@ -96,6 +97,13 @@
     }
   }
 
+  // Check for decodeHeapOop_not_null node which did not fold into address
+  bool is_decoden = ((intptr_t)val) & 1;
+  val = (Node*)(((intptr_t)val) & ~1);
+
+  assert(!is_decoden || (val->in(0) == NULL) && val->is_Mach() &&
+         (val->as_Mach()->ideal_Opcode() == Op_DecodeN), "sanity");
+
   // Search the successor block for a load or store who's base value is also
   // the tested value.  There may be several.
   Node_List *out = new Node_List(Thread::current()->resource_area());
@@ -148,7 +156,8 @@
       if( !mach->needs_anti_dependence_check() )
         continue;               // Not an memory op; skip it
       {
-        // Check that value is used in memory address.
+        // Check that value is used in memory address in
+        // instructions with embedded load (CmpP val1,(val2+off)).
         Node* base;
         Node* index;
         const MachOper* oper = mach->memory_inputs(base, index);
@@ -213,7 +222,11 @@
     uint vidx = 0;              // Capture index of value into memop
     uint j;
     for( j = mach->req()-1; j > 0; j-- ) {
-      if( mach->in(j) == val ) vidx = j;
+      if( mach->in(j) == val ) {
+        vidx = j;
+        // Ignore DecodeN val which could be hoisted to where needed.
+        if( is_decoden ) continue;
+      }
       // Block of memory-op input
       Block *inb = cfg->_bbs[mach->in(j)->_idx];
       Block *b = this;          // Start from nul check
@@ -270,6 +283,26 @@
   extern int implicit_null_checks;
   implicit_null_checks++;
 
+  if( is_decoden ) {
+    // Check if we need to hoist decodeHeapOop_not_null first.
+    Block *valb = cfg->_bbs[val->_idx];
+    if( this != valb && this->_dom_depth < valb->_dom_depth ) {
+      // Hoist it up to the end of the test block.
+      valb->find_remove(val);
+      this->add_inst(val);
+      cfg->_bbs.map(val->_idx,this);
+      // DecodeN on x86 may kill flags. Check for flag-killing projections
+      // that also need to be hoisted.
+      for (DUIterator_Fast jmax, j = val->fast_outs(jmax); j < jmax; j++) {
+        Node* n = val->fast_out(j);
+        if( n->Opcode() == Op_MachProj ) {
+          cfg->_bbs[n->_idx]->find_remove(n);
+          this->add_inst(n);
+          cfg->_bbs.map(n->_idx,this);
+        }
+      }
+    }
+  }
   // Hoist the memory candidate up to the end of the test block.
   Block *old_block = cfg->_bbs[best->_idx];
   old_block->find_remove(best);
diff -r 1eb493f33423 -r 3657cb01ffc5 src/share/vm/opto/matcher.cpp
--- a/src/share/vm/opto/matcher.cpp	Sat May 29 19:22:32 2010 -0700
+++ b/src/share/vm/opto/matcher.cpp	Wed Jun 02 09:49:32 2010 -0700
@@ -1334,7 +1334,7 @@
       if( j == max_scan )       // No post-domination before scan end?
         return true;            // Then break the match tree up
     }
-    if (m->is_DecodeN() && Matcher::clone_shift_expressions) {
+    if (m->is_DecodeN() && Matcher::narrow_oop_use_complex_address()) {
       // These are commonly used in address expressions and can
       // efficiently fold into them on X64 in some cases.
       return false;
@@ -2110,8 +2110,8 @@
         _null_check_tests.push(proj);
         Node* val = cmp->in(1);
 #ifdef _LP64
-        if (UseCompressedOops && !Matcher::clone_shift_expressions &&
-            val->bottom_type()->isa_narrowoop()) {
+        if (val->bottom_type()->isa_narrowoop() &&
+            !Matcher::narrow_oop_use_complex_address()) {
           //
           // Look for DecodeN node which should be pinned to orig_proj.
           // On platforms (Sparc) which can not handle 2 adds
@@ -2127,6 +2127,9 @@
             if (d->is_DecodeN() && d->in(1) == val) {
               val = d;
               val->set_req(0, NULL); // Unpin now.
+              // Mark this as special case to distinguish from
+              // a regular case: CmpP(DecodeN, NULL).
+              val = (Node*)(((intptr_t)val) | 1);
               break;
             }
           }
@@ -2146,9 +2149,21 @@
   for( uint i=0; i < cnt; i+=2 ) {
     Node *test = _null_check_tests[i];
     Node *val = _null_check_tests[i+1];
+    bool is_decoden = ((intptr_t)val) & 1;
+    val = (Node*)(((intptr_t)val) & ~1);
     if (has_new_node(val)) {
+      Node* new_val = new_node(val);
+      if (is_decoden) {
+        assert(val->is_DecodeN() && val->in(0) == NULL, "sanity");
+        // Note: new_val may have a control edge if
+        // the original ideal node DecodeN was matched before
+        // it was unpinned in Matcher::collect_null_checks().
+        // Unpin the mach node and mark it.
+        new_val->set_req(0, NULL);
+        new_val = (Node*)(((intptr_t)new_val) | 1);
+      }
       // Is a match-tree root, so replace with the matched value
-      _null_check_tests.map(i+1, new_node(val));
+      _null_check_tests.map(i+1, new_val);
     } else {
       // Yank from candidate list
       _null_check_tests.map(i+1,_null_check_tests[--cnt]);
diff -r 1eb493f33423 -r 3657cb01ffc5 src/share/vm/opto/matcher.hpp
--- a/src/share/vm/opto/matcher.hpp	Sat May 29 19:22:32 2010 -0700
+++ b/src/share/vm/opto/matcher.hpp	Wed Jun 02 09:49:32 2010 -0700
@@ -352,6 +352,38 @@
   // registers?  True for Intel but false for most RISCs
   static const bool clone_shift_expressions;
 
+  static bool narrow_oop_use_complex_address();
+
+  // Generate implicit null check for narrow oops if it can fold
+  // into address expression (x64).
+  //
+  // [R12 + narrow_oop_reg<<3 + offset] // fold into address expression
+  // NullCheck narrow_oop_reg
+  //
+  // When narrow oops can't fold into address expression (Sparc) and
+  // base is not null use decode_not_null and normal implicit null check.
+  // Note, decode_not_null node can be used here since it is referenced
+  // only on non null path but it requires special handling, see
+  // collect_null_checks():
+  //
+  // decode_not_null narrow_oop_reg, oop_reg // 'shift' and 'add base'
+  // [oop_reg + offset]
+  // NullCheck oop_reg
+  //
+  // With Zero base and when narrow oops can not fold into address
+  // expression use normal implicit null check since only shift
+  // is needed to decode narrow oop.
+  //
+  // decode narrow_oop_reg, oop_reg // only 'shift'
+  // [oop_reg + offset]
+  // NullCheck oop_reg
+  //
+  inline static bool gen_narrow_oop_implicit_null_checks() {
+    return Universe::narrow_oop_use_implicit_null_checks() &&
+           (narrow_oop_use_complex_address() ||
+            Universe::narrow_oop_base() != NULL);
+  }
+
   // Is it better to copy float constants, or load them directly from memory?
   // Intel can load a float constant from a direct address, requiring no
   // extra registers.  Most RISCs will have to materialize an address into a