# HG changeset patch # User kvn # Date 1275497372 25200 # Node ID 3657cb01ffc5b74da8f698aab584a7c80786bea7 # Parent 1eb493f334235ffcc2ab032d15f69690944c179a 6954029: Improve implicit null check generation with compressed oops Summary: Hoist DecodeN instruction above null check Reviewed-by: never, twisti diff -r 1eb493f33423 -r 3657cb01ffc5 src/cpu/sparc/vm/sparc.ad --- a/src/cpu/sparc/vm/sparc.ad Sat May 29 19:22:32 2010 -0700 +++ b/src/cpu/sparc/vm/sparc.ad Wed Jun 02 09:49:32 2010 -0700 @@ -1760,6 +1760,12 @@ // registers? True for Intel but false for most RISCs const bool Matcher::clone_shift_expressions = false; +bool Matcher::narrow_oop_use_complex_address() { + NOT_LP64(ShouldNotCallThis()); + assert(UseCompressedOops, "only for compressed oops code"); + return false; +} + // Is it better to copy float constants, or load them directly from memory? // Intel can load a float constant from a direct address, requiring no // extra registers. Most RISCs will have to materialize an address into a diff -r 1eb493f33423 -r 3657cb01ffc5 src/cpu/sparc/vm/vm_version_sparc.cpp --- a/src/cpu/sparc/vm/vm_version_sparc.cpp Sat May 29 19:22:32 2010 -0700 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp Wed Jun 02 09:49:32 2010 -0700 @@ -65,13 +65,6 @@ FLAG_SET_DEFAULT(UseInlineCaches, false); } #ifdef _LP64 - // Single issue niagara1 is slower for CompressedOops - // but niagaras after that it's fine. - if (!is_niagara1_plus()) { - if (FLAG_IS_DEFAULT(UseCompressedOops)) { - FLAG_SET_ERGO(bool, UseCompressedOops, false); - } - } // 32-bit oops don't make sense for the 64-bit VM on sparc // since the 32-bit VM has the same registers and smaller objects. Universe::set_narrow_oop_shift(LogMinObjAlignmentInBytes); diff -r 1eb493f33423 -r 3657cb01ffc5 src/cpu/x86/vm/x86_32.ad --- a/src/cpu/x86/vm/x86_32.ad Sat May 29 19:22:32 2010 -0700 +++ b/src/cpu/x86/vm/x86_32.ad Wed Jun 02 09:49:32 2010 -0700 @@ -1377,6 +1377,12 @@ // registers? True for Intel but false for most RISCs const bool Matcher::clone_shift_expressions = true; +bool Matcher::narrow_oop_use_complex_address() { + ShouldNotCallThis(); + return true; +} + + // Is it better to copy float constants, or load them directly from memory? // Intel can load a float constant from a direct address, requiring no // extra registers. Most RISCs will have to materialize an address into a diff -r 1eb493f33423 -r 3657cb01ffc5 src/cpu/x86/vm/x86_64.ad --- a/src/cpu/x86/vm/x86_64.ad Sat May 29 19:22:32 2010 -0700 +++ b/src/cpu/x86/vm/x86_64.ad Wed Jun 02 09:49:32 2010 -0700 @@ -2037,6 +2037,11 @@ // into registers? True for Intel but false for most RISCs const bool Matcher::clone_shift_expressions = true; +bool Matcher::narrow_oop_use_complex_address() { + assert(UseCompressedOops, "only for compressed oops code"); + return (LogMinObjAlignmentInBytes <= 3); +} + // Is it better to copy float constants, or load them directly from // memory? Intel can load a float constant from a direct address, // requiring no extra registers. Most RISCs will have to materialize diff -r 1eb493f33423 -r 3657cb01ffc5 src/share/vm/opto/compile.cpp --- a/src/share/vm/opto/compile.cpp Sat May 29 19:22:32 2010 -0700 +++ b/src/share/vm/opto/compile.cpp Wed Jun 02 09:49:32 2010 -0700 @@ -2176,14 +2176,14 @@ #ifdef _LP64 case Op_CastPP: - if (n->in(1)->is_DecodeN() && Universe::narrow_oop_use_implicit_null_checks()) { + if (n->in(1)->is_DecodeN() && Matcher::gen_narrow_oop_implicit_null_checks()) { Compile* C = Compile::current(); Node* in1 = n->in(1); const Type* t = n->bottom_type(); Node* new_in1 = in1->clone(); new_in1->as_DecodeN()->set_type(t); - if (!Matcher::clone_shift_expressions) { + if (!Matcher::narrow_oop_use_complex_address()) { // // x86, ARM and friends can handle 2 adds in addressing mode // and Matcher can fold a DecodeN node into address by using @@ -2231,8 +2231,12 @@ new_in2 = in2->in(1); } else if (in2->Opcode() == Op_ConP) { const Type* t = in2->bottom_type(); - if (t == TypePtr::NULL_PTR && Universe::narrow_oop_use_implicit_null_checks()) { - new_in2 = ConNode::make(C, TypeNarrowOop::NULL_PTR); + if (t == TypePtr::NULL_PTR) { + // Don't convert CmpP null check into CmpN if compressed + // oops implicit null check is not generated. + // This will allow to generate normal oop implicit null check. + if (Matcher::gen_narrow_oop_implicit_null_checks()) + new_in2 = ConNode::make(C, TypeNarrowOop::NULL_PTR); // // This transformation together with CastPP transformation above // will generated code for implicit NULL checks for compressed oops. @@ -2289,9 +2293,9 @@ case Op_DecodeN: assert(!n->in(1)->is_EncodeP(), "should be optimized out"); - // DecodeN could be pinned on Sparc where it can't be fold into + // DecodeN could be pinned when it can't be fold into // an address expression, see the code for Op_CastPP above. - assert(n->in(0) == NULL || !Matcher::clone_shift_expressions, "no control except on sparc"); + assert(n->in(0) == NULL || !Matcher::narrow_oop_use_complex_address(), "no control"); break; case Op_EncodeP: { @@ -2496,6 +2500,10 @@ } } + // Skip next transformation if compressed oops are not used. + if (!UseCompressedOops || !Matcher::gen_narrow_oop_implicit_null_checks()) + return; + // Go over safepoints nodes to skip DecodeN nodes for debug edges. // It could be done for an uncommon traps or any safepoints/calls // if the DecodeN node is referenced only in a debug info. diff -r 1eb493f33423 -r 3657cb01ffc5 src/share/vm/opto/connode.cpp --- a/src/share/vm/opto/connode.cpp Sat May 29 19:22:32 2010 -0700 +++ b/src/share/vm/opto/connode.cpp Wed Jun 02 09:49:32 2010 -0700 @@ -437,7 +437,7 @@ // If not converting int->oop, throw away cast after constant propagation Node *CastPPNode::Ideal_DU_postCCP( PhaseCCP *ccp ) { const Type *t = ccp->type(in(1)); - if (!t->isa_oop_ptr() || (in(1)->is_DecodeN() && Universe::narrow_oop_use_implicit_null_checks())) { + if (!t->isa_oop_ptr() || (in(1)->is_DecodeN() && Matcher::gen_narrow_oop_implicit_null_checks())) { return NULL; // do not transform raw pointers or narrow oops } return ConstraintCastNode::Ideal_DU_postCCP(ccp); diff -r 1eb493f33423 -r 3657cb01ffc5 src/share/vm/opto/lcm.cpp --- a/src/share/vm/opto/lcm.cpp Sat May 29 19:22:32 2010 -0700 +++ b/src/share/vm/opto/lcm.cpp Wed Jun 02 09:49:32 2010 -0700 @@ -32,7 +32,8 @@ // with suitable memory ops nearby. Use the memory op to do the NULL check. // I can generate a memory op if there is not one nearby. // The proj is the control projection for the not-null case. -// The val is the pointer being checked for nullness. +// The val is the pointer being checked for nullness or +// decodeHeapOop_not_null node if it did not fold into address. void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowed_reasons) { // Assume if null check need for 0 offset then always needed // Intel solaris doesn't support any null checks yet and no @@ -96,6 +97,13 @@ } } + // Check for decodeHeapOop_not_null node which did not fold into address + bool is_decoden = ((intptr_t)val) & 1; + val = (Node*)(((intptr_t)val) & ~1); + + assert(!is_decoden || (val->in(0) == NULL) && val->is_Mach() && + (val->as_Mach()->ideal_Opcode() == Op_DecodeN), "sanity"); + // Search the successor block for a load or store who's base value is also // the tested value. There may be several. Node_List *out = new Node_List(Thread::current()->resource_area()); @@ -148,7 +156,8 @@ if( !mach->needs_anti_dependence_check() ) continue; // Not an memory op; skip it { - // Check that value is used in memory address. + // Check that value is used in memory address in + // instructions with embedded load (CmpP val1,(val2+off)). Node* base; Node* index; const MachOper* oper = mach->memory_inputs(base, index); @@ -213,7 +222,11 @@ uint vidx = 0; // Capture index of value into memop uint j; for( j = mach->req()-1; j > 0; j-- ) { - if( mach->in(j) == val ) vidx = j; + if( mach->in(j) == val ) { + vidx = j; + // Ignore DecodeN val which could be hoisted to where needed. + if( is_decoden ) continue; + } // Block of memory-op input Block *inb = cfg->_bbs[mach->in(j)->_idx]; Block *b = this; // Start from nul check @@ -270,6 +283,26 @@ extern int implicit_null_checks; implicit_null_checks++; + if( is_decoden ) { + // Check if we need to hoist decodeHeapOop_not_null first. + Block *valb = cfg->_bbs[val->_idx]; + if( this != valb && this->_dom_depth < valb->_dom_depth ) { + // Hoist it up to the end of the test block. + valb->find_remove(val); + this->add_inst(val); + cfg->_bbs.map(val->_idx,this); + // DecodeN on x86 may kill flags. Check for flag-killing projections + // that also need to be hoisted. + for (DUIterator_Fast jmax, j = val->fast_outs(jmax); j < jmax; j++) { + Node* n = val->fast_out(j); + if( n->Opcode() == Op_MachProj ) { + cfg->_bbs[n->_idx]->find_remove(n); + this->add_inst(n); + cfg->_bbs.map(n->_idx,this); + } + } + } + } // Hoist the memory candidate up to the end of the test block. Block *old_block = cfg->_bbs[best->_idx]; old_block->find_remove(best); diff -r 1eb493f33423 -r 3657cb01ffc5 src/share/vm/opto/matcher.cpp --- a/src/share/vm/opto/matcher.cpp Sat May 29 19:22:32 2010 -0700 +++ b/src/share/vm/opto/matcher.cpp Wed Jun 02 09:49:32 2010 -0700 @@ -1334,7 +1334,7 @@ if( j == max_scan ) // No post-domination before scan end? return true; // Then break the match tree up } - if (m->is_DecodeN() && Matcher::clone_shift_expressions) { + if (m->is_DecodeN() && Matcher::narrow_oop_use_complex_address()) { // These are commonly used in address expressions and can // efficiently fold into them on X64 in some cases. return false; @@ -2110,8 +2110,8 @@ _null_check_tests.push(proj); Node* val = cmp->in(1); #ifdef _LP64 - if (UseCompressedOops && !Matcher::clone_shift_expressions && - val->bottom_type()->isa_narrowoop()) { + if (val->bottom_type()->isa_narrowoop() && + !Matcher::narrow_oop_use_complex_address()) { // // Look for DecodeN node which should be pinned to orig_proj. // On platforms (Sparc) which can not handle 2 adds @@ -2127,6 +2127,9 @@ if (d->is_DecodeN() && d->in(1) == val) { val = d; val->set_req(0, NULL); // Unpin now. + // Mark this as special case to distinguish from + // a regular case: CmpP(DecodeN, NULL). + val = (Node*)(((intptr_t)val) | 1); break; } } @@ -2146,9 +2149,21 @@ for( uint i=0; i < cnt; i+=2 ) { Node *test = _null_check_tests[i]; Node *val = _null_check_tests[i+1]; + bool is_decoden = ((intptr_t)val) & 1; + val = (Node*)(((intptr_t)val) & ~1); if (has_new_node(val)) { + Node* new_val = new_node(val); + if (is_decoden) { + assert(val->is_DecodeN() && val->in(0) == NULL, "sanity"); + // Note: new_val may have a control edge if + // the original ideal node DecodeN was matched before + // it was unpinned in Matcher::collect_null_checks(). + // Unpin the mach node and mark it. + new_val->set_req(0, NULL); + new_val = (Node*)(((intptr_t)new_val) | 1); + } // Is a match-tree root, so replace with the matched value - _null_check_tests.map(i+1, new_node(val)); + _null_check_tests.map(i+1, new_val); } else { // Yank from candidate list _null_check_tests.map(i+1,_null_check_tests[--cnt]); diff -r 1eb493f33423 -r 3657cb01ffc5 src/share/vm/opto/matcher.hpp --- a/src/share/vm/opto/matcher.hpp Sat May 29 19:22:32 2010 -0700 +++ b/src/share/vm/opto/matcher.hpp Wed Jun 02 09:49:32 2010 -0700 @@ -352,6 +352,38 @@ // registers? True for Intel but false for most RISCs static const bool clone_shift_expressions; + static bool narrow_oop_use_complex_address(); + + // Generate implicit null check for narrow oops if it can fold + // into address expression (x64). + // + // [R12 + narrow_oop_reg<<3 + offset] // fold into address expression + // NullCheck narrow_oop_reg + // + // When narrow oops can't fold into address expression (Sparc) and + // base is not null use decode_not_null and normal implicit null check. + // Note, decode_not_null node can be used here since it is referenced + // only on non null path but it requires special handling, see + // collect_null_checks(): + // + // decode_not_null narrow_oop_reg, oop_reg // 'shift' and 'add base' + // [oop_reg + offset] + // NullCheck oop_reg + // + // With Zero base and when narrow oops can not fold into address + // expression use normal implicit null check since only shift + // is needed to decode narrow oop. + // + // decode narrow_oop_reg, oop_reg // only 'shift' + // [oop_reg + offset] + // NullCheck oop_reg + // + inline static bool gen_narrow_oop_implicit_null_checks() { + return Universe::narrow_oop_use_implicit_null_checks() && + (narrow_oop_use_complex_address() || + Universe::narrow_oop_base() != NULL); + } + // Is it better to copy float constants, or load them directly from memory? // Intel can load a float constant from a direct address, requiring no // extra registers. Most RISCs will have to materialize an address into a