comparison src/share/vm/opto/loopTransform.cpp @ 1763:d6f45b55c972

4809552: Optimize Arrays.fill(...) Reviewed-by: kvn
author never
date Fri, 27 Aug 2010 17:33:49 -0700
parents 6027dddc26c6
children 5e4f03302987
comparison
equal deleted inserted replaced
1731:ee5cc9e78493 1763:d6f45b55c972
2047 } 2047 }
2048 const CmpNode *cmp = bol->in(1)->as_Cmp(); 2048 const CmpNode *cmp = bol->in(1)->as_Cmp();
2049 if (cmp->Opcode() != Op_CmpU ) { 2049 if (cmp->Opcode() != Op_CmpU ) {
2050 return false; 2050 return false;
2051 } 2051 }
2052 if (cmp->in(2)->Opcode() != Op_LoadRange) { 2052 Node* range = cmp->in(2);
2053 return false; 2053 if (range->Opcode() != Op_LoadRange) {
2054 } 2054 const TypeInt* tint = phase->_igvn.type(range)->isa_int();
2055 LoadRangeNode* lr = (LoadRangeNode*)cmp->in(2); 2055 if (!OptimizeFill || tint == NULL || tint->empty() || tint->_lo < 0) {
2056 if (!invar.is_invariant(lr)) { // loadRange must be invariant 2056 // Allow predication on positive values that aren't LoadRanges.
2057 // This allows optimization of loops where the length of the
2058 // array is a known value and doesn't need to be loaded back
2059 // from the array.
2060 return false;
2061 }
2062 }
2063 if (!invar.is_invariant(range)) {
2057 return false; 2064 return false;
2058 } 2065 }
2059 Node *iv = _head->as_CountedLoop()->phi(); 2066 Node *iv = _head->as_CountedLoop()->phi();
2060 int scale = 0; 2067 int scale = 0;
2061 Node *offset = NULL; 2068 Node *offset = NULL;
2246 2253
2247 // Range check for counted loops 2254 // Range check for counted loops
2248 const Node* cmp = bol->in(1)->as_Cmp(); 2255 const Node* cmp = bol->in(1)->as_Cmp();
2249 Node* idx = cmp->in(1); 2256 Node* idx = cmp->in(1);
2250 assert(!invar.is_invariant(idx), "index is variant"); 2257 assert(!invar.is_invariant(idx), "index is variant");
2251 assert(cmp->in(2)->Opcode() == Op_LoadRange, "must be"); 2258 assert(cmp->in(2)->Opcode() == Op_LoadRange || OptimizeFill, "must be");
2252 Node* ld_rng = cmp->in(2); // LoadRangeNode 2259 Node* rng = cmp->in(2);
2253 assert(invar.is_invariant(ld_rng), "load range must be invariant"); 2260 assert(invar.is_invariant(rng), "range must be invariant");
2254 int scale = 1; 2261 int scale = 1;
2255 Node* offset = zero; 2262 Node* offset = zero;
2256 bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset); 2263 bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset);
2257 assert(ok, "must be index expression"); 2264 assert(ok, "must be index expression");
2258 2265
2269 assert(upper_bound_proj->in(0)->as_If()->in(0) == lower_bound_proj, "should dominate"); 2276 assert(upper_bound_proj->in(0)->as_If()->in(0) == lower_bound_proj, "should dominate");
2270 Node *ctrl = lower_bound_proj->in(0)->as_If()->in(0); 2277 Node *ctrl = lower_bound_proj->in(0)->as_If()->in(0);
2271 2278
2272 // Perform cloning to keep Invariance state correct since the 2279 // Perform cloning to keep Invariance state correct since the
2273 // late schedule will place invariant things in the loop. 2280 // late schedule will place invariant things in the loop.
2274 ld_rng = invar.clone(ld_rng, ctrl); 2281 rng = invar.clone(rng, ctrl);
2275 if (offset && offset != zero) { 2282 if (offset && offset != zero) {
2276 assert(invar.is_invariant(offset), "offset must be loop invariant"); 2283 assert(invar.is_invariant(offset), "offset must be loop invariant");
2277 offset = invar.clone(offset, ctrl); 2284 offset = invar.clone(offset, ctrl);
2278 } 2285 }
2279 2286
2280 // Test the lower bound 2287 // Test the lower bound
2281 Node* lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, false); 2288 Node* lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, false);
2282 IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If(); 2289 IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If();
2283 _igvn.hash_delete(lower_bound_iff); 2290 _igvn.hash_delete(lower_bound_iff);
2284 lower_bound_iff->set_req(1, lower_bound_bol); 2291 lower_bound_iff->set_req(1, lower_bound_bol);
2285 if (TraceLoopPredicate) tty->print_cr("lower bound check if: %d", lower_bound_iff->_idx); 2292 if (TraceLoopPredicate) tty->print_cr("lower bound check if: %d", lower_bound_iff->_idx);
2286 2293
2287 // Test the upper bound 2294 // Test the upper bound
2288 Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, true); 2295 Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, true);
2289 IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If(); 2296 IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If();
2290 _igvn.hash_delete(upper_bound_iff); 2297 _igvn.hash_delete(upper_bound_iff);
2291 upper_bound_iff->set_req(1, upper_bound_bol); 2298 upper_bound_iff->set_req(1, upper_bound_bol);
2292 if (TraceLoopPredicate) tty->print_cr("upper bound check if: %d", lower_bound_iff->_idx); 2299 if (TraceLoopPredicate) tty->print_cr("upper bound check if: %d", lower_bound_iff->_idx);
2293 2300
2364 hoisted |= _next->loop_predication( phase); 2371 hoisted |= _next->loop_predication( phase);
2365 } 2372 }
2366 2373
2367 return hoisted; 2374 return hoisted;
2368 } 2375 }
2376
2377
2378 // Process all the loops in the loop tree and replace any fill
2379 // patterns with an intrisc version.
2380 bool PhaseIdealLoop::do_intrinsify_fill() {
2381 bool changed = false;
2382 for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
2383 IdealLoopTree* lpt = iter.current();
2384 changed |= intrinsify_fill(lpt);
2385 }
2386 return changed;
2387 }
2388
2389
2390 // Examine an inner loop looking for a a single store of an invariant
2391 // value in a unit stride loop,
2392 bool PhaseIdealLoop::match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value,
2393 Node*& shift, Node*& con) {
2394 const char* msg = NULL;
2395 Node* msg_node = NULL;
2396
2397 store_value = NULL;
2398 con = NULL;
2399 shift = NULL;
2400
2401 // Process the loop looking for stores. If there are multiple
2402 // stores or extra control flow give at this point.
2403 CountedLoopNode* head = lpt->_head->as_CountedLoop();
2404 for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
2405 Node* n = lpt->_body.at(i);
2406 if (n->outcnt() == 0) continue; // Ignore dead
2407 if (n->is_Store()) {
2408 if (store != NULL) {
2409 msg = "multiple stores";
2410 break;
2411 }
2412 int opc = n->Opcode();
2413 if (opc == Op_StoreP || opc == Op_StoreN || opc == Op_StoreCM) {
2414 msg = "oop fills not handled";
2415 break;
2416 }
2417 Node* value = n->in(MemNode::ValueIn);
2418 if (!lpt->is_invariant(value)) {
2419 msg = "variant store value";
2420 }
2421 store = n;
2422 store_value = value;
2423 } else if (n->is_If() && n != head->loopexit()) {
2424 msg = "extra control flow";
2425 msg_node = n;
2426 }
2427 }
2428
2429 if (store == NULL) {
2430 // No store in loop
2431 return false;
2432 }
2433
2434 if (msg == NULL && head->stride_con() != 1) {
2435 // could handle negative strides too
2436 if (head->stride_con() < 0) {
2437 msg = "negative stride";
2438 } else {
2439 msg = "non-unit stride";
2440 }
2441 }
2442
2443 if (msg == NULL && !store->in(MemNode::Address)->is_AddP()) {
2444 msg = "can't handle store address";
2445 msg_node = store->in(MemNode::Address);
2446 }
2447
2448 // Make sure there is an appropriate fill routine
2449 BasicType t = store->as_Mem()->memory_type();
2450 const char* fill_name;
2451 if (msg == NULL &&
2452 StubRoutines::select_fill_function(t, false, fill_name) == NULL) {
2453 msg = "unsupported store";
2454 msg_node = store;
2455 }
2456
2457 if (msg != NULL) {
2458 #ifndef PRODUCT
2459 if (TraceOptimizeFill) {
2460 tty->print_cr("not fill intrinsic candidate: %s", msg);
2461 if (msg_node != NULL) msg_node->dump();
2462 }
2463 #endif
2464 return false;
2465 }
2466
2467 // Make sure the address expression can be handled. It should be
2468 // head->phi * elsize + con. head->phi might have a ConvI2L.
2469 Node* elements[4];
2470 Node* conv = NULL;
2471 int count = store->in(MemNode::Address)->as_AddP()->unpack_offsets(elements, ARRAY_SIZE(elements));
2472 for (int e = 0; e < count; e++) {
2473 Node* n = elements[e];
2474 if (n->is_Con() && con == NULL) {
2475 con = n;
2476 } else if (n->Opcode() == Op_LShiftX && shift == NULL) {
2477 Node* value = n->in(1);
2478 #ifdef _LP64
2479 if (value->Opcode() == Op_ConvI2L) {
2480 conv = value;
2481 value = value->in(1);
2482 }
2483 #endif
2484 if (value != head->phi()) {
2485 msg = "unhandled shift in address";
2486 } else {
2487 shift = n;
2488 assert(type2aelembytes(store->as_Mem()->memory_type(), true) == 1 << shift->in(2)->get_int(), "scale should match");
2489 }
2490 } else if (n->Opcode() == Op_ConvI2L && conv == NULL) {
2491 if (n->in(1) == head->phi()) {
2492 conv = n;
2493 } else {
2494 msg = "unhandled input to ConvI2L";
2495 }
2496 } else if (n == head->phi()) {
2497 // no shift, check below for allowed cases
2498 } else {
2499 msg = "unhandled node in address";
2500 msg_node = n;
2501 }
2502 }
2503
2504 if (count == -1) {
2505 msg = "malformed address expression";
2506 msg_node = store;
2507 }
2508
2509 // byte sized items won't have a shift
2510 if (msg == NULL && shift == NULL && t != T_BYTE && t != T_BOOLEAN) {
2511 msg = "can't find shift";
2512 msg_node = store;
2513 }
2514
2515 if (msg != NULL) {
2516 #ifndef PRODUCT
2517 if (TraceOptimizeFill) {
2518 tty->print_cr("not fill intrinsic: %s", msg);
2519 if (msg_node != NULL) msg_node->dump();
2520 }
2521 #endif
2522 return false;
2523 }
2524
2525 // No make sure all the other nodes in the loop can be handled
2526 VectorSet ok(Thread::current()->resource_area());
2527
2528 // store related values are ok
2529 ok.set(store->_idx);
2530 ok.set(store->in(MemNode::Memory)->_idx);
2531
2532 // Loop structure is ok
2533 ok.set(head->_idx);
2534 ok.set(head->loopexit()->_idx);
2535 ok.set(head->phi()->_idx);
2536 ok.set(head->incr()->_idx);
2537 ok.set(head->loopexit()->cmp_node()->_idx);
2538 ok.set(head->loopexit()->in(1)->_idx);
2539
2540 // Address elements are ok
2541 if (con) ok.set(con->_idx);
2542 if (shift) ok.set(shift->_idx);
2543 if (conv) ok.set(conv->_idx);
2544
2545 for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
2546 Node* n = lpt->_body.at(i);
2547 if (n->outcnt() == 0) continue; // Ignore dead
2548 if (ok.test(n->_idx)) continue;
2549 // Backedge projection is ok
2550 if (n->is_IfTrue() && n->in(0) == head->loopexit()) continue;
2551 if (!n->is_AddP()) {
2552 msg = "unhandled node";
2553 msg_node = n;
2554 break;
2555 }
2556 }
2557
2558 // Make sure no unexpected values are used outside the loop
2559 for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
2560 Node* n = lpt->_body.at(i);
2561 // These values can be replaced with other nodes if they are used
2562 // outside the loop.
2563 if (n == store || n == head->loopexit() || n == head->incr()) continue;
2564 for (SimpleDUIterator iter(n); iter.has_next(); iter.next()) {
2565 Node* use = iter.get();
2566 if (!lpt->_body.contains(use)) {
2567 msg = "node is used outside loop";
2568 // lpt->_body.dump();
2569 msg_node = n;
2570 break;
2571 }
2572 }
2573 }
2574
2575 #ifdef ASSERT
2576 if (TraceOptimizeFill) {
2577 if (msg != NULL) {
2578 tty->print_cr("no fill intrinsic: %s", msg);
2579 if (msg_node != NULL) msg_node->dump();
2580 } else {
2581 tty->print_cr("fill intrinsic for:");
2582 }
2583 store->dump();
2584 if (Verbose) {
2585 lpt->_body.dump();
2586 }
2587 }
2588 #endif
2589
2590 return msg == NULL;
2591 }
2592
2593
2594
2595 bool PhaseIdealLoop::intrinsify_fill(IdealLoopTree* lpt) {
2596 // Only for counted inner loops
2597 if (!lpt->is_counted() || !lpt->is_inner()) {
2598 return false;
2599 }
2600
2601 // Must have constant stride
2602 CountedLoopNode* head = lpt->_head->as_CountedLoop();
2603 if (!head->stride_is_con() || !head->is_normal_loop()) {
2604 return false;
2605 }
2606
2607 // Check that the body only contains a store of a loop invariant
2608 // value that is indexed by the loop phi.
2609 Node* store = NULL;
2610 Node* store_value = NULL;
2611 Node* shift = NULL;
2612 Node* offset = NULL;
2613 if (!match_fill_loop(lpt, store, store_value, shift, offset)) {
2614 return false;
2615 }
2616
2617 // Now replace the whole loop body by a call to a fill routine that
2618 // covers the same region as the loop.
2619 Node* base = store->in(MemNode::Address)->as_AddP()->in(AddPNode::Base);
2620
2621 // Build an expression for the beginning of the copy region
2622 Node* index = head->init_trip();
2623 #ifdef _LP64
2624 index = new (C, 2) ConvI2LNode(index);
2625 _igvn.register_new_node_with_optimizer(index);
2626 #endif
2627 if (shift != NULL) {
2628 // byte arrays don't require a shift but others do.
2629 index = new (C, 3) LShiftXNode(index, shift->in(2));
2630 _igvn.register_new_node_with_optimizer(index);
2631 }
2632 index = new (C, 4) AddPNode(base, base, index);
2633 _igvn.register_new_node_with_optimizer(index);
2634 Node* from = new (C, 4) AddPNode(base, index, offset);
2635 _igvn.register_new_node_with_optimizer(from);
2636 // Compute the number of elements to copy
2637 Node* len = new (C, 3) SubINode(head->limit(), head->init_trip());
2638 _igvn.register_new_node_with_optimizer(len);
2639
2640 BasicType t = store->as_Mem()->memory_type();
2641 bool aligned = false;
2642 if (offset != NULL && head->init_trip()->is_Con()) {
2643 int element_size = type2aelembytes(t);
2644 aligned = (offset->find_intptr_t_type()->get_con() + head->init_trip()->get_int() * element_size) % HeapWordSize == 0;
2645 }
2646
2647 // Build a call to the fill routine
2648 const char* fill_name;
2649 address fill = StubRoutines::select_fill_function(t, aligned, fill_name);
2650 assert(fill != NULL, "what?");
2651
2652 // Convert float/double to int/long for fill routines
2653 if (t == T_FLOAT) {
2654 store_value = new (C, 2) MoveF2INode(store_value);
2655 _igvn.register_new_node_with_optimizer(store_value);
2656 } else if (t == T_DOUBLE) {
2657 store_value = new (C, 2) MoveD2LNode(store_value);
2658 _igvn.register_new_node_with_optimizer(store_value);
2659 }
2660
2661 Node* mem_phi = store->in(MemNode::Memory);
2662 Node* result_ctrl;
2663 Node* result_mem;
2664 const TypeFunc* call_type = OptoRuntime::array_fill_Type();
2665 int size = call_type->domain()->cnt();
2666 CallLeafNode *call = new (C, size) CallLeafNoFPNode(call_type, fill,
2667 fill_name, TypeAryPtr::get_array_body_type(t));
2668 call->init_req(TypeFunc::Parms+0, from);
2669 call->init_req(TypeFunc::Parms+1, store_value);
2670 call->init_req(TypeFunc::Parms+2, len);
2671 call->init_req( TypeFunc::Control, head->init_control());
2672 call->init_req( TypeFunc::I_O , C->top() ) ; // does no i/o
2673 call->init_req( TypeFunc::Memory , mem_phi->in(LoopNode::EntryControl) );
2674 call->init_req( TypeFunc::ReturnAdr, C->start()->proj_out(TypeFunc::ReturnAdr) );
2675 call->init_req( TypeFunc::FramePtr, C->start()->proj_out(TypeFunc::FramePtr) );
2676 _igvn.register_new_node_with_optimizer(call);
2677 result_ctrl = new (C, 1) ProjNode(call,TypeFunc::Control);
2678 _igvn.register_new_node_with_optimizer(result_ctrl);
2679 result_mem = new (C, 1) ProjNode(call,TypeFunc::Memory);
2680 _igvn.register_new_node_with_optimizer(result_mem);
2681
2682 // If this fill is tightly coupled to an allocation and overwrites
2683 // the whole body, allow it to take over the zeroing.
2684 AllocateNode* alloc = AllocateNode::Ideal_allocation(base, this);
2685 if (alloc != NULL && alloc->is_AllocateArray()) {
2686 Node* length = alloc->as_AllocateArray()->Ideal_length();
2687 if (head->limit() == length &&
2688 head->init_trip() == _igvn.intcon(0)) {
2689 if (TraceOptimizeFill) {
2690 tty->print_cr("Eliminated zeroing in allocation");
2691 }
2692 alloc->maybe_set_complete(&_igvn);
2693 } else {
2694 #ifdef ASSERT
2695 if (TraceOptimizeFill) {
2696 tty->print_cr("filling array but bounds don't match");
2697 alloc->dump();
2698 head->init_trip()->dump();
2699 head->limit()->dump();
2700 length->dump();
2701 }
2702 #endif
2703 }
2704 }
2705
2706 // Redirect the old control and memory edges that are outside the loop.
2707 Node* exit = head->loopexit()->proj_out(0);
2708 _igvn.replace_node(exit, result_ctrl);
2709 _igvn.replace_node(store, result_mem);
2710 // Any uses the increment outside of the loop become the loop limit.
2711 _igvn.replace_node(head->incr(), head->limit());
2712
2713 // Disconnect the head from the loop.
2714 for (uint i = 0; i < lpt->_body.size(); i++) {
2715 Node* n = lpt->_body.at(i);
2716 _igvn.replace_node(n, C->top());
2717 }
2718
2719 return true;
2720 }