diff src/cpu/ppc/vm/stubGenerator_ppc.cpp @ 14445:67fa91961822

8029940: PPC64 (part 122): C2 compiler port Reviewed-by: kvn
author goetz
date Wed, 11 Dec 2013 00:06:11 +0100
parents ec28f9c041ff
children b858620b0081
line wrap: on
line diff
--- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Tue Dec 10 14:29:43 2013 +0100
+++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Wed Dec 11 00:06:11 2013 +0100
@@ -146,14 +146,14 @@
       // FIXME: use round_to() here
       __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
       __ sldi(r_frame_alignment_in_bytes,
-                  r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
+              r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
 
       // size = unaligned size of arguments + top abi's size
       __ addi(r_frame_size, r_argument_size_in_bytes,
               frame::top_ijava_frame_abi_size);
       // size += arguments alignment
       __ add(r_frame_size,
-                 r_frame_size, r_frame_alignment_in_bytes);
+             r_frame_size, r_frame_alignment_in_bytes);
       // size += size of call_stub locals
       __ addi(r_frame_size,
               r_frame_size, frame::entry_frame_locals_size);
@@ -179,7 +179,7 @@
       __ addi(r_top_of_arguments_addr,
               R1_SP, frame::top_ijava_frame_abi_size);
       __ add(r_top_of_arguments_addr,
-                 r_top_of_arguments_addr, r_frame_alignment_in_bytes);
+             r_top_of_arguments_addr, r_frame_alignment_in_bytes);
 
       // any arguments to copy?
       __ cmpdi(CCR0, r_arg_argument_count, 0);
@@ -229,22 +229,23 @@
 
       // Register state on entry to frame manager / native entry:
       //
-      //   R17_tos     -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
+      //   tos         -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
       //   R19_method  -  Method
       //   R16_thread  -  JavaThread*
 
-      // R17_tos must point to last argument - element_size.
-      __ addi(R17_tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
+      // Tos must point to last argument - element_size.
+      const Register tos = R17_tos;
+      __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
 
       // initialize call_stub locals (step 2)
-      // now save R17_tos as arguments_tos_address
-      __ std(R17_tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
+      // now save tos as arguments_tos_address
+      __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
 
       // load argument registers for call
       __ mr(R19_method, r_arg_method);
       __ mr(R16_thread, r_arg_thread);
-      assert(R17_tos != r_arg_method, "trashed r_arg_method");
-      assert(R17_tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
+      assert(tos != r_arg_method, "trashed r_arg_method");
+      assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
 
       // Set R15_prev_state to 0 for simplifying checks in callee.
       __ li(R15_prev_state, 0);
@@ -274,7 +275,7 @@
       // Do a light-weight C-call here, r_new_arg_entry holds the address
       // of the interpreter entry point (frame manager or native entry)
       // and save runtime-value of LR in return_address.
-      assert(r_new_arg_entry != R17_tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
+      assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
              "trashed r_new_arg_entry");
       return_address = __ call_stub(r_new_arg_entry);
     }
@@ -326,8 +327,8 @@
       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
       __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
       __ cmpwi(CCR1, r_arg_result_type, T_LONG);
-      __ cmpwi(CCR5,  r_arg_result_type, T_FLOAT);
-      __ cmpwi(CCR6,  r_arg_result_type, T_DOUBLE);
+      __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
+      __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
 
       // restore non-volatile registers
       __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
@@ -345,8 +346,8 @@
 
       __ beq(CCR0, ret_is_object);
       __ beq(CCR1, ret_is_long);
-      __ beq(CCR5,  ret_is_float);
-      __ beq(CCR6,  ret_is_double);
+      __ beq(CCR5, ret_is_float);
+      __ beq(CCR6, ret_is_double);
 
       // default:
       __ stw(R3_RET, 0, r_arg_result_addr);
@@ -614,6 +615,17 @@
         if (!dest_uninitialized) {
           const int spill_slots = 4 * wordSize;
           const int frame_size  = frame::abi_112_size + spill_slots;
+          Label filtered;
+
+          // Is marking active?
+          if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
+            __ lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
+          } else {
+            guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
+            __ lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
+          }
+          __ cmpdi(CCR0, Rtmp1, 0);
+          __ beq(CCR0, filtered);
 
           __ save_LR_CR(R0);
           __ push_frame_abi112(spill_slots, R0);
@@ -628,6 +640,8 @@
           __ ld(count, frame_size - 3 * wordSize, R1_SP);
           __ pop_frame();
           __ restore_LR_CR(R0);
+
+          __ bind(filtered);
         }
         break;
       case BarrierSet::CardTableModRef:
@@ -648,21 +662,28 @@
   //
   //  The input registers and R0 are overwritten.
   //
-  void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp) {
+  void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, bool branchToEnd) {
     BarrierSet* const bs = Universe::heap()->barrier_set();
 
     switch (bs->kind()) {
       case BarrierSet::G1SATBCT:
       case BarrierSet::G1SATBCTLogging:
         {
-          __ save_LR_CR(R0);
-          // We need this frame only that the callee can spill LR/CR.
-          __ push_frame_abi112(0, R0);
-
-          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
-
-          __ pop_frame();
-          __ restore_LR_CR(R0);
+          if (branchToEnd) {
+            __ save_LR_CR(R0);
+            // We need this frame only to spill LR.
+            __ push_frame_abi112(0, R0);
+            __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
+            __ pop_frame();
+            __ restore_LR_CR(R0);
+          } else {
+            // Tail call: fake call from stub caller by branching without linking.
+            address entry_point = (address)CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);
+            __ mr_if_needed(R3_ARG1, addr);
+            __ mr_if_needed(R4_ARG2, count);
+            __ load_const(R11, entry_point, R0);
+            __ call_c_and_return_to_caller(R11);
+          }
         }
         break;
       case BarrierSet::CardTableModRef:
@@ -697,9 +718,12 @@
           __ addi(addr, addr, 1);
           __ bdnz(Lstore_loop);
           __ bind(Lskip_loop);
+
+          if (!branchToEnd) __ blr();
         }
       break;
       case BarrierSet::ModRef:
+        if (!branchToEnd) __ blr();
         break;
       default:
         ShouldNotReachHere();
@@ -847,30 +871,28 @@
   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
   //
-  // Source code in function is_range_check_if() shows OptimizeFill relaxed the condition
+  // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
   // for turning on loop predication optimization, and hence the behavior of "array range check"
   // and "loop invariant check" could be influenced, which potentially boosted JVM98.
   //
-  // We leave the code here and see if Oracle has updates in later releases(later than HS20).
-  //
-  //  Generate stub for disjoint short fill.  If "aligned" is true, the
-  //  "to" address is assumed to be heapword aligned.
+  // Generate stub for disjoint short fill. If "aligned" is true, the
+  // "to" address is assumed to be heapword aligned.
   //
   // Arguments for generated stub:
-  //      to:    R3_ARG1
-  //      value: R4_ARG2
-  //      count: R5_ARG3 treated as signed
+  //   to:    R3_ARG1
+  //   value: R4_ARG2
+  //   count: R5_ARG3 treated as signed
   //
   address generate_fill(BasicType t, bool aligned, const char* name) {
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ emit_fd();
 
-    const Register to        = R3_ARG1;   // source array address
-    const Register value     = R4_ARG2;   // fill value
-    const Register count     = R5_ARG3;   // elements count
-    const Register temp      = R6_ARG4;   // temp register
+    const Register to    = R3_ARG1;   // source array address
+    const Register value = R4_ARG2;   // fill value
+    const Register count = R5_ARG3;   // elements count
+    const Register temp  = R6_ARG4;   // temp register
 
-    //assert_clean_int(count, O3);     // Make sure 'count' is clean int.
+    //assert_clean_int(count, O3);    // Make sure 'count' is clean int.
 
     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
@@ -879,31 +901,31 @@
     switch (t) {
        case T_BYTE:
         shift = 2;
-        // clone bytes (zero extend not needed because store instructions below ignore high order bytes)
+        // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit
-        __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element
+        __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
         __ blt(CCR0, L_fill_elements);
         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
         break;
        case T_SHORT:
         shift = 1;
-        // clone bytes (zero extend not needed because store instructions below ignore high order bytes)
+        // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
-        __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element
+        __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
         __ blt(CCR0, L_fill_elements);
         break;
       case T_INT:
         shift = 0;
-        __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element
+        __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
         __ blt(CCR0, L_fill_4_bytes);
         break;
       default: ShouldNotReachHere();
     }
 
     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
-      // align source address at 4 bytes address boundary
+      // Align source address at 4 bytes address boundary.
       if (t == T_BYTE) {
-        // One byte misalignment happens only for byte arrays
+        // One byte misalignment happens only for byte arrays.
         __ andi_(temp, to, 1);
         __ beq(CCR0, L_skip_align1);
         __ stb(value, 0, to);
@@ -930,12 +952,12 @@
       __ bind(L_fill_32_bytes);
     }
 
-    __ li(temp, 8<<shift);              // prepare for 32 byte loop
-    // clone bytes int->long as above
-    __ rldimi(value, value, 32, 0);     // 32 bit -> 64 bit
+    __ li(temp, 8<<shift);                  // Prepare for 32 byte loop.
+    // Clone bytes int->long as above.
+    __ rldimi(value, value, 32, 0);         // 32 bit -> 64 bit
 
     Label L_check_fill_8_bytes;
-    // Fill 32-byte chunks
+    // Fill 32-byte chunks.
     __ subf_(count, temp, count);
     __ blt(CCR0, L_check_fill_8_bytes);
 
@@ -945,7 +967,7 @@
 
     __ std(value, 0, to);
     __ std(value, 8, to);
-    __ subf_(count, temp, count); // update count
+    __ subf_(count, temp, count);           // Update count.
     __ std(value, 16, to);
     __ std(value, 24, to);
 
@@ -968,7 +990,7 @@
     __ addi(to, to, 8);
     __ bge(CCR0, L_fill_8_bytes_loop);
 
-    // fill trailing 4 bytes
+    // Fill trailing 4 bytes.
     __ bind(L_fill_4_bytes);
     __ andi_(temp, count, 1<<shift);
     __ beq(CCR0, L_fill_2_bytes);
@@ -976,14 +998,14 @@
     __ stw(value, 0, to);
     if (t == T_BYTE || t == T_SHORT) {
       __ addi(to, to, 4);
-      // fill trailing 2 bytes
+      // Fill trailing 2 bytes.
       __ bind(L_fill_2_bytes);
       __ andi_(temp, count, 1<<(shift-1));
       __ beq(CCR0, L_fill_byte);
       __ sth(value, 0, to);
       if (t == T_BYTE) {
         __ addi(to, to, 2);
-        // fill trailing byte
+        // Fill trailing byte.
         __ bind(L_fill_byte);
         __ andi_(count, count, 1);
         __ beq(CCR0, L_exit);
@@ -997,7 +1019,7 @@
     __ bind(L_exit);
     __ blr();
 
-    // Handle copies less than 8 bytes.  Int is handled elsewhere.
+    // Handle copies less than 8 bytes. Int is handled elsewhere.
     if (t == T_BYTE) {
       __ bind(L_fill_elements);
       Label L_fill_2, L_fill_4;
@@ -1039,7 +1061,7 @@
   }
 
 
-  // Generate overlap test for array copy stubs
+  // Generate overlap test for array copy stubs.
   //
   // Input:
   //   R3_ARG1    -  from
@@ -1873,10 +1895,7 @@
       generate_conjoint_long_copy_core(aligned);
     }
 
-    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);
-
-    __ blr();
-
+    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
     return start;
   }
 
@@ -1906,9 +1925,7 @@
       generate_disjoint_long_copy_core(aligned);
     }
 
-    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);
-
-    __ blr();
+    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
 
     return start;
   }