diff src/gpu/hsail/vm/gpu_hsail.cpp @ 15482:a250a512434d

HSAIL: support for object values in stack slots at deoptimization points Contributed-by: Tom Deneau <tom.deneau@amd.com>
author Doug Simon <doug.simon@oracle.com>
date Fri, 02 May 2014 21:58:28 +0200
parents 66e3af78ea96
children d370d87e528f
line wrap: on
line diff
--- a/src/gpu/hsail/vm/gpu_hsail.cpp	Fri May 02 11:04:51 2014 -0700
+++ b/src/gpu/hsail/vm/gpu_hsail.cpp	Fri May 02 21:58:28 2014 +0200
@@ -67,7 +67,7 @@
 JNINativeMethod Hsail::HSAIL_methods[] = {
   {CC"initialize",       CC"()Z",                               FN_PTR(Hsail::initialize)},
   {CC"generateKernel",   CC"([B" STRING ")J",                   FN_PTR(Hsail::generate_kernel)},
-  {CC"executeKernel0",   CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I)Z",  FN_PTR(Hsail::execute_kernel_void_1d)},
+  {CC"executeKernel0",   CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I[I)Z",  FN_PTR(Hsail::execute_kernel_void_1d)},
 };
 
 void * Hsail::_device_context = NULL;
@@ -147,7 +147,7 @@
 }
 
 GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args, jobject oops_save,
-                                                      jobject donor_threads, jint allocBytesPerWorkitem))
+                                                      jobject donor_threads, jint allocBytesPerWorkitem, jobject oop_map_array))
 
   ResourceMark rm;
   jlong nmethodValue = InstalledCode::address(kernel_handle);
@@ -163,7 +163,7 @@
     SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL);
   }
 
-  return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, CHECK_0);
+return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, oop_map_array, CHECK_0);
 GPU_END
 
 static void showRanges(jboolean *a, int len) {
@@ -215,9 +215,139 @@
   tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct);
 }
 
+class OopSaver : public StackObj {
+private:
+  objArrayOop _oopsSaveArray;
+  typeArrayOop _oopMapArray;
+  jobject  _oops_save;
+  jobject _oop_map_array;
+  int _last_pcoffset;
+  int _last_idx;
+  int _saveAreaCounts;
 
-jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save, 
-                                                jobject donor_threads, int allocBytesPerWorkitem, TRAPS) {
+  enum {
+    SAVEAREACOUNTS_OFST=0,
+    SPAN_OFST=1,
+    HEADERSIZE=2
+  }; 
+  int mapPcOffsetToIndex(int pcOffset) {
+    if (pcOffset == _last_pcoffset) {
+      return _last_idx;
+    }
+    int span = _oopMapArray->int_at(SPAN_OFST);
+    for (int idx = HEADERSIZE; idx < _oopMapArray->length(); idx += span) {
+      int ofst = _oopMapArray->int_at(idx);
+      if (ofst == pcOffset) {
+        _last_pcoffset = pcOffset;
+        _last_idx = idx + 1;
+        return _last_idx;
+      }
+    }
+  }
+
+public:
+  OopSaver(jobject oops_save, jobject oop_map_array) {
+    _oops_save = oops_save;
+    _oop_map_array = oop_map_array;
+    _last_pcoffset = -1;
+    _saveAreaCounts = getSaveAreaCounts(oop_map_array);
+    resolveArrays();
+  }
+ 
+  void resolveArrays() {
+    _oopsSaveArray = (objArrayOop) JNIHandles::resolve(_oops_save);
+    _oopMapArray = (typeArrayOop) JNIHandles::resolve(_oop_map_array);
+  }
+
+  void * getOopForBit(HSAILFrame * hsailFrame, int bit) {
+    assert(isOop(hsailFrame, bit), "");
+    void *oop;
+    if (bit < hsailFrame->num_d_regs()) {
+      // d register
+      oop = (void*) hsailFrame->get_d_reg(bit);
+    } else {
+      // stack slot
+      int stackOffset = (bit - hsailFrame->num_d_regs()) * 8;  // 8 bytes per stack slot
+      oop = (void *) hsailFrame->get_stackslot64(stackOffset);
+    }
+    return oop;
+  }
+
+  void putOopForBit(HSAILFrame * hsailFrame, int bit, void *oop) {
+    assert(isOop(hsailFrame, bit), "");
+    if (bit < hsailFrame->num_d_regs()) {
+      // d register
+      hsailFrame->put_d_reg(bit, (jlong) oop);
+    } else {
+      // stack slot
+      int stackOffset = (bit - hsailFrame->num_d_regs()) * 8;  // 8 bytes per stack slot
+      hsailFrame->put_stackslot64(stackOffset, (jlong) oop);
+    }
+  }
+
+  void saveOopsFromFrame(HSAILFrame * hsailFrame, int deoptSlot){
+    // as used, no need to resolve arrays on each call
+    int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
+
+    // handle the dregister and stackSlot based oops
+    for (int bit = 0; bit < oopsPerDeopt; bit++) {
+      if (isOop(hsailFrame, bit)) {
+        void* saved_oop = getOopForBit(hsailFrame, bit);
+        int saveArrayIndex = deoptSlot * oopsPerDeopt + bit;
+        _oopsSaveArray->obj_at_put(saveArrayIndex, (oop) saved_oop);
+      }
+    }
+  }
+
+  void restoreOopsToFrame(HSAILFrame * hsailFrame, int deoptSlot, int workitem){
+    // need to re-resolve on each restore
+    resolveArrays();
+    int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
+
+    // handle the dregister and stackSlot based oops
+    for (int bit = 0; bit < oopsPerDeopt; bit++) {
+      if (isOop(hsailFrame, bit)) {
+        // the dregister or stack slot at this bit is an oop, retrieve it from array and put back in frame
+        int saveArrayIndex = deoptSlot * oopsPerDeopt + bit;
+        void * newValue = (void *) _oopsSaveArray->obj_at(saveArrayIndex);
+        void * oldValue = getOopForBit(hsailFrame, bit);
+        assert((oldValue != 0 ? newValue != 0 : newValue == 0), "bad dregValue retrieved");
+        if (newValue != oldValue) {
+          if (TraceGPUInteraction) {
+            int numDRegs = hsailFrame->num_d_regs();
+            const char *name = (bit < numDRegs ? "$d" : "stk");
+            int num = (bit < numDRegs ? bit : bit - numDRegs);
+            tty->print_cr("oop moved for %s%d, workitem %d, slot %d, old=%p, new=%p",
+                          name, num, workitem, deoptSlot, oldValue, newValue);
+          }
+          putOopForBit(hsailFrame, bit, newValue);
+        }
+      }
+    }
+  }
+
+  bool isOop(HSAILFrame * hsailFrame, int bit){
+    // re-resolve on each access
+    resolveArrays();
+    if (bit > hsailFrame->num_d_regs() + hsailFrame->num_stack_slots()) {
+      return false;
+    }
+    int pcOffset = hsailFrame->pc_offset();
+    int bits_int_idx = mapPcOffsetToIndex(pcOffset) + (bit / 32);
+    int bitpos = bit % 32;
+    int bits = _oopMapArray->int_at(bits_int_idx);
+    return ((bits & (1 << bitpos)) != 0);
+  }
+
+  static int getSaveAreaCounts(jobject oopMapArrayObject) {
+    typeArrayOop oopMapArray = (typeArrayOop) JNIHandles::resolve(oopMapArrayObject);
+    return oopMapArray->int_at(SAVEAREACOUNTS_OFST);
+  }
+
+};
+
+jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save,
+                                                jobject donor_threads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) {
   ResourceMark rm(THREAD);
   objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args);
 
@@ -260,9 +390,16 @@
   // Reset the kernel arguments
   _okra_clearargs(kernel);
 
+  // get how many bytes per deopt save area are required
+  int saveAreaCounts = OopSaver::getSaveAreaCounts(oop_map_array);
+  int numSRegs = saveAreaCounts & 0xff;
+  int numDRegs = (saveAreaCounts >> 8) & 0xff;
+  int numStackSlots = (saveAreaCounts >> 16);
+  int bytesPerSaveArea = numSRegs * 4 + (numDRegs + numStackSlots) * 8;
+
   HSAILDeoptimizationInfo* e;
   if (UseHSAILDeoptimization) {
-    e = new (ResourceObj::C_HEAP, mtInternal) HSAILDeoptimizationInfo();
+    e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea);
     e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal));
     memset(e->never_ran_array(), 0, dimX * sizeof(jboolean));
     e->set_donor_threads(donorThreads);
@@ -318,7 +455,6 @@
   if (UseHSAILDeoptimization) {
     kernelStats.incDeopts();
     // check if any workitem requested a deopt
-    // currently we only support at most one such workitem
     int deoptcode = e->deopt_occurred();
     if (deoptcode != 1) {
       if (deoptcode == 0) {
@@ -337,40 +473,30 @@
         TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction);
         if (TraceGPUInteraction) {
           tty->print_cr("deopt happened.");
-          HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[0];
+          HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(0);
           tty->print_cr("first deopter was workitem %d", pdeopt->workitem());
         }
 
         // Before handling any deopting workitems, save the pointers from
         // the hsail frames in oops_save so they get adjusted by any
         // GC. Need to do this before leaving thread_in_vm mode.
+        OopSaver oopSaver(oops_save, oop_map_array);
         // resolve handle only needed once here (not exiting vm mode)
-        objArrayOop oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save);
+        oopSaver.resolveArrays();
 
         // since slots are allocated from the beginning, we know how far to look
-        assert(e->num_deopts() < MAX_DEOPT_SAVE_STATES_SIZE, "deopt save state overflow");
+        assert(e->num_deopts() < e->num_slots(), "deopt save state overflow");
         for (int k = 0; k < e->num_deopts(); k++) {
-          HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k];
-          jint workitem = pdeopt->workitem();
-          if (workitem != -1) {
-            // this is a workitem that deopted
-            HSAILFrame *hsailFrame = pdeopt->first_frame();
-            int dregOopMap = hsailFrame->dreg_oops_map();
-            for (int bit = 0; bit < 16; bit++) {
-              if ((dregOopMap & (1 << bit)) != 0) {
-                // the dregister at this bit is an oop, save it in the array
-                int index = k * 16 + bit;
-                void* saved_oop = (void*) hsailFrame->get_d_reg(bit);
-                oopsSaveArray->obj_at_put(index, (oop) saved_oop);
-              }
-            }
-          }
+          HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k);
+          assert (pdeopt->workitem() >= 0, "bad workitem in deopt");
+          // this is a workitem that deopted
+          oopSaver.saveOopsFromFrame(pdeopt->first_frame(), k);
         }
 
         // Handle any deopting workitems.
         int count_deoptimized = 0;
         for (int k = 0; k < e->num_deopts(); k++) {
-          HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k];
+          HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k);
 
           jint workitem = pdeopt->workitem();
           if (workitem != -1) {
@@ -378,25 +504,8 @@
             HSAILFrame *hsailFrame = pdeopt->first_frame();
 
             // update the hsailFrame from the oopsSaveArray
-            // re-resolve the handle
-            oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save);
-
-            int dregOopMap = hsailFrame->dreg_oops_map();
-            for (int bit = 0; bit < 16; bit++) {
-              if ((dregOopMap & (1 << bit)) != 0) {
-                // the dregister at this bit is an oop, retrieve it from array and put back in frame
-                int index = k * 16 + bit;
-                void * dregValue = (void *) oopsSaveArray->obj_at(index);
-                void * oldDregValue = (void *) hsailFrame->get_d_reg(bit);
-                assert((oldDregValue != 0 ? dregValue != 0 : dregValue == 0), "bad dregValue retrieved");
-                if (TraceGPUInteraction) {
-                  if (dregValue != oldDregValue) {
-                    tty->print_cr("oop moved for $d%d, workitem %d, slot %d, old=%p, new=%p", bit, workitem, k, oldDregValue, dregValue);
-                  }
-                }
-                hsailFrame->put_d_reg(bit, (jlong) dregValue);
-              }
-            }
+            // will re-resolve the handles each time
+            oopSaver.restoreOopsToFrame(hsailFrame, k, workitem);
 
             JavaValue result(T_VOID);
             JavaCallArguments javaArgs;
@@ -410,12 +519,19 @@
             javaArgs.push_int(myActionReason);
             javaArgs.push_oop((oop) NULL);
             if (TraceGPUInteraction) {
-              int dregOopMap = hsailFrame->dreg_oops_map();
-              tty->print_cr("[HSAIL] Deoptimizing to host for workitem=%d (slot=%d) with deoptId=%d, frame=" INTPTR_FORMAT ", actionAndReason=%d, dregOopMap=%04x", workitem, k, deoptId, hsailFrame, myActionReason, dregOopMap);
-              // show the registers containing references
-              for (int bit = 0; bit < 16; bit++) {
-                if ((dregOopMap & (1 << bit)) != 0) {
-                  tty->print_cr("  oop $d%d = %p", bit, hsailFrame->get_d_reg(bit));
+              tty->print_cr("[HSAIL] Deoptimizing to host for workitem=%d (slot=%d) with deoptId=%d, frame=" INTPTR_FORMAT ", actionAndReason=%d", workitem, k, deoptId, hsailFrame, myActionReason);
+              // show the $d registers or stack slots containing references
+              int maxOopBits = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
+              for (int bit = 0; bit < maxOopBits; bit++) {
+                if (oopSaver.isOop(hsailFrame, bit)) {
+                  if (bit < hsailFrame->num_d_regs()) {
+                    // show $d reg oop
+                    tty->print_cr("  oop $d%d = %p", bit, oopSaver.getOopForBit(hsailFrame, bit));
+                  } else {
+                    // show stack slot oop
+                    int stackOffset = (bit - hsailFrame->num_d_regs()) * 8;  // 8 bytes per stack slot
+                    tty->print_cr("  oop stk:%d = %p", stackOffset, oopSaver.getOopForBit(hsailFrame, bit));
+                  }
                 }
               }
             }
@@ -461,7 +577,7 @@
           }
         }
         TraceGPUInteraction = savedTraceGPUInteraction;
-        if (TraceGPUInteraction) {
+        if (TraceGPUInteraction && (count_never_ran > 0)) {
           tty->print_cr("%d workitems never ran, have been run via JavaCall", count_never_ran);
           showRanges(never_ran_array, dimX);
         }