diff src/gpu/hsail/vm/gpu_hsail.cpp @ 16076:06eedda53e14

HSAIL: add support to allocate new TLAB from GPU Contributed-by: Tom Deneau <tom.deneau@amd.com>
author Doug Simon <doug.simon@oracle.com>
date Tue, 10 Jun 2014 22:36:26 +0200
parents 66a9286203a2
children e9998e2be7f5
line wrap: on
line diff
--- a/src/gpu/hsail/vm/gpu_hsail.cpp	Tue Jun 10 19:08:33 2014 +0200
+++ b/src/gpu/hsail/vm/gpu_hsail.cpp	Tue Jun 10 22:36:26 2014 +0200
@@ -69,8 +69,8 @@
   {CC"executeKernel0",   CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I[I)Z",  FN_PTR(Hsail::execute_kernel_void_1d)},
 };
 
-void * Hsail::_device_context = NULL;
-jint   Hsail::_notice_safepoints = false;
+void* Hsail::_device_context = NULL;
+jint  Hsail::_notice_safepoints = false;
 
 Hsail::okra_create_context_func_t  Hsail::_okra_create_context;
 Hsail::okra_create_kernel_func_t   Hsail::_okra_create_kernel;
@@ -85,43 +85,6 @@
 Hsail::okra_clearargs_func_t       Hsail::_okra_clearargs;
 Hsail::okra_register_heap_func_t   Hsail::_okra_register_heap;
 
-struct Stats {
-  int _dispatches;
-  int _deopts;
-  int _overflows;
-  bool _changeSeen;
-
-public:
-  Stats() {
-    _dispatches = _deopts = _overflows = 0;
-    _changeSeen = false;
-  }
-
-  void incDeopts() {
-    _deopts++;
-    _changeSeen = true;
-  }
-  void incOverflows() {
-    _overflows++;
-    _changeSeen = true;
-  }
-
-  void finishDispatch() {
-    _dispatches++;
-    if (_changeSeen) {
-      // print();
-      _changeSeen = false;
-    }
-  }
-
-  void print() {
-    tty->print_cr("Disp=%d, Deopts=%d, Ovflows=%d", _dispatches, _deopts, _overflows);
-  }
-
-};
-
-static Stats kernelStats;
-
 //static jint in_kernel = 0;
 
 void Hsail::notice_safepoints() {
@@ -165,7 +128,7 @@
 return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, oop_map_array, CHECK_0);
 GPU_END
 
-static void showRanges(jboolean *a, int len) {
+static void showRanges(jboolean* a, int len) {
   // show ranges
   bool lookFor = true;
   for (int i = 0; i < len; i++) {
@@ -182,38 +145,6 @@
   }
 }
 
-// fill and retire old tlab and get a new one
-// if we can't get one, no problem someone will eventually do a gc
-void Hsail::getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, size_t tlabMinHsail) {
-  tlab->clear_before_allocation();    // fill and retire old tlab (will also check for null)
-
-  // get a size for a new tlab that is at least tlabMinHsail.
-  size_t new_tlab_size = tlab->compute_size(tlabMinHsail);
-  if (new_tlab_size == 0) return;
-
-  HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size);
-  if (tlab_start == NULL) return;
-
-  // ..and clear it if required
-  if (ZeroTLAB) {
-    Copy::zero_to_words(tlab_start, new_tlab_size);
-  }
-  // and init the tlab pointers
-  tlab->fill(tlab_start, tlab_start, new_tlab_size);
-}
-
-static void printTlabInfo (ThreadLocalAllocBuffer* tlab) {
-  HeapWord *start = tlab->start();
-  HeapWord *top = tlab->top();
-  HeapWord *end = tlab->end();
-  // sizes are in bytes
-  size_t tlabFree = tlab->free() * HeapWordSize;
-  size_t tlabUsed = tlab->used() * HeapWordSize;
-  size_t tlabSize = tlabFree + tlabUsed;
-  double freePct = 100.0 * (double) tlabFree/(double) tlabSize;
-  tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct);
-}
-
 class OopSaver : public StackObj {
 private:
   objArrayOop _oopsSaveArray;
@@ -260,21 +191,21 @@
     _oopMapArray = (typeArrayOop) JNIHandles::resolve(_oop_map_array);
   }
 
-  void * getOopForBit(HSAILFrame * hsailFrame, int bit) {
+  void* getOopForBit(HSAILFrame* hsailFrame, int bit) {
     assert(isOop(hsailFrame, bit), "");
-    void *oop;
+    void* oop;
     if (bit < hsailFrame->num_d_regs()) {
       // d register
       oop = (void*) hsailFrame->get_d_reg(bit);
     } else {
       // stack slot
       int stackOffset = (bit - hsailFrame->num_d_regs()) * 8;  // 8 bytes per stack slot
-      oop = (void *) hsailFrame->get_stackslot64(stackOffset);
+      oop = (void*) hsailFrame->get_stackslot64(stackOffset);
     }
     return oop;
   }
 
-  void putOopForBit(HSAILFrame * hsailFrame, int bit, void *oop) {
+  void putOopForBit(HSAILFrame* hsailFrame, int bit, void* oop) {
     assert(isOop(hsailFrame, bit), "");
     if (bit < hsailFrame->num_d_regs()) {
       // d register
@@ -286,7 +217,7 @@
     }
   }
 
-  void saveOopsFromFrame(HSAILFrame * hsailFrame, int deoptSlot){
+  void saveOopsFromFrame(HSAILFrame* hsailFrame, int deoptSlot){
     // as used, no need to resolve arrays on each call
     int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
 
@@ -300,7 +231,7 @@
     }
   }
 
-  void restoreOopsToFrame(HSAILFrame * hsailFrame, int deoptSlot, int workitem){
+  void restoreOopsToFrame(HSAILFrame* hsailFrame, int deoptSlot, int workitem){
     // need to re-resolve on each restore
     resolveArrays();
     int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
@@ -310,13 +241,13 @@
       if (isOop(hsailFrame, bit)) {
         // the dregister or stack slot at this bit is an oop, retrieve it from array and put back in frame
         int saveArrayIndex = deoptSlot * oopsPerDeopt + bit;
-        void * newValue = (void *) _oopsSaveArray->obj_at(saveArrayIndex);
-        void * oldValue = getOopForBit(hsailFrame, bit);
+        void* newValue = (void*) _oopsSaveArray->obj_at(saveArrayIndex);
+        void* oldValue = getOopForBit(hsailFrame, bit);
         assert((oldValue != 0 ? newValue != 0 : newValue == 0), "bad dregValue retrieved");
         if (newValue != oldValue) {
           if (TraceGPUInteraction) {
             int numDRegs = hsailFrame->num_d_regs();
-            const char *name = (bit < numDRegs ? "$d" : "stk");
+            const char* name = (bit < numDRegs ? "$d" : "stk");
             int num = (bit < numDRegs ? bit : bit - numDRegs);
             tty->print_cr("oop moved for %s%d, workitem %d, slot %d, old=%p, new=%p",
                           name, num, workitem, deoptSlot, oldValue, newValue);
@@ -327,7 +258,7 @@
     }
   }
 
-  bool isOop(HSAILFrame * hsailFrame, int bit){
+  bool isOop(HSAILFrame* hsailFrame, int bit){
     // re-resolve on each access
     resolveArrays();
     if (bit > hsailFrame->num_d_regs() + hsailFrame->num_stack_slots()) {
@@ -347,47 +278,15 @@
 
 };
 
-jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save,
+jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm, jobject oops_save,
                                                 jobject donor_threads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) {
   ResourceMark rm(THREAD);
   objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args);
 
-  // TODO: avoid donor thread logic if kernel does not allocate
-  objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads);
-  int numDonorThreads = donorThreadObjects->length();
-  guarantee(numDonorThreads > 0, "need at least one donor thread");
-  JavaThread** donorThreads = NEW_RESOURCE_ARRAY(JavaThread*, numDonorThreads);
-  for (int i = 0; i < numDonorThreads; i++) {
-    donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i));
-  }
-
-
-  // compute tlabMinHsail based on number of workitems, number of donor
-  // threads, allocBytesPerWorkitem rounded up
-  size_t tlabMinHsail = (allocBytesPerWorkitem * dimX + (numDonorThreads - 1)) / numDonorThreads;
-  if (TraceGPUInteraction) {
-    tty->print_cr("computed tlabMinHsail = %d", tlabMinHsail);
-  }
-
-  for (int i = 0; i < numDonorThreads; i++) {
-    JavaThread* donorThread = donorThreads[i];
-    ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
-    if (TraceGPUInteraction) {
-      tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab);
-      printTlabInfo(tlab);
-    }
-
-    // note: this used vs. free limit checking should be based on some
-    // heuristic where we see how much this kernel tends to allocate
-    if ((tlab->end() == NULL) || (tlab->free() * HeapWordSize < tlabMinHsail)) {
-      getNewTlabForDonorThread(tlab, tlabMinHsail);
-      if (TraceGPUInteraction) {
-        tty->print("donorThread %d, refilled tlab, -> ", i);
-        printTlabInfo(tlab);
-      }
-    }
-  }
-
+  // We avoid HSAILAllocationInfo logic if kernel does not allocate
+  // in which case the donor_thread array passed in will be null
+  HSAILAllocationInfo* allocInfo = (donor_threads == NULL ? NULL : new HSAILAllocationInfo(donor_threads, dimX, allocBytesPerWorkitem));
+  
   // Reset the kernel arguments
   _okra_clearargs(kernel);
 
@@ -400,7 +299,11 @@
     int numStackSlots = (saveAreaCounts >> 16);
     int bytesPerSaveArea = numSRegs * 4 + (numDRegs + numStackSlots) * 8;
 
-    e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea, dimX, donorThreads);
+    e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea, dimX, allocInfo);
+    // copy cur_tlab_infos
+    if (allocInfo != NULL) {
+      e->setCurTlabInfos(allocInfo->getCurTlabInfos());
+    }
   }
 
   // This object sets up the kernel arguments
@@ -409,8 +312,8 @@
     tty->print_cr("[HSAIL] range=%d", dimX);
   }
 
-  // if any object passed was null, throw an exception here
-  // doing this means the kernel code can avoid null checks on the object parameters.
+  // If any object passed was null, throw an exception here. Doing this
+  // means the kernel code can avoid null checks on the object parameters.
   if (hka.getFirstNullParameterIndex() >= 0) {
     char buf[64];
     sprintf(buf, "Null Kernel Parameter seen, Parameter Index: %d", hka.getFirstNullParameterIndex());
@@ -431,23 +334,9 @@
     //in_kernel = 0;
   }
 
-  // fix up any tlab tops that overflowed
-  bool anyOverflows = false;
-  for (int i = 0; i < numDonorThreads; i++) {
-    JavaThread * donorThread = donorThreads[i];
-    ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
-    if (tlab->top() > tlab->end()) {
-      anyOverflows = true;
-      long overflowAmount = (long) tlab->top() - (long) tlab->pf_top(); 
-      // tlab->set_top is private this ugly hack gets around that
-      *(long *)((char *)tlab + in_bytes(tlab->top_offset())) = (long) tlab->pf_top();
-      if (TraceGPUInteraction) {
-        tty->print_cr("donorThread %d at %p overflowed by %ld bytes, setting last good top to %p", i, donorThread, overflowAmount, tlab->top());
-      }
-    }
-  }
-  if (anyOverflows) {
-    kernelStats.incOverflows();
+  // avoid HSAILAllocationInfo logic if kernel does not allocate
+  if (allocInfo != NULL) {
+    allocInfo->postKernelCleanup();
   }
 
   if (UseHSAILDeoptimization) {
@@ -465,13 +354,11 @@
         guarantee(deoptcode == 1, msg);
       }
     } else {
-      kernelStats.incDeopts();
-
       {
         TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction);
         if (TraceGPUInteraction) {
           tty->print_cr("deopt happened.");
-          HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(0);
+          HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(0);
           tty->print_cr("first deopter was workitem %d", pdeopt->workitem());
         }
 
@@ -485,7 +372,7 @@
         // since slots are allocated from the beginning, we know how far to look
         assert(e->num_deopts() < e->num_slots(), "deopt save state overflow");
         for (int k = 0; k < e->num_deopts(); k++) {
-          HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k);
+          HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(k);
           assert (pdeopt->workitem() >= 0, "bad workitem in deopt");
           // this is a workitem that deopted
           oopSaver.saveOopsFromFrame(pdeopt->first_frame(), k);
@@ -494,15 +381,15 @@
         // Handle any deopting workitems.
         int count_deoptimized = 0;
         for (int k = 0; k < e->num_deopts(); k++) {
-          HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k);
+          HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(k);
 
           jint workitem = pdeopt->workitem();
           if (workitem != -1) {
             int deoptId = pdeopt->pc_offset();
-            HSAILFrame *hsailFrame = pdeopt->first_frame();
+            HSAILFrame* hsailFrame = pdeopt->first_frame();
 
-            // update the hsailFrame from the oopsSaveArray
-            // will re-resolve the handles each time
+            // Update the hsailFrame from the oopsSaveArray
+            // will re-resolve the handles each time.
             oopSaver.restoreOopsToFrame(hsailFrame, k, workitem);
 
             JavaValue result(T_VOID);
@@ -511,7 +398,7 @@
             javaArgs.push_int(deoptId);
             javaArgs.push_long((jlong) hsailFrame);
 
-            // override the deoptimization action with Action_none until we decide
+            // Override the deoptimization action with Action_none until we decide
             // how to handle the other actions.
             int myActionReason = Deoptimization::make_trap_request(Deoptimization::trap_request_reason(pdeopt->reason()), Deoptimization::Action_none);
             javaArgs.push_int(myActionReason);
@@ -551,7 +438,7 @@
       // turn off verbose trace stuff for javacall arg setup
       bool savedTraceGPUInteraction = TraceGPUInteraction;
       TraceGPUInteraction = false;
-      jboolean *never_ran_array = e->never_ran_array();
+      jboolean* never_ran_array = e->never_ran_array();
       if (handleNeverRansHere) {
         for (int k = 0; k < dimX; k++) {
           if (never_ran_array[k]) {
@@ -562,9 +449,10 @@
             JavaCallArguments javaArgs;
             // re-resolve the args_handle here
             objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args);
-            // This object sets up the javaCall arguments
-            // the way argsArray is set up, this should work for instance methods as well
-            // (the receiver will be the first oop pushed)
+
+            // This object sets up the javaCall arguments. The way
+            // argsArray is set up, this should work for instance
+            // methods as well (the receiver will be the first oop pushed)
             HSAILJavaCallArguments hjca(&javaArgs, k, mh->signature(), resolvedArgsArray, mh->is_static());
             if (mh->is_static()) {
               JavaCalls::call_static(&result, methKlass, mh->name(), mh->signature(), &javaArgs, THREAD);
@@ -583,19 +471,19 @@
     }
 
     delete e;
+    delete allocInfo;
   }
-  kernelStats.finishDispatch();
   return success;
 }
 
-GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv *env, jclass, jbyteArray code_handle, jstring name_handle))
+GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv* env, jclass, jbyteArray code_handle, jstring name_handle))
   guarantee(_okra_create_kernel != NULL, "[HSAIL] Okra not linked");
   ResourceMark rm;
   jsize name_len = env->GetStringLength(name_handle);
   jsize code_len = env->GetArrayLength(code_handle);
 
   char* name = NEW_RESOURCE_ARRAY(char, name_len + 1);
-  unsigned char *code = NEW_RESOURCE_ARRAY(unsigned char, code_len + 1);
+  unsigned char* code = NEW_RESOURCE_ARRAY(unsigned char, code_len + 1);
 
   code[code_len] = 0;
   name[name_len] = 0;
@@ -631,7 +519,7 @@
         return false; \
   } \
 
-GPU_ENTRY(jboolean, Hsail::initialize, (JNIEnv *env, jclass))
+GPU_ENTRY(jboolean, Hsail::initialize, (JNIEnv* env, jclass))
   if (okra_library_name == NULL) {
     if (TraceGPUInteraction) {
       tty->print_cr("Unsupported HSAIL platform");
@@ -641,14 +529,14 @@
 
   // here we know we have a valid okra_library_name to try to load
   char ebuf[O_BUFLEN];
-  char *okra_lib_name_from_env_var = getenv("_OKRA_SIM_LIB_PATH_");
+  char* okra_lib_name_from_env_var = getenv("_OKRA_SIM_LIB_PATH_");
   if (okra_lib_name_from_env_var != NULL) {
     okra_library_name = okra_lib_name_from_env_var;
   }
   if (TraceGPUInteraction) {
     tty->print_cr("[HSAIL] library is %s", okra_library_name);
   }
-  void *okra_lib_handle = NULL;
+  void* okra_lib_handle = NULL;
 #if defined(LINUX)
   // Check first if the Okra library is already loaded.
   // TODO: Figure out how to do this on other OSes.
@@ -668,8 +556,8 @@
   
   guarantee(_okra_create_context == NULL, "cannot repeat GPU initialization");
 
-  // at this point we know  okra_lib_handle is valid whether we loaded
-  // here or earlier.  In either case, we can lookup the functions
+  // At this point we know  okra_lib_handle is valid whether we loaded
+  // here or earlier.  In either case, we can lookup the functions.
   LOOKUP_OKRA_FUNCTION(okra_create_context, okra_create_context);
   LOOKUP_OKRA_FUNCTION(okra_create_kernel, okra_create_kernel);
   LOOKUP_OKRA_FUNCTION(okra_push_object, okra_push_object);