diff src/gpu/hsail/vm/gpu_hsail_Tlab.hpp @ 16076:06eedda53e14

HSAIL: add support to allocate new TLAB from GPU Contributed-by: Tom Deneau <tom.deneau@amd.com>
author Doug Simon <doug.simon@oracle.com>
date Tue, 10 Jun 2014 22:36:26 +0200
parents
children f1d1ec9bcf24
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/gpu/hsail/vm/gpu_hsail_Tlab.hpp	Tue Jun 10 22:36:26 2014 +0200
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP
+#define GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP
+
+#include "graal/graalEnv.hpp"
+#include "code/debugInfo.hpp"
+#include "code/location.hpp"
+#include "gpu_hsail.hpp"
+
+class HSAILAllocationInfo;
+
+class HSAILTlabInfo VALUE_OBJ_CLASS_SPEC {
+  friend class VMStructs;
+public:
+  // uses only the necessary fields from a full TLAB
+  HeapWord* _start;
+  HeapWord* _top;
+  HeapWord* _end;
+  HeapWord* _last_good_top;
+  HeapWord* _original_top;
+  JavaThread* _donor_thread;         // donor thread associated with this tlabInfo
+  HSAILAllocationInfo* _alloc_info;   // same as what is in HSAILDeoptimizationInfo
+
+  // Accessors
+  HeapWord* start() { return _start; }
+  HeapWord* top() { return _top; }
+  HeapWord* end() { return _end; }
+  HeapWord* last_good_top() { return _last_good_top; }
+  HeapWord* original_top() { return _original_top; }
+  void initialize(HeapWord* start, HeapWord* top, HeapWord* end, JavaThread* donorThread, HSAILAllocationInfo* allocInfo) {
+    _start = start;
+    _top = _original_top = top;
+    _end = end;
+    _donor_thread = donorThread;
+    _alloc_info = allocInfo;
+  }
+};
+
+
+class HSAILAllocationInfo : public CHeapObj<mtInternal> {
+  friend class VMStructs;
+private:
+  JavaThread** donorThreads;
+  jint _num_donor_threads;
+  size_t _tlab_align_reserve_bytes;    // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes()
+  HSAILTlabInfo** _cur_tlab_infos;    // array of current tlab info pointers, one per donor_thread
+  HSAILTlabInfo* _tlab_infos_pool_start;    // pool for new tlab_infos
+  HSAILTlabInfo* _tlab_infos_pool_next;     // where next will be allocated from
+  HSAILTlabInfo* _tlab_infos_pool_end;      // where next will be allocated from
+
+public:
+  HSAILAllocationInfo(jobject donor_threads_jobj, int dimX, int allocBytesPerWorkitem) {
+    // fill in the donorThreads array
+    objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads_jobj);
+    _num_donor_threads = donorThreadObjects->length();
+    guarantee(_num_donor_threads > 0, "need at least one donor thread");
+    donorThreads = NEW_C_HEAP_ARRAY(JavaThread*, _num_donor_threads, mtInternal);
+    for (int i = 0; i < _num_donor_threads; i++) {
+      donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i));
+    }
+    
+    // Compute max_tlab_infos based on amount of free heap space
+    size_t max_tlab_infos;
+    {
+      JavaThread* donorThread = donorThreads[0];
+      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+      size_t new_tlab_size = tlab->compute_size(0);
+      size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(donorThread);
+      if (new_tlab_size != 0) {
+        max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_donor_threads));
+      } else {
+        max_tlab_infos = 8 * _num_donor_threads;   // an arbitrary multiple
+      }
+      if (TraceGPUInteraction) {
+        tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos);
+      }
+    }
+
+    _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_donor_threads, mtInternal);
+    _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal);
+    _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_donor_threads];
+    _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos];
+    _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes();
+      
+    // we will fill the first N tlabInfos from the donor threads
+    for (int i = 0; i < _num_donor_threads; i++) {
+      JavaThread* donorThread = donorThreads[i];
+      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+      if (TraceGPUInteraction) {
+        tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab);
+        printTlabInfoFromThread(tlab);
+      }
+      
+      // Here we try to get a new tlab if current one is null. Note:
+      // eventually we may want to test if the size is too small based
+      // on some heuristic where we see how much this kernel tends to
+      // allocate, but for now we can just let it overflow and let the
+      // GPU allocate new tlabs. Actually, if we can't prime a tlab
+      // here, it might make sense to do a gc now rather than to start
+      // the kernel and have it deoptimize.  How to do that?
+      if (tlab->end() == NULL) {
+        bool success = getNewTlabForDonorThread(tlab, i);
+        if (TraceGPUInteraction) {
+          if (success) {
+            tty->print("donorThread %d, refilled tlab, -> ", i);
+            printTlabInfoFromThread(tlab);
+          } else {
+            tty->print("donorThread %d, could not refill tlab, left as ", i);
+            printTlabInfoFromThread(tlab);
+          }
+        }
+      }
+
+      // extract the necessary tlab fields into a TlabInfo record
+      HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i];
+      _cur_tlab_infos[i] = pTlabInfo;
+      pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), donorThread, this);
+    }
+  }
+
+  ~HSAILAllocationInfo() {
+    FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal);
+    FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal);
+    FREE_C_HEAP_ARRAY(JavaThread*, donorThreads, mtInternal);
+  }
+
+  void postKernelCleanup() {
+    // go thru all the tlabInfos, fix up any tlab tops that overflowed
+    // complete the tlabs if they overflowed
+    // update the donor threads tlabs when appropriate
+    bool anyOverflows = false;
+    size_t bytesAllocated = 0;
+    // if there was an overflow in allocating tlabInfos, correct it here
+    if (_tlab_infos_pool_next > _tlab_infos_pool_end) {
+      if (TraceGPUInteraction) {
+        int overflowAmount = _tlab_infos_pool_next - _tlab_infos_pool_end;
+        tty->print_cr("tlabInfo allocation overflowed by %d units", overflowAmount);
+      }
+      _tlab_infos_pool_next = _tlab_infos_pool_end;
+    }
+    for (HSAILTlabInfo* tlabInfo = _tlab_infos_pool_start; tlabInfo < _tlab_infos_pool_next; tlabInfo++) {
+      if (TraceGPUInteraction) {
+        tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo, 
+                      tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top());
+      }
+      JavaThread* donorThread = tlabInfo->_donor_thread;
+      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+      bool overflowed = false;
+      // if a tlabInfo has NULL fields, i.e. we could not prime it on entry,
+      // or we could not get a tlab from the gpu, so ignore tlabInfo here
+      if (tlabInfo->start() != NULL) {
+        if (tlabInfo->top() > tlabInfo->end()) {
+          anyOverflows = true;
+          overflowed = true;
+          if (TraceGPUInteraction) {
+            long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top(); 
+            tty->print_cr("tlabInfo %p (donorThread = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, donorThread, overflowAmount, tlabInfo->last_good_top());
+          }
+          tlabInfo->_top = tlabInfo->last_good_top();
+        }
+
+        // fill the donor thread tlab with the tlabInfo information
+        // we do this even if it will get overwritten by a later tlabinfo
+        // because it helps with tlab statistics for that donor thread
+        tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve());
+
+        // if there was an overflow, make it parsable with retire = true
+        if (overflowed) {
+          tlab->make_parsable(true);
+        }
+        
+        size_t delta = (long)(tlabInfo->top()) - (long)(tlabInfo->original_top());
+        if (TraceGPUInteraction) {
+          tty->print_cr("%ld bytes were allocated by tlabInfo %p (start %p, top %p, end %p", delta, tlabInfo,
+                        tlabInfo->start(), tlabInfo->top(), tlabInfo->end());
+        }
+        bytesAllocated += delta;
+      }
+    }
+    if (TraceGPUInteraction) {
+      tty->print_cr("%ld total bytes were allocated in this kernel", bytesAllocated);
+    }
+    if (anyOverflows) {
+      // Hsail::kernelStats.incOverflows();
+    }
+  }
+
+  HSAILTlabInfo** getCurTlabInfos() {
+    return _cur_tlab_infos;
+  }
+
+private:
+  // fill and retire old tlab and get a new one
+  // if we can't get one, no problem someone will eventually do a gc
+  bool getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, int idx) {
+
+    tlab->clear_before_allocation();    // fill and retire old tlab (will also check for null)
+    
+    // get a size for a new tlab that is based on the desired_size
+    size_t new_tlab_size = tlab->compute_size(0);
+    if (new_tlab_size == 0) return false;
+    
+    HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size);
+    if (tlab_start == NULL) return false;
+    
+    // ..and clear it if required
+    if (ZeroTLAB) {
+      Copy::zero_to_words(tlab_start, new_tlab_size);
+    }
+    // and init the tlab pointers
+    tlab->fill(tlab_start, tlab_start, new_tlab_size);
+    return true;
+  }
+  
+  void printTlabInfoFromThread (ThreadLocalAllocBuffer* tlab) {
+    HeapWord* start = tlab->start();
+    HeapWord* top = tlab->top();
+    HeapWord* end = tlab->end();
+    // sizes are in bytes
+    size_t tlabFree = tlab->free() * HeapWordSize;
+    size_t tlabUsed = tlab->used() * HeapWordSize;
+    size_t tlabSize = tlabFree + tlabUsed;
+    double freePct = 100.0 * (double) tlabFree/(double) tlabSize;
+    tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct);
+  }
+  
+};
+  
+#endif // GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP