comparison src/gpu/hsail/vm/gpu_hsail_Tlab.hpp @ 16076:06eedda53e14

HSAIL: add support to allocate new TLAB from GPU Contributed-by: Tom Deneau <tom.deneau@amd.com>
author Doug Simon <doug.simon@oracle.com>
date Tue, 10 Jun 2014 22:36:26 +0200
parents
children f1d1ec9bcf24
comparison
equal deleted inserted replaced
16074:b6ab7e7fa0a5 16076:06eedda53e14
1 /*
2 * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP
26 #define GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP
27
28 #include "graal/graalEnv.hpp"
29 #include "code/debugInfo.hpp"
30 #include "code/location.hpp"
31 #include "gpu_hsail.hpp"
32
33 class HSAILAllocationInfo;
34
35 class HSAILTlabInfo VALUE_OBJ_CLASS_SPEC {
36 friend class VMStructs;
37 public:
38 // uses only the necessary fields from a full TLAB
39 HeapWord* _start;
40 HeapWord* _top;
41 HeapWord* _end;
42 HeapWord* _last_good_top;
43 HeapWord* _original_top;
44 JavaThread* _donor_thread; // donor thread associated with this tlabInfo
45 HSAILAllocationInfo* _alloc_info; // same as what is in HSAILDeoptimizationInfo
46
47 // Accessors
48 HeapWord* start() { return _start; }
49 HeapWord* top() { return _top; }
50 HeapWord* end() { return _end; }
51 HeapWord* last_good_top() { return _last_good_top; }
52 HeapWord* original_top() { return _original_top; }
53 void initialize(HeapWord* start, HeapWord* top, HeapWord* end, JavaThread* donorThread, HSAILAllocationInfo* allocInfo) {
54 _start = start;
55 _top = _original_top = top;
56 _end = end;
57 _donor_thread = donorThread;
58 _alloc_info = allocInfo;
59 }
60 };
61
62
63 class HSAILAllocationInfo : public CHeapObj<mtInternal> {
64 friend class VMStructs;
65 private:
66 JavaThread** donorThreads;
67 jint _num_donor_threads;
68 size_t _tlab_align_reserve_bytes; // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes()
69 HSAILTlabInfo** _cur_tlab_infos; // array of current tlab info pointers, one per donor_thread
70 HSAILTlabInfo* _tlab_infos_pool_start; // pool for new tlab_infos
71 HSAILTlabInfo* _tlab_infos_pool_next; // where next will be allocated from
72 HSAILTlabInfo* _tlab_infos_pool_end; // where next will be allocated from
73
74 public:
75 HSAILAllocationInfo(jobject donor_threads_jobj, int dimX, int allocBytesPerWorkitem) {
76 // fill in the donorThreads array
77 objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads_jobj);
78 _num_donor_threads = donorThreadObjects->length();
79 guarantee(_num_donor_threads > 0, "need at least one donor thread");
80 donorThreads = NEW_C_HEAP_ARRAY(JavaThread*, _num_donor_threads, mtInternal);
81 for (int i = 0; i < _num_donor_threads; i++) {
82 donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i));
83 }
84
85 // Compute max_tlab_infos based on amount of free heap space
86 size_t max_tlab_infos;
87 {
88 JavaThread* donorThread = donorThreads[0];
89 ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
90 size_t new_tlab_size = tlab->compute_size(0);
91 size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(donorThread);
92 if (new_tlab_size != 0) {
93 max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_donor_threads));
94 } else {
95 max_tlab_infos = 8 * _num_donor_threads; // an arbitrary multiple
96 }
97 if (TraceGPUInteraction) {
98 tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos);
99 }
100 }
101
102 _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_donor_threads, mtInternal);
103 _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal);
104 _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_donor_threads];
105 _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos];
106 _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes();
107
108 // we will fill the first N tlabInfos from the donor threads
109 for (int i = 0; i < _num_donor_threads; i++) {
110 JavaThread* donorThread = donorThreads[i];
111 ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
112 if (TraceGPUInteraction) {
113 tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab);
114 printTlabInfoFromThread(tlab);
115 }
116
117 // Here we try to get a new tlab if current one is null. Note:
118 // eventually we may want to test if the size is too small based
119 // on some heuristic where we see how much this kernel tends to
120 // allocate, but for now we can just let it overflow and let the
121 // GPU allocate new tlabs. Actually, if we can't prime a tlab
122 // here, it might make sense to do a gc now rather than to start
123 // the kernel and have it deoptimize. How to do that?
124 if (tlab->end() == NULL) {
125 bool success = getNewTlabForDonorThread(tlab, i);
126 if (TraceGPUInteraction) {
127 if (success) {
128 tty->print("donorThread %d, refilled tlab, -> ", i);
129 printTlabInfoFromThread(tlab);
130 } else {
131 tty->print("donorThread %d, could not refill tlab, left as ", i);
132 printTlabInfoFromThread(tlab);
133 }
134 }
135 }
136
137 // extract the necessary tlab fields into a TlabInfo record
138 HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i];
139 _cur_tlab_infos[i] = pTlabInfo;
140 pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), donorThread, this);
141 }
142 }
143
144 ~HSAILAllocationInfo() {
145 FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal);
146 FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal);
147 FREE_C_HEAP_ARRAY(JavaThread*, donorThreads, mtInternal);
148 }
149
150 void postKernelCleanup() {
151 // go thru all the tlabInfos, fix up any tlab tops that overflowed
152 // complete the tlabs if they overflowed
153 // update the donor threads tlabs when appropriate
154 bool anyOverflows = false;
155 size_t bytesAllocated = 0;
156 // if there was an overflow in allocating tlabInfos, correct it here
157 if (_tlab_infos_pool_next > _tlab_infos_pool_end) {
158 if (TraceGPUInteraction) {
159 int overflowAmount = _tlab_infos_pool_next - _tlab_infos_pool_end;
160 tty->print_cr("tlabInfo allocation overflowed by %d units", overflowAmount);
161 }
162 _tlab_infos_pool_next = _tlab_infos_pool_end;
163 }
164 for (HSAILTlabInfo* tlabInfo = _tlab_infos_pool_start; tlabInfo < _tlab_infos_pool_next; tlabInfo++) {
165 if (TraceGPUInteraction) {
166 tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo,
167 tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top());
168 }
169 JavaThread* donorThread = tlabInfo->_donor_thread;
170 ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
171 bool overflowed = false;
172 // if a tlabInfo has NULL fields, i.e. we could not prime it on entry,
173 // or we could not get a tlab from the gpu, so ignore tlabInfo here
174 if (tlabInfo->start() != NULL) {
175 if (tlabInfo->top() > tlabInfo->end()) {
176 anyOverflows = true;
177 overflowed = true;
178 if (TraceGPUInteraction) {
179 long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top();
180 tty->print_cr("tlabInfo %p (donorThread = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, donorThread, overflowAmount, tlabInfo->last_good_top());
181 }
182 tlabInfo->_top = tlabInfo->last_good_top();
183 }
184
185 // fill the donor thread tlab with the tlabInfo information
186 // we do this even if it will get overwritten by a later tlabinfo
187 // because it helps with tlab statistics for that donor thread
188 tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve());
189
190 // if there was an overflow, make it parsable with retire = true
191 if (overflowed) {
192 tlab->make_parsable(true);
193 }
194
195 size_t delta = (long)(tlabInfo->top()) - (long)(tlabInfo->original_top());
196 if (TraceGPUInteraction) {
197 tty->print_cr("%ld bytes were allocated by tlabInfo %p (start %p, top %p, end %p", delta, tlabInfo,
198 tlabInfo->start(), tlabInfo->top(), tlabInfo->end());
199 }
200 bytesAllocated += delta;
201 }
202 }
203 if (TraceGPUInteraction) {
204 tty->print_cr("%ld total bytes were allocated in this kernel", bytesAllocated);
205 }
206 if (anyOverflows) {
207 // Hsail::kernelStats.incOverflows();
208 }
209 }
210
211 HSAILTlabInfo** getCurTlabInfos() {
212 return _cur_tlab_infos;
213 }
214
215 private:
216 // fill and retire old tlab and get a new one
217 // if we can't get one, no problem someone will eventually do a gc
218 bool getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, int idx) {
219
220 tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null)
221
222 // get a size for a new tlab that is based on the desired_size
223 size_t new_tlab_size = tlab->compute_size(0);
224 if (new_tlab_size == 0) return false;
225
226 HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size);
227 if (tlab_start == NULL) return false;
228
229 // ..and clear it if required
230 if (ZeroTLAB) {
231 Copy::zero_to_words(tlab_start, new_tlab_size);
232 }
233 // and init the tlab pointers
234 tlab->fill(tlab_start, tlab_start, new_tlab_size);
235 return true;
236 }
237
238 void printTlabInfoFromThread (ThreadLocalAllocBuffer* tlab) {
239 HeapWord* start = tlab->start();
240 HeapWord* top = tlab->top();
241 HeapWord* end = tlab->end();
242 // sizes are in bytes
243 size_t tlabFree = tlab->free() * HeapWordSize;
244 size_t tlabUsed = tlab->used() * HeapWordSize;
245 size_t tlabSize = tlabFree + tlabUsed;
246 double freePct = 100.0 * (double) tlabFree/(double) tlabSize;
247 tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct);
248 }
249
250 };
251
252 #endif // GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP