Mercurial > hg > graal-compiler
comparison src/gpu/hsail/vm/gpu_hsail_Tlab.hpp @ 16795:a29e6e7b7a86
Replace hsail donor threads with hsail tlabs
author | Tom Rodriguez <tom.rodriguez@oracle.com> |
---|---|
date | Tue, 12 Aug 2014 16:30:17 -0700 |
parents | f1d1ec9bcf24 |
children | 82e5b5ccdb0c |
comparison
equal
deleted
inserted
replaced
16794:74c02c90a3f9 | 16795:a29e6e7b7a86 |
---|---|
39 HeapWord* _start; | 39 HeapWord* _start; |
40 HeapWord* _top; | 40 HeapWord* _top; |
41 HeapWord* _end; | 41 HeapWord* _end; |
42 HeapWord* _last_good_top; | 42 HeapWord* _last_good_top; |
43 HeapWord* _original_top; | 43 HeapWord* _original_top; |
44 JavaThread* _donor_thread; // donor thread associated with this tlabInfo | 44 ThreadLocalAllocBuffer* _tlab; // tlab associated with this tlabInfo |
45 HSAILAllocationInfo* _alloc_info; // same as what is in HSAILDeoptimizationInfo | 45 HSAILAllocationInfo* _alloc_info; // same as what is in HSAILDeoptimizationInfo |
46 | 46 |
47 // Accessors | 47 // Accessors |
48 HeapWord* start() { return _start; } | 48 HeapWord* start() { return _start; } |
49 HeapWord* top() { return _top; } | 49 HeapWord* top() { return _top; } |
50 HeapWord* end() { return _end; } | 50 HeapWord* end() { return _end; } |
51 HeapWord* last_good_top() { return _last_good_top; } | 51 HeapWord* last_good_top() { return _last_good_top; } |
52 HeapWord* original_top() { return _original_top; } | 52 HeapWord* original_top() { return _original_top; } |
53 void initialize(HeapWord* start, HeapWord* top, HeapWord* end, JavaThread* donorThread, HSAILAllocationInfo* allocInfo) { | 53 ThreadLocalAllocBuffer* tlab() { return _tlab; } |
54 void initialize(HeapWord* start, HeapWord* top, HeapWord* end, ThreadLocalAllocBuffer* tlab, HSAILAllocationInfo* allocInfo) { | |
54 _start = start; | 55 _start = start; |
55 _top = _original_top = top; | 56 _top = _original_top = top; |
56 _end = end; | 57 _end = end; |
57 _donor_thread = donorThread; | 58 _tlab = tlab; |
58 _alloc_info = allocInfo; | 59 _alloc_info = allocInfo; |
59 } | 60 } |
60 }; | 61 }; |
61 | 62 |
62 | 63 |
63 class HSAILAllocationInfo : public CHeapObj<mtInternal> { | 64 class HSAILAllocationInfo : public CHeapObj<mtInternal> { |
64 friend class VMStructs; | 65 friend class VMStructs; |
65 private: | 66 private: |
66 JavaThread** donorThreads; | 67 jint _num_tlabs; |
67 jint _num_donor_threads; | 68 size_t _tlab_align_reserve_bytes; // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes() |
68 size_t _tlab_align_reserve_bytes; // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes() | 69 HSAILTlabInfo** _cur_tlab_infos; // array of current tlab info pointers, one per num_tlabs |
69 HSAILTlabInfo** _cur_tlab_infos; // array of current tlab info pointers, one per donor_thread | |
70 HSAILTlabInfo* _tlab_infos_pool_start; // pool for new tlab_infos | 70 HSAILTlabInfo* _tlab_infos_pool_start; // pool for new tlab_infos |
71 HSAILTlabInfo* _tlab_infos_pool_next; // where next will be allocated from | 71 HSAILTlabInfo* _tlab_infos_pool_next; // where next will be allocated from |
72 HSAILTlabInfo* _tlab_infos_pool_end; // where next will be allocated from | 72 HSAILTlabInfo* _tlab_infos_pool_end; // where next will be allocated from |
73 | 73 |
74 public: | 74 public: |
75 HSAILAllocationInfo(jobject donor_threads_jobj, int dimX, int allocBytesPerWorkitem) { | 75 HSAILAllocationInfo(jint num_tlabs, int dimX, int allocBytesPerWorkitem) { |
76 // fill in the donorThreads array | 76 _num_tlabs = num_tlabs; |
77 objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads_jobj); | 77 // if this thread doesn't have gpu_hsail_tlabs allocated yet, do so now |
78 _num_donor_threads = donorThreadObjects->length(); | 78 JavaThread* thread = JavaThread::current(); |
79 guarantee(_num_donor_threads > 0, "need at least one donor thread"); | 79 if (thread->get_gpu_hsail_tlabs_count() == 0) { |
80 donorThreads = NEW_C_HEAP_ARRAY(JavaThread*, _num_donor_threads, mtInternal); | 80 thread->initialize_gpu_hsail_tlabs(num_tlabs); |
81 for (int i = 0; i < _num_donor_threads; i++) { | 81 if (TraceGPUInteraction) { |
82 donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i)); | 82 for (int i = 0; i < num_tlabs; i++) { |
83 } | 83 ThreadLocalAllocBuffer* tlab = thread->get_gpu_hsail_tlab_at(i); |
84 | 84 tty->print("initialized gpu_hsail_tlab %d at %p -> ", i, tlab); |
85 printTlabInfoFromThread(tlab); | |
86 } | |
87 } | |
88 } | |
89 | |
85 // Compute max_tlab_infos based on amount of free heap space | 90 // Compute max_tlab_infos based on amount of free heap space |
86 size_t max_tlab_infos; | 91 size_t max_tlab_infos; |
87 { | 92 { |
88 JavaThread* donorThread = donorThreads[0]; | 93 ThreadLocalAllocBuffer* tlab = &thread->tlab(); |
89 ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); | |
90 size_t new_tlab_size = tlab->compute_size(0); | 94 size_t new_tlab_size = tlab->compute_size(0); |
91 size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(donorThread); | 95 size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(thread); |
92 if (new_tlab_size != 0) { | 96 if (new_tlab_size != 0) { |
93 max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_donor_threads)); | 97 max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_tlabs)); |
94 } else { | 98 } else { |
95 max_tlab_infos = 8 * _num_donor_threads; // an arbitrary multiple | 99 max_tlab_infos = 8 * _num_tlabs; // an arbitrary multiple |
96 } | 100 } |
97 if (TraceGPUInteraction) { | 101 if (TraceGPUInteraction) { |
98 tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos); | 102 tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos); |
99 } | 103 } |
100 } | 104 } |
101 | 105 |
102 _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_donor_threads, mtInternal); | 106 _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_tlabs, mtInternal); |
103 _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal); | 107 _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal); |
104 _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_donor_threads]; | 108 _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_tlabs]; |
105 _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos]; | 109 _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos]; |
106 _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes(); | 110 _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes(); |
107 | 111 |
108 // we will fill the first N tlabInfos from the donor threads | 112 // we will fill the first N tlabInfos from the gpu_hsail_tlabs |
109 for (int i = 0; i < _num_donor_threads; i++) { | 113 for (int i = 0; i < _num_tlabs; i++) { |
110 JavaThread* donorThread = donorThreads[i]; | 114 ThreadLocalAllocBuffer* tlab = thread->get_gpu_hsail_tlab_at(i); |
111 ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); | 115 if (TraceGPUInteraction) { |
112 if (TraceGPUInteraction) { | 116 tty->print("gpu_hsail_tlab %d at %p -> ", i, tlab); |
113 tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab); | |
114 printTlabInfoFromThread(tlab); | 117 printTlabInfoFromThread(tlab); |
115 } | 118 } |
116 | 119 |
117 // Here we try to get a new tlab if current one is null. Note: | 120 // Here we try to get a new tlab if current one is null. Note: |
118 // eventually we may want to test if the size is too small based | 121 // eventually we may want to test if the size is too small based |
120 // allocate, but for now we can just let it overflow and let the | 123 // allocate, but for now we can just let it overflow and let the |
121 // GPU allocate new tlabs. Actually, if we can't prime a tlab | 124 // GPU allocate new tlabs. Actually, if we can't prime a tlab |
122 // here, it might make sense to do a gc now rather than to start | 125 // here, it might make sense to do a gc now rather than to start |
123 // the kernel and have it deoptimize. How to do that? | 126 // the kernel and have it deoptimize. How to do that? |
124 if (tlab->end() == NULL) { | 127 if (tlab->end() == NULL) { |
125 bool success = getNewTlabForDonorThread(tlab, i); | 128 bool success = getNewGpuHsailTlab(tlab); |
126 if (TraceGPUInteraction) { | 129 if (TraceGPUInteraction) { |
127 if (success) { | 130 if (success) { |
128 tty->print("donorThread %d, refilled tlab, -> ", i); | 131 tty->print("gpu_hsail_tlab %d, refilled tlab, -> ", i); |
129 printTlabInfoFromThread(tlab); | 132 printTlabInfoFromThread(tlab); |
130 } else { | 133 } else { |
131 tty->print("donorThread %d, could not refill tlab, left as ", i); | 134 tty->print("gpu_hsail_tlab %d, could not refill tlab, left as ", i); |
132 printTlabInfoFromThread(tlab); | 135 printTlabInfoFromThread(tlab); |
133 } | 136 } |
134 } | 137 } |
135 } | 138 } |
136 | 139 |
137 // extract the necessary tlab fields into a TlabInfo record | 140 // extract the necessary tlab fields into a TlabInfo record |
138 HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i]; | 141 HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i]; |
139 _cur_tlab_infos[i] = pTlabInfo; | 142 _cur_tlab_infos[i] = pTlabInfo; |
140 pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), donorThread, this); | 143 pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), tlab, this); |
141 | |
142 // reset the real tlab fields to zero so we are sure the thread doesn't use it | |
143 tlab->set_start(NULL); | |
144 tlab->set_top(NULL); | |
145 tlab->set_pf_top(NULL); | |
146 tlab->set_end(NULL); | |
147 } | 144 } |
148 } | 145 } |
149 | 146 |
150 ~HSAILAllocationInfo() { | 147 ~HSAILAllocationInfo() { |
151 FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal); | 148 FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal); |
152 FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal); | 149 FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal); |
153 FREE_C_HEAP_ARRAY(JavaThread*, donorThreads, mtInternal); | |
154 } | 150 } |
155 | 151 |
156 void postKernelCleanup() { | 152 void postKernelCleanup() { |
157 // go thru all the tlabInfos, fix up any tlab tops that overflowed | 153 // go thru all the tlabInfos, fix up any tlab tops that overflowed |
158 // complete the tlabs if they overflowed | 154 // complete the tlabs if they overflowed |
159 // update the donor threads tlabs when appropriate | 155 // update the gpu_hsail_tlabs when appropriate |
160 bool anyOverflows = false; | 156 bool anyOverflows = false; |
161 size_t bytesAllocated = 0; | 157 size_t bytesAllocated = 0; |
162 // if there was an overflow in allocating tlabInfos, correct it here | 158 // if there was an overflow in allocating tlabInfos, correct it here |
163 if (_tlab_infos_pool_next > _tlab_infos_pool_end) { | 159 if (_tlab_infos_pool_next > _tlab_infos_pool_end) { |
164 if (TraceGPUInteraction) { | 160 if (TraceGPUInteraction) { |
170 for (HSAILTlabInfo* tlabInfo = _tlab_infos_pool_start; tlabInfo < _tlab_infos_pool_next; tlabInfo++) { | 166 for (HSAILTlabInfo* tlabInfo = _tlab_infos_pool_start; tlabInfo < _tlab_infos_pool_next; tlabInfo++) { |
171 if (TraceGPUInteraction) { | 167 if (TraceGPUInteraction) { |
172 tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo, | 168 tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo, |
173 tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top()); | 169 tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top()); |
174 } | 170 } |
175 JavaThread* donorThread = tlabInfo->_donor_thread; | 171 ThreadLocalAllocBuffer* tlab = tlabInfo->tlab(); |
176 ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); | |
177 bool overflowed = false; | 172 bool overflowed = false; |
178 // if a tlabInfo has NULL fields, i.e. we could not prime it on entry, | 173 // if a tlabInfo has NULL fields, i.e. we could not prime it on entry, |
179 // or we could not get a tlab from the gpu, so ignore tlabInfo here | 174 // or we could not get a tlab from the gpu, so ignore tlabInfo here |
180 if (tlabInfo->start() != NULL) { | 175 if (tlabInfo->start() != NULL) { |
181 if (tlabInfo->top() > tlabInfo->end()) { | 176 if (tlabInfo->top() > tlabInfo->end()) { |
182 anyOverflows = true; | 177 anyOverflows = true; |
183 overflowed = true; | 178 overflowed = true; |
184 if (TraceGPUInteraction) { | 179 if (TraceGPUInteraction) { |
185 long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top(); | 180 long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top(); |
186 tty->print_cr("tlabInfo %p (donorThread = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, donorThread, overflowAmount, tlabInfo->last_good_top()); | 181 tty->print_cr("tlabInfo %p (tlab = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, tlab, overflowAmount, tlabInfo->last_good_top()); |
187 } | 182 } |
188 tlabInfo->_top = tlabInfo->last_good_top(); | 183 tlabInfo->_top = tlabInfo->last_good_top(); |
189 } | 184 } |
190 | 185 |
191 // if the donor thread allocated anything while we were running | 186 // fill the gpu_hsail_tlab with the tlabInfo information |
192 // we will retire its tlab before overwriting with our new one | |
193 if (tlab->top() != NULL) { | |
194 if (TraceGPUInteraction) { | |
195 tty->print("Donor Thread allocated new tlab"); | |
196 printTlabInfoFromThread(tlab); | |
197 } | |
198 tlab->make_parsable(true); | |
199 } | |
200 | |
201 // fill the donor thread tlab with the tlabInfo information | |
202 // we do this even if it will get overwritten by a later tlabinfo | 187 // we do this even if it will get overwritten by a later tlabinfo |
203 // because it helps with tlab statistics for that donor thread | 188 // because it helps with tlab statistics for that tlab |
204 tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve()); | 189 tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve()); |
205 | 190 |
206 // if there was an overflow, make it parsable with retire = true | 191 // if there was an overflow, make it parsable with retire = true |
207 if (overflowed) { | 192 if (overflowed) { |
208 tlab->make_parsable(true); | 193 tlab->make_parsable(true); |
229 } | 214 } |
230 | 215 |
231 private: | 216 private: |
232 // fill and retire old tlab and get a new one | 217 // fill and retire old tlab and get a new one |
233 // if we can't get one, no problem someone will eventually do a gc | 218 // if we can't get one, no problem someone will eventually do a gc |
234 bool getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, int idx) { | 219 bool getNewGpuHsailTlab(ThreadLocalAllocBuffer* tlab) { |
235 | 220 |
236 tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null) | 221 tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null) |
237 | 222 |
238 // get a size for a new tlab that is based on the desired_size | 223 // get a size for a new tlab that is based on the desired_size |
239 size_t new_tlab_size = tlab->compute_size(0); | 224 size_t new_tlab_size = tlab->compute_size(0); |