comparison src/gpu/hsail/vm/gpu_hsail_Tlab.hpp @ 16795:a29e6e7b7a86

Replace hsail donor threads with hsail tlabs
author Tom Rodriguez <tom.rodriguez@oracle.com>
date Tue, 12 Aug 2014 16:30:17 -0700
parents f1d1ec9bcf24
children 82e5b5ccdb0c
comparison
equal deleted inserted replaced
16794:74c02c90a3f9 16795:a29e6e7b7a86
39 HeapWord* _start; 39 HeapWord* _start;
40 HeapWord* _top; 40 HeapWord* _top;
41 HeapWord* _end; 41 HeapWord* _end;
42 HeapWord* _last_good_top; 42 HeapWord* _last_good_top;
43 HeapWord* _original_top; 43 HeapWord* _original_top;
44 JavaThread* _donor_thread; // donor thread associated with this tlabInfo 44 ThreadLocalAllocBuffer* _tlab; // tlab associated with this tlabInfo
45 HSAILAllocationInfo* _alloc_info; // same as what is in HSAILDeoptimizationInfo 45 HSAILAllocationInfo* _alloc_info; // same as what is in HSAILDeoptimizationInfo
46 46
47 // Accessors 47 // Accessors
48 HeapWord* start() { return _start; } 48 HeapWord* start() { return _start; }
49 HeapWord* top() { return _top; } 49 HeapWord* top() { return _top; }
50 HeapWord* end() { return _end; } 50 HeapWord* end() { return _end; }
51 HeapWord* last_good_top() { return _last_good_top; } 51 HeapWord* last_good_top() { return _last_good_top; }
52 HeapWord* original_top() { return _original_top; } 52 HeapWord* original_top() { return _original_top; }
53 void initialize(HeapWord* start, HeapWord* top, HeapWord* end, JavaThread* donorThread, HSAILAllocationInfo* allocInfo) { 53 ThreadLocalAllocBuffer* tlab() { return _tlab; }
54 void initialize(HeapWord* start, HeapWord* top, HeapWord* end, ThreadLocalAllocBuffer* tlab, HSAILAllocationInfo* allocInfo) {
54 _start = start; 55 _start = start;
55 _top = _original_top = top; 56 _top = _original_top = top;
56 _end = end; 57 _end = end;
57 _donor_thread = donorThread; 58 _tlab = tlab;
58 _alloc_info = allocInfo; 59 _alloc_info = allocInfo;
59 } 60 }
60 }; 61 };
61 62
62 63
63 class HSAILAllocationInfo : public CHeapObj<mtInternal> { 64 class HSAILAllocationInfo : public CHeapObj<mtInternal> {
64 friend class VMStructs; 65 friend class VMStructs;
65 private: 66 private:
66 JavaThread** donorThreads; 67 jint _num_tlabs;
67 jint _num_donor_threads; 68 size_t _tlab_align_reserve_bytes; // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes()
68 size_t _tlab_align_reserve_bytes; // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes() 69 HSAILTlabInfo** _cur_tlab_infos; // array of current tlab info pointers, one per num_tlabs
69 HSAILTlabInfo** _cur_tlab_infos; // array of current tlab info pointers, one per donor_thread
70 HSAILTlabInfo* _tlab_infos_pool_start; // pool for new tlab_infos 70 HSAILTlabInfo* _tlab_infos_pool_start; // pool for new tlab_infos
71 HSAILTlabInfo* _tlab_infos_pool_next; // where next will be allocated from 71 HSAILTlabInfo* _tlab_infos_pool_next; // where next will be allocated from
72 HSAILTlabInfo* _tlab_infos_pool_end; // where next will be allocated from 72 HSAILTlabInfo* _tlab_infos_pool_end; // where next will be allocated from
73 73
74 public: 74 public:
75 HSAILAllocationInfo(jobject donor_threads_jobj, int dimX, int allocBytesPerWorkitem) { 75 HSAILAllocationInfo(jint num_tlabs, int dimX, int allocBytesPerWorkitem) {
76 // fill in the donorThreads array 76 _num_tlabs = num_tlabs;
77 objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads_jobj); 77 // if this thread doesn't have gpu_hsail_tlabs allocated yet, do so now
78 _num_donor_threads = donorThreadObjects->length(); 78 JavaThread* thread = JavaThread::current();
79 guarantee(_num_donor_threads > 0, "need at least one donor thread"); 79 if (thread->get_gpu_hsail_tlabs_count() == 0) {
80 donorThreads = NEW_C_HEAP_ARRAY(JavaThread*, _num_donor_threads, mtInternal); 80 thread->initialize_gpu_hsail_tlabs(num_tlabs);
81 for (int i = 0; i < _num_donor_threads; i++) { 81 if (TraceGPUInteraction) {
82 donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i)); 82 for (int i = 0; i < num_tlabs; i++) {
83 } 83 ThreadLocalAllocBuffer* tlab = thread->get_gpu_hsail_tlab_at(i);
84 84 tty->print("initialized gpu_hsail_tlab %d at %p -> ", i, tlab);
85 printTlabInfoFromThread(tlab);
86 }
87 }
88 }
89
85 // Compute max_tlab_infos based on amount of free heap space 90 // Compute max_tlab_infos based on amount of free heap space
86 size_t max_tlab_infos; 91 size_t max_tlab_infos;
87 { 92 {
88 JavaThread* donorThread = donorThreads[0]; 93 ThreadLocalAllocBuffer* tlab = &thread->tlab();
89 ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
90 size_t new_tlab_size = tlab->compute_size(0); 94 size_t new_tlab_size = tlab->compute_size(0);
91 size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(donorThread); 95 size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(thread);
92 if (new_tlab_size != 0) { 96 if (new_tlab_size != 0) {
93 max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_donor_threads)); 97 max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_tlabs));
94 } else { 98 } else {
95 max_tlab_infos = 8 * _num_donor_threads; // an arbitrary multiple 99 max_tlab_infos = 8 * _num_tlabs; // an arbitrary multiple
96 } 100 }
97 if (TraceGPUInteraction) { 101 if (TraceGPUInteraction) {
98 tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos); 102 tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos);
99 } 103 }
100 } 104 }
101 105
102 _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_donor_threads, mtInternal); 106 _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_tlabs, mtInternal);
103 _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal); 107 _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal);
104 _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_donor_threads]; 108 _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_tlabs];
105 _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos]; 109 _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos];
106 _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes(); 110 _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes();
107 111
108 // we will fill the first N tlabInfos from the donor threads 112 // we will fill the first N tlabInfos from the gpu_hsail_tlabs
109 for (int i = 0; i < _num_donor_threads; i++) { 113 for (int i = 0; i < _num_tlabs; i++) {
110 JavaThread* donorThread = donorThreads[i]; 114 ThreadLocalAllocBuffer* tlab = thread->get_gpu_hsail_tlab_at(i);
111 ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); 115 if (TraceGPUInteraction) {
112 if (TraceGPUInteraction) { 116 tty->print("gpu_hsail_tlab %d at %p -> ", i, tlab);
113 tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab);
114 printTlabInfoFromThread(tlab); 117 printTlabInfoFromThread(tlab);
115 } 118 }
116 119
117 // Here we try to get a new tlab if current one is null. Note: 120 // Here we try to get a new tlab if current one is null. Note:
118 // eventually we may want to test if the size is too small based 121 // eventually we may want to test if the size is too small based
120 // allocate, but for now we can just let it overflow and let the 123 // allocate, but for now we can just let it overflow and let the
121 // GPU allocate new tlabs. Actually, if we can't prime a tlab 124 // GPU allocate new tlabs. Actually, if we can't prime a tlab
122 // here, it might make sense to do a gc now rather than to start 125 // here, it might make sense to do a gc now rather than to start
123 // the kernel and have it deoptimize. How to do that? 126 // the kernel and have it deoptimize. How to do that?
124 if (tlab->end() == NULL) { 127 if (tlab->end() == NULL) {
125 bool success = getNewTlabForDonorThread(tlab, i); 128 bool success = getNewGpuHsailTlab(tlab);
126 if (TraceGPUInteraction) { 129 if (TraceGPUInteraction) {
127 if (success) { 130 if (success) {
128 tty->print("donorThread %d, refilled tlab, -> ", i); 131 tty->print("gpu_hsail_tlab %d, refilled tlab, -> ", i);
129 printTlabInfoFromThread(tlab); 132 printTlabInfoFromThread(tlab);
130 } else { 133 } else {
131 tty->print("donorThread %d, could not refill tlab, left as ", i); 134 tty->print("gpu_hsail_tlab %d, could not refill tlab, left as ", i);
132 printTlabInfoFromThread(tlab); 135 printTlabInfoFromThread(tlab);
133 } 136 }
134 } 137 }
135 } 138 }
136 139
137 // extract the necessary tlab fields into a TlabInfo record 140 // extract the necessary tlab fields into a TlabInfo record
138 HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i]; 141 HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i];
139 _cur_tlab_infos[i] = pTlabInfo; 142 _cur_tlab_infos[i] = pTlabInfo;
140 pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), donorThread, this); 143 pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), tlab, this);
141
142 // reset the real tlab fields to zero so we are sure the thread doesn't use it
143 tlab->set_start(NULL);
144 tlab->set_top(NULL);
145 tlab->set_pf_top(NULL);
146 tlab->set_end(NULL);
147 } 144 }
148 } 145 }
149 146
150 ~HSAILAllocationInfo() { 147 ~HSAILAllocationInfo() {
151 FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal); 148 FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal);
152 FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal); 149 FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal);
153 FREE_C_HEAP_ARRAY(JavaThread*, donorThreads, mtInternal);
154 } 150 }
155 151
156 void postKernelCleanup() { 152 void postKernelCleanup() {
157 // go thru all the tlabInfos, fix up any tlab tops that overflowed 153 // go thru all the tlabInfos, fix up any tlab tops that overflowed
158 // complete the tlabs if they overflowed 154 // complete the tlabs if they overflowed
159 // update the donor threads tlabs when appropriate 155 // update the gpu_hsail_tlabs when appropriate
160 bool anyOverflows = false; 156 bool anyOverflows = false;
161 size_t bytesAllocated = 0; 157 size_t bytesAllocated = 0;
162 // if there was an overflow in allocating tlabInfos, correct it here 158 // if there was an overflow in allocating tlabInfos, correct it here
163 if (_tlab_infos_pool_next > _tlab_infos_pool_end) { 159 if (_tlab_infos_pool_next > _tlab_infos_pool_end) {
164 if (TraceGPUInteraction) { 160 if (TraceGPUInteraction) {
170 for (HSAILTlabInfo* tlabInfo = _tlab_infos_pool_start; tlabInfo < _tlab_infos_pool_next; tlabInfo++) { 166 for (HSAILTlabInfo* tlabInfo = _tlab_infos_pool_start; tlabInfo < _tlab_infos_pool_next; tlabInfo++) {
171 if (TraceGPUInteraction) { 167 if (TraceGPUInteraction) {
172 tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo, 168 tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo,
173 tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top()); 169 tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top());
174 } 170 }
175 JavaThread* donorThread = tlabInfo->_donor_thread; 171 ThreadLocalAllocBuffer* tlab = tlabInfo->tlab();
176 ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
177 bool overflowed = false; 172 bool overflowed = false;
178 // if a tlabInfo has NULL fields, i.e. we could not prime it on entry, 173 // if a tlabInfo has NULL fields, i.e. we could not prime it on entry,
179 // or we could not get a tlab from the gpu, so ignore tlabInfo here 174 // or we could not get a tlab from the gpu, so ignore tlabInfo here
180 if (tlabInfo->start() != NULL) { 175 if (tlabInfo->start() != NULL) {
181 if (tlabInfo->top() > tlabInfo->end()) { 176 if (tlabInfo->top() > tlabInfo->end()) {
182 anyOverflows = true; 177 anyOverflows = true;
183 overflowed = true; 178 overflowed = true;
184 if (TraceGPUInteraction) { 179 if (TraceGPUInteraction) {
185 long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top(); 180 long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top();
186 tty->print_cr("tlabInfo %p (donorThread = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, donorThread, overflowAmount, tlabInfo->last_good_top()); 181 tty->print_cr("tlabInfo %p (tlab = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, tlab, overflowAmount, tlabInfo->last_good_top());
187 } 182 }
188 tlabInfo->_top = tlabInfo->last_good_top(); 183 tlabInfo->_top = tlabInfo->last_good_top();
189 } 184 }
190 185
191 // if the donor thread allocated anything while we were running 186 // fill the gpu_hsail_tlab with the tlabInfo information
192 // we will retire its tlab before overwriting with our new one
193 if (tlab->top() != NULL) {
194 if (TraceGPUInteraction) {
195 tty->print("Donor Thread allocated new tlab");
196 printTlabInfoFromThread(tlab);
197 }
198 tlab->make_parsable(true);
199 }
200
201 // fill the donor thread tlab with the tlabInfo information
202 // we do this even if it will get overwritten by a later tlabinfo 187 // we do this even if it will get overwritten by a later tlabinfo
203 // because it helps with tlab statistics for that donor thread 188 // because it helps with tlab statistics for that tlab
204 tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve()); 189 tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve());
205 190
206 // if there was an overflow, make it parsable with retire = true 191 // if there was an overflow, make it parsable with retire = true
207 if (overflowed) { 192 if (overflowed) {
208 tlab->make_parsable(true); 193 tlab->make_parsable(true);
229 } 214 }
230 215
231 private: 216 private:
232 // fill and retire old tlab and get a new one 217 // fill and retire old tlab and get a new one
233 // if we can't get one, no problem someone will eventually do a gc 218 // if we can't get one, no problem someone will eventually do a gc
234 bool getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, int idx) { 219 bool getNewGpuHsailTlab(ThreadLocalAllocBuffer* tlab) {
235 220
236 tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null) 221 tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null)
237 222
238 // get a size for a new tlab that is based on the desired_size 223 // get a size for a new tlab that is based on the desired_size
239 size_t new_tlab_size = tlab->compute_size(0); 224 size_t new_tlab_size = tlab->compute_size(0);