Mercurial > hg > graal-jvmci-8
comparison src/gpu/hsail/vm/gpu_hsail.cpp @ 15066:2cae21d9f122
HSAIL: initial support for object allocation in HSAIL kernels
Contributed-by: Tom Deneau <tom.deneau@amd.com>
author | Doug Simon <doug.simon@oracle.com> |
---|---|
date | Fri, 11 Apr 2014 17:12:08 +0200 |
parents | a6c144380ce7 |
children | 0e689f20706e |
comparison
equal
deleted
inserted
replaced
15065:f5ef63b5b5ed | 15066:2cae21d9f122 |
---|---|
57 #define CC (char*) /*cast a literal from (const char*)*/ | 57 #define CC (char*) /*cast a literal from (const char*)*/ |
58 #define FN_PTR(f) CAST_FROM_FN_PTR(void*, &(f)) | 58 #define FN_PTR(f) CAST_FROM_FN_PTR(void*, &(f)) |
59 | 59 |
60 #define OBJECT "Ljava/lang/Object;" | 60 #define OBJECT "Ljava/lang/Object;" |
61 #define STRING "Ljava/lang/String;" | 61 #define STRING "Ljava/lang/String;" |
62 #define JLTHREAD "Ljava/lang/Thread;" | |
62 #define HS_INSTALLED_CODE "Lcom/oracle/graal/hotspot/meta/HotSpotInstalledCode;" | 63 #define HS_INSTALLED_CODE "Lcom/oracle/graal/hotspot/meta/HotSpotInstalledCode;" |
63 #define HS_COMPILED_NMETHOD "Lcom/oracle/graal/hotspot/HotSpotCompiledNmethod;" | 64 #define HS_COMPILED_NMETHOD "Lcom/oracle/graal/hotspot/HotSpotCompiledNmethod;" |
64 #define HS_NMETHOD "Lcom/oracle/graal/hotspot/meta/HotSpotNmethod;" | 65 #define HS_NMETHOD "Lcom/oracle/graal/hotspot/meta/HotSpotNmethod;" |
65 | 66 |
66 // public native void executeKernel(HotSpotNmethod kernel, int jobSize, int i, int j, Object[] args) throws InvalidInstalledCodeException; | |
67 | |
68 JNINativeMethod Hsail::HSAIL_methods[] = { | 67 JNINativeMethod Hsail::HSAIL_methods[] = { |
69 {CC"initialize", CC"()Z", FN_PTR(Hsail::initialize)}, | 68 {CC"initialize", CC"()Z", FN_PTR(Hsail::initialize)}, |
70 {CC"generateKernel", CC"([B" STRING ")J", FN_PTR(Hsail::generate_kernel)}, | 69 {CC"generateKernel", CC"([B" STRING ")J", FN_PTR(Hsail::generate_kernel)}, |
71 {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT")Z", FN_PTR(Hsail::execute_kernel_void_1d)}, | 70 {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I)Z", FN_PTR(Hsail::execute_kernel_void_1d)}, |
72 }; | 71 }; |
73 | 72 |
74 void * Hsail::_device_context = NULL; | 73 void * Hsail::_device_context = NULL; |
75 | 74 |
76 Hsail::okra_create_context_func_t Hsail::_okra_create_context; | 75 Hsail::okra_create_context_func_t Hsail::_okra_create_context; |
84 Hsail::okra_push_long_func_t Hsail::_okra_push_long; | 83 Hsail::okra_push_long_func_t Hsail::_okra_push_long; |
85 Hsail::okra_execute_with_range_func_t Hsail::_okra_execute_with_range; | 84 Hsail::okra_execute_with_range_func_t Hsail::_okra_execute_with_range; |
86 Hsail::okra_clearargs_func_t Hsail::_okra_clearargs; | 85 Hsail::okra_clearargs_func_t Hsail::_okra_clearargs; |
87 Hsail::okra_register_heap_func_t Hsail::_okra_register_heap; | 86 Hsail::okra_register_heap_func_t Hsail::_okra_register_heap; |
88 | 87 |
88 struct Stats { | |
89 int _dispatches; | |
90 int _deopts; | |
91 int _overflows; | |
92 bool _changeSeen; | |
93 | |
94 public: | |
95 Stats() { | |
96 _dispatches = _deopts = _overflows = 0; | |
97 _changeSeen = false; | |
98 } | |
99 | |
100 void incDeopts() { | |
101 _deopts++; | |
102 _changeSeen = true; | |
103 } | |
104 void incOverflows() { | |
105 _overflows++; | |
106 _changeSeen = true; | |
107 } | |
108 | |
109 void finishDispatch() { | |
110 _dispatches++; | |
111 if (_changeSeen) { | |
112 // print(); | |
113 _changeSeen = false; | |
114 } | |
115 } | |
116 | |
117 void print() { | |
118 tty->print_cr("Disp=%d, Deopts=%d, Ovflows=%d", _dispatches, _deopts, _overflows); | |
119 } | |
120 | |
121 }; | |
122 | |
123 static Stats kernelStats; | |
124 | |
89 | 125 |
90 void Hsail::register_heap() { | 126 void Hsail::register_heap() { |
91 // After the okra functions are set up and the heap is initialized, register the java heap with HSA | 127 // After the okra functions are set up and the heap is initialized, register the java heap with HSA |
92 guarantee(Universe::heap() != NULL, "heap should be there by now."); | 128 guarantee(Universe::heap() != NULL, "heap should be there by now."); |
93 if (TraceGPUInteraction) { | 129 if (TraceGPUInteraction) { |
95 tty->print_cr("[HSAIL] base=0x%08x, capacity=%ld", Universe::heap()->base(), Universe::heap()->capacity()); | 131 tty->print_cr("[HSAIL] base=0x%08x, capacity=%ld", Universe::heap()->base(), Universe::heap()->capacity()); |
96 } | 132 } |
97 _okra_register_heap(Universe::heap()->base(), Universe::heap()->capacity()); | 133 _okra_register_heap(Universe::heap()->base(), Universe::heap()->capacity()); |
98 } | 134 } |
99 | 135 |
100 GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args_handle, jobject oops_save_handle)) | 136 GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args, jobject oops_save, |
137 jobject donor_threads, jint allocBytesPerWorkitem)) | |
101 | 138 |
102 ResourceMark rm; | 139 ResourceMark rm; |
103 jlong nmethodValue = HotSpotInstalledCode::codeBlob(kernel_handle); | 140 jlong nmethodValue = HotSpotInstalledCode::codeBlob(kernel_handle); |
104 if (nmethodValue == 0) { | 141 if (nmethodValue == 0) { |
105 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); | 142 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); |
111 void* kernel = (void*) HotSpotInstalledCode::codeStart(kernel_handle); | 148 void* kernel = (void*) HotSpotInstalledCode::codeStart(kernel_handle); |
112 if (kernel == NULL) { | 149 if (kernel == NULL) { |
113 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); | 150 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); |
114 } | 151 } |
115 | 152 |
116 return execute_kernel_void_1d_internal((address) kernel, dimX, args_handle, mh, nm, oops_save_handle, CHECK_0); | 153 return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, CHECK_0); |
117 GPU_END | 154 GPU_END |
118 | 155 |
119 static void showRanges(jboolean *a, int len) { | 156 static void showRanges(jboolean *a, int len) { |
120 // show ranges | 157 // show ranges |
121 bool lookFor = true; | 158 bool lookFor = true; |
131 if (lookFor == false) { | 168 if (lookFor == false) { |
132 tty->print_cr("-%d", len-1); | 169 tty->print_cr("-%d", len-1); |
133 } | 170 } |
134 } | 171 } |
135 | 172 |
136 jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args_handle, methodHandle& mh, nmethod *nm, jobject oops_save_handle, TRAPS) { | 173 // fill and retire old tlab and get a new one |
137 | 174 // if we can't get one, no problem someone will eventually do a gc |
175 void Hsail::getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, size_t tlabMinHsail) { | |
176 tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null) | |
177 | |
178 // get a size for a new tlab that is at least tlabMinHsail. | |
179 size_t new_tlab_size = tlab->compute_size(tlabMinHsail); | |
180 if (new_tlab_size == 0) return; | |
181 | |
182 HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size); | |
183 if (tlab_start == NULL) return; | |
184 | |
185 // ..and clear it if required | |
186 if (ZeroTLAB) { | |
187 Copy::zero_to_words(tlab_start, new_tlab_size); | |
188 } | |
189 // and init the tlab pointers | |
190 tlab->fill(tlab_start, tlab_start, new_tlab_size); | |
191 } | |
192 | |
193 static void printTlabInfo (ThreadLocalAllocBuffer* tlab) { | |
194 HeapWord *start = tlab->start(); | |
195 HeapWord *top = tlab->top(); | |
196 HeapWord *end = tlab->end(); | |
197 // sizes are in bytes | |
198 size_t tlabFree = tlab->free() * HeapWordSize; | |
199 size_t tlabUsed = tlab->used() * HeapWordSize; | |
200 size_t tlabSize = tlabFree + tlabUsed; | |
201 double freePct = 100.0 * (double) tlabFree/(double) tlabSize; | |
202 tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct); | |
203 } | |
204 | |
205 | |
206 | |
207 jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save, | |
208 jobject donor_threads, int allocBytesPerWorkitem, TRAPS) { | |
138 ResourceMark rm(THREAD); | 209 ResourceMark rm(THREAD); |
139 objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args_handle); | 210 objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args); |
211 | |
212 // TODO: avoid donor thread logic if kernel does not allocate | |
213 objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads); | |
214 int numDonorThreads = donorThreadObjects->length(); | |
215 guarantee(numDonorThreads > 0, "need at least one donor thread"); | |
216 JavaThread** donorThreads = NEW_RESOURCE_ARRAY(JavaThread*, numDonorThreads); | |
217 for (int i = 0; i < numDonorThreads; i++) { | |
218 donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i)); | |
219 } | |
220 | |
221 | |
222 // compute tlabMinHsail based on number of workitems, number of donor | |
223 // threads, allocBytesPerWorkitem rounded up | |
224 size_t tlabMinHsail = (allocBytesPerWorkitem * dimX + (numDonorThreads - 1)) / numDonorThreads; | |
225 if (TraceGPUInteraction) { | |
226 tty->print_cr("computed tlabMinHsail = %d", tlabMinHsail); | |
227 } | |
228 | |
229 for (int i = 0; i < numDonorThreads; i++) { | |
230 JavaThread* donorThread = donorThreads[i]; | |
231 ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); | |
232 if (TraceGPUInteraction) { | |
233 tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab); | |
234 printTlabInfo(tlab); | |
235 } | |
236 | |
237 // note: this used vs. free limit checking should be based on some | |
238 // heuristic where we see how much this kernel tends to allocate | |
239 if ((tlab->end() == NULL) || (tlab->free() * HeapWordSize < tlabMinHsail)) { | |
240 getNewTlabForDonorThread(tlab, tlabMinHsail); | |
241 if (TraceGPUInteraction) { | |
242 tty->print("donorThread %d, refilled tlab, -> ", i); | |
243 printTlabInfo(tlab); | |
244 } | |
245 } | |
246 } | |
140 | 247 |
141 // Reset the kernel arguments | 248 // Reset the kernel arguments |
142 _okra_clearargs(kernel); | 249 _okra_clearargs(kernel); |
143 | 250 |
144 HSAILDeoptimizationInfo* e; | 251 HSAILDeoptimizationInfo* e; |
145 if (UseHSAILDeoptimization) { | 252 if (UseHSAILDeoptimization) { |
146 e = new (ResourceObj::C_HEAP, mtInternal) HSAILDeoptimizationInfo(); | 253 e = new (ResourceObj::C_HEAP, mtInternal) HSAILDeoptimizationInfo(); |
147 e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal)); | 254 e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal)); |
148 memset(e->never_ran_array(), 0, dimX * sizeof(jboolean)); | 255 memset(e->never_ran_array(), 0, dimX * sizeof(jboolean)); |
256 e->set_donor_threads(donorThreads); | |
149 } | 257 } |
150 | 258 |
151 // This object sets up the kernel arguments | 259 // This object sets up the kernel arguments |
152 HSAILKernelArguments hka((address) kernel, mh->signature(), argsArray, mh->is_static(), e); | 260 HSAILKernelArguments hka((address) kernel, mh->signature(), argsArray, mh->is_static(), e); |
153 | 261 |
165 // Run the kernel | 273 // Run the kernel |
166 bool success = false; | 274 bool success = false; |
167 { | 275 { |
168 TraceTime t1("execute kernel", TraceGPUInteraction); | 276 TraceTime t1("execute kernel", TraceGPUInteraction); |
169 success = _okra_execute_with_range(kernel, dimX); | 277 success = _okra_execute_with_range(kernel, dimX); |
278 } | |
279 | |
280 // fix up any tlab tops that overflowed | |
281 bool anyOverflows = false; | |
282 for (int i = 0; i < numDonorThreads; i++) { | |
283 JavaThread * donorThread = donorThreads[i]; | |
284 ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); | |
285 if (tlab->top() > tlab->end()) { | |
286 anyOverflows = true; | |
287 long overflowAmount = (long) tlab->top() - (long) tlab->pf_top(); | |
288 // tlab->set_top is private this ugly hack gets around that | |
289 *(long *)((char *)tlab + in_bytes(tlab->top_offset())) = (long) tlab->pf_top(); | |
290 if (TraceGPUInteraction) { | |
291 tty->print_cr("donorThread %d at %p overflowed by %ld bytes, setting last good top to %p", i, donorThread, overflowAmount, tlab->top()); | |
292 } | |
293 } | |
294 } | |
295 if (anyOverflows) { | |
296 kernelStats.incOverflows(); | |
170 } | 297 } |
171 | 298 |
172 if (UseHSAILDeoptimization) { | 299 if (UseHSAILDeoptimization) { |
173 // check if any workitem requested a deopt | 300 // check if any workitem requested a deopt |
174 // currently we only support at most one such workitem | 301 // currently we only support at most one such workitem |
178 // error condition detected in deopt code | 305 // error condition detected in deopt code |
179 char msg[200]; | 306 char msg[200]; |
180 sprintf(msg, "deopt error detected, slot for workitem %d was not empty", -1 * (deoptcode + 1)); | 307 sprintf(msg, "deopt error detected, slot for workitem %d was not empty", -1 * (deoptcode + 1)); |
181 guarantee(deoptcode == 1, msg); | 308 guarantee(deoptcode == 1, msg); |
182 } | 309 } |
183 | 310 kernelStats.incDeopts(); |
184 { | 311 { |
185 TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction); | 312 TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction); |
186 | |
187 if (TraceGPUInteraction) { | 313 if (TraceGPUInteraction) { |
188 tty->print_cr("deopt happened."); | 314 tty->print_cr("deopt happened."); |
189 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[0]; | 315 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[0]; |
190 tty->print_cr("first deopter was workitem %d", pdeopt->workitem()); | 316 tty->print_cr("first deopter was workitem %d", pdeopt->workitem()); |
191 } | 317 } |
192 | 318 |
193 // Before handling any deopting workitems, save the pointers from | 319 // Before handling any deopting workitems, save the pointers from |
194 // the hsail frames in oops_save so they get adjusted by any | 320 // the hsail frames in oops_save so they get adjusted by any |
195 // GC. Need to do this before leaving thread_in_vm mode. | 321 // GC. Need to do this before leaving thread_in_vm mode. |
196 // resolve handle only needed once here (not exiting vm mode) | 322 // resolve handle only needed once here (not exiting vm mode) |
197 objArrayOop oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save_handle); | 323 objArrayOop oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save); |
198 | 324 |
199 // since slots are allocated from the beginning, we know how far to look | 325 // since slots are allocated from the beginning, we know how far to look |
200 assert(e->num_deopts() < MAX_DEOPT_SAVE_STATES_SIZE, "deopt save state overflow"); | 326 assert(e->num_deopts() < MAX_DEOPT_SAVE_STATES_SIZE, "deopt save state overflow"); |
201 for (int k = 0; k < e->num_deopts(); k++) { | 327 for (int k = 0; k < e->num_deopts(); k++) { |
202 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k]; | 328 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k]; |
226 int deoptId = pdeopt->pc_offset(); | 352 int deoptId = pdeopt->pc_offset(); |
227 HSAILFrame *hsailFrame = pdeopt->first_frame(); | 353 HSAILFrame *hsailFrame = pdeopt->first_frame(); |
228 | 354 |
229 // update the hsailFrame from the oopsSaveArray | 355 // update the hsailFrame from the oopsSaveArray |
230 // re-resolve the handle | 356 // re-resolve the handle |
231 oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save_handle); | 357 oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save); |
232 | 358 |
233 int dregOopMap = hsailFrame->dreg_oops_map(); | 359 int dregOopMap = hsailFrame->dreg_oops_map(); |
234 for (int bit = 0; bit < 16; bit++) { | 360 for (int bit = 0; bit < 16; bit++) { |
235 if ((dregOopMap & (1 << bit)) != 0) { | 361 if ((dregOopMap & (1 << bit)) != 0) { |
236 // the dregister at this bit is an oop, retrieve it from array and put back in frame | 362 // the dregister at this bit is an oop, retrieve it from array and put back in frame |
294 KlassHandle methKlass = mh->method_holder(); | 420 KlassHandle methKlass = mh->method_holder(); |
295 Thread* THREAD = Thread::current(); | 421 Thread* THREAD = Thread::current(); |
296 JavaValue result(T_VOID); | 422 JavaValue result(T_VOID); |
297 JavaCallArguments javaArgs; | 423 JavaCallArguments javaArgs; |
298 // re-resolve the args_handle here | 424 // re-resolve the args_handle here |
299 objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args_handle); | 425 objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args); |
300 // This object sets up the javaCall arguments | 426 // This object sets up the javaCall arguments |
301 // the way argsArray is set up, this should work for instance methods as well | 427 // the way argsArray is set up, this should work for instance methods as well |
302 // (the receiver will be the first oop pushed) | 428 // (the receiver will be the first oop pushed) |
303 HSAILJavaCallArguments hjca(&javaArgs, k, mh->signature(), resolvedArgsArray, mh->is_static()); | 429 HSAILJavaCallArguments hjca(&javaArgs, k, mh->signature(), resolvedArgsArray, mh->is_static()); |
304 if (mh->is_static()) { | 430 if (mh->is_static()) { |
315 showRanges(never_ran_array, dimX); | 441 showRanges(never_ran_array, dimX); |
316 } | 442 } |
317 } // end of never-ran handling | 443 } // end of never-ran handling |
318 } | 444 } |
319 } | 445 } |
320 | 446 |
321 FREE_C_HEAP_ARRAY(jboolean, e->never_ran_array(), mtInternal); | 447 FREE_C_HEAP_ARRAY(jboolean, e->never_ran_array(), mtInternal); |
322 delete e; | 448 delete e; |
323 } | 449 } |
450 kernelStats.finishDispatch(); | |
324 return success; | 451 return success; |
325 } | 452 } |
326 | 453 |
327 GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv *env, jclass, jbyteArray code_handle, jstring name_handle)) | 454 GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv *env, jclass, jbyteArray code_handle, jstring name_handle)) |
328 guarantee(_okra_create_kernel != NULL, "[HSAIL] Okra not linked"); | 455 guarantee(_okra_create_kernel != NULL, "[HSAIL] Okra not linked"); |