comparison src/gpu/hsail/vm/gpu_hsail.cpp @ 15066:2cae21d9f122

HSAIL: initial support for object allocation in HSAIL kernels Contributed-by: Tom Deneau <tom.deneau@amd.com>
author Doug Simon <doug.simon@oracle.com>
date Fri, 11 Apr 2014 17:12:08 +0200
parents a6c144380ce7
children 0e689f20706e
comparison
equal deleted inserted replaced
15065:f5ef63b5b5ed 15066:2cae21d9f122
57 #define CC (char*) /*cast a literal from (const char*)*/ 57 #define CC (char*) /*cast a literal from (const char*)*/
58 #define FN_PTR(f) CAST_FROM_FN_PTR(void*, &(f)) 58 #define FN_PTR(f) CAST_FROM_FN_PTR(void*, &(f))
59 59
60 #define OBJECT "Ljava/lang/Object;" 60 #define OBJECT "Ljava/lang/Object;"
61 #define STRING "Ljava/lang/String;" 61 #define STRING "Ljava/lang/String;"
62 #define JLTHREAD "Ljava/lang/Thread;"
62 #define HS_INSTALLED_CODE "Lcom/oracle/graal/hotspot/meta/HotSpotInstalledCode;" 63 #define HS_INSTALLED_CODE "Lcom/oracle/graal/hotspot/meta/HotSpotInstalledCode;"
63 #define HS_COMPILED_NMETHOD "Lcom/oracle/graal/hotspot/HotSpotCompiledNmethod;" 64 #define HS_COMPILED_NMETHOD "Lcom/oracle/graal/hotspot/HotSpotCompiledNmethod;"
64 #define HS_NMETHOD "Lcom/oracle/graal/hotspot/meta/HotSpotNmethod;" 65 #define HS_NMETHOD "Lcom/oracle/graal/hotspot/meta/HotSpotNmethod;"
65 66
66 // public native void executeKernel(HotSpotNmethod kernel, int jobSize, int i, int j, Object[] args) throws InvalidInstalledCodeException;
67
68 JNINativeMethod Hsail::HSAIL_methods[] = { 67 JNINativeMethod Hsail::HSAIL_methods[] = {
69 {CC"initialize", CC"()Z", FN_PTR(Hsail::initialize)}, 68 {CC"initialize", CC"()Z", FN_PTR(Hsail::initialize)},
70 {CC"generateKernel", CC"([B" STRING ")J", FN_PTR(Hsail::generate_kernel)}, 69 {CC"generateKernel", CC"([B" STRING ")J", FN_PTR(Hsail::generate_kernel)},
71 {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT")Z", FN_PTR(Hsail::execute_kernel_void_1d)}, 70 {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I)Z", FN_PTR(Hsail::execute_kernel_void_1d)},
72 }; 71 };
73 72
74 void * Hsail::_device_context = NULL; 73 void * Hsail::_device_context = NULL;
75 74
76 Hsail::okra_create_context_func_t Hsail::_okra_create_context; 75 Hsail::okra_create_context_func_t Hsail::_okra_create_context;
84 Hsail::okra_push_long_func_t Hsail::_okra_push_long; 83 Hsail::okra_push_long_func_t Hsail::_okra_push_long;
85 Hsail::okra_execute_with_range_func_t Hsail::_okra_execute_with_range; 84 Hsail::okra_execute_with_range_func_t Hsail::_okra_execute_with_range;
86 Hsail::okra_clearargs_func_t Hsail::_okra_clearargs; 85 Hsail::okra_clearargs_func_t Hsail::_okra_clearargs;
87 Hsail::okra_register_heap_func_t Hsail::_okra_register_heap; 86 Hsail::okra_register_heap_func_t Hsail::_okra_register_heap;
88 87
88 struct Stats {
89 int _dispatches;
90 int _deopts;
91 int _overflows;
92 bool _changeSeen;
93
94 public:
95 Stats() {
96 _dispatches = _deopts = _overflows = 0;
97 _changeSeen = false;
98 }
99
100 void incDeopts() {
101 _deopts++;
102 _changeSeen = true;
103 }
104 void incOverflows() {
105 _overflows++;
106 _changeSeen = true;
107 }
108
109 void finishDispatch() {
110 _dispatches++;
111 if (_changeSeen) {
112 // print();
113 _changeSeen = false;
114 }
115 }
116
117 void print() {
118 tty->print_cr("Disp=%d, Deopts=%d, Ovflows=%d", _dispatches, _deopts, _overflows);
119 }
120
121 };
122
123 static Stats kernelStats;
124
89 125
90 void Hsail::register_heap() { 126 void Hsail::register_heap() {
91 // After the okra functions are set up and the heap is initialized, register the java heap with HSA 127 // After the okra functions are set up and the heap is initialized, register the java heap with HSA
92 guarantee(Universe::heap() != NULL, "heap should be there by now."); 128 guarantee(Universe::heap() != NULL, "heap should be there by now.");
93 if (TraceGPUInteraction) { 129 if (TraceGPUInteraction) {
95 tty->print_cr("[HSAIL] base=0x%08x, capacity=%ld", Universe::heap()->base(), Universe::heap()->capacity()); 131 tty->print_cr("[HSAIL] base=0x%08x, capacity=%ld", Universe::heap()->base(), Universe::heap()->capacity());
96 } 132 }
97 _okra_register_heap(Universe::heap()->base(), Universe::heap()->capacity()); 133 _okra_register_heap(Universe::heap()->base(), Universe::heap()->capacity());
98 } 134 }
99 135
100 GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args_handle, jobject oops_save_handle)) 136 GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args, jobject oops_save,
137 jobject donor_threads, jint allocBytesPerWorkitem))
101 138
102 ResourceMark rm; 139 ResourceMark rm;
103 jlong nmethodValue = HotSpotInstalledCode::codeBlob(kernel_handle); 140 jlong nmethodValue = HotSpotInstalledCode::codeBlob(kernel_handle);
104 if (nmethodValue == 0) { 141 if (nmethodValue == 0) {
105 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); 142 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL);
111 void* kernel = (void*) HotSpotInstalledCode::codeStart(kernel_handle); 148 void* kernel = (void*) HotSpotInstalledCode::codeStart(kernel_handle);
112 if (kernel == NULL) { 149 if (kernel == NULL) {
113 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL); 150 SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL);
114 } 151 }
115 152
116 return execute_kernel_void_1d_internal((address) kernel, dimX, args_handle, mh, nm, oops_save_handle, CHECK_0); 153 return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, CHECK_0);
117 GPU_END 154 GPU_END
118 155
119 static void showRanges(jboolean *a, int len) { 156 static void showRanges(jboolean *a, int len) {
120 // show ranges 157 // show ranges
121 bool lookFor = true; 158 bool lookFor = true;
131 if (lookFor == false) { 168 if (lookFor == false) {
132 tty->print_cr("-%d", len-1); 169 tty->print_cr("-%d", len-1);
133 } 170 }
134 } 171 }
135 172
136 jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args_handle, methodHandle& mh, nmethod *nm, jobject oops_save_handle, TRAPS) { 173 // fill and retire old tlab and get a new one
137 174 // if we can't get one, no problem someone will eventually do a gc
175 void Hsail::getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, size_t tlabMinHsail) {
176 tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null)
177
178 // get a size for a new tlab that is at least tlabMinHsail.
179 size_t new_tlab_size = tlab->compute_size(tlabMinHsail);
180 if (new_tlab_size == 0) return;
181
182 HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size);
183 if (tlab_start == NULL) return;
184
185 // ..and clear it if required
186 if (ZeroTLAB) {
187 Copy::zero_to_words(tlab_start, new_tlab_size);
188 }
189 // and init the tlab pointers
190 tlab->fill(tlab_start, tlab_start, new_tlab_size);
191 }
192
193 static void printTlabInfo (ThreadLocalAllocBuffer* tlab) {
194 HeapWord *start = tlab->start();
195 HeapWord *top = tlab->top();
196 HeapWord *end = tlab->end();
197 // sizes are in bytes
198 size_t tlabFree = tlab->free() * HeapWordSize;
199 size_t tlabUsed = tlab->used() * HeapWordSize;
200 size_t tlabSize = tlabFree + tlabUsed;
201 double freePct = 100.0 * (double) tlabFree/(double) tlabSize;
202 tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct);
203 }
204
205
206
207 jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save,
208 jobject donor_threads, int allocBytesPerWorkitem, TRAPS) {
138 ResourceMark rm(THREAD); 209 ResourceMark rm(THREAD);
139 objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args_handle); 210 objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args);
211
212 // TODO: avoid donor thread logic if kernel does not allocate
213 objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads);
214 int numDonorThreads = donorThreadObjects->length();
215 guarantee(numDonorThreads > 0, "need at least one donor thread");
216 JavaThread** donorThreads = NEW_RESOURCE_ARRAY(JavaThread*, numDonorThreads);
217 for (int i = 0; i < numDonorThreads; i++) {
218 donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i));
219 }
220
221
222 // compute tlabMinHsail based on number of workitems, number of donor
223 // threads, allocBytesPerWorkitem rounded up
224 size_t tlabMinHsail = (allocBytesPerWorkitem * dimX + (numDonorThreads - 1)) / numDonorThreads;
225 if (TraceGPUInteraction) {
226 tty->print_cr("computed tlabMinHsail = %d", tlabMinHsail);
227 }
228
229 for (int i = 0; i < numDonorThreads; i++) {
230 JavaThread* donorThread = donorThreads[i];
231 ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
232 if (TraceGPUInteraction) {
233 tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab);
234 printTlabInfo(tlab);
235 }
236
237 // note: this used vs. free limit checking should be based on some
238 // heuristic where we see how much this kernel tends to allocate
239 if ((tlab->end() == NULL) || (tlab->free() * HeapWordSize < tlabMinHsail)) {
240 getNewTlabForDonorThread(tlab, tlabMinHsail);
241 if (TraceGPUInteraction) {
242 tty->print("donorThread %d, refilled tlab, -> ", i);
243 printTlabInfo(tlab);
244 }
245 }
246 }
140 247
141 // Reset the kernel arguments 248 // Reset the kernel arguments
142 _okra_clearargs(kernel); 249 _okra_clearargs(kernel);
143 250
144 HSAILDeoptimizationInfo* e; 251 HSAILDeoptimizationInfo* e;
145 if (UseHSAILDeoptimization) { 252 if (UseHSAILDeoptimization) {
146 e = new (ResourceObj::C_HEAP, mtInternal) HSAILDeoptimizationInfo(); 253 e = new (ResourceObj::C_HEAP, mtInternal) HSAILDeoptimizationInfo();
147 e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal)); 254 e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal));
148 memset(e->never_ran_array(), 0, dimX * sizeof(jboolean)); 255 memset(e->never_ran_array(), 0, dimX * sizeof(jboolean));
256 e->set_donor_threads(donorThreads);
149 } 257 }
150 258
151 // This object sets up the kernel arguments 259 // This object sets up the kernel arguments
152 HSAILKernelArguments hka((address) kernel, mh->signature(), argsArray, mh->is_static(), e); 260 HSAILKernelArguments hka((address) kernel, mh->signature(), argsArray, mh->is_static(), e);
153 261
165 // Run the kernel 273 // Run the kernel
166 bool success = false; 274 bool success = false;
167 { 275 {
168 TraceTime t1("execute kernel", TraceGPUInteraction); 276 TraceTime t1("execute kernel", TraceGPUInteraction);
169 success = _okra_execute_with_range(kernel, dimX); 277 success = _okra_execute_with_range(kernel, dimX);
278 }
279
280 // fix up any tlab tops that overflowed
281 bool anyOverflows = false;
282 for (int i = 0; i < numDonorThreads; i++) {
283 JavaThread * donorThread = donorThreads[i];
284 ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
285 if (tlab->top() > tlab->end()) {
286 anyOverflows = true;
287 long overflowAmount = (long) tlab->top() - (long) tlab->pf_top();
288 // tlab->set_top is private this ugly hack gets around that
289 *(long *)((char *)tlab + in_bytes(tlab->top_offset())) = (long) tlab->pf_top();
290 if (TraceGPUInteraction) {
291 tty->print_cr("donorThread %d at %p overflowed by %ld bytes, setting last good top to %p", i, donorThread, overflowAmount, tlab->top());
292 }
293 }
294 }
295 if (anyOverflows) {
296 kernelStats.incOverflows();
170 } 297 }
171 298
172 if (UseHSAILDeoptimization) { 299 if (UseHSAILDeoptimization) {
173 // check if any workitem requested a deopt 300 // check if any workitem requested a deopt
174 // currently we only support at most one such workitem 301 // currently we only support at most one such workitem
178 // error condition detected in deopt code 305 // error condition detected in deopt code
179 char msg[200]; 306 char msg[200];
180 sprintf(msg, "deopt error detected, slot for workitem %d was not empty", -1 * (deoptcode + 1)); 307 sprintf(msg, "deopt error detected, slot for workitem %d was not empty", -1 * (deoptcode + 1));
181 guarantee(deoptcode == 1, msg); 308 guarantee(deoptcode == 1, msg);
182 } 309 }
183 310 kernelStats.incDeopts();
184 { 311 {
185 TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction); 312 TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction);
186
187 if (TraceGPUInteraction) { 313 if (TraceGPUInteraction) {
188 tty->print_cr("deopt happened."); 314 tty->print_cr("deopt happened.");
189 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[0]; 315 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[0];
190 tty->print_cr("first deopter was workitem %d", pdeopt->workitem()); 316 tty->print_cr("first deopter was workitem %d", pdeopt->workitem());
191 } 317 }
192 318
193 // Before handling any deopting workitems, save the pointers from 319 // Before handling any deopting workitems, save the pointers from
194 // the hsail frames in oops_save so they get adjusted by any 320 // the hsail frames in oops_save so they get adjusted by any
195 // GC. Need to do this before leaving thread_in_vm mode. 321 // GC. Need to do this before leaving thread_in_vm mode.
196 // resolve handle only needed once here (not exiting vm mode) 322 // resolve handle only needed once here (not exiting vm mode)
197 objArrayOop oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save_handle); 323 objArrayOop oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save);
198 324
199 // since slots are allocated from the beginning, we know how far to look 325 // since slots are allocated from the beginning, we know how far to look
200 assert(e->num_deopts() < MAX_DEOPT_SAVE_STATES_SIZE, "deopt save state overflow"); 326 assert(e->num_deopts() < MAX_DEOPT_SAVE_STATES_SIZE, "deopt save state overflow");
201 for (int k = 0; k < e->num_deopts(); k++) { 327 for (int k = 0; k < e->num_deopts(); k++) {
202 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k]; 328 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k];
226 int deoptId = pdeopt->pc_offset(); 352 int deoptId = pdeopt->pc_offset();
227 HSAILFrame *hsailFrame = pdeopt->first_frame(); 353 HSAILFrame *hsailFrame = pdeopt->first_frame();
228 354
229 // update the hsailFrame from the oopsSaveArray 355 // update the hsailFrame from the oopsSaveArray
230 // re-resolve the handle 356 // re-resolve the handle
231 oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save_handle); 357 oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save);
232 358
233 int dregOopMap = hsailFrame->dreg_oops_map(); 359 int dregOopMap = hsailFrame->dreg_oops_map();
234 for (int bit = 0; bit < 16; bit++) { 360 for (int bit = 0; bit < 16; bit++) {
235 if ((dregOopMap & (1 << bit)) != 0) { 361 if ((dregOopMap & (1 << bit)) != 0) {
236 // the dregister at this bit is an oop, retrieve it from array and put back in frame 362 // the dregister at this bit is an oop, retrieve it from array and put back in frame
294 KlassHandle methKlass = mh->method_holder(); 420 KlassHandle methKlass = mh->method_holder();
295 Thread* THREAD = Thread::current(); 421 Thread* THREAD = Thread::current();
296 JavaValue result(T_VOID); 422 JavaValue result(T_VOID);
297 JavaCallArguments javaArgs; 423 JavaCallArguments javaArgs;
298 // re-resolve the args_handle here 424 // re-resolve the args_handle here
299 objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args_handle); 425 objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args);
300 // This object sets up the javaCall arguments 426 // This object sets up the javaCall arguments
301 // the way argsArray is set up, this should work for instance methods as well 427 // the way argsArray is set up, this should work for instance methods as well
302 // (the receiver will be the first oop pushed) 428 // (the receiver will be the first oop pushed)
303 HSAILJavaCallArguments hjca(&javaArgs, k, mh->signature(), resolvedArgsArray, mh->is_static()); 429 HSAILJavaCallArguments hjca(&javaArgs, k, mh->signature(), resolvedArgsArray, mh->is_static());
304 if (mh->is_static()) { 430 if (mh->is_static()) {
315 showRanges(never_ran_array, dimX); 441 showRanges(never_ran_array, dimX);
316 } 442 }
317 } // end of never-ran handling 443 } // end of never-ran handling
318 } 444 }
319 } 445 }
320 446
321 FREE_C_HEAP_ARRAY(jboolean, e->never_ran_array(), mtInternal); 447 FREE_C_HEAP_ARRAY(jboolean, e->never_ran_array(), mtInternal);
322 delete e; 448 delete e;
323 } 449 }
450 kernelStats.finishDispatch();
324 return success; 451 return success;
325 } 452 }
326 453
327 GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv *env, jclass, jbyteArray code_handle, jstring name_handle)) 454 GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv *env, jclass, jbyteArray code_handle, jstring name_handle))
328 guarantee(_okra_create_kernel != NULL, "[HSAIL] Okra not linked"); 455 guarantee(_okra_create_kernel != NULL, "[HSAIL] Okra not linked");