comparison src/gpu/hsail/vm/gpu_hsail.cpp @ 14969:a6c144380ce7

HSAIL: added UseHSAILDeoptimization VM option for disabling HSAIL deopt support Contributed-by: Eric Caspole <eric.caspole@amd.com>
author Doug Simon <doug.simon@oracle.com>
date Fri, 04 Apr 2014 12:22:49 +0200
parents 3e9a960f0da1
children 2cae21d9f122
comparison
equal deleted inserted replaced
14968:169caf662ac7 14969:a6c144380ce7
131 if (lookFor == false) { 131 if (lookFor == false) {
132 tty->print_cr("-%d", len-1); 132 tty->print_cr("-%d", len-1);
133 } 133 }
134 } 134 }
135 135
136
137 // for experimentation
138 static bool useDeoptInfo = true;
139
140 jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args_handle, methodHandle& mh, nmethod *nm, jobject oops_save_handle, TRAPS) { 136 jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args_handle, methodHandle& mh, nmethod *nm, jobject oops_save_handle, TRAPS) {
141 137
142 ResourceMark rm(THREAD); 138 ResourceMark rm(THREAD);
143 objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args_handle); 139 objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args_handle);
144 140
145 // Reset the kernel arguments 141 // Reset the kernel arguments
146 _okra_clearargs(kernel); 142 _okra_clearargs(kernel);
147 143
148
149 HSAILDeoptimizationInfo* e; 144 HSAILDeoptimizationInfo* e;
150 if (useDeoptInfo) { 145 if (UseHSAILDeoptimization) {
151 e = new (ResourceObj::C_HEAP, mtInternal) HSAILDeoptimizationInfo(); 146 e = new (ResourceObj::C_HEAP, mtInternal) HSAILDeoptimizationInfo();
152 e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal)); 147 e->set_never_ran_array(NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal));
153 memset(e->never_ran_array(), 0, dimX * sizeof(jboolean)); 148 memset(e->never_ran_array(), 0, dimX * sizeof(jboolean));
154 } 149 }
155 150
166 thread->set_gpu_exception_method(mh()); 161 thread->set_gpu_exception_method(mh());
167 THROW_MSG_0(vmSymbols::java_lang_NullPointerException(), buf); 162 THROW_MSG_0(vmSymbols::java_lang_NullPointerException(), buf);
168 } 163 }
169 164
170 // Run the kernel 165 // Run the kernel
171 bool success = _okra_execute_with_range(kernel, dimX); 166 bool success = false;
172 // check if any workitem requested a deopt 167 {
173 // currently we only support at most one such workitem 168 TraceTime t1("execute kernel", TraceGPUInteraction);
174 169 success = _okra_execute_with_range(kernel, dimX);
175 170 }
176 int deoptcode = e->deopt_occurred(); 171
177 if (useDeoptInfo && deoptcode != 0) { 172 if (UseHSAILDeoptimization) {
178 if (deoptcode != 1) { 173 // check if any workitem requested a deopt
179 // error condition detected in deopt code 174 // currently we only support at most one such workitem
180 char msg[200]; 175 int deoptcode = e->deopt_occurred();
181 sprintf(msg, "deopt error detected, slot for workitem %d was not empty", -1*(deoptcode + 1)); 176 if (deoptcode != 0) {
182 guarantee(deoptcode == 1, msg); 177 if (deoptcode != 1) {
183 } 178 // error condition detected in deopt code
184 if (TraceGPUInteraction) { 179 char msg[200];
185 tty->print_cr("deopt happened."); 180 sprintf(msg, "deopt error detected, slot for workitem %d was not empty", -1 * (deoptcode + 1));
186 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[0]; 181 guarantee(deoptcode == 1, msg);
187 tty->print_cr("first deopter was workitem %d", pdeopt->workitem()); 182 }
188 } 183
189 184 {
190 // Before handling any deopting workitems, save the pointers from 185 TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction);
191 // the hsail frames in oops_save so they get adjusted by any 186
192 // GC. Need to do this before leaving thread_in_vm mode. 187 if (TraceGPUInteraction) {
193 // resolve handle only needed once here (not exiting vm mode) 188 tty->print_cr("deopt happened.");
194 objArrayOop oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save_handle); 189 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[0];
195 190 tty->print_cr("first deopter was workitem %d", pdeopt->workitem());
196 // since slots are allocated from the beginning, we know how far to look
197 assert(e->num_deopts() < MAX_DEOPT_SAVE_STATES_SIZE, "deopt save state overflow");
198 for (int k = 0; k < e->num_deopts(); k++) {
199 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k];
200 jint workitem = pdeopt->workitem();
201 if (workitem != -1) {
202 // this is a workitem that deopted
203 HSAILFrame *hsailFrame = pdeopt->first_frame();
204 int dregOopMap = hsailFrame->dreg_oops_map();
205 for (int bit = 0; bit < 16; bit++) {
206 if ((dregOopMap & (1 << bit)) != 0) {
207 // the dregister at this bit is an oop, save it in the array
208 int index = k * 16 + bit;
209 void* saved_oop = (void*) hsailFrame->get_d_reg(bit);
210 oopsSaveArray->obj_at_put(index, (oop) saved_oop);
211 }
212 } 191 }
213 } 192
214 } 193 // Before handling any deopting workitems, save the pointers from
215 194 // the hsail frames in oops_save so they get adjusted by any
216 // Handle any deopting workitems. 195 // GC. Need to do this before leaving thread_in_vm mode.
217 int count_deoptimized = 0; 196 // resolve handle only needed once here (not exiting vm mode)
218 for (int k = 0; k < e->num_deopts(); k++) { 197 objArrayOop oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save_handle);
219 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k]; 198
220 199 // since slots are allocated from the beginning, we know how far to look
221 jint workitem = pdeopt->workitem(); 200 assert(e->num_deopts() < MAX_DEOPT_SAVE_STATES_SIZE, "deopt save state overflow");
222 if (workitem != -1) { 201 for (int k = 0; k < e->num_deopts(); k++) {
223 int deoptId = pdeopt->pc_offset(); 202 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k];
224 HSAILFrame *hsailFrame = pdeopt->first_frame(); 203 jint workitem = pdeopt->workitem();
225 204 if (workitem != -1) {
226 // update the hsailFrame from the oopsSaveArray 205 // this is a workitem that deopted
227 // re-resolve the handle 206 HSAILFrame *hsailFrame = pdeopt->first_frame();
228 oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save_handle); 207 int dregOopMap = hsailFrame->dreg_oops_map();
229 208 for (int bit = 0; bit < 16; bit++) {
230 int dregOopMap = hsailFrame->dreg_oops_map(); 209 if ((dregOopMap & (1 << bit)) != 0) {
231 for (int bit = 0; bit < 16; bit++) { 210 // the dregister at this bit is an oop, save it in the array
232 if ((dregOopMap & (1 << bit)) != 0) { 211 int index = k * 16 + bit;
233 // the dregister at this bit is an oop, retrieve it from array and put back in frame 212 void* saved_oop = (void*) hsailFrame->get_d_reg(bit);
234 int index = k * 16 + bit; 213 oopsSaveArray->obj_at_put(index, (oop) saved_oop);
235 void * dregValue = (void *) oopsSaveArray->obj_at(index);
236 void * oldDregValue = (void *) hsailFrame->get_d_reg(bit);
237 assert((oldDregValue != 0 ? dregValue != 0 : dregValue == 0) , "bad dregValue retrieved");
238 if (TraceGPUInteraction) {
239 if (dregValue != oldDregValue) {
240 tty->print_cr("oop moved for $d%d, workitem %d, slot %d, old=%p, new=%p", bit, workitem, k, oldDregValue, dregValue);
241 } 214 }
242 }
243 hsailFrame->put_d_reg(bit, (jlong) dregValue);
244 }
245 }
246
247 JavaValue result(T_VOID);
248 JavaCallArguments javaArgs;
249 javaArgs.set_alternative_target(nm);
250 javaArgs.push_int(deoptId);
251 javaArgs.push_long((jlong) hsailFrame);
252
253 // override the deoptimization action with Action_none until we decide
254 // how to handle the other actions.
255 int myActionReason = Deoptimization::make_trap_request(Deoptimization::trap_request_reason(pdeopt->reason()), Deoptimization::Action_none);
256 javaArgs.push_int(myActionReason);
257 javaArgs.push_oop((oop)NULL);
258 if (TraceGPUInteraction) {
259 int dregOopMap = hsailFrame->dreg_oops_map();
260 tty->print_cr("[HSAIL] Deoptimizing to host for workitem=%d (slot=%d) with deoptId=%d, frame=" INTPTR_FORMAT ", actionAndReason=%d, dregOopMap=%04x", workitem, k, deoptId, hsailFrame, myActionReason, dregOopMap);
261 // show the registers containing references
262 for (int bit = 0; bit < 16; bit++) {
263 if ((dregOopMap & (1 << bit)) != 0) {
264 tty->print_cr(" oop $d%d = %p", bit, hsailFrame->get_d_reg(bit));
265 } 215 }
266 } 216 }
267 } 217 }
268 JavaCalls::call(&result, mh, &javaArgs, THREAD); 218
269 count_deoptimized++; 219 // Handle any deopting workitems.
270 } 220 int count_deoptimized = 0;
271 } 221 for (int k = 0; k < e->num_deopts(); k++) {
272 if (TraceGPUInteraction) { 222 HSAILKernelDeoptimization * pdeopt = &e->_deopt_save_states[k];
273 tty->print_cr("[HSAIL] Deoptimizing to host completed for %d workitems", count_deoptimized); 223
274 } 224 jint workitem = pdeopt->workitem();
275 225 if (workitem != -1) {
276 // Handle any never_ran workitems if there were any 226 int deoptId = pdeopt->pc_offset();
277 int count_never_ran = 0; 227 HSAILFrame *hsailFrame = pdeopt->first_frame();
278 bool handleNeverRansHere = true; 228
279 // turn off verbose trace stuff for javacall arg setup 229 // update the hsailFrame from the oopsSaveArray
280 bool savedTraceGPUInteraction = TraceGPUInteraction; 230 // re-resolve the handle
281 TraceGPUInteraction = false; 231 oopsSaveArray = (objArrayOop) JNIHandles::resolve(oops_save_handle);
282 jboolean *never_ran_array = e->never_ran_array(); 232
283 if (handleNeverRansHere) { 233 int dregOopMap = hsailFrame->dreg_oops_map();
284 for (int k = 0; k < dimX; k++) { 234 for (int bit = 0; bit < 16; bit++) {
285 if (never_ran_array[k]) { 235 if ((dregOopMap & (1 << bit)) != 0) {
286 // run it as a javaCall 236 // the dregister at this bit is an oop, retrieve it from array and put back in frame
287 KlassHandle methKlass = mh->method_holder(); 237 int index = k * 16 + bit;
288 Thread* THREAD = Thread::current(); 238 void * dregValue = (void *) oopsSaveArray->obj_at(index);
289 JavaValue result(T_VOID); 239 void * oldDregValue = (void *) hsailFrame->get_d_reg(bit);
290 JavaCallArguments javaArgs; 240 assert((oldDregValue != 0 ? dregValue != 0 : dregValue == 0), "bad dregValue retrieved");
291 // re-resolve the args_handle here 241 if (TraceGPUInteraction) {
292 objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args_handle); 242 if (dregValue != oldDregValue) {
293 // This object sets up the javaCall arguments 243 tty->print_cr("oop moved for $d%d, workitem %d, slot %d, old=%p, new=%p", bit, workitem, k, oldDregValue, dregValue);
294 // the way argsArray is set up, this should work for instance methods as well 244 }
295 // (the receiver will be the first oop pushed) 245 }
296 HSAILJavaCallArguments hjca(&javaArgs, k, mh->signature(), resolvedArgsArray, mh->is_static()); 246 hsailFrame->put_d_reg(bit, (jlong) dregValue);
297 if (mh->is_static()) { 247 }
298 JavaCalls::call_static(&result, methKlass, mh->name(), mh->signature(), &javaArgs, THREAD); 248 }
299 } else { 249
300 JavaCalls::call_virtual(&result, methKlass, mh->name(), mh->signature(), &javaArgs, THREAD); 250 JavaValue result(T_VOID);
251 JavaCallArguments javaArgs;
252 javaArgs.set_alternative_target(nm);
253 javaArgs.push_int(deoptId);
254 javaArgs.push_long((jlong) hsailFrame);
255
256 // override the deoptimization action with Action_none until we decide
257 // how to handle the other actions.
258 int myActionReason = Deoptimization::make_trap_request(Deoptimization::trap_request_reason(pdeopt->reason()), Deoptimization::Action_none);
259 javaArgs.push_int(myActionReason);
260 javaArgs.push_oop((oop) NULL);
261 if (TraceGPUInteraction) {
262 int dregOopMap = hsailFrame->dreg_oops_map();
263 tty->print_cr("[HSAIL] Deoptimizing to host for workitem=%d (slot=%d) with deoptId=%d, frame=" INTPTR_FORMAT ", actionAndReason=%d, dregOopMap=%04x", workitem, k, deoptId, hsailFrame, myActionReason, dregOopMap);
264 // show the registers containing references
265 for (int bit = 0; bit < 16; bit++) {
266 if ((dregOopMap & (1 << bit)) != 0) {
267 tty->print_cr(" oop $d%d = %p", bit, hsailFrame->get_d_reg(bit));
268 }
269 }
270 }
271 JavaCalls::call(&result, mh, &javaArgs, THREAD);
272 count_deoptimized++;
301 } 273 }
302 count_never_ran++; 274 }
275 if (TraceGPUInteraction) {
276 tty->print_cr("[HSAIL] Deoptimizing to host completed for %d workitems", count_deoptimized);
303 } 277 }
304 } 278 }
305 TraceGPUInteraction = savedTraceGPUInteraction; 279
306 if (TraceGPUInteraction) { 280 {
307 tty->print_cr("%d workitems never ran, have been run via JavaCall", count_never_ran); 281 TraceTime t3("handle never-rans", TraceGPUInteraction);
308 showRanges(never_ran_array, dimX); 282
283 // Handle any never_ran workitems if there were any
284 int count_never_ran = 0;
285 bool handleNeverRansHere = true;
286 // turn off verbose trace stuff for javacall arg setup
287 bool savedTraceGPUInteraction = TraceGPUInteraction;
288 TraceGPUInteraction = false;
289 jboolean *never_ran_array = e->never_ran_array();
290 if (handleNeverRansHere) {
291 for (int k = 0; k < dimX; k++) {
292 if (never_ran_array[k]) {
293 // run it as a javaCall
294 KlassHandle methKlass = mh->method_holder();
295 Thread* THREAD = Thread::current();
296 JavaValue result(T_VOID);
297 JavaCallArguments javaArgs;
298 // re-resolve the args_handle here
299 objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args_handle);
300 // This object sets up the javaCall arguments
301 // the way argsArray is set up, this should work for instance methods as well
302 // (the receiver will be the first oop pushed)
303 HSAILJavaCallArguments hjca(&javaArgs, k, mh->signature(), resolvedArgsArray, mh->is_static());
304 if (mh->is_static()) {
305 JavaCalls::call_static(&result, methKlass, mh->name(), mh->signature(), &javaArgs, THREAD);
306 } else {
307 JavaCalls::call_virtual(&result, methKlass, mh->name(), mh->signature(), &javaArgs, THREAD);
308 }
309 count_never_ran++;
310 }
311 }
312 TraceGPUInteraction = savedTraceGPUInteraction;
313 if (TraceGPUInteraction) {
314 tty->print_cr("%d workitems never ran, have been run via JavaCall", count_never_ran);
315 showRanges(never_ran_array, dimX);
316 }
317 } // end of never-ran handling
309 } 318 }
310 } // end of never-ran handling 319 }
311 320
312 }
313
314 if (useDeoptInfo) {
315 FREE_C_HEAP_ARRAY(jboolean, e->never_ran_array(), mtInternal); 321 FREE_C_HEAP_ARRAY(jboolean, e->never_ran_array(), mtInternal);
316 delete e; 322 delete e;
317 } 323 }
318 return success; 324 return success;
319 } 325 }