changeset 16242:e9998e2be7f5

use oops_do to modify saved hsail state Contributed-by: Tom Deneau <tom.deneau@amd.com>
author Gilles Duboscq <duboscq@ssw.jku.at>
date Thu, 26 Jun 2014 18:25:35 +0200
parents c6ebc1997a55
children fb77eab05bd3
files graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/BoundsCatchMost20000StressGCTest.java graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ArrayListSetTest.java graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java src/gpu/hsail/vm/gpu_hsail.cpp src/gpu/hsail/vm/gpu_hsail.hpp src/gpu/hsail/vm/gpu_hsail_Frame.hpp src/gpu/hsail/vm/gpu_hsail_OopMapHelper.hpp src/share/vm/classfile/javaClasses.cpp src/share/vm/runtime/thread.cpp src/share/vm/runtime/thread.hpp
diffstat 10 files changed, 321 insertions(+), 211 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/BoundsCatchMost20000StressGCTest.java	Thu Jun 26 18:25:35 2014 +0200
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2009, 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package com.oracle.graal.compiler.hsail.test;
+
+import org.junit.*;
+
+/**
+ * A version which is likely to get a GC while running the never_rans, and so is good for oops_do
+ * testing.
+ */
+public class BoundsCatchMost20000StressGCTest extends BoundsCatchManyBase {
+
+    @Override
+    int getGlobalSize() {
+        return 20000;
+    }
+
+    boolean isMyDeoptGid(int gid) {
+        return (gid > 100 && gid % 100 != 1);
+    }
+
+    int[] dummyArray;
+
+    // copied run routine here because otherwise polymorphic calls to isDeoptGid
+    @Override
+    public void run(int gid) {
+        int outval = getOutval(gid);
+        try {
+            int index = (isMyDeoptGid(gid) ? num + 1 : gid);
+            outArray[index] = outval;
+        } catch (ArrayIndexOutOfBoundsException e) {
+            // set up so we can detect if we go thru here twice
+            outArray[gid] += outval;
+            // note: cannot record the exceptiongid here for many deopts in parallel
+
+            // allocate something so GCs happen more often
+            dummyArray = new int[1000];
+        }
+    }
+
+    @Override
+    public void runTest() {
+        setupArrays();
+
+        for (int i = 0; i < 10; i++) {
+            // we should not get an exception escaping from the kernel
+            dispatchMethodKernel(num);
+        }
+    }
+
+    @Test
+    public void test() {
+        testGeneratedHsail();
+    }
+}
--- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ArrayListSetTest.java	Thu Jun 26 13:42:29 2014 +0200
+++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/ArrayListSetTest.java	Thu Jun 26 18:25:35 2014 +0200
@@ -43,10 +43,6 @@
         dispatchLambdaKernel(NUM, (gid) -> {
             aryList.set(gid, gid);
         });
-
-        // for (int i = 0; i < NUM; i++) {
-        // System.out.println(aryList.get(i));
-        // }
     }
 
     @Override
@@ -55,6 +51,7 @@
     }
 
     @Test
+    @Ignore
     public void testUsingLambdaMethod() {
         testGeneratedHsailUsingLambdaMethod();
     }
--- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java	Thu Jun 26 13:42:29 2014 +0200
+++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java	Thu Jun 26 18:25:35 2014 +0200
@@ -379,22 +379,13 @@
             throw new GraalInternalError("Cannot execute GPU kernel if device is not initialized");
         }
         int[] oopMapArray = ((HSAILHotSpotNmethod) kernel).getOopMapArray();
-        Object[] oopsSaveArea;
-        if (getRuntime().getConfig().useHSAILDeoptimization) {
-            int saveAreaCounts = OopMapArrayBuilder.getSaveAreaCounts(oopMapArray);
-            int numDRegs = (saveAreaCounts >> 8) & 0xff;
-            int numStackSlots = (saveAreaCounts >> 16);
-            // pessimistically assume that any of the DRegs or stackslots could be oops
-            oopsSaveArea = new Object[maxDeoptIndex * (numDRegs + numStackSlots)];
-        } else {
-            oopsSaveArea = null;
-        }
+
         // Pass donorThreadPoolArray if this kernel uses allocation, otherwise null
         Thread[] donorThreadArray = ((HSAILHotSpotNmethod) kernel).getUsesAllocationFlag() ? donorThreadPool.get().getThreads() : null;
-        return executeKernel0(kernel, jobSize, args, oopsSaveArea, donorThreadArray, HsailAllocBytesPerWorkitem.getValue(), oopMapArray);
+        return executeKernel0(kernel, jobSize, args, donorThreadArray, HsailAllocBytesPerWorkitem.getValue(), oopMapArray);
     }
 
-    private static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args, Object[] oopsSave, Thread[] donorThreads, int allocBytesPerWorkitem, int[] oopMapArray)
+    private static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args, Thread[] donorThreads, int allocBytesPerWorkitem, int[] oopMapArray)
                     throws InvalidInstalledCodeException;
 
     /**
@@ -1069,10 +1060,6 @@
             int arrIndex = HEADERSIZE + infoIndex * intsPerInfopoint + 1 + intIndex;
             return array[arrIndex];
         }
-
-        public static int getSaveAreaCounts(int[] array) {
-            return array[SAVEAREACOUNTS_OFST];
-        }
     }
 
     private static StructuredGraph prepareHostGraph(ResolvedJavaMethod method, List<DeoptimizingOp> deopts, HotSpotProviders providers, HotSpotVMConfig config, int numSRegs, int numDRegs) {
--- a/src/gpu/hsail/vm/gpu_hsail.cpp	Thu Jun 26 13:42:29 2014 +0200
+++ b/src/gpu/hsail/vm/gpu_hsail.cpp	Thu Jun 26 18:25:35 2014 +0200
@@ -66,7 +66,7 @@
 JNINativeMethod Hsail::HSAIL_methods[] = {
   {CC"initialize",       CC"()Z",                               FN_PTR(Hsail::initialize)},
   {CC"generateKernel",   CC"([B" STRING ")J",                   FN_PTR(Hsail::generate_kernel)},
-  {CC"executeKernel0",   CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I[I)Z",  FN_PTR(Hsail::execute_kernel_void_1d)},
+  {CC"executeKernel0",   CC"("HS_INSTALLED_CODE"I["OBJECT"["JLTHREAD"I[I)Z",  FN_PTR(Hsail::execute_kernel_void_1d)},
 };
 
 void* Hsail::_device_context = NULL;
@@ -108,7 +108,7 @@
   _okra_register_heap(Universe::heap()->base(), Universe::heap()->capacity());
 }
 
-GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args, jobject oops_save,
+GPU_VMENTRY(jboolean, Hsail::execute_kernel_void_1d, (JNIEnv* env, jclass, jobject kernel_handle, jint dimX, jobject args,
                                                       jobject donor_threads, jint allocBytesPerWorkitem, jobject oop_map_array))
 
   ResourceMark rm;
@@ -125,7 +125,7 @@
     SharedRuntime::throw_and_post_jvmti_exception(JavaThread::current(), vmSymbols::com_oracle_graal_api_code_InvalidInstalledCodeException(), NULL);
   }
 
-return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, oop_map_array, CHECK_0);
+return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, donor_threads, allocBytesPerWorkitem, oop_map_array, CHECK_0);
 GPU_END
 
 static void showRanges(jboolean* a, int len) {
@@ -145,143 +145,11 @@
   }
 }
 
-class OopSaver : public StackObj {
-private:
-  objArrayOop _oopsSaveArray;
-  typeArrayOop _oopMapArray;
-  jobject  _oops_save;
-  jobject _oop_map_array;
-  int _last_pcoffset;
-  int _last_idx;
-  int _saveAreaCounts;
-
-  enum {
-    SAVEAREACOUNTS_OFST=0,
-    SPAN_OFST=1,
-    HEADERSIZE=2
-  }; 
-  int mapPcOffsetToIndex(int pcOffset) {
-    if (pcOffset == _last_pcoffset) {
-      return _last_idx;
-    }
-    int span = _oopMapArray->int_at(SPAN_OFST);
-    for (int idx = HEADERSIZE; idx < _oopMapArray->length(); idx += span) {
-      int ofst = _oopMapArray->int_at(idx);
-      if (ofst == pcOffset) {
-        _last_pcoffset = pcOffset;
-        _last_idx = idx + 1;
-        return _last_idx;
-      }
-    }
-    ShouldNotReachHere();
-    return -1;
-  }
-
-public:
-  OopSaver(jobject oops_save, jobject oop_map_array) {
-    _oops_save = oops_save;
-    _oop_map_array = oop_map_array;
-    _last_pcoffset = -1;
-    _saveAreaCounts = getSaveAreaCounts(oop_map_array);
-    resolveArrays();
-  }
- 
-  void resolveArrays() {
-    _oopsSaveArray = (objArrayOop) JNIHandles::resolve(_oops_save);
-    _oopMapArray = (typeArrayOop) JNIHandles::resolve(_oop_map_array);
-  }
-
-  void* getOopForBit(HSAILFrame* hsailFrame, int bit) {
-    assert(isOop(hsailFrame, bit), "");
-    void* oop;
-    if (bit < hsailFrame->num_d_regs()) {
-      // d register
-      oop = (void*) hsailFrame->get_d_reg(bit);
-    } else {
-      // stack slot
-      int stackOffset = (bit - hsailFrame->num_d_regs()) * 8;  // 8 bytes per stack slot
-      oop = (void*) hsailFrame->get_stackslot64(stackOffset);
-    }
-    return oop;
-  }
-
-  void putOopForBit(HSAILFrame* hsailFrame, int bit, void* oop) {
-    assert(isOop(hsailFrame, bit), "");
-    if (bit < hsailFrame->num_d_regs()) {
-      // d register
-      hsailFrame->put_d_reg(bit, (jlong) oop);
-    } else {
-      // stack slot
-      int stackOffset = (bit - hsailFrame->num_d_regs()) * 8;  // 8 bytes per stack slot
-      hsailFrame->put_stackslot64(stackOffset, (jlong) oop);
-    }
-  }
-
-  void saveOopsFromFrame(HSAILFrame* hsailFrame, int deoptSlot){
-    // as used, no need to resolve arrays on each call
-    int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
-
-    // handle the dregister and stackSlot based oops
-    for (int bit = 0; bit < oopsPerDeopt; bit++) {
-      if (isOop(hsailFrame, bit)) {
-        void* saved_oop = getOopForBit(hsailFrame, bit);
-        int saveArrayIndex = deoptSlot * oopsPerDeopt + bit;
-        _oopsSaveArray->obj_at_put(saveArrayIndex, (oop) saved_oop);
-      }
-    }
-  }
-
-  void restoreOopsToFrame(HSAILFrame* hsailFrame, int deoptSlot, int workitem){
-    // need to re-resolve on each restore
-    resolveArrays();
-    int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
-
-    // handle the dregister and stackSlot based oops
-    for (int bit = 0; bit < oopsPerDeopt; bit++) {
-      if (isOop(hsailFrame, bit)) {
-        // the dregister or stack slot at this bit is an oop, retrieve it from array and put back in frame
-        int saveArrayIndex = deoptSlot * oopsPerDeopt + bit;
-        void* newValue = (void*) _oopsSaveArray->obj_at(saveArrayIndex);
-        void* oldValue = getOopForBit(hsailFrame, bit);
-        assert((oldValue != 0 ? newValue != 0 : newValue == 0), "bad dregValue retrieved");
-        if (newValue != oldValue) {
-          if (TraceGPUInteraction) {
-            int numDRegs = hsailFrame->num_d_regs();
-            const char* name = (bit < numDRegs ? "$d" : "stk");
-            int num = (bit < numDRegs ? bit : bit - numDRegs);
-            tty->print_cr("oop moved for %s%d, workitem %d, slot %d, old=%p, new=%p",
-                          name, num, workitem, deoptSlot, oldValue, newValue);
-          }
-          putOopForBit(hsailFrame, bit, newValue);
-        }
-      }
-    }
-  }
-
-  bool isOop(HSAILFrame* hsailFrame, int bit){
-    // re-resolve on each access
-    resolveArrays();
-    if (bit > hsailFrame->num_d_regs() + hsailFrame->num_stack_slots()) {
-      return false;
-    }
-    int pcOffset = hsailFrame->pc_offset();
-    int bits_int_idx = mapPcOffsetToIndex(pcOffset) + (bit / 32);
-    int bitpos = bit % 32;
-    int bits = _oopMapArray->int_at(bits_int_idx);
-    return ((bits & (1 << bitpos)) != 0);
-  }
-
-  static int getSaveAreaCounts(jobject oopMapArrayObject) {
-    typeArrayOop oopMapArray = (typeArrayOop) JNIHandles::resolve(oopMapArrayObject);
-    return oopMapArray->int_at(SAVEAREACOUNTS_OFST);
-  }
-
-};
-
-jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm, jobject oops_save,
+jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm,
                                                 jobject donor_threads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) {
   ResourceMark rm(THREAD);
   objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args);
+  assert(THREAD->is_Java_thread(), "must be a JavaThread");
 
   // We avoid HSAILAllocationInfo logic if kernel does not allocate
   // in which case the donor_thread array passed in will be null
@@ -290,20 +158,23 @@
   // Reset the kernel arguments
   _okra_clearargs(kernel);
 
+  JavaThread* thread = (JavaThread*)THREAD;
   HSAILDeoptimizationInfo* e;
   if (UseHSAILDeoptimization) {
     // get how many bytes per deopt save area are required
-    int saveAreaCounts = OopSaver::getSaveAreaCounts(oop_map_array);
+    int saveAreaCounts = HSAILOopMapHelper::get_save_area_counts(oop_map_array);
     int numSRegs = saveAreaCounts & 0xff;
     int numDRegs = (saveAreaCounts >> 8) & 0xff;
     int numStackSlots = (saveAreaCounts >> 16);
     int bytesPerSaveArea = numSRegs * 4 + (numDRegs + numStackSlots) * 8;
 
-    e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea, dimX, allocInfo);
+    e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea, dimX, allocInfo, oop_map_array);
     // copy cur_tlab_infos
     if (allocInfo != NULL) {
-      e->setCurTlabInfos(allocInfo->getCurTlabInfos());
+      e->set_cur_tlabInfos(allocInfo->getCurTlabInfos());
     }
+    // set deopt info in thread so gc oops_do processing can find it
+    thread->set_gpu_hsail_deopt_info(e);
   }
 
   // This object sets up the kernel arguments
@@ -317,7 +188,6 @@
   if (hka.getFirstNullParameterIndex() >= 0) {
     char buf[64];
     sprintf(buf, "Null Kernel Parameter seen, Parameter Index: %d", hka.getFirstNullParameterIndex());
-    JavaThread* thread = (JavaThread*)THREAD;
     thread->set_gpu_exception_bci(0);
     thread->set_gpu_exception_method(mh());
     THROW_MSG_0(vmSymbols::java_lang_NullPointerException(), buf);
@@ -362,22 +232,6 @@
           tty->print_cr("first deopter was workitem %d", pdeopt->workitem());
         }
 
-        // Before handling any deopting workitems, save the pointers from
-        // the hsail frames in oops_save so they get adjusted by any
-        // GC. Need to do this before leaving thread_in_vm mode.
-        OopSaver oopSaver(oops_save, oop_map_array);
-        // resolve handle only needed once here (not exiting vm mode)
-        oopSaver.resolveArrays();
-
-        // since slots are allocated from the beginning, we know how far to look
-        assert(e->num_deopts() < e->num_slots(), "deopt save state overflow");
-        for (int k = 0; k < e->num_deopts(); k++) {
-          HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(k);
-          assert (pdeopt->workitem() >= 0, "bad workitem in deopt");
-          // this is a workitem that deopted
-          oopSaver.saveOopsFromFrame(pdeopt->first_frame(), k);
-        }
-
         // Handle any deopting workitems.
         int count_deoptimized = 0;
         for (int k = 0; k < e->num_deopts(); k++) {
@@ -388,10 +242,6 @@
             int deoptId = pdeopt->pc_offset();
             HSAILFrame* hsailFrame = pdeopt->first_frame();
 
-            // Update the hsailFrame from the oopsSaveArray
-            // will re-resolve the handles each time.
-            oopSaver.restoreOopsToFrame(hsailFrame, k, workitem);
-
             JavaValue result(T_VOID);
             JavaCallArguments javaArgs;
             javaArgs.set_alternative_target(nm);
@@ -407,21 +257,24 @@
               tty->print_cr("[HSAIL] Deoptimizing to host for workitem=%d (slot=%d) with deoptId=%d, frame=" INTPTR_FORMAT ", actionAndReason=%d", workitem, k, deoptId, hsailFrame, myActionReason);
               // show the $d registers or stack slots containing references
               int maxOopBits = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots();
+              HSAILOopMapHelper oopMapHelper(oop_map_array);
+              int pc_offset = hsailFrame->pc_offset();
               for (int bit = 0; bit < maxOopBits; bit++) {
-                if (oopSaver.isOop(hsailFrame, bit)) {
+                if (oopMapHelper.is_oop(pc_offset, bit)) {
                   if (bit < hsailFrame->num_d_regs()) {
                     // show $d reg oop
-                    tty->print_cr("  oop $d%d = %p", bit, oopSaver.getOopForBit(hsailFrame, bit));
+                    tty->print_cr("  oop $d%d = %p", bit, hsailFrame->get_oop_for_bit(bit));
                   } else {
                     // show stack slot oop
                     int stackOffset = (bit - hsailFrame->num_d_regs()) * 8;  // 8 bytes per stack slot
-                    tty->print_cr("  oop stk:%d = %p", stackOffset, oopSaver.getOopForBit(hsailFrame, bit));
+                    tty->print_cr("  oop stk:%d = %p", stackOffset, hsailFrame->get_oop_for_bit(bit));
                   }
                 }
               }
             }
             JavaCalls::call(&result, mh, &javaArgs, THREAD);
             count_deoptimized++;
+            e->set_deopt_work_index(k + 1);
           }
         }
         if (TraceGPUInteraction) {
@@ -429,6 +282,9 @@
         }
       }
     }
+    // when we are done with the deopts, we don't need to oops_do anything
+    // in the saved state anymore
+    thread->set_gpu_hsail_deopt_info(NULL);  
 
     // Handle any never_ran workitems if there were any
     {
@@ -595,3 +451,27 @@
   }
   return true;
 }
+
+
+void Hsail::HSAILDeoptimizationInfo::oops_do(OopClosure* f) {
+  int unprocessed_deopts = num_deopts() - deopt_work_index();
+  if (TraceGPUInteraction) {
+    tty->print_cr("HSAILDeoptimizationInfo::oops_do deopt_occurred=%d, total_deopts=%d, unprocessed_deopts=%d, oop_map_array=%p", _deopt_occurred, num_deopts(), unprocessed_deopts, _oop_map_array);
+  }
+  if (num_deopts() == 0 || unprocessed_deopts <= 0) {
+    return; // nothing to do
+  }
+  HSAILOopMapHelper oopMapHelper(_oop_map_array);
+  oopMapHelper.resolve_arrays();  // resolve once before processing
+
+  // go thru the unprocessed deopt frames, finding each oop and applying the closre
+  for (int k = deopt_work_index(); k < num_deopts(); k++) {
+    HSAILKernelDeoptimization* pdeopt = get_deopt_save_state(k);
+    assert (pdeopt->workitem() >= 0, "bad workitem in deopt");
+    if (TraceGPUInteraction) {
+      tty->print_cr("  deopt %d, workitem %d, pc %d", k, pdeopt->workitem(), pdeopt->pc_offset());
+    }
+    HSAILFrame* hsailFrame = pdeopt->first_frame();
+    hsailFrame->oops_do(f, &oopMapHelper);
+  }
+}
--- a/src/gpu/hsail/vm/gpu_hsail.hpp	Thu Jun 26 13:42:29 2014 +0200
+++ b/src/gpu/hsail/vm/gpu_hsail.hpp	Thu Jun 26 18:25:35 2014 +0200
@@ -28,6 +28,7 @@
 #include "runtime/gpu.hpp"
 #include "utilities/exceptions.hpp"
 #include "graal/graalEnv.hpp"
+#include "gpu_hsail_OopMapHelper.hpp"
 #include "gpu_hsail_Frame.hpp"
 #include "gpu_hsail_Tlab.hpp"
 
@@ -101,9 +102,11 @@
     jint _deopt_next_index;
     jint _num_slots;
     jint _deopt_span;
+    jint _deopt_work_index;           // how far we are in processing the deopts
     HSAILTlabInfo** _cur_tlab_info;   // copy of what was in the HSAILAllocationInfo, to avoid an extra indirection
     HSAILAllocationInfo* _alloc_info;
     char _ignore;
+    jobject _oop_map_array;
     // keep a pointer last so save area following it is word aligned
     jboolean* _never_ran_array; 
 
@@ -119,14 +122,16 @@
       return (jbyte*) (this) + hdr_size();
     }
 
-    inline HSAILDeoptimizationInfo(int numSlots, int bytesPerSaveArea, int dimX, HSAILAllocationInfo* allocInfo) {
+    inline HSAILDeoptimizationInfo(int numSlots, int bytesPerSaveArea, int dimX, HSAILAllocationInfo* allocInfo, jobject oop_map_array) {
       _notice_safepoints = &Hsail::_notice_safepoints;
       _deopt_occurred = 0;
       _deopt_next_index = 0;
+      _deopt_work_index = 0;
       _num_slots = numSlots;
       _never_ran_array = NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal);
       memset(_never_ran_array, 0, dimX * sizeof(jboolean));
       _alloc_info = allocInfo;
+      _oop_map_array = oop_map_array;
       _deopt_span = sizeof(HSAILKernelDeoptimization) + sizeof(HSAILFrame) + bytesPerSaveArea;
       if (TraceGPUInteraction) {
         tty->print_cr("HSAILDeoptimizationInfo allocated, %d slots of size %d, total size = 0x%lx bytes", _num_slots, _deopt_span, (_num_slots * _deopt_span + sizeof(HSAILDeoptimizationInfo)));
@@ -143,16 +148,20 @@
     inline jint num_deopts() { return _deopt_next_index; }
     inline jboolean* never_ran_array() { return _never_ran_array; }
     inline jint num_slots() {return _num_slots;}
+    inline void set_deopt_work_index(int val) { _deopt_work_index = val; }
+    inline jint deopt_work_index() { return _deopt_work_index; }
 
     inline HSAILKernelDeoptimization* get_deopt_save_state(int slot) {
       // use _deopt_span to index into _deopt_states
       return (HSAILKernelDeoptimization*) (save_area_start() + _deopt_span * slot);
     }
 
-    void setCurTlabInfos(HSAILTlabInfo** ptlabInfos) {
+    void set_cur_tlabInfos(HSAILTlabInfo** ptlabInfos) {
       _cur_tlab_info = ptlabInfos;
     }
 
+    void oops_do(OopClosure* f);
+
     void* operator new (size_t hdrSize, int numSlots, int bytesPerSaveArea) {
       assert(hdrSize <= hdr_size(), "");
       size_t totalSizeBytes = hdr_size()  + numSlots * (sizeof(HSAILKernelDeoptimization) + sizeof(HSAILFrame) + bytesPerSaveArea);
@@ -175,10 +184,10 @@
   JNIEXPORT static jlong generate_kernel(JNIEnv* env, jclass, jbyteArray code_handle, jstring name_handle);
 
   // static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args);
-  JNIEXPORT static jboolean execute_kernel_void_1d(JNIEnv* env, jclass, jobject hotspotInstalledCode, jint dimX, jobject args, jobject oopsSave,
+  JNIEXPORT static jboolean execute_kernel_void_1d(JNIEnv* env, jclass, jobject hotspotInstalledCode, jint dimX, jobject args,
                                                    jobject donorThreads, int allocBytesPerWorkitem, jobject oop_map_array);
 
-  static jboolean execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm, jobject oopsSave,
+  static jboolean execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm,
                                                   jobject donorThreads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS);
 
   static void register_heap();
--- a/src/gpu/hsail/vm/gpu_hsail_Frame.hpp	Thu Jun 26 13:42:29 2014 +0200
+++ b/src/gpu/hsail/vm/gpu_hsail_Frame.hpp	Thu Jun 26 18:25:35 2014 +0200
@@ -37,36 +37,85 @@
   jbyte _num_d_regs;
   jshort _num_stack_slots; 
 
+  jbyte* data_start() {return (jbyte*) this  + sizeof(*this); }
+  int sreg_ofst_start() { return 0; }
+  int dreg_ofst_start() { return sreg_ofst_start() + num_s_regs() * sizeof(jint); } 
+  int stackslot_ofst_start() { return dreg_ofst_start() + num_d_regs() * sizeof(jlong); } 
+
+  int sreg_ofst(int idx) {
+    assert(idx >= 0 && idx < num_s_regs(), "bad sreg index");
+    return sreg_ofst_start() + idx * sizeof(jint);
+  }
+
+  int dreg_ofst(int idx) {
+    assert(idx >= 0 && idx < num_d_regs(), "bad dreg index");
+    return dreg_ofst_start() + idx * sizeof(jlong);
+  }
+
+  int stackslot_ofst(int stackOffset) {
+    assert(stackOffset >= 0 && (unsigned int) stackOffset < num_stack_slots() * sizeof(jlong), "bad stackoffset");
+    return stackslot_ofst_start() + stackOffset;
+  }
+
+  // the _ptr versions just return a pointer to the indicated d reg or stackslot64
+  // some of these are used for oops_do processing
+  jint* get_s_reg_ptr(int idx) {
+    return((jint*) (data_start() + sreg_ofst(idx)));
+  }
+
+  jlong* get_d_reg_ptr(int idx) {
+    return((jlong*) (data_start() + dreg_ofst(idx)));
+  }
+
+  jlong* get_stackslot64_ptr(int stackOffset) {
+    return((jlong*) (data_start() + stackslot_ofst(stackOffset)));
+  }
+
+  jint* get_stackslot32_ptr(int stackOffset) {
+    return((jint*) (data_start() + stackslot_ofst(stackOffset)));
+  }
+
+  void* get_oop_ptr_for_bit(int bit) {
+    void* oop_ptr;
+    if (bit < num_d_regs()) {
+      // d register
+      oop_ptr = (void*) get_d_reg_ptr(bit);
+    } else {
+      // stack slot
+      int stackOffset = (bit - num_d_regs()) * 8;  // 8 bytes per stack slot
+      oop_ptr = (void*) get_stackslot64_ptr(stackOffset);
+    }
+    return oop_ptr;
+  }
+
 public:
   // Accessors
   jint pc_offset() { return _pc_offset; }
   jint num_s_regs() {return _num_s_regs; }
   jint num_d_regs() {return _num_d_regs; }
   jint num_stack_slots() {return _num_stack_slots; }
-  jbyte* data_start() {return (jbyte*) this  + sizeof(*this); }
-  jlong get_d_reg(int idx) {
-    int ofst = num_s_regs() * 4 + idx * 8;
-    return(*(jlong*) (data_start() + ofst));
-  }
-  jint get_s_reg(int idx) {
-    int ofst = idx * 4;
-    return(*(jint*) (data_start() + ofst));
+
+  jlong get_oop_for_bit(int bit) {
+    return * (jlong *) get_oop_ptr_for_bit(bit);
   }
-  void put_d_reg(int idx, jlong val) {
-    int ofst = num_s_regs() * 4 + idx * 8;
-    (*(jlong*) (data_start() + ofst)) = val;
-  }
-  jint get_stackslot32(int stackOffset) {
-    int ofst = num_s_regs() * 4 + num_d_regs() * 8 + stackOffset;
-    return(*(jint*) (data_start() + ofst));
-  }
-  jlong get_stackslot64(int stackOffset) {
-    int ofst = num_s_regs() * 4 + num_d_regs() * 8 + stackOffset;
-    return(*(jlong*) (data_start() + ofst));
-  }
-  void put_stackslot64(int stackOffset, jlong val) {
-    int ofst = num_s_regs() * 4 + num_d_regs() * 8 + stackOffset;
-    (*(jlong*) (data_start() + ofst)) = val;
+    
+  // do the oops from this frame
+  void oops_do(OopClosure* f, HSAILOopMapHelper* oopMapHelper) {
+    int oops_per_deopt = num_d_regs() + num_stack_slots();
+
+    // handle the dregister and stackSlot based oops
+    for (int bit = 0; bit < oops_per_deopt; bit++) {
+      if (oopMapHelper->is_oop(pc_offset(), bit)) {
+        void* oop_ptr = get_oop_ptr_for_bit(bit);
+        // the oops we are dealing with here in the hsailFrame are always uncompressed
+        oop old_oop = oopDesc::load_heap_oop((oop *)oop_ptr);
+        f->do_oop((oop*) oop_ptr);
+        if (TraceGPUInteraction) {
+          oop new_oop = oopDesc::load_heap_oop((oop *)oop_ptr);
+          tty->print_cr("bit=%d, oop_ptr=%p, old=%p, new=%p", bit, oop_ptr, (void *)old_oop, (void *)new_oop);
+        }
+      }
+    }
   }
 };
   
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/gpu/hsail/vm/gpu_hsail_OopMapHelper.hpp	Thu Jun 26 18:25:35 2014 +0200
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef GPU_HSAIL_VM_GPU_HSAIL_OOPMAPHELPER_HPP
+#define GPU_HSAIL_VM_GPU_HSAIL_OOPMAPHELPER_HPP
+
+#include "graal/graalEnv.hpp"
+#include "code/debugInfo.hpp"
+#include "code/location.hpp"
+
+// Takes the jobject for the array of ints created by the java side
+// and decodes the information based on pc_offset to find oops
+class HSAILOopMapHelper : public StackObj {
+private:
+  jobject _oop_map_array_jobject;
+  typeArrayOop _oop_map_array;
+  int _last_pcoffset;
+  int _last_idx;
+
+  enum {
+    SAVEAREACOUNTS_OFST=0,
+    SPAN_OFST=1,
+    HEADERSIZE=2
+  }; 
+  int mapPcOffsetToIndex(int pcOffset) {
+    if (pcOffset == _last_pcoffset) {
+      return _last_idx;
+    }
+    int span = _oop_map_array->int_at(SPAN_OFST);
+    for (int idx = HEADERSIZE; idx < _oop_map_array->length(); idx += span) {
+      int ofst = _oop_map_array->int_at(idx);
+      if (ofst == pcOffset) {
+        _last_pcoffset = pcOffset;
+        _last_idx = idx + 1;
+        return _last_idx;
+      }
+    }
+    ShouldNotReachHere();
+    return -1;
+  }
+
+public:
+  HSAILOopMapHelper(jobject oop_map_array_jobject) {
+    _oop_map_array_jobject = oop_map_array_jobject;
+    _last_pcoffset = -1;
+    resolve_arrays();
+  }
+ 
+  void resolve_arrays() {
+    _oop_map_array = (typeArrayOop) JNIHandles::resolve(_oop_map_array_jobject);
+  }
+
+  static int get_save_area_counts(jobject oop_map_array_jobject) {
+    typeArrayOop oop_map_array_resolved = (typeArrayOop) JNIHandles::resolve(oop_map_array_jobject);
+    return oop_map_array_resolved->int_at(SAVEAREACOUNTS_OFST);
+  }
+
+  bool is_oop(int pcOffset, int bit){
+    int bits_int_idx = mapPcOffsetToIndex(pcOffset) + (bit / 32);
+    int bitpos = bit % 32;
+    int bits = _oop_map_array->int_at(bits_int_idx);
+    return ((bits & (1 << bitpos)) != 0);
+  }
+
+};
+
+#endif // GPU_HSAIL_VM_GPU_HSAIL_OOPMAPHELPER_HPP
--- a/src/share/vm/classfile/javaClasses.cpp	Thu Jun 26 13:42:29 2014 +0200
+++ b/src/share/vm/classfile/javaClasses.cpp	Thu Jun 26 18:25:35 2014 +0200
@@ -1532,6 +1532,7 @@
     return;
   }
   
+#ifdef GRAAL
   // Check for gpu exception to add as top frame
   Method* gpu_method = thread->get_gpu_exception_method();
   if (gpu_method != NULL) {
@@ -1541,6 +1542,7 @@
     thread->set_gpu_exception_bci(0);
     thread->set_gpu_exception_method(NULL);  
   }
+#endif
 
   // Instead of using vframe directly, this version of fill_in_stack_trace
   // basically handles everything by hand. This significantly improved the
--- a/src/share/vm/runtime/thread.cpp	Thu Jun 26 13:42:29 2014 +0200
+++ b/src/share/vm/runtime/thread.cpp	Thu Jun 26 18:25:35 2014 +0200
@@ -54,6 +54,9 @@
 #include "runtime/fprofiler.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/gpu.hpp"
+#ifdef GRAAL
+# include "hsail/vm/gpu_hsail.hpp"
+#endif
 #include "runtime/init.hpp"
 #include "runtime/interfaceSupport.hpp"
 #include "runtime/java.hpp"
@@ -1467,8 +1470,11 @@
   clear_must_deopt_id();
   set_monitor_chunks(NULL);
   set_next(NULL);
+#ifdef GRAAL
   set_gpu_exception_bci(0);
   set_gpu_exception_method(NULL);  
+  set_gpu_hsail_deopt_info(NULL);  
+#endif
   set_thread_state(_thread_new);
 #if INCLUDE_NMT
   set_recorder(NULL);
@@ -2853,6 +2859,13 @@
     // a scan.
     cf->do_code_blob(_scanned_nmethod);
   }
+
+#ifdef GRAAL
+  Hsail::HSAILDeoptimizationInfo* gpu_hsail_deopt_info = (Hsail::HSAILDeoptimizationInfo*) get_gpu_hsail_deopt_info();
+  if (gpu_hsail_deopt_info != NULL) {
+    gpu_hsail_deopt_info->oops_do(f);
+  }
+#endif
 }
 
 void JavaThread::nmethods_do(CodeBlobClosure* cf) {
--- a/src/share/vm/runtime/thread.hpp	Thu Jun 26 13:42:29 2014 +0200
+++ b/src/share/vm/runtime/thread.hpp	Thu Jun 26 18:25:35 2014 +0200
@@ -944,15 +944,24 @@
   volatile address _exception_handler_pc;        // PC for handler of exception
   volatile int     _is_method_handle_return;     // true (== 1) if the current exception PC is a MethodHandle call site.
 
+#ifdef GRAAL
   // Record the method and bci from a gpu kernel exception so
   // it can be added into the exception stack trace
   jint    _gpu_exception_bci;
   Method* _gpu_exception_method;
+  // Record the hsailDeoptimization info so gc oops_do processing can find it
+  void*   _gpu_hsail_deopt_info;
+#endif
+
  public:
+#ifdef GRAAL
   void set_gpu_exception_bci(jint bci)           { _gpu_exception_bci = bci; } 
   jint get_gpu_exception_bci()                   { return _gpu_exception_bci; }
   void set_gpu_exception_method(Method* method)  { _gpu_exception_method = method; }
   Method* get_gpu_exception_method()             { return _gpu_exception_method; }
+  void set_gpu_hsail_deopt_info(void * deoptInfo) { _gpu_hsail_deopt_info = deoptInfo; }
+  void* get_gpu_hsail_deopt_info()               { return _gpu_hsail_deopt_info; }
+#endif
   
  private:  
   // support for JNI critical regions