# HG changeset patch
# User S.Bharadwaj Yadavalli <bharadwaj.yadavalli@oracle.com>
# Date 1392061122 18000
# Node ID 0995dcbd6dd8d62f8fc77fc19c5526c445420692
# Parent  eb48fac53e6f7e6c1855775aa92d6005203e6b93
Change CUDA context management to support multiple executions of a kernel. Exclude GPU offloading of lambdas from java.* library code.

diff -r eb48fac53e6f -r 0995dcbd6dd8 graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java
--- a/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java	Mon Feb 10 16:13:21 2014 +0100
+++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotBackend.java	Mon Feb 10 14:38:42 2014 -0500
@@ -137,9 +137,18 @@
             long launchKernel = getLaunchKernelAddress();
             hostForeignCalls.linkForeignCall(hostProviders, CALL_KERNEL, launchKernel, false, NOT_LEAF, NOT_REEXECUTABLE, ANY_LOCATION);
         }
+        /* Add a shutdown hook to destroy CUDA context(s) */
+        Runtime.getRuntime().addShutdownHook(new Thread("PTXShutdown") {
+            @Override
+            public void run() {
+                destroyContext();
+            }
+        });
         super.completeInitialization();
     }
 
+    private static native void destroyContext();
+
     /**
      * Gets the address of {@code Ptx::execute_kernel_from_vm()}.
      */
@@ -365,7 +374,7 @@
 
     @Override
     public LIRGenerator newLIRGenerator(StructuredGraph graph, FrameMap frameMap, CallingConvention cc, LIR lir) {
-        return new PTXLIRGenerator(graph, getProviders(), frameMap, cc, lir);
+        return new PTXHotSpotLIRGenerator(graph, getProviders(), getRuntime().getConfig(), frameMap, cc, lir);
     }
 
     private static void emitKernelEntry(CompilationResultBuilder crb, LIRGenerator lirGen, ResolvedJavaMethod codeCacheOwner) {
diff -r eb48fac53e6f -r 0995dcbd6dd8 graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotLIRGenerator.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/graal/com.oracle.graal.hotspot.ptx/src/com/oracle/graal/hotspot/ptx/PTXHotSpotLIRGenerator.java	Mon Feb 10 14:38:42 2014 -0500
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package com.oracle.graal.hotspot.ptx;
+
+import com.oracle.graal.api.code.CallingConvention;
+import com.oracle.graal.api.code.StackSlot;
+import com.oracle.graal.api.meta.DeoptimizationAction;
+import com.oracle.graal.api.meta.DeoptimizationReason;
+import com.oracle.graal.api.meta.Value;
+import com.oracle.graal.compiler.ptx.PTXLIRGenerator;
+import com.oracle.graal.graph.GraalInternalError;
+import com.oracle.graal.hotspot.HotSpotLIRGenerator;
+import com.oracle.graal.hotspot.HotSpotVMConfig;
+import com.oracle.graal.hotspot.meta.HotSpotProviders;
+import com.oracle.graal.hotspot.nodes.DirectCompareAndSwapNode;
+import com.oracle.graal.lir.FrameMap;
+import com.oracle.graal.lir.LIR;
+import com.oracle.graal.nodes.StructuredGraph;
+import com.oracle.graal.nodes.ValueNode;
+
+/**
+ * LIR generator specialized for PTX HotSpot.
+ */
+public class PTXHotSpotLIRGenerator extends PTXLIRGenerator implements HotSpotLIRGenerator {
+
+    private final HotSpotVMConfig config;
+
+    protected PTXHotSpotLIRGenerator(StructuredGraph graph, HotSpotProviders providers, HotSpotVMConfig config, FrameMap frameMap, CallingConvention cc, LIR lir) {
+        super(graph, providers, frameMap, cc, lir);
+        assert config.basicLockSize == 8;
+        this.config = config;
+    }
+
+    public void emitPrefetchAllocate(ValueNode address, ValueNode distance) {
+        // nop
+    }
+
+    public void emitTailcall(Value[] args, Value address) {
+        throw GraalInternalError.unimplemented();
+    }
+
+    public void emitDeoptimizeCaller(DeoptimizationAction action, DeoptimizationReason reason) {
+        throw GraalInternalError.unimplemented();
+    }
+
+    public void emitPatchReturnAddress(ValueNode address) {
+        throw GraalInternalError.unimplemented();
+    }
+
+    public void emitJumpToExceptionHandlerInCaller(ValueNode handlerInCallerPc, ValueNode exception, ValueNode exceptionPc) {
+        throw GraalInternalError.unimplemented();
+    }
+
+    public void visitDirectCompareAndSwap(DirectCompareAndSwapNode x) {
+        throw GraalInternalError.unimplemented();
+    }
+
+    public StackSlot getLockSlot(int lockDepth) {
+        throw GraalInternalError.unimplemented();
+    }
+
+    public HotSpotProviders getProviders() {
+        throw GraalInternalError.unimplemented();
+    }
+}
diff -r eb48fac53e6f -r 0995dcbd6dd8 src/gpu/ptx/vm/gpu_ptx.cpp
--- a/src/gpu/ptx/vm/gpu_ptx.cpp	Mon Feb 10 16:13:21 2014 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.cpp	Mon Feb 10 14:38:42 2014 -0500
@@ -48,7 +48,7 @@
 // Entry to GPU native method implementation that transitions current thread to '_thread_in_vm'.
 #define GPU_VMENTRY(result_type, name, signature) \
   JNIEXPORT result_type JNICALL name signature { \
-  if (TraceGPUInteraction) tty->print_cr("[CUDA] Ptx::" #name); \
+  if (TraceGPUInteraction) tty->print_cr("[CUDA] " #name); \
   GRAAL_VM_ENTRY_MARK; \
 
 // Entry to GPU native method implementation that calls a JNI function
@@ -69,9 +69,10 @@
   {CC"generateKernel",          CC"([B" STRING ")J",   FN_PTR(Ptx::generate_kernel)},
   {CC"getLaunchKernelAddress",  CC"()J",               FN_PTR(Ptx::get_execute_kernel_from_vm_address)},
   {CC"getAvailableProcessors0", CC"()I",               FN_PTR(Ptx::get_total_cores)},
+  {CC"destroyContext",          CC"()V",               FN_PTR(Ptx::destroy_ptx_context)},
 };
 
-void * Ptx::_device_context;
+void * Ptx::_device_context = 0;
 int    Ptx::_cu_device = 0;
 
 Ptx::cuda_cu_init_func_t Ptx::_cuda_cu_init;
@@ -218,8 +219,8 @@
   version = (float) major + ((float) minor)/10;
 
   if (version < GRAAL_SUPPORTED_COMPUTE_CAPABILITY_VERSION) {
-    tty->print_cr("[CUDA] Only cuda compute capability 3.0 and later supported. Device %d supports %.1f",
-                  _cu_device, version);
+    tty->print_cr("[CUDA] Only cuda compute capability %.1f and later supported. Device %d supports %.1f",
+                  (float) GRAAL_SUPPORTED_COMPUTE_CAPABILITY_VERSION, _cu_device, version);
     return false;
   }
 
@@ -253,6 +254,18 @@
     tty->print_cr("[CUDA] Using %s", device_name);
   }
 
+  // Create CUDA context to compile and execute the kernel
+
+  status = _cuda_cu_ctx_create(&_device_context, GRAAL_CU_CTX_MAP_HOST, _cu_device);
+
+  if (status != GRAAL_CUDA_SUCCESS) {
+    tty->print_cr("[CUDA] Failed to create CUDA context for device(%d): %d", _cu_device, status);
+    return false;
+  }
+  if (TraceGPUInteraction) {
+    tty->print_cr("[CUDA] Success: Created context for device: %d", _cu_device);
+  }
+
   gpu::initialized_gpu(device_name);
 
   return true;
@@ -381,23 +394,20 @@
   jit_options[2] = GRAAL_CU_JIT_MAX_REGISTERS;
   jit_option_values[2] = (void *)(size_t)jit_register_count;
 
-  // Create CUDA context to compile and execute the kernel
-  int status = _cuda_cu_ctx_create(&_device_context, GRAAL_CU_CTX_MAP_HOST, _cu_device);
+  // Set CUDA context to compile and execute the kernel
 
-  if (status != GRAAL_CUDA_SUCCESS) {
-    tty->print_cr("[CUDA] Failed to create CUDA context for device(%d): %d", _cu_device, status);
-    return 0L;
-  }
-  if (TraceGPUInteraction) {
-    tty->print_cr("[CUDA] Success: Created context for device: %d", _cu_device);
+  if (_device_context == NULL) {
+    tty->print_cr("[CUDA] Encountered uninitialized CUDA context for device(%d)", _cu_device);
+      return 0L;
   }
 
-  status = _cuda_cu_ctx_set_current(_device_context);
+  int status = _cuda_cu_ctx_set_current(_device_context);
 
   if (status != GRAAL_CUDA_SUCCESS) {
     tty->print_cr("[CUDA] Failed to set current context for device: %d", _cu_device);
     return 0L;
   }
+
   if (TraceGPUInteraction) {
     tty->print_cr("[CUDA] Success: Set current context for device: %d", _cu_device);
     tty->print_cr("[CUDA] PTX Kernel\n%s", code);
@@ -573,17 +583,9 @@
     }
   }
 
-  void destroy_context() {
-    if (Ptx::_device_context != NULL) {
-      check(Ptx::_cuda_cu_ctx_destroy(Ptx::_device_context), "Destroy context");
-      Ptx::_device_context = NULL;
-    }
-  }
-
   ~PtxCall() {
     unpin_objects();
     free_return_value();
-    destroy_context();
     if (_gc_locked) {
       GC_locker::unlock_critical(_thread);
       if (TraceGPUInteraction) {
@@ -669,6 +671,23 @@
   }
 }
 
+GPU_VMENTRY(void, Ptx::destroy_ptx_context, (void))
+    if (_device_context != NULL) {
+      int status = _cuda_cu_ctx_destroy(_device_context);
+      if (status != GRAAL_CUDA_SUCCESS) {
+        if (TraceGPUInteraction) {
+          tty->print_cr("[CUDA] Error(%d) : Failed to destroy context", status);
+        }
+      _device_context = NULL;
+      } else {
+        if (TraceGPUInteraction) {
+          tty->print_cr("[CUDA] Destroyed context", status);
+        }
+      }
+    }
+
+GPU_END
+
 GPU_VMENTRY(jlong, Ptx::get_execute_kernel_from_vm_address, (JNIEnv *env, jclass))
   return (jlong) Ptx::execute_kernel_from_vm;
 GPU_END
@@ -720,7 +739,7 @@
 JRT_END
 
 #if defined(LINUX)
-static const char cuda_library_name[] = "libcuda.so";
+static const char cuda_library_name[] = "/usr/lib/libcuda.so";
 #elif defined(__APPLE__)
 static char const cuda_library_name[] = "/usr/local/cuda/lib/libcuda.dylib";
 #else
diff -r eb48fac53e6f -r 0995dcbd6dd8 src/gpu/ptx/vm/gpu_ptx.hpp
--- a/src/gpu/ptx/vm/gpu_ptx.hpp	Mon Feb 10 16:13:21 2014 +0100
+++ b/src/gpu/ptx/vm/gpu_ptx.hpp	Mon Feb 10 14:38:42 2014 -0500
@@ -112,6 +112,8 @@
   // static native int getAvailableProcessors0();
   JNIEXPORT static jint get_total_cores(JNIEnv *env, jobject);
 
+  JNIEXPORT static void destroy_ptx_context();
+
   // Links the CUDA driver library functions
   static bool link();
 
diff -r eb48fac53e6f -r 0995dcbd6dd8 src/share/vm/runtime/compilationPolicy.cpp
--- a/src/share/vm/runtime/compilationPolicy.cpp	Mon Feb 10 16:13:21 2014 +0100
+++ b/src/share/vm/runtime/compilationPolicy.cpp	Mon Feb 10 14:38:42 2014 -0500
@@ -172,16 +172,20 @@
       {
         ResourceMark rm;
         if (klass_name != NULL) {
-          if (klass_name != NULL && method_name != NULL) {
-            const char* lambdaPrefix = "lambda$";
-            char* methodPrefix = strstr(method_name->as_C_string(), lambdaPrefix);
-            if (methodPrefix != 0) {
-              if ((strncmp(lambdaPrefix, methodPrefix, strlen(lambdaPrefix)) == 0)) {
-                if (TraceGPUInteraction) {
-                  char buf[O_BUFLEN];
-                  tty->print_cr("Selected lambda method %s for GPU offload", m->name_and_sig_as_C_string(buf, O_BUFLEN));
+          const char* javaClass = "java/";
+          // Exclude java library classes - for now
+          if (strncmp(klass_name->as_C_string(), javaClass, strlen(javaClass))) {
+            if (method_name != NULL) {
+              const char* lambdaPrefix = "lambda$";
+              char* methodPrefix = strstr(method_name->as_C_string(), lambdaPrefix);
+              if (methodPrefix != 0) {
+                if ((strncmp(lambdaPrefix, methodPrefix, strlen(lambdaPrefix)) == 0)) {
+                  if (TraceGPUInteraction) {
+                    char buf[O_BUFLEN];
+                    tty->print_cr("Selected lambda method %s for GPU offload", m->name_and_sig_as_C_string(buf, O_BUFLEN));
+                  }
+                  return true;
                 }
-                return true;
               }
             }
           }