# HG changeset patch # User Doug Simon # Date 1402432586 -7200 # Node ID 06eedda53e1450b4548f1b5751bc5e4292af6215 # Parent b6ab7e7fa0a53c1e51f29cee5e1c31e3d21955fb HSAIL: add support to allocate new TLAB from GPU Contributed-by: Tom Deneau diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test.infra/src/com/oracle/graal/compiler/hsail/test/infra/GraalKernelTester.java --- a/graal/com.oracle.graal.compiler.hsail.test.infra/src/com/oracle/graal/compiler/hsail/test/infra/GraalKernelTester.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test.infra/src/com/oracle/graal/compiler/hsail/test/infra/GraalKernelTester.java Tue Jun 10 22:36:26 2014 +0200 @@ -152,13 +152,17 @@ return true; } + HotSpotNmethod installedCode; + @Override protected void dispatchKernelOkra(int range, Object... args) { HSAILHotSpotBackend backend = getHSAILBackend(); if (backend.isDeviceInitialized()) { try { - HotSpotNmethod code = backend.compileAndInstallKernel(testMethod); - backend.executeKernel(code, range, args); + if (installedCode == null) { + installedCode = backend.compileAndInstallKernel(testMethod); + } + backend.executeKernel(installedCode, range, args); } catch (InvalidInstalledCodeException e) { Debug.log("WARNING:Invalid installed code: " + e); e.printStackTrace(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicIntGetAndAddTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicIntGetAndAddTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicIntGetAndAddTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -45,11 +45,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicIntGetAndSetTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicIntGetAndSetTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicIntGetAndSetTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -45,11 +45,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicLongGetAndAddTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicLongGetAndAddTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicLongGetAndAddTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -45,11 +45,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicLongGetAndSetTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicLongGetAndSetTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/AtomicLongGetAndSetTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -45,11 +45,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntAddAndGetGidTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntAddAndGetGidTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntAddAndGetGidTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -47,11 +47,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntAddAndGetTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntAddAndGetTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntAddAndGetTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -44,11 +44,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntDecAndGetTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntDecAndGetTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntDecAndGetTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -44,11 +44,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndAddTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndAddTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndAddTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -44,11 +44,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndDecTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndDecTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndDecTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -44,11 +44,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndIncTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndIncTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntGetAndIncTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -44,11 +44,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntIncAndGetTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntIncAndGetTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicIntIncAndGetTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -44,11 +44,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongAddAndGetTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongAddAndGetTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongAddAndGetTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -46,11 +46,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongGetAndAddTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongGetAndAddTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongGetAndAddTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -44,11 +44,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongGetAndIncTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongGetAndIncTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongGetAndIncTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -44,11 +44,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongIncAndGetTest.java --- a/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongIncAndGetTest.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.compiler.hsail.test/src/com/oracle/graal/compiler/hsail/test/lambda/AtomicLongIncAndGetTest.java Tue Jun 10 22:36:26 2014 +0200 @@ -44,11 +44,6 @@ } @Override - protected boolean supportsRequiredCapabilities() { - return (canDeoptimize()); - } - - @Override public void runTest() { setupArrays(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.gpu/src/com/oracle/graal/gpu/ExternalCompilationResult.java --- a/graal/com.oracle.graal.gpu/src/com/oracle/graal/gpu/ExternalCompilationResult.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.gpu/src/com/oracle/graal/gpu/ExternalCompilationResult.java Tue Jun 10 22:36:26 2014 +0200 @@ -44,6 +44,7 @@ private StructuredGraph hostGraph; private int[] oopMapArray; + private boolean usesAllocation; /** * Set the address for the point of entry to the external compilation result. @@ -86,4 +87,12 @@ return oopMapArray; } + public void setUsesAllocationFlag(boolean val) { + usesAllocation = val; + } + + public boolean getUsesAllocationFlag() { + return usesAllocation; + } + } diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java --- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotBackend.java Tue Jun 10 22:36:26 2014 +0200 @@ -66,7 +66,6 @@ import com.oracle.graal.lir.gen.*; import com.oracle.graal.lir.hsail.*; import com.oracle.graal.lir.hsail.HSAILControlFlow.DeoptimizingOp; -import com.oracle.graal.lir.hsail.HSAILMove.AtomicReadAndAddOp; import com.oracle.graal.nodes.*; import com.oracle.graal.nodes.StructuredGraph.GuardsStage; import com.oracle.graal.nodes.extended.*; @@ -103,8 +102,10 @@ paramTypeMap.put("HotSpotResolvedPrimitiveType", "f64"); paramTypeMap.put("HotSpotResolvedPrimitiveType", "s64"); - // The order of the conjunction below is important: the OkraUtil - // call may provision the native library required by the initialize() call + /* + * The order of the conjunction below is important: the OkraUtil call may provision the + * native library required by the initialize() call + */ deviceInitialized = OkraUtil.okraLibExists() && initialize(); } @@ -261,8 +262,7 @@ */ public final HotSpotNmethod installKernel(ResolvedJavaMethod method, ExternalCompilationResult hsailCode) { assert hsailCode.getEntryPoint() != 0L; - // code below here lifted from HotSpotCodeCacheProviders.addExternalMethod - // used to be return getProviders().getCodeCache().addExternalMethod(method, hsailCode); + // Code here based on HotSpotCodeCacheProvider.addExternalMethod(). HotSpotResolvedJavaMethod javaMethod = (HotSpotResolvedJavaMethod) method; if (hsailCode.getId() == -1) { hsailCode.setId(javaMethod.allocateCompileId(hsailCode.getEntryBCI())); @@ -294,6 +294,7 @@ HSAILHotSpotNmethod code = new HSAILHotSpotNmethod(javaMethod, hsailCode.getName(), false, true); code.setOopMapArray(hsailCode.getOopMapArray()); + code.setUsesAllocationFlag(hsailCode.getUsesAllocationFlag()); HotSpotCompiledNmethod compiled = new HotSpotCompiledNmethod(getTarget(), javaMethod, compilationResult); CodeInstallResult result = getRuntime().getCompilerToVM().installCode(compiled, code, null); if (result != CodeInstallResult.OK) { @@ -388,7 +389,9 @@ } else { oopsSaveArea = null; } - return executeKernel0(kernel, jobSize, args, oopsSaveArea, donorThreadPool.get().getThreads(), HsailAllocBytesPerWorkitem.getValue(), oopMapArray); + // Pass donorThreadPoolArray if this kernel uses allocation, otherwise null + Thread[] donorThreadArray = ((HSAILHotSpotNmethod) kernel).getUsesAllocationFlag() ? donorThreadPool.get().getThreads() : null; + return executeKernel0(kernel, jobSize, args, oopsSaveArea, donorThreadArray, HsailAllocBytesPerWorkitem.getValue(), oopMapArray); } private static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args, Object[] oopsSave, Thread[] donorThreads, int allocBytesPerWorkitem, int[] oopMapArray) @@ -449,6 +452,7 @@ static class HSAILHotSpotNmethod extends HotSpotNmethod { private int[] oopMapArray; + private boolean usesAllocation; HSAILHotSpotNmethod(HotSpotResolvedJavaMethod method, String name, boolean isDefault, boolean isExternal) { super(method, name, isDefault, isExternal); @@ -461,6 +465,14 @@ int[] getOopMapArray() { return oopMapArray; } + + public void setUsesAllocationFlag(boolean val) { + usesAllocation = val; + } + + public boolean getUsesAllocationFlag() { + return usesAllocation; + } } @Override @@ -493,19 +505,22 @@ Debug.log("+UseHSAILSafepoints requires +UseHSAILDeoptimization"); } - // see what graph nodes we have to see if we are using the thread register - // if not, we don't have to emit the code that sets that up - // maybe there is a better way to do this? - boolean usesThreadRegister = false; + /* + * See what graph nodes we have to see if we are using the thread register. If not, we don't + * have to emit the code that sets it up. Maybe there is a better way to do this? + */ + boolean usesAllocation = false; search: for (AbstractBlock b : lir.linearScanOrder()) { for (LIRInstruction op : lir.getLIRforBlock(b)) { - if (op instanceof AtomicReadAndAddOp) { - usesThreadRegister = true; + if ((op instanceof HSAILMove.LoadOp) && ((HSAILMove.LoadOp) op).usesThreadRegister()) { + usesAllocation = true; assert useHSAILDeoptimization : "cannot use thread register if HSAIL deopt support is disabled"; break search; } } } + // save usesAllocation flag in ExternalCompilationResult + ((ExternalCompilationResult) crb.compilationResult).setUsesAllocationFlag(usesAllocation); // Emit the prologue. HSAILAssembler asm = (HSAILAssembler) crb.asm; @@ -527,8 +542,7 @@ nonConstantParamCount++; } - // If this is an instance method, include mappings for the "this" parameter - // as the first parameter. + // If this is an instance method, include the "this" parameter if (!isStatic) { nonConstantParamCount++; } @@ -564,8 +578,10 @@ // Include the gid. System.arraycopy(paramtypes, 0, ccParamTypes, 0, nonConstantParamCount); - // Last entry is always int (its register gets used in the workitemabsid instruction) - // this is true even for object stream labmdas + /* + * Last entry is always int (its register gets used in the workitemabsid instruction). This + * is true even for object stream lambdas. + */ if (sigParamCount > 0) { ccParamTypes[ccParamTypes.length - 1] = metaAccess.lookupJavaType(int.class); } @@ -621,7 +637,6 @@ if (useHSAILDeoptimization) { // Aliases for d16 RegisterValue d16_deoptInfo = HSAIL.d16.asValue(wordKind); - RegisterValue d16_donorThreads = d16_deoptInfo; // Aliases for d17 RegisterValue d17_donorThreadIndex = HSAIL.d17.asValue(wordKind); @@ -645,21 +660,20 @@ asm.emitLoadAcquire(s34_deoptOccurred, new HSAILAddressValue(Kind.Int, d16_deoptInfo, config.hsailDeoptOccurredOffset).toAddress()); asm.emitCompare(Kind.Int, s34_deoptOccurred, Constant.forInt(0), "ne", false, false); asm.cbr(deoptInProgressLabel); - // load thread register if needed - if (usesThreadRegister) { + // load thread register if this kernel performs allocation + if (usesAllocation) { + RegisterValue threadReg = getProviders().getRegisters().getThreadRegister().asValue(wordKind); assert HsailDonorThreads.getValue() > 0; - asm.emitLoad(wordKind, d16_donorThreads, new HSAILAddressValue(wordKind, d16_deoptInfo, config.hsailDonorThreadsOffset).toAddress()); + asm.emitLoad(wordKind, threadReg, new HSAILAddressValue(wordKind, d16_deoptInfo, config.hsailCurTlabInfoOffset).toAddress()); if (HsailDonorThreads.getValue() != 1) { asm.emitComment("// map workitem to a donor thread"); asm.emitString(String.format("rem_u32 $%s, %s, %d;", s34_donorThreadIndex.getRegister(), workItemReg, HsailDonorThreads.getValue())); asm.emitConvert(d17_donorThreadIndex, s34_donorThreadIndex, wordKind, Kind.Int); - asm.emit("mad", d16_donorThreads, d17_donorThreadIndex, Constant.forInt(8), d16_donorThreads); + asm.emit("mad", threadReg, d17_donorThreadIndex, Constant.forInt(8), threadReg); } else { // workitem is already mapped to solitary donor thread } - AllocatableValue threadRegValue = getProviders().getRegisters().getThreadRegister().asValue(wordKind); - asm.emitComment("// $" + getProviders().getRegisters().getThreadRegister() + " will point to a donor thread for this workitem"); - asm.emitLoad(wordKind, threadRegValue, new HSAILAddressValue(wordKind, d16_donorThreads).toAddress()); + asm.emitComment("// $" + getProviders().getRegisters().getThreadRegister() + " will point to holder of tlab thread info for this workitem"); } } @@ -676,8 +690,10 @@ boolean useCompressedOops = config.useCompressedOops; final int arrayElementsOffset = HotSpotGraalRuntime.getArrayBaseOffset(wordKind); String iterationObjArgReg = HSAIL.mapRegister(cc.getArgument(nonConstantParamCount - 1)); - // iterationObjArgReg will be the highest $d register in use (it is the last parameter) - // so tempReg can be the next higher $d register + /* + * iterationObjArgReg will be the highest $d register in use (it is the last parameter) + * so tempReg can be the next higher $d register + */ String tmpReg = "$d" + (asRegister(cc.getArgument(nonConstantParamCount - 1)).encoding() + 1); // Convert gid to long. asm.emitString("cvt_u64_s32 " + tmpReg + ", " + workItemReg + "; // Convert gid to long"); @@ -740,8 +756,10 @@ int numDRegs = 0; int numStackSlotBytes = 0; if (useHSAILDeoptimization) { - // get the union of registers and stack slots needed to be saved at the infopoints - // while doing this compute the highest register in each category + /* + * Get the union of registers and stack slots needed to be saved at the infopoints. While + * doing this compute the highest register in each category. + */ HSAILHotSpotRegisterConfig hsailRegConfig = (HSAILHotSpotRegisterConfig) regConfig; Set infoUsedRegs = new TreeSet<>(); Set infoUsedStackSlots = new HashSet<>(); @@ -836,13 +854,16 @@ asm.emitComment("// Determine next deopt save slot"); asm.emitAtomicAdd(scratch32, deoptNextIndexAddr, Constant.forInt(1)); - // scratch32 now holds next index to use - // set error condition if no room in save area + /* + * scratch32 now holds next index to use set error condition if no room in save area + */ asm.emitComment("// assert room to save deopt"); asm.emitCompare(Kind.Int, scratch32, Constant.forInt(maxDeoptIndex), "lt", false, false); asm.cbr("@L_StoreDeopt"); - // if assert fails, store a guaranteed negative workitemid in top level deopt occurred - // flag + /* + * if assert fails, store a guaranteed negative workitemid in top level deopt occurred + * flag + */ asm.emitWorkItemAbsId(scratch32); asm.emit("mad", scratch32, scratch32, Constant.forInt(-1), Constant.forInt(-1)); asm.emitStore(scratch32, deoptInfoAddr); @@ -880,8 +901,10 @@ asm.emitComment("// store regCounts (" + numSRegs + " $s registers, " + numDRegs + " $d registers, " + numStackSlots + " stack slots)"); asm.emitStore(Kind.Int, Constant.forInt(numSRegs + (numDRegs << 8) + (numStackSlots << 16)), regCountsAddr); - // loop thru the usedValues storing each of the registers that are used. - // we always store in a fixed location, even if some registers are skipped + /* + * Loop thru the usedValues storing each of the registers that are used. We always store + * in a fixed location, even if some registers are skipped. + */ asm.emitComment("// store used regs"); for (Register reg : infoUsedRegs) { if (hsailRegConfig.isAllocatableSReg(reg)) { @@ -961,11 +984,11 @@ private int intsPerInfopoint; int[] build(List infoList, int numSRegs, int numDRegs, int numStackSlots, HSAILHotSpotRegisterConfig hsailRegConfig) { - // we are told that infoList is always sorted - // each infoPoint can have a different oopMap - - // since numStackSlots is the number of 8-byte stack slots used, it is an upper limit on - // the number of oop stack slots + /* + * We are told that infoList is always sorted. Each infoPoint can have a different + * oopMap. Since numStackSlots is the number of 8-byte stack slots used, it is an upper + * limit on the number of oop stack slots + */ int bitsPerInfopoint = numDRegs + numStackSlots; int intsForBits = (bitsPerInfopoint + 31) / 32; int numInfopoints = infoList.size(); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotLIRGenerator.java --- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotLIRGenerator.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/HSAILHotSpotLIRGenerator.java Tue Jun 10 22:36:26 2014 +0200 @@ -38,17 +38,8 @@ import com.oracle.graal.lir.StandardOp.SaveRegistersOp; import com.oracle.graal.lir.gen.*; import com.oracle.graal.lir.hsail.*; -import com.oracle.graal.lir.hsail.HSAILControlFlow.CondMoveOp; -import com.oracle.graal.lir.hsail.HSAILControlFlow.DeoptimizeOp; -import com.oracle.graal.lir.hsail.HSAILControlFlow.ForeignCall1ArgOp; -import com.oracle.graal.lir.hsail.HSAILControlFlow.ForeignCall2ArgOp; -import com.oracle.graal.lir.hsail.HSAILControlFlow.ForeignCallNoArgOp; -import com.oracle.graal.lir.hsail.HSAILMove.CompareAndSwapOp; -import com.oracle.graal.lir.hsail.HSAILMove.LoadOp; -import com.oracle.graal.lir.hsail.HSAILMove.MoveFromRegOp; -import com.oracle.graal.lir.hsail.HSAILMove.MoveToRegOp; -import com.oracle.graal.lir.hsail.HSAILMove.StoreConstantOp; -import com.oracle.graal.lir.hsail.HSAILMove.StoreOp; +import com.oracle.graal.lir.hsail.HSAILControlFlow.*; +import com.oracle.graal.lir.hsail.HSAILMove.*; import com.oracle.graal.phases.util.*; /** @@ -126,6 +117,13 @@ return result; } + public Variable emitLoadAcquire(PlatformKind kind, Value address, LIRFrameState state) { + HSAILAddressValue loadAddress = asAddressValue(address); + Variable result = newVariable(kind); + append(new LoadAcquireOp(getMemoryKind(kind), result, loadAddress, state)); + return result; + } + @Override public void emitStore(PlatformKind kind, Value address, Value inputVal, LIRFrameState state) { HSAILAddressValue storeAddress = asAddressValue(address); @@ -147,6 +145,13 @@ } } + public void emitStoreRelease(PlatformKind kind, Value address, Value inputVal, LIRFrameState state) { + HSAILAddressValue storeAddress = asAddressValue(address); + // TODO: handle Constants here + Variable input = load(inputVal); + append(new StoreReleaseOp(getMemoryKind(kind), storeAddress, input, state)); + } + public Value emitCompareAndSwap(Value address, Value expectedValue, Value newValue, Value trueValue, Value falseValue) { PlatformKind kind = newValue.getPlatformKind(); assert kind == expectedValue.getPlatformKind(); @@ -314,4 +319,10 @@ emitMove(obj, address); append(new HSAILMove.NullCheckOp(obj, state)); } + + public Variable emitWorkItemAbsId() { + Variable result = newVariable(Kind.Int); + append(new WorkItemAbsIdOp(result)); + return result; + } } diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILDirectLoadAcquireNode.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILDirectLoadAcquireNode.java Tue Jun 10 22:36:26 2014 +0200 @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package com.oracle.graal.hotspot.hsail.replacements; + +import com.oracle.graal.api.meta.*; +import com.oracle.graal.nodes.*; +import com.oracle.graal.nodes.spi.*; +import com.oracle.graal.replacements.nodes.*; +import com.oracle.graal.hotspot.hsail.*; +import com.oracle.graal.word.*; + +public class HSAILDirectLoadAcquireNode extends DirectReadNode { + + public HSAILDirectLoadAcquireNode(ValueNode address, Kind readKind) { + super(address, readKind); + } + + @Override + public void generate(NodeLIRBuilderTool gen) { + HSAILHotSpotLIRGenerator hsailgen = (HSAILHotSpotLIRGenerator) (gen.getLIRGeneratorTool()); + Value result = hsailgen.emitLoadAcquire(getKind(), gen.operand(getAddress()), null); + gen.setResult(this, result); + } + + @NodeIntrinsic + public static native long loadAcquire(long address, @ConstantNodeParameter Kind kind); + + public static long loadAcquireLong(Word address) { + return loadAcquire(address.rawValue(), Kind.Long); + } + +} diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILDirectStoreReleaseNode.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILDirectStoreReleaseNode.java Tue Jun 10 22:36:26 2014 +0200 @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package com.oracle.graal.hotspot.hsail.replacements; + +import com.oracle.graal.api.meta.*; +import com.oracle.graal.nodes.*; +import com.oracle.graal.nodes.spi.*; +import com.oracle.graal.replacements.nodes.*; +import com.oracle.graal.hotspot.hsail.*; +import com.oracle.graal.word.*; + +public class HSAILDirectStoreReleaseNode extends DirectStoreNode { + + public HSAILDirectStoreReleaseNode(ValueNode address, ValueNode value, Kind kind) { + super(address, value, kind); + } + + @Override + public void generate(NodeLIRBuilderTool gen) { + HSAILHotSpotLIRGenerator hsailgen = (HSAILHotSpotLIRGenerator) (gen.getLIRGeneratorTool()); + Value v = gen.operand(getValue()); + hsailgen.emitStoreRelease(getKind(), gen.operand(getAddress()), v, null); + } + + @NodeIntrinsic + public static native void storeRelease(long address, long value, @ConstantNodeParameter Kind kind); + + public static void storeReleaseLong(Word address, long value) { + storeRelease(address.rawValue(), value, Kind.Long); + } + +} diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILHotSpotReplacementsUtil.java --- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILHotSpotReplacementsUtil.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILHotSpotReplacementsUtil.java Tue Jun 10 22:36:26 2014 +0200 @@ -43,11 +43,37 @@ hsailRegisters = registers; } + public static final LocationIdentity TLAB_INFO_LOCATION = new NamedLocationIdentity("TlabInfo"); + public static final LocationIdentity TLABINFO_LASTGOODTOP_LOCATION = new NamedLocationIdentity("TlabInfoLastGoodTop"); + public static final LocationIdentity TLABINFO_END_LOCATION = new NamedLocationIdentity("TlabInfoEnd"); + public static final LocationIdentity TLABINFO_TOP_LOCATION = new NamedLocationIdentity("TlabInfoTop"); + public static final LocationIdentity TLABINFO_START_LOCATION = new NamedLocationIdentity("TlabInfoStart"); + public static final LocationIdentity TLABINFO_ALLOCINFO_LOCATION = new NamedLocationIdentity("TlabInfoAllocInfo"); + public static final LocationIdentity TLABINFO_ORIGINALTOP_LOCATION = new NamedLocationIdentity("TlabInfoOriginalTop"); + public static final LocationIdentity TLABINFO_DONORTHREAD_LOCATION = new NamedLocationIdentity("TlabInfoDonorThread"); + + public static final LocationIdentity ALLOCINFO_TLABINFOSPOOLNEXT_LOCATION = new NamedLocationIdentity("AllocInfoTlabInfosPoolNext"); + public static final LocationIdentity ALLOCINFO_TLABINFOSPOOLEND_LOCATION = new NamedLocationIdentity("AllocInfoTlabInfosPoolEnd"); + public static final LocationIdentity ALLOCINFO_TLABALIGNRESERVEBYTES_LOCATION = new NamedLocationIdentity("AllocInfoTlabAlignreservebytes"); + /** - * Gets the value of the thread register as a Word. + * Gets the value of the thread register as a Word. There is a level of indirection here. Thread + * register actually points to a holder for tlab info. */ - public static Word thread() { - return registerAsWord(threadRegister(), true, false); + public static Word getTlabInfoPtr() { + Word threadRegAsWord = registerAsWord(threadRegister(), true, false); + return threadRegAsWord.readWord(0, TLAB_INFO_LOCATION); + } + + public static Word getTlabInfoPtrLoadAcquire() { + Word threadRegAsWord = registerAsWord(threadRegister(), true, false); + return Word.unsigned(HSAILDirectLoadAcquireNode.loadAcquireLong(threadRegAsWord)); + } + + public static void writeTlabInfoPtrStoreRelease(Word val) { + // this only gets done in the waiting loop so we will always use Store Release + Word threadRegAsWord = registerAsWord(threadRegister(), true, false); + HSAILDirectStoreReleaseNode.storeReleaseLong(threadRegAsWord, val.rawValue()); } @Fold @@ -55,19 +81,64 @@ return hsailRegisters.getThreadRegister(); } - public static Word atomicGetAndAddTlabTop(Word thread, int size) { - return Word.unsigned(AtomicReadAndAddNode.getAndAddLong(null, thread.rawValue() + threadTlabTopOffset(), size, TLAB_TOP_LOCATION)); + public static Word atomicGetAndAddTlabInfoTop(Word tlabInfo, int delta) { + return Word.unsigned(AtomicReadAndAddNode.getAndAddLong(null, tlabInfo.rawValue() + config().hsailTlabInfoTopOffset, delta, TLABINFO_TOP_LOCATION)); + } + + public static Word readTlabInfoEnd(Word tlabInfo) { + return tlabInfo.readWord(config().hsailTlabInfoEndOffset, TLABINFO_END_LOCATION); + } + + public static Word readTlabInfoStart(Word tlabInfo) { + return tlabInfo.readWord(config().hsailTlabInfoStartOffset, TLABINFO_START_LOCATION); + } + + public static void writeTlabInfoLastGoodTop(Word tlabInfo, Word val) { + tlabInfo.writeWord(config().hsailTlabInfoLastGoodTopOffset, val, TLABINFO_LASTGOODTOP_LOCATION); + } + + public static void writeTlabInfoStart(Word tlabInfo, Word val) { + tlabInfo.writeWord(config().hsailTlabInfoStartOffset, val, TLABINFO_START_LOCATION); + } + + public static void writeTlabInfoTop(Word tlabInfo, Word val) { + tlabInfo.writeWord(config().hsailTlabInfoTopOffset, val, TLABINFO_TOP_LOCATION); + } + + public static void writeTlabInfoEnd(Word tlabInfo, Word val) { + tlabInfo.writeWord(config().hsailTlabInfoEndOffset, val, TLABINFO_END_LOCATION); } - public static final LocationIdentity TLAB_PFTOP_LOCATION = new NamedLocationIdentity("TlabPfTop"); + public static Word readTlabInfoAllocInfo(Word tlabInfo) { + return tlabInfo.readWord(config().hsailTlabInfoAllocInfoOffset, TLABINFO_ALLOCINFO_LOCATION); + } - @Fold - public static int threadTlabPfTopOffset() { - return config().threadTlabPfTopOffset(); + public static void writeTlabInfoAllocInfo(Word tlabInfo, Word val) { + tlabInfo.writeWord(config().hsailTlabInfoAllocInfoOffset, val, TLABINFO_ALLOCINFO_LOCATION); + } + + public static void writeTlabInfoOriginalTop(Word tlabInfo, Word val) { + tlabInfo.writeWord(config().hsailTlabInfoOriginalTopOffset, val, TLABINFO_ORIGINALTOP_LOCATION); } - public static void writeTlabPfTop(Word thread, Word val) { - thread.writeWord(threadTlabPfTopOffset(), val, TLAB_PFTOP_LOCATION); + public static void writeTlabInfoDonorThread(Word tlabInfo, Word val) { + tlabInfo.writeWord(config().hsailTlabInfoDonorThreadOffset, val, TLABINFO_DONORTHREAD_LOCATION); + } + + public static Word readTlabInfoDonorThread(Word tlabInfo) { + return tlabInfo.readWord(config().hsailTlabInfoDonorThreadOffset, TLABINFO_DONORTHREAD_LOCATION); + } + + public static Word readAllocInfoTlabInfosPoolEnd(Word allocInfo) { + return allocInfo.readWord(config().hsailAllocInfoTlabInfosPoolEndOffset, ALLOCINFO_TLABINFOSPOOLEND_LOCATION); + } + + public static Word readAllocInfoTlabAlignReserveBytes(Word allocInfo) { + return allocInfo.readWord(config().hsailAllocInfoTlabAlignReserveBytesOffset, ALLOCINFO_TLABALIGNRESERVEBYTES_LOCATION); + } + + public static Word atomicGetAndAddAllocInfoTlabInfosPoolNext(Word allocInfo, int delta) { + return Word.unsigned(AtomicReadAndAddNode.getAndAddLong(null, allocInfo.rawValue() + config().hsailAllocInfoTlabInfosPoolNextOffset, delta, ALLOCINFO_TLABINFOSPOOLNEXT_LOCATION)); } } diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILNewObjectSnippets.java --- a/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILNewObjectSnippets.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILNewObjectSnippets.java Tue Jun 10 22:36:26 2014 +0200 @@ -60,6 +60,9 @@ @Option(help = "In HSAIL allocation, allow allocation from eden as fallback if TLAB is full") static final OptionValue HsailUseEdenAllocate = new OptionValue<>(false); + @Option(help = "In HSAIL allocation, allow GPU to allocate a new tlab if TLAB is full") + static final OptionValue HsailNewTlabAllocate = new OptionValue<>(true); + @Option(help = "Estimate of number of bytes allocated by each HSAIL workitem, used to size TLABs") static public final OptionValue HsailAllocBytesPerWorkitem = new OptionValue<>(64); @@ -67,44 +70,130 @@ } private static final boolean hsailUseEdenAllocate = HsailUseEdenAllocate.getValue(); + private static final boolean hsailNewTlabAllocate = HsailNewTlabAllocate.getValue(); + + protected static Word fillNewTlabInfoWithTlab(Word oldTlabInfo) { + Word allocInfo = readTlabInfoAllocInfo(oldTlabInfo); + Word newTlabInfo = atomicGetAndAddAllocInfoTlabInfosPoolNext(allocInfo, config().hsailTlabInfoSize); + Word tlabInfosPoolEnd = readAllocInfoTlabInfosPoolEnd(allocInfo); + if (newTlabInfo.aboveOrEqual(tlabInfosPoolEnd)) { + // could not get a new tlab info, mark zero and we will later deoptimize + return (Word.zero()); + } + + // make new size depend on old tlab size + Word newTlabSize = readTlabInfoEnd(oldTlabInfo).subtract(readTlabInfoStart(oldTlabInfo)); + // try to allocate a new tlab + Word tlabStart = NewInstanceStub.edenAllocate(newTlabSize, false); + writeTlabInfoStart(newTlabInfo, tlabStart); // write this field even if zero + if (tlabStart.equal(0)) { + // could not get a new tlab, mark zero and we will later deoptimize + return (Word.zero()); + } + // here we have a new tlab and a tlabInfo, we can fill it in + writeTlabInfoTop(newTlabInfo, tlabStart); + writeTlabInfoOriginalTop(newTlabInfo, tlabStart); + // set end so that we leave space for the tlab "alignment reserve" + Word alignReserveBytes = readAllocInfoTlabAlignReserveBytes(allocInfo); + writeTlabInfoEnd(newTlabInfo, tlabStart.add(newTlabSize.subtract(alignReserveBytes))); + writeTlabInfoAllocInfo(newTlabInfo, allocInfo); + writeTlabInfoDonorThread(newTlabInfo, readTlabInfoDonorThread(oldTlabInfo)); + return (newTlabInfo); + } + + protected static Word allocateFromTlabSlowPath(Word fastPathTlabInfo, int size, Word fastPathTop, Word fastPathEnd) { + // eventually this will be a separate call, not inlined + + // we come here from the fastpath allocation + // here we know that the tlab has overflowed (top + size > end) + // find out if we are the first overflower + Word tlabInfo = fastPathTlabInfo; + Word top = fastPathTop; + Word end = fastPathEnd; + + // start a loop where we try to get a new tlab and then try to allocate from it + // keep doing this until we run out of tlabs or tlabInfo structures + // initialize result with error return value + Word result = Word.zero(); + while (result.equal(Word.zero()) && tlabInfo.notEqual(Word.zero())) { + boolean firstOverflower = top.belowOrEqual(end); + if (firstOverflower) { + // store the last good top before overflow into last_good_top field + // we will move it back into top later when back in the VM + writeTlabInfoLastGoodTop(tlabInfo, top); + } + + // if all this allocate tlab from gpu logic is disabled, + // just immediately set tlabInfo to 0 here + if (!hsailNewTlabAllocate) { + tlabInfo = Word.zero(); + } else { + // loop here waiting for the first overflower to get a new tlab + // note that on an hsa device we must be careful how we loop in order to ensure + // "forward progress". For example we must not break out of the loop. + Word oldTlabInfo = tlabInfo; + do { + if (firstOverflower) { + // allocate new tlabInfo and new tlab to fill it, returning 0 if any + // problems + // this will get all spinners out of this loop. + tlabInfo = fillNewTlabInfoWithTlab(oldTlabInfo); + writeTlabInfoPtrStoreRelease(tlabInfo); + } else { + tlabInfo = getTlabInfoPtrLoadAcquire(); + } + } while (tlabInfo.equal(oldTlabInfo)); + // when we get out of the loop if tlabInfoPtr contains 0, it means we + // can't get any more tlabs and will have to deoptimize + // otherwise, we have a valid new tlabInfo/tlab and can try to allocate again. + if (tlabInfo.notEqual(0)) { + top = atomicGetAndAddTlabInfoTop(tlabInfo, size); + end = readTlabInfoEnd(tlabInfo); + Word newTop = top.add(size); + if (probability(FAST_PATH_PROBABILITY, newTop.belowOrEqual(end))) { + result = top; + } + } + } + } // while (result == 0) && (tlabInfo != 0)) + return result; + } + + protected static Object addressToFormattedObject(Word addr, @ConstantParameter int size, Word hub, Word prototypeMarkWord, @ConstantParameter boolean fillContents, + @ConstantParameter String typeContext) { + Object result = formatObject(hub, size, addr, prototypeMarkWord, fillContents, true, false, true); + profileAllocation("instance", size, typeContext); + return piCast(verifyOop(result), StampFactory.forNodeIntrinsic()); + } @Snippet public static Object allocateInstanceAtomic(@ConstantParameter int size, Word hub, Word prototypeMarkWord, @ConstantParameter boolean fillContents, @ConstantParameter String typeContext) { - Word thread = thread(); boolean haveResult = false; if (useTLAB()) { - Word top = atomicGetAndAddTlabTop(thread, size); - Word end = readTlabEnd(thread); - Word newTop = top.add(size); - if (probability(FAST_PATH_PROBABILITY, newTop.belowOrEqual(end))) { - // writeTlabTop(thread, newTop) was done by the atomicGetAndAdd - Object result = formatObject(hub, size, top, prototypeMarkWord, fillContents, true, false, true); - profileAllocation("instance", size, typeContext); - return piCast(verifyOop(result), StampFactory.forNodeIntrinsic()); - } else { - // only one overflower will be the first overflower, detectable because - // oldtop was still below end - if (top.belowOrEqual(end)) { - // hack alert: store the last good top before overflow into pf_top - // we will move it back into top later when back in the VM - writeTlabPfTop(thread, top); + // inlining this manually here because it resulted in better fastpath codegen + Word tlabInfo = getTlabInfoPtr(); + if (probability(FAST_PATH_PROBABILITY, tlabInfo.notEqual(0))) { + Word top = atomicGetAndAddTlabInfoTop(tlabInfo, size); + Word end = readTlabInfoEnd(tlabInfo); + Word newTop = top.add(size); + if (probability(FAST_PATH_PROBABILITY, newTop.belowOrEqual(end))) { + return addressToFormattedObject(top, size, hub, prototypeMarkWord, fillContents, typeContext); + } else { + Word addr = allocateFromTlabSlowPath(tlabInfo, size, top, end); + if (addr.notEqual(0)) { + return addressToFormattedObject(addr, size, hub, prototypeMarkWord, fillContents, typeContext); + } } - // useless logic but see notes on deopt path below - haveResult = newTop.belowOrEqual(end); } } + + // we could not allocate from tlab, try allocating directly from eden if (hsailUseEdenAllocate) { - // originally: - // result = NewInstanceStubCall.call(hub); - - // we could not allocate from tlab, try allocating directly from eden // false for no logging - Word memory = NewInstanceStub.edenAllocate(Word.unsigned(size), false); - if (memory.notEqual(0)) { + Word addr = NewInstanceStub.edenAllocate(Word.unsigned(size), false); + if (addr.notEqual(0)) { new_eden.inc(); - Object result = formatObject(hub, size, memory, prototypeMarkWord, fillContents, true, false, true); - profileAllocation("instance", size, typeContext); - return piCast(verifyOop(result), StampFactory.forNodeIntrinsic()); + return addressToFormattedObject(addr, size, hub, prototypeMarkWord, fillContents, typeContext); } } // haveResult test here helps avoid dropping earlier stores were seen to be dropped without @@ -126,44 +215,43 @@ return allocateArrayAtomicImpl(hub, length, prototypeMarkWord, headerSize, log2ElementSize, fillContents, maybeUnroll, typeContext); } + protected static Object addressToFormattedArray(Word addr, int allocationSize, int length, int headerSize, Word hub, Word prototypeMarkWord, boolean fillContents, boolean maybeUnroll, + @ConstantParameter String typeContext) { + // we are not in a stub so we can set useSnippetCounters to true + Object result = formatArray(hub, allocationSize, length, headerSize, addr, prototypeMarkWord, fillContents, maybeUnroll, true); + profileAllocation("array", allocationSize, typeContext); + return piArrayCast(verifyOop(result), length, StampFactory.forNodeIntrinsic()); + } + private static Object allocateArrayAtomicImpl(Word hub, int length, Word prototypeMarkWord, int headerSize, int log2ElementSize, boolean fillContents, boolean maybeUnroll, String typeContext) { int alignment = wordSize(); int allocationSize = computeArrayAllocationSize(length, alignment, headerSize, log2ElementSize); - Word thread = thread(); boolean haveResult = false; if (useTLAB()) { - Word top = atomicGetAndAddTlabTop(thread, allocationSize); - Word end = readTlabEnd(thread); - Word newTop = top.add(allocationSize); - if (probability(FAST_PATH_PROBABILITY, newTop.belowOrEqual(end))) { - // writeTlabTop(thread, newTop) was done by the atomicGetAndAdd - newarray_loopInit.inc(); - // we are not in a stub so we can set useSnippetCounters to true - Object result = formatArray(hub, allocationSize, length, headerSize, top, prototypeMarkWord, fillContents, maybeUnroll, true); - profileAllocation("array", allocationSize, typeContext); - return piArrayCast(verifyOop(result), length, StampFactory.forNodeIntrinsic()); - } else { - // only one overflower will be the first overflower, detectable because - // oldtop was still below end - if (top.belowOrEqual(end)) { - // hack alert: store the last good top before overflow into pf_top - // we will move it back into top later when back in the VM - writeTlabPfTop(thread, top); + // inlining this manually here because it resulted in better fastpath codegen + Word tlabInfo = getTlabInfoPtr(); + if (probability(FAST_PATH_PROBABILITY, tlabInfo.notEqual(0))) { + Word top = atomicGetAndAddTlabInfoTop(tlabInfo, allocationSize); + Word end = readTlabInfoEnd(tlabInfo); + Word newTop = top.add(allocationSize); + if (probability(FAST_PATH_PROBABILITY, newTop.belowOrEqual(end))) { + return addressToFormattedArray(top, allocationSize, length, headerSize, hub, prototypeMarkWord, fillContents, maybeUnroll, typeContext); + } else { + Word addr = allocateFromTlabSlowPath(tlabInfo, allocationSize, top, end); + if (addr.notEqual(0)) { + return addressToFormattedArray(addr, allocationSize, length, headerSize, hub, prototypeMarkWord, fillContents, maybeUnroll, typeContext); + } } - // useless logic but see notes on deopt path below - haveResult = newTop.belowOrEqual(end); } } + // we could not allocate from tlab, try allocating directly from eden if (hsailUseEdenAllocate) { // false for no logging - Word memory = NewInstanceStub.edenAllocate(Word.unsigned(allocationSize), false); - if (memory.notEqual(0)) { + Word addr = NewInstanceStub.edenAllocate(Word.unsigned(allocationSize), false); + if (addr.notEqual(0)) { newarray_eden.inc(); - // we are not in a stub so we can set useSnippetCounters to true - Object result = formatArray(hub, allocationSize, length, headerSize, memory, prototypeMarkWord, fillContents, maybeUnroll, true); - profileAllocation("array", allocationSize, typeContext); - return piArrayCast(verifyOop(result), length, StampFactory.forNodeIntrinsic()); + return addressToFormattedArray(addr, allocationSize, length, headerSize, hub, prototypeMarkWord, fillContents, maybeUnroll, typeContext); } } if (!haveResult) { @@ -250,6 +338,7 @@ private static final SnippetCounter new_eden = new SnippetCounter(countersNew, "eden", "used edenAllocate"); private static final SnippetCounter.Group countersNewArray = SnippetCounters.getValue() ? new SnippetCounter.Group("NewArray") : null; - private static final SnippetCounter newarray_loopInit = new SnippetCounter(countersNewArray, "tlabLoopInit", "TLAB alloc with zeroing in a loop"); + // private static final SnippetCounter newarray_loopInit = new SnippetCounter(countersNewArray, + // "tlabLoopInit", "TLAB alloc with zeroing in a loop"); private static final SnippetCounter newarray_eden = new SnippetCounter(countersNewArray, "eden", "used edenAllocate"); } diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILWorkItemAbsIdNode.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graal/com.oracle.graal.hotspot.hsail/src/com/oracle/graal/hotspot/hsail/replacements/HSAILWorkItemAbsIdNode.java Tue Jun 10 22:36:26 2014 +0200 @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package com.oracle.graal.hotspot.hsail.replacements; + +import com.oracle.graal.api.meta.*; +import com.oracle.graal.compiler.common.type.*; +import com.oracle.graal.nodes.*; +import com.oracle.graal.nodes.spi.*; +import com.oracle.graal.hotspot.hsail.*; + +public class HSAILWorkItemAbsIdNode extends FixedWithNextNode implements LIRLowerable { + + public HSAILWorkItemAbsIdNode() { + super(StampFactory.forKind(Kind.Int)); + } + + @Override + public void generate(NodeLIRBuilderTool gen) { + HSAILHotSpotLIRGenerator hsailgen = (HSAILHotSpotLIRGenerator) (gen.getLIRGeneratorTool()); + Value result = hsailgen.emitWorkItemAbsId(); + gen.setResult(this, result); + } + + @NodeIntrinsic + public static native int getWorkItemAbsId(); + +} diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java --- a/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.hotspot/src/com/oracle/graal/hotspot/HotSpotVMConfig.java Tue Jun 10 22:36:26 2014 +0200 @@ -1029,9 +1029,10 @@ */ @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_notice_safepoints", type = "jint*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailNoticeSafepointsOffset; @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_deopt_occurred", type = "jint", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailDeoptOccurredOffset; - @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_never_ran_array", type = "jboolean *", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailNeverRanArrayOffset; + @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_never_ran_array", type = "jboolean*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailNeverRanArrayOffset; @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_deopt_next_index", type = "jint", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailDeoptNextIndexOffset; - @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_donor_threads", type = "JavaThread**", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailDonorThreadsOffset; + @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_alloc_info", type = "HSAILAllocationInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailAllocInfoOffset; + @HotSpotVMField(name = "Hsail::HSAILDeoptimizationInfo::_cur_tlab_info", type = "HSAILTlabInfo**", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailCurTlabInfoOffset; @HotSpotVMField(name = "Hsail::HSAILKernelDeoptimization::_workitemid", type = "jint", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailDeoptimizationWorkItem; @HotSpotVMField(name = "Hsail::HSAILKernelDeoptimization::_actionAndReason", type = "jint", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailDeoptimizationReason; @@ -1043,6 +1044,20 @@ @HotSpotVMType(name = "Hsail::HSAILKernelDeoptimization", get = HotSpotVMType.Type.SIZE) @Stable public int hsailKernelDeoptimizationHeaderSize; @HotSpotVMType(name = "Hsail::HSAILDeoptimizationInfo", get = HotSpotVMType.Type.SIZE) @Stable public int hsailDeoptimizationInfoHeaderSize; + @HotSpotVMField(name = "HSAILAllocationInfo::_tlab_infos_pool_start", type = "HSAILTlabInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailAllocInfoTlabInfosPoolStartOffset; + @HotSpotVMField(name = "HSAILAllocationInfo::_tlab_infos_pool_next", type = "HSAILTlabInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailAllocInfoTlabInfosPoolNextOffset; + @HotSpotVMField(name = "HSAILAllocationInfo::_tlab_infos_pool_end", type = "HSAILTlabInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailAllocInfoTlabInfosPoolEndOffset; + @HotSpotVMField(name = "HSAILAllocationInfo::_tlab_align_reserve_bytes", type = "size_t", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailAllocInfoTlabAlignReserveBytesOffset; + + @HotSpotVMField(name = "HSAILTlabInfo::_start", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoStartOffset; + @HotSpotVMField(name = "HSAILTlabInfo::_top", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoTopOffset; + @HotSpotVMField(name = "HSAILTlabInfo::_end", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoEndOffset; + @HotSpotVMField(name = "HSAILTlabInfo::_last_good_top", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoLastGoodTopOffset; + @HotSpotVMField(name = "HSAILTlabInfo::_original_top", type = "HeapWord*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoOriginalTopOffset; + @HotSpotVMField(name = "HSAILTlabInfo::_donor_thread", type = "JavaThread*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoDonorThreadOffset; + @HotSpotVMField(name = "HSAILTlabInfo::_alloc_info", type = "HSAILAllocationInfo*", get = HotSpotVMField.Type.OFFSET) @Stable public int hsailTlabInfoAllocInfoOffset; + @HotSpotVMType(name = "HSAILTlabInfo", get = HotSpotVMType.Type.SIZE) @Stable public int hsailTlabInfoSize; + /** * Mark word right shift to get identity hash code. */ diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.lir.hsail/src/com/oracle/graal/lir/hsail/HSAILMove.java --- a/graal/com.oracle.graal.lir.hsail/src/com/oracle/graal/lir/hsail/HSAILMove.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.lir.hsail/src/com/oracle/graal/lir/hsail/HSAILMove.java Tue Jun 10 22:36:26 2014 +0200 @@ -33,6 +33,7 @@ import com.oracle.graal.lir.*; import com.oracle.graal.lir.StandardOp.MoveOp; import com.oracle.graal.lir.asm.*; +import com.oracle.graal.hsail.*; /** * Implementation of move instructions. @@ -167,6 +168,25 @@ HSAILAddress addr = address.toAddress(); masm.emitLoad(kind, result, addr); } + + public boolean usesThreadRegister() { + return (address.toAddress().getBase() == HSAIL.threadRegister); + } + } + + /** + * A LoadOp that uses the HSAIL ld_acq instruction + */ + public static class LoadAcquireOp extends LoadOp { + public LoadAcquireOp(Kind kind, AllocatableValue result, HSAILAddressValue address, LIRFrameState state) { + super(kind, result, address, state); + } + + @Override + public void emitMemAccess(HSAILAssembler masm) { + HSAILAddress addr = address.toAddress(); + masm.emitLoadAcquire(result, addr); + } } public static class StoreOp extends MemOp { @@ -186,6 +206,22 @@ } } + /** + * A StoreOp that uses the HSAIL st_rel instruction + */ + public static class StoreReleaseOp extends StoreOp { + public StoreReleaseOp(Kind kind, HSAILAddressValue address, AllocatableValue input, LIRFrameState state) { + super(kind, address, input, state); + } + + @Override + public void emitMemAccess(HSAILAssembler masm) { + assert isRegister(input); + HSAILAddress addr = address.toAddress(); + masm.emitStoreRelease(input, addr); + } + } + public static class StoreConstantOp extends MemOp { protected final Constant input; @@ -465,4 +501,18 @@ } } + public static class WorkItemAbsIdOp extends HSAILLIRInstruction { + + @Def({REG}) protected AllocatableValue result; + + public WorkItemAbsIdOp(AllocatableValue result) { + this.result = result; + } + + @Override + public void emitCode(CompilationResultBuilder crb, HSAILAssembler masm) { + masm.emitWorkItemAbsId(result); + } + } + } diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.replacements/src/com/oracle/graal/replacements/nodes/DirectReadNode.java --- a/graal/com.oracle.graal.replacements/src/com/oracle/graal/replacements/nodes/DirectReadNode.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.replacements/src/com/oracle/graal/replacements/nodes/DirectReadNode.java Tue Jun 10 22:36:26 2014 +0200 @@ -45,6 +45,10 @@ this.readKind = readKind; } + protected ValueNode getAddress() { + return address; + } + @Override public void generate(NodeLIRBuilderTool gen) { gen.setResult(this, gen.getLIRGeneratorTool().emitLoad(readKind, gen.operand(address), null)); diff -r b6ab7e7fa0a5 -r 06eedda53e14 graal/com.oracle.graal.replacements/src/com/oracle/graal/replacements/nodes/DirectStoreNode.java --- a/graal/com.oracle.graal.replacements/src/com/oracle/graal/replacements/nodes/DirectStoreNode.java Tue Jun 10 19:08:33 2014 +0200 +++ b/graal/com.oracle.graal.replacements/src/com/oracle/graal/replacements/nodes/DirectStoreNode.java Tue Jun 10 22:36:26 2014 +0200 @@ -53,6 +53,14 @@ gen.getLIRGeneratorTool().emitStore(kind, gen.operand(address), v, null); } + protected ValueNode getAddress() { + return address; + } + + protected ValueNode getValue() { + return value; + } + /* * The kind of the store is provided explicitly in these intrinsics because it is not always * possible to determine the kind from the given value during compilation (because stack kinds diff -r b6ab7e7fa0a5 -r 06eedda53e14 src/gpu/hsail/vm/gpu_hsail.cpp --- a/src/gpu/hsail/vm/gpu_hsail.cpp Tue Jun 10 19:08:33 2014 +0200 +++ b/src/gpu/hsail/vm/gpu_hsail.cpp Tue Jun 10 22:36:26 2014 +0200 @@ -69,8 +69,8 @@ {CC"executeKernel0", CC"("HS_INSTALLED_CODE"I["OBJECT"["OBJECT"["JLTHREAD"I[I)Z", FN_PTR(Hsail::execute_kernel_void_1d)}, }; -void * Hsail::_device_context = NULL; -jint Hsail::_notice_safepoints = false; +void* Hsail::_device_context = NULL; +jint Hsail::_notice_safepoints = false; Hsail::okra_create_context_func_t Hsail::_okra_create_context; Hsail::okra_create_kernel_func_t Hsail::_okra_create_kernel; @@ -85,43 +85,6 @@ Hsail::okra_clearargs_func_t Hsail::_okra_clearargs; Hsail::okra_register_heap_func_t Hsail::_okra_register_heap; -struct Stats { - int _dispatches; - int _deopts; - int _overflows; - bool _changeSeen; - -public: - Stats() { - _dispatches = _deopts = _overflows = 0; - _changeSeen = false; - } - - void incDeopts() { - _deopts++; - _changeSeen = true; - } - void incOverflows() { - _overflows++; - _changeSeen = true; - } - - void finishDispatch() { - _dispatches++; - if (_changeSeen) { - // print(); - _changeSeen = false; - } - } - - void print() { - tty->print_cr("Disp=%d, Deopts=%d, Ovflows=%d", _dispatches, _deopts, _overflows); - } - -}; - -static Stats kernelStats; - //static jint in_kernel = 0; void Hsail::notice_safepoints() { @@ -165,7 +128,7 @@ return execute_kernel_void_1d_internal((address) kernel, dimX, args, mh, nm, oops_save, donor_threads, allocBytesPerWorkitem, oop_map_array, CHECK_0); GPU_END -static void showRanges(jboolean *a, int len) { +static void showRanges(jboolean* a, int len) { // show ranges bool lookFor = true; for (int i = 0; i < len; i++) { @@ -182,38 +145,6 @@ } } -// fill and retire old tlab and get a new one -// if we can't get one, no problem someone will eventually do a gc -void Hsail::getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, size_t tlabMinHsail) { - tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null) - - // get a size for a new tlab that is at least tlabMinHsail. - size_t new_tlab_size = tlab->compute_size(tlabMinHsail); - if (new_tlab_size == 0) return; - - HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size); - if (tlab_start == NULL) return; - - // ..and clear it if required - if (ZeroTLAB) { - Copy::zero_to_words(tlab_start, new_tlab_size); - } - // and init the tlab pointers - tlab->fill(tlab_start, tlab_start, new_tlab_size); -} - -static void printTlabInfo (ThreadLocalAllocBuffer* tlab) { - HeapWord *start = tlab->start(); - HeapWord *top = tlab->top(); - HeapWord *end = tlab->end(); - // sizes are in bytes - size_t tlabFree = tlab->free() * HeapWordSize; - size_t tlabUsed = tlab->used() * HeapWordSize; - size_t tlabSize = tlabFree + tlabUsed; - double freePct = 100.0 * (double) tlabFree/(double) tlabSize; - tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct); -} - class OopSaver : public StackObj { private: objArrayOop _oopsSaveArray; @@ -260,21 +191,21 @@ _oopMapArray = (typeArrayOop) JNIHandles::resolve(_oop_map_array); } - void * getOopForBit(HSAILFrame * hsailFrame, int bit) { + void* getOopForBit(HSAILFrame* hsailFrame, int bit) { assert(isOop(hsailFrame, bit), ""); - void *oop; + void* oop; if (bit < hsailFrame->num_d_regs()) { // d register oop = (void*) hsailFrame->get_d_reg(bit); } else { // stack slot int stackOffset = (bit - hsailFrame->num_d_regs()) * 8; // 8 bytes per stack slot - oop = (void *) hsailFrame->get_stackslot64(stackOffset); + oop = (void*) hsailFrame->get_stackslot64(stackOffset); } return oop; } - void putOopForBit(HSAILFrame * hsailFrame, int bit, void *oop) { + void putOopForBit(HSAILFrame* hsailFrame, int bit, void* oop) { assert(isOop(hsailFrame, bit), ""); if (bit < hsailFrame->num_d_regs()) { // d register @@ -286,7 +217,7 @@ } } - void saveOopsFromFrame(HSAILFrame * hsailFrame, int deoptSlot){ + void saveOopsFromFrame(HSAILFrame* hsailFrame, int deoptSlot){ // as used, no need to resolve arrays on each call int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots(); @@ -300,7 +231,7 @@ } } - void restoreOopsToFrame(HSAILFrame * hsailFrame, int deoptSlot, int workitem){ + void restoreOopsToFrame(HSAILFrame* hsailFrame, int deoptSlot, int workitem){ // need to re-resolve on each restore resolveArrays(); int oopsPerDeopt = hsailFrame->num_d_regs() + hsailFrame->num_stack_slots(); @@ -310,13 +241,13 @@ if (isOop(hsailFrame, bit)) { // the dregister or stack slot at this bit is an oop, retrieve it from array and put back in frame int saveArrayIndex = deoptSlot * oopsPerDeopt + bit; - void * newValue = (void *) _oopsSaveArray->obj_at(saveArrayIndex); - void * oldValue = getOopForBit(hsailFrame, bit); + void* newValue = (void*) _oopsSaveArray->obj_at(saveArrayIndex); + void* oldValue = getOopForBit(hsailFrame, bit); assert((oldValue != 0 ? newValue != 0 : newValue == 0), "bad dregValue retrieved"); if (newValue != oldValue) { if (TraceGPUInteraction) { int numDRegs = hsailFrame->num_d_regs(); - const char *name = (bit < numDRegs ? "$d" : "stk"); + const char* name = (bit < numDRegs ? "$d" : "stk"); int num = (bit < numDRegs ? bit : bit - numDRegs); tty->print_cr("oop moved for %s%d, workitem %d, slot %d, old=%p, new=%p", name, num, workitem, deoptSlot, oldValue, newValue); @@ -327,7 +258,7 @@ } } - bool isOop(HSAILFrame * hsailFrame, int bit){ + bool isOop(HSAILFrame* hsailFrame, int bit){ // re-resolve on each access resolveArrays(); if (bit > hsailFrame->num_d_regs() + hsailFrame->num_stack_slots()) { @@ -347,47 +278,15 @@ }; -jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oops_save, +jboolean Hsail::execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm, jobject oops_save, jobject donor_threads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS) { ResourceMark rm(THREAD); objArrayOop argsArray = (objArrayOop) JNIHandles::resolve(args); - // TODO: avoid donor thread logic if kernel does not allocate - objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads); - int numDonorThreads = donorThreadObjects->length(); - guarantee(numDonorThreads > 0, "need at least one donor thread"); - JavaThread** donorThreads = NEW_RESOURCE_ARRAY(JavaThread*, numDonorThreads); - for (int i = 0; i < numDonorThreads; i++) { - donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i)); - } - - - // compute tlabMinHsail based on number of workitems, number of donor - // threads, allocBytesPerWorkitem rounded up - size_t tlabMinHsail = (allocBytesPerWorkitem * dimX + (numDonorThreads - 1)) / numDonorThreads; - if (TraceGPUInteraction) { - tty->print_cr("computed tlabMinHsail = %d", tlabMinHsail); - } - - for (int i = 0; i < numDonorThreads; i++) { - JavaThread* donorThread = donorThreads[i]; - ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); - if (TraceGPUInteraction) { - tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab); - printTlabInfo(tlab); - } - - // note: this used vs. free limit checking should be based on some - // heuristic where we see how much this kernel tends to allocate - if ((tlab->end() == NULL) || (tlab->free() * HeapWordSize < tlabMinHsail)) { - getNewTlabForDonorThread(tlab, tlabMinHsail); - if (TraceGPUInteraction) { - tty->print("donorThread %d, refilled tlab, -> ", i); - printTlabInfo(tlab); - } - } - } - + // We avoid HSAILAllocationInfo logic if kernel does not allocate + // in which case the donor_thread array passed in will be null + HSAILAllocationInfo* allocInfo = (donor_threads == NULL ? NULL : new HSAILAllocationInfo(donor_threads, dimX, allocBytesPerWorkitem)); + // Reset the kernel arguments _okra_clearargs(kernel); @@ -400,7 +299,11 @@ int numStackSlots = (saveAreaCounts >> 16); int bytesPerSaveArea = numSRegs * 4 + (numDRegs + numStackSlots) * 8; - e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea, dimX, donorThreads); + e = new (MAX_DEOPT_SLOTS, bytesPerSaveArea) HSAILDeoptimizationInfo(MAX_DEOPT_SLOTS, bytesPerSaveArea, dimX, allocInfo); + // copy cur_tlab_infos + if (allocInfo != NULL) { + e->setCurTlabInfos(allocInfo->getCurTlabInfos()); + } } // This object sets up the kernel arguments @@ -409,8 +312,8 @@ tty->print_cr("[HSAIL] range=%d", dimX); } - // if any object passed was null, throw an exception here - // doing this means the kernel code can avoid null checks on the object parameters. + // If any object passed was null, throw an exception here. Doing this + // means the kernel code can avoid null checks on the object parameters. if (hka.getFirstNullParameterIndex() >= 0) { char buf[64]; sprintf(buf, "Null Kernel Parameter seen, Parameter Index: %d", hka.getFirstNullParameterIndex()); @@ -431,23 +334,9 @@ //in_kernel = 0; } - // fix up any tlab tops that overflowed - bool anyOverflows = false; - for (int i = 0; i < numDonorThreads; i++) { - JavaThread * donorThread = donorThreads[i]; - ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); - if (tlab->top() > tlab->end()) { - anyOverflows = true; - long overflowAmount = (long) tlab->top() - (long) tlab->pf_top(); - // tlab->set_top is private this ugly hack gets around that - *(long *)((char *)tlab + in_bytes(tlab->top_offset())) = (long) tlab->pf_top(); - if (TraceGPUInteraction) { - tty->print_cr("donorThread %d at %p overflowed by %ld bytes, setting last good top to %p", i, donorThread, overflowAmount, tlab->top()); - } - } - } - if (anyOverflows) { - kernelStats.incOverflows(); + // avoid HSAILAllocationInfo logic if kernel does not allocate + if (allocInfo != NULL) { + allocInfo->postKernelCleanup(); } if (UseHSAILDeoptimization) { @@ -465,13 +354,11 @@ guarantee(deoptcode == 1, msg); } } else { - kernelStats.incDeopts(); - { TraceTime t3("handle deoptimizing workitems", TraceGPUInteraction); if (TraceGPUInteraction) { tty->print_cr("deopt happened."); - HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(0); + HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(0); tty->print_cr("first deopter was workitem %d", pdeopt->workitem()); } @@ -485,7 +372,7 @@ // since slots are allocated from the beginning, we know how far to look assert(e->num_deopts() < e->num_slots(), "deopt save state overflow"); for (int k = 0; k < e->num_deopts(); k++) { - HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k); + HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(k); assert (pdeopt->workitem() >= 0, "bad workitem in deopt"); // this is a workitem that deopted oopSaver.saveOopsFromFrame(pdeopt->first_frame(), k); @@ -494,15 +381,15 @@ // Handle any deopting workitems. int count_deoptimized = 0; for (int k = 0; k < e->num_deopts(); k++) { - HSAILKernelDeoptimization * pdeopt = e->get_deopt_save_state(k); + HSAILKernelDeoptimization* pdeopt = e->get_deopt_save_state(k); jint workitem = pdeopt->workitem(); if (workitem != -1) { int deoptId = pdeopt->pc_offset(); - HSAILFrame *hsailFrame = pdeopt->first_frame(); + HSAILFrame* hsailFrame = pdeopt->first_frame(); - // update the hsailFrame from the oopsSaveArray - // will re-resolve the handles each time + // Update the hsailFrame from the oopsSaveArray + // will re-resolve the handles each time. oopSaver.restoreOopsToFrame(hsailFrame, k, workitem); JavaValue result(T_VOID); @@ -511,7 +398,7 @@ javaArgs.push_int(deoptId); javaArgs.push_long((jlong) hsailFrame); - // override the deoptimization action with Action_none until we decide + // Override the deoptimization action with Action_none until we decide // how to handle the other actions. int myActionReason = Deoptimization::make_trap_request(Deoptimization::trap_request_reason(pdeopt->reason()), Deoptimization::Action_none); javaArgs.push_int(myActionReason); @@ -551,7 +438,7 @@ // turn off verbose trace stuff for javacall arg setup bool savedTraceGPUInteraction = TraceGPUInteraction; TraceGPUInteraction = false; - jboolean *never_ran_array = e->never_ran_array(); + jboolean* never_ran_array = e->never_ran_array(); if (handleNeverRansHere) { for (int k = 0; k < dimX; k++) { if (never_ran_array[k]) { @@ -562,9 +449,10 @@ JavaCallArguments javaArgs; // re-resolve the args_handle here objArrayOop resolvedArgsArray = (objArrayOop) JNIHandles::resolve(args); - // This object sets up the javaCall arguments - // the way argsArray is set up, this should work for instance methods as well - // (the receiver will be the first oop pushed) + + // This object sets up the javaCall arguments. The way + // argsArray is set up, this should work for instance + // methods as well (the receiver will be the first oop pushed) HSAILJavaCallArguments hjca(&javaArgs, k, mh->signature(), resolvedArgsArray, mh->is_static()); if (mh->is_static()) { JavaCalls::call_static(&result, methKlass, mh->name(), mh->signature(), &javaArgs, THREAD); @@ -583,19 +471,19 @@ } delete e; + delete allocInfo; } - kernelStats.finishDispatch(); return success; } -GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv *env, jclass, jbyteArray code_handle, jstring name_handle)) +GPU_ENTRY(jlong, Hsail::generate_kernel, (JNIEnv* env, jclass, jbyteArray code_handle, jstring name_handle)) guarantee(_okra_create_kernel != NULL, "[HSAIL] Okra not linked"); ResourceMark rm; jsize name_len = env->GetStringLength(name_handle); jsize code_len = env->GetArrayLength(code_handle); char* name = NEW_RESOURCE_ARRAY(char, name_len + 1); - unsigned char *code = NEW_RESOURCE_ARRAY(unsigned char, code_len + 1); + unsigned char* code = NEW_RESOURCE_ARRAY(unsigned char, code_len + 1); code[code_len] = 0; name[name_len] = 0; @@ -631,7 +519,7 @@ return false; \ } \ -GPU_ENTRY(jboolean, Hsail::initialize, (JNIEnv *env, jclass)) +GPU_ENTRY(jboolean, Hsail::initialize, (JNIEnv* env, jclass)) if (okra_library_name == NULL) { if (TraceGPUInteraction) { tty->print_cr("Unsupported HSAIL platform"); @@ -641,14 +529,14 @@ // here we know we have a valid okra_library_name to try to load char ebuf[O_BUFLEN]; - char *okra_lib_name_from_env_var = getenv("_OKRA_SIM_LIB_PATH_"); + char* okra_lib_name_from_env_var = getenv("_OKRA_SIM_LIB_PATH_"); if (okra_lib_name_from_env_var != NULL) { okra_library_name = okra_lib_name_from_env_var; } if (TraceGPUInteraction) { tty->print_cr("[HSAIL] library is %s", okra_library_name); } - void *okra_lib_handle = NULL; + void* okra_lib_handle = NULL; #if defined(LINUX) // Check first if the Okra library is already loaded. // TODO: Figure out how to do this on other OSes. @@ -668,8 +556,8 @@ guarantee(_okra_create_context == NULL, "cannot repeat GPU initialization"); - // at this point we know okra_lib_handle is valid whether we loaded - // here or earlier. In either case, we can lookup the functions + // At this point we know okra_lib_handle is valid whether we loaded + // here or earlier. In either case, we can lookup the functions. LOOKUP_OKRA_FUNCTION(okra_create_context, okra_create_context); LOOKUP_OKRA_FUNCTION(okra_create_kernel, okra_create_kernel); LOOKUP_OKRA_FUNCTION(okra_push_object, okra_push_object); diff -r b6ab7e7fa0a5 -r 06eedda53e14 src/gpu/hsail/vm/gpu_hsail.hpp --- a/src/gpu/hsail/vm/gpu_hsail.hpp Tue Jun 10 19:08:33 2014 +0200 +++ b/src/gpu/hsail/vm/gpu_hsail.hpp Tue Jun 10 22:36:26 2014 +0200 @@ -22,12 +22,47 @@ * */ -#ifndef GPU_HSAIL_HPP -#define GPU_HSAIL_HPP +#ifndef GPU_HSAIL_VM_GPU_HSAIL_HPP +#define GPU_HSAIL_VM_GPU_HSAIL_HPP #include "utilities/exceptions.hpp" #include "graal/graalEnv.hpp" #include "gpu_hsail_Frame.hpp" +#include "gpu_hsail_Tlab.hpp" + +struct HSAILKernelStats { + int _dispatches; + int _deopts; + int _overflows; + bool _changeSeen; + +public: + HSAILKernelStats() { + _dispatches = _deopts = _overflows = 0; + _changeSeen = false; + } + + void incDeopts() { + _deopts++; + _changeSeen = true; + } + void incOverflows() { + _overflows++; + _changeSeen = true; + } + + void finishDispatch() { + _dispatches++; + if (_changeSeen) { + // print(); + _changeSeen = false; + } + } + + void print() { + tty->print_cr("Disp=%d, Deopts=%d, Ovflows=%d", _dispatches, _deopts, _overflows); + } +}; class Hsail : public Gpu { @@ -46,9 +81,9 @@ inline jint workitem() { return _workitemid; } inline jint reason() { return _actionAndReason; } inline jint pc_offset() { return first_frame()->pc_offset(); } - inline HSAILFrame *first_frame() { + inline HSAILFrame* first_frame() { // starts after the "header" fields - return (HSAILFrame *) (((jbyte *) this) + sizeof(*this)); + return (HSAILFrame*) (((jbyte*) this) + sizeof(*this)); } }; @@ -56,38 +91,41 @@ // TODO: query the device to get this number #define MAX_DEOPT_SLOTS (8 * 40 * 64) + class HSAILDeoptimizationInfo : public CHeapObj { friend class VMStructs; private: jint* _notice_safepoints; jint _deopt_occurred; jint _deopt_next_index; - JavaThread** _donor_threads; jint _num_slots; jint _deopt_span; + HSAILTlabInfo** _cur_tlab_info; // copy of what was in the HSAILAllocationInfo, to avoid an extra indirection + HSAILAllocationInfo* _alloc_info; char _ignore; // keep a pointer last so save area following it is word aligned - jboolean * _never_ran_array; + jboolean* _never_ran_array; public: + // static HSAILKernelStats kernelStats; HSAILKernelDeoptimization _deopt_save_states[1]; // number and size of these can vary per kernel static inline size_t hdr_size() { return sizeof(HSAILDeoptimizationInfo); } - inline jbyte * save_area_start() { + inline jbyte* save_area_start() { return (jbyte*) (this) + hdr_size(); } - inline HSAILDeoptimizationInfo(int numSlots, int bytesPerSaveArea, int dimX, JavaThread** donorThreads) { + inline HSAILDeoptimizationInfo(int numSlots, int bytesPerSaveArea, int dimX, HSAILAllocationInfo* allocInfo) { _notice_safepoints = &Hsail::_notice_safepoints; _deopt_occurred = 0; _deopt_next_index = 0; _num_slots = numSlots; _never_ran_array = NEW_C_HEAP_ARRAY(jboolean, dimX, mtInternal); memset(_never_ran_array, 0, dimX * sizeof(jboolean)); - _donor_threads = donorThreads; + _alloc_info = allocInfo; _deopt_span = sizeof(HSAILKernelDeoptimization) + sizeof(HSAILFrame) + bytesPerSaveArea; if (TraceGPUInteraction) { tty->print_cr("HSAILDeoptimizationInfo allocated, %d slots of size %d, total size = 0x%lx bytes", _num_slots, _deopt_span, (_num_slots * _deopt_span + sizeof(HSAILDeoptimizationInfo))); @@ -102,21 +140,25 @@ return _deopt_occurred; } inline jint num_deopts() { return _deopt_next_index; } - inline jboolean *never_ran_array() { return _never_ran_array; } + inline jboolean* never_ran_array() { return _never_ran_array; } inline jint num_slots() {return _num_slots;} - inline HSAILKernelDeoptimization * get_deopt_save_state(int slot) { + inline HSAILKernelDeoptimization* get_deopt_save_state(int slot) { // use _deopt_span to index into _deopt_states - return (HSAILKernelDeoptimization *) (save_area_start() + _deopt_span * slot); + return (HSAILKernelDeoptimization*) (save_area_start() + _deopt_span * slot); } - void * operator new (size_t hdrSize, int numSlots, int bytesPerSaveArea) { + void setCurTlabInfos(HSAILTlabInfo** ptlabInfos) { + _cur_tlab_info = ptlabInfos; + } + + void* operator new (size_t hdrSize, int numSlots, int bytesPerSaveArea) { assert(hdrSize <= hdr_size(), ""); size_t totalSizeBytes = hdr_size() + numSlots * (sizeof(HSAILKernelDeoptimization) + sizeof(HSAILFrame) + bytesPerSaveArea); return NEW_C_HEAP_ARRAY(char, totalSizeBytes, mtInternal); } - void operator delete (void *ptr) { + void operator delete (void* ptr) { FREE_C_HEAP_ARRAY(char, ptr, mtInternal); } }; @@ -126,21 +168,16 @@ static JNINativeMethod HSAIL_methods[]; // static native boolean initialize(); - JNIEXPORT static jboolean initialize(JNIEnv *env, jclass); + JNIEXPORT static jboolean initialize(JNIEnv* env, jclass); // static native long generateKernel(byte[] targetCode, String name); - JNIEXPORT static jlong generate_kernel(JNIEnv *env, jclass, jbyteArray code_handle, jstring name_handle); + JNIEXPORT static jlong generate_kernel(JNIEnv* env, jclass, jbyteArray code_handle, jstring name_handle); // static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args); - JNIEXPORT static jboolean execute_kernel_void_1d(JNIEnv *env, jclass, jobject hotspotInstalledCode, jint dimX, jobject args, jobject oopsSave, + JNIEXPORT static jboolean execute_kernel_void_1d(JNIEnv* env, jclass, jobject hotspotInstalledCode, jint dimX, jobject args, jobject oopsSave, jobject donorThreads, int allocBytesPerWorkitem, jobject oop_map_array); - // static native void getThreadPointers(Object[] donorThreads, long[] threadPointersOut); - JNIEXPORT static void get_thread_pointers(JNIEnv *env, jclass, jobject donor_threads_handle, jobject thread_ptrs_handle); - - static void getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, size_t tlabMinHsail); - - static jboolean execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod *nm, jobject oopsSave, + static jboolean execute_kernel_void_1d_internal(address kernel, int dimX, jobject args, methodHandle& mh, nmethod* nm, jobject oopsSave, jobject donorThreads, int allocBytesPerWorkitem, jobject oop_map_array, TRAPS); static void register_heap(); @@ -165,7 +202,7 @@ private: typedef void* (*okra_create_context_func_t)(); - typedef void* (*okra_create_kernel_func_t)(void*, unsigned char *, const char *); + typedef void* (*okra_create_kernel_func_t)(void*, unsigned char*, const char*); typedef bool (*okra_push_object_func_t)(void*, void*); typedef bool (*okra_push_boolean_func_t)(void*, jboolean); typedef bool (*okra_push_byte_func_t)(void*, jbyte); @@ -197,4 +234,4 @@ // true if safepoints are activated static jint _notice_safepoints; }; -#endif // GPU_HSAIL_HPP +#endif // GPU_HSAIL_VM_GPU_HSAIL_HPP diff -r b6ab7e7fa0a5 -r 06eedda53e14 src/gpu/hsail/vm/gpu_hsail_Frame.hpp --- a/src/gpu/hsail/vm/gpu_hsail_Frame.hpp Tue Jun 10 19:08:33 2014 +0200 +++ b/src/gpu/hsail/vm/gpu_hsail_Frame.hpp Tue Jun 10 22:36:26 2014 +0200 @@ -22,8 +22,8 @@ * */ -#ifndef GPU_HSAIL_FRAME_HPP -#define GPU_HSAIL_FRAME_HPP +#ifndef GPU_HSAIL_VM_GPU_HSAIL_FRAME_HPP +#define GPU_HSAIL_VM_GPU_HSAIL_FRAME_HPP #include "graal/graalEnv.hpp" #include "code/debugInfo.hpp" @@ -43,31 +43,31 @@ jint num_s_regs() {return _num_s_regs; } jint num_d_regs() {return _num_d_regs; } jint num_stack_slots() {return _num_stack_slots; } - jbyte * data_start() {return (jbyte *) this + sizeof(*this); } + jbyte* data_start() {return (jbyte*) this + sizeof(*this); } jlong get_d_reg(int idx) { int ofst = num_s_regs() * 4 + idx * 8; - return(*(jlong *) (data_start() + ofst)); + return(*(jlong*) (data_start() + ofst)); } jint get_s_reg(int idx) { int ofst = idx * 4; - return(*(jint *) (data_start() + ofst)); + return(*(jint*) (data_start() + ofst)); } void put_d_reg(int idx, jlong val) { int ofst = num_s_regs() * 4 + idx * 8; - (*(jlong *) (data_start() + ofst)) = val; + (*(jlong*) (data_start() + ofst)) = val; } jint get_stackslot32(int stackOffset) { int ofst = num_s_regs() * 4 + num_d_regs() * 8 + stackOffset; - return(*(jint *) (data_start() + ofst)); + return(*(jint*) (data_start() + ofst)); } jlong get_stackslot64(int stackOffset) { int ofst = num_s_regs() * 4 + num_d_regs() * 8 + stackOffset; - return(*(jlong *) (data_start() + ofst)); + return(*(jlong*) (data_start() + ofst)); } void put_stackslot64(int stackOffset, jlong val) { int ofst = num_s_regs() * 4 + num_d_regs() * 8 + stackOffset; - (*(jlong *) (data_start() + ofst)) = val; + (*(jlong*) (data_start() + ofst)) = val; } }; -#endif // GPU_HSAIL_FRAME_HPP +#endif // GPU_HSAIL_VM_GPU_HSAIL_FRAME_HPP diff -r b6ab7e7fa0a5 -r 06eedda53e14 src/gpu/hsail/vm/gpu_hsail_Tlab.hpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/gpu/hsail/vm/gpu_hsail_Tlab.hpp Tue Jun 10 22:36:26 2014 +0200 @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP +#define GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP + +#include "graal/graalEnv.hpp" +#include "code/debugInfo.hpp" +#include "code/location.hpp" +#include "gpu_hsail.hpp" + +class HSAILAllocationInfo; + +class HSAILTlabInfo VALUE_OBJ_CLASS_SPEC { + friend class VMStructs; +public: + // uses only the necessary fields from a full TLAB + HeapWord* _start; + HeapWord* _top; + HeapWord* _end; + HeapWord* _last_good_top; + HeapWord* _original_top; + JavaThread* _donor_thread; // donor thread associated with this tlabInfo + HSAILAllocationInfo* _alloc_info; // same as what is in HSAILDeoptimizationInfo + + // Accessors + HeapWord* start() { return _start; } + HeapWord* top() { return _top; } + HeapWord* end() { return _end; } + HeapWord* last_good_top() { return _last_good_top; } + HeapWord* original_top() { return _original_top; } + void initialize(HeapWord* start, HeapWord* top, HeapWord* end, JavaThread* donorThread, HSAILAllocationInfo* allocInfo) { + _start = start; + _top = _original_top = top; + _end = end; + _donor_thread = donorThread; + _alloc_info = allocInfo; + } +}; + + +class HSAILAllocationInfo : public CHeapObj { + friend class VMStructs; +private: + JavaThread** donorThreads; + jint _num_donor_threads; + size_t _tlab_align_reserve_bytes; // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes() + HSAILTlabInfo** _cur_tlab_infos; // array of current tlab info pointers, one per donor_thread + HSAILTlabInfo* _tlab_infos_pool_start; // pool for new tlab_infos + HSAILTlabInfo* _tlab_infos_pool_next; // where next will be allocated from + HSAILTlabInfo* _tlab_infos_pool_end; // where next will be allocated from + +public: + HSAILAllocationInfo(jobject donor_threads_jobj, int dimX, int allocBytesPerWorkitem) { + // fill in the donorThreads array + objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads_jobj); + _num_donor_threads = donorThreadObjects->length(); + guarantee(_num_donor_threads > 0, "need at least one donor thread"); + donorThreads = NEW_C_HEAP_ARRAY(JavaThread*, _num_donor_threads, mtInternal); + for (int i = 0; i < _num_donor_threads; i++) { + donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i)); + } + + // Compute max_tlab_infos based on amount of free heap space + size_t max_tlab_infos; + { + JavaThread* donorThread = donorThreads[0]; + ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); + size_t new_tlab_size = tlab->compute_size(0); + size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(donorThread); + if (new_tlab_size != 0) { + max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_donor_threads)); + } else { + max_tlab_infos = 8 * _num_donor_threads; // an arbitrary multiple + } + if (TraceGPUInteraction) { + tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos); + } + } + + _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_donor_threads, mtInternal); + _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal); + _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_donor_threads]; + _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos]; + _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes(); + + // we will fill the first N tlabInfos from the donor threads + for (int i = 0; i < _num_donor_threads; i++) { + JavaThread* donorThread = donorThreads[i]; + ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); + if (TraceGPUInteraction) { + tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab); + printTlabInfoFromThread(tlab); + } + + // Here we try to get a new tlab if current one is null. Note: + // eventually we may want to test if the size is too small based + // on some heuristic where we see how much this kernel tends to + // allocate, but for now we can just let it overflow and let the + // GPU allocate new tlabs. Actually, if we can't prime a tlab + // here, it might make sense to do a gc now rather than to start + // the kernel and have it deoptimize. How to do that? + if (tlab->end() == NULL) { + bool success = getNewTlabForDonorThread(tlab, i); + if (TraceGPUInteraction) { + if (success) { + tty->print("donorThread %d, refilled tlab, -> ", i); + printTlabInfoFromThread(tlab); + } else { + tty->print("donorThread %d, could not refill tlab, left as ", i); + printTlabInfoFromThread(tlab); + } + } + } + + // extract the necessary tlab fields into a TlabInfo record + HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i]; + _cur_tlab_infos[i] = pTlabInfo; + pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), donorThread, this); + } + } + + ~HSAILAllocationInfo() { + FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal); + FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal); + FREE_C_HEAP_ARRAY(JavaThread*, donorThreads, mtInternal); + } + + void postKernelCleanup() { + // go thru all the tlabInfos, fix up any tlab tops that overflowed + // complete the tlabs if they overflowed + // update the donor threads tlabs when appropriate + bool anyOverflows = false; + size_t bytesAllocated = 0; + // if there was an overflow in allocating tlabInfos, correct it here + if (_tlab_infos_pool_next > _tlab_infos_pool_end) { + if (TraceGPUInteraction) { + int overflowAmount = _tlab_infos_pool_next - _tlab_infos_pool_end; + tty->print_cr("tlabInfo allocation overflowed by %d units", overflowAmount); + } + _tlab_infos_pool_next = _tlab_infos_pool_end; + } + for (HSAILTlabInfo* tlabInfo = _tlab_infos_pool_start; tlabInfo < _tlab_infos_pool_next; tlabInfo++) { + if (TraceGPUInteraction) { + tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo, + tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top()); + } + JavaThread* donorThread = tlabInfo->_donor_thread; + ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); + bool overflowed = false; + // if a tlabInfo has NULL fields, i.e. we could not prime it on entry, + // or we could not get a tlab from the gpu, so ignore tlabInfo here + if (tlabInfo->start() != NULL) { + if (tlabInfo->top() > tlabInfo->end()) { + anyOverflows = true; + overflowed = true; + if (TraceGPUInteraction) { + long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top(); + tty->print_cr("tlabInfo %p (donorThread = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, donorThread, overflowAmount, tlabInfo->last_good_top()); + } + tlabInfo->_top = tlabInfo->last_good_top(); + } + + // fill the donor thread tlab with the tlabInfo information + // we do this even if it will get overwritten by a later tlabinfo + // because it helps with tlab statistics for that donor thread + tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve()); + + // if there was an overflow, make it parsable with retire = true + if (overflowed) { + tlab->make_parsable(true); + } + + size_t delta = (long)(tlabInfo->top()) - (long)(tlabInfo->original_top()); + if (TraceGPUInteraction) { + tty->print_cr("%ld bytes were allocated by tlabInfo %p (start %p, top %p, end %p", delta, tlabInfo, + tlabInfo->start(), tlabInfo->top(), tlabInfo->end()); + } + bytesAllocated += delta; + } + } + if (TraceGPUInteraction) { + tty->print_cr("%ld total bytes were allocated in this kernel", bytesAllocated); + } + if (anyOverflows) { + // Hsail::kernelStats.incOverflows(); + } + } + + HSAILTlabInfo** getCurTlabInfos() { + return _cur_tlab_infos; + } + +private: + // fill and retire old tlab and get a new one + // if we can't get one, no problem someone will eventually do a gc + bool getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, int idx) { + + tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null) + + // get a size for a new tlab that is based on the desired_size + size_t new_tlab_size = tlab->compute_size(0); + if (new_tlab_size == 0) return false; + + HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size); + if (tlab_start == NULL) return false; + + // ..and clear it if required + if (ZeroTLAB) { + Copy::zero_to_words(tlab_start, new_tlab_size); + } + // and init the tlab pointers + tlab->fill(tlab_start, tlab_start, new_tlab_size); + return true; + } + + void printTlabInfoFromThread (ThreadLocalAllocBuffer* tlab) { + HeapWord* start = tlab->start(); + HeapWord* top = tlab->top(); + HeapWord* end = tlab->end(); + // sizes are in bytes + size_t tlabFree = tlab->free() * HeapWordSize; + size_t tlabUsed = tlab->used() * HeapWordSize; + size_t tlabSize = tlabFree + tlabUsed; + double freePct = 100.0 * (double) tlabFree/(double) tlabSize; + tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct); + } + +}; + +#endif // GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP diff -r b6ab7e7fa0a5 -r 06eedda53e14 src/gpu/hsail/vm/hsailArgumentsBase.hpp --- a/src/gpu/hsail/vm/hsailArgumentsBase.hpp Tue Jun 10 19:08:33 2014 +0200 +++ b/src/gpu/hsail/vm/hsailArgumentsBase.hpp Tue Jun 10 22:36:26 2014 +0200 @@ -22,19 +22,16 @@ * */ -#ifndef BASE_ARGUMENTS_HSAIL_HPP -#define BASE_ARGUMENTS_HSAIL_HPP +#ifndef GPU_HSAIL_VM_HSAIL_ARGUMENTS_BASE_HPP +#define GPU_HSAIL_VM_HSAIL_ARGUMENTS_BASE_HPP #include "runtime/signature.hpp" -/*** - * Base class which iterates thru a signature and pulls from a - * objArrayOop of boxed values. Used as base for HSAILKernelArguments - * and HSAILJavaCallArguments The derived classes specify how to push - * args onto their data structure - ***/ - +// Base class which iterates thru a signature and pulls from a +// objArrayOop of boxed values. Used as base for HSAILKernelArguments +// and HSAILJavaCallArguments The derived classes specify how to push +// args onto their data structure class HSAILArgumentsBase : public SignatureIterator { public: @@ -49,7 +46,7 @@ // number of parameters in the signature int _parameter_count; - Symbol * _signature; + Symbol* _signature; bool _is_static; // records first null parameter seen @@ -58,8 +55,8 @@ // Get next java argument oop next_arg(BasicType expectedType); - virtual char *argsBuilderName() = 0; - virtual void pushObject(void * obj) = 0; + virtual char* argsBuilderName() = 0; + virtual void pushObject(void* obj) = 0; virtual void pushBool(jboolean z) = 0; virtual void pushByte(jbyte b) = 0; virtual void pushDouble(jdouble d) = 0; @@ -67,7 +64,7 @@ virtual void pushInt(jint i) = 0; virtual void pushLong(jlong j) = 0; virtual void handleFinalIntParameter() = 0; - virtual void handleFinalObjParameter(void *obj) = 0; + virtual void handleFinalObjParameter(void* obj) = 0; virtual void pushTrailingArgs() = 0; void recordNullObjectParameter() { @@ -143,4 +140,4 @@ }; -#endif // BASE_ARGUMENTS_HSAIL_HPP +#endif // GPU_HSAIL_VM_HSAIL_ARGUMENTS_BASE_HPP diff -r b6ab7e7fa0a5 -r 06eedda53e14 src/gpu/hsail/vm/hsailJavaCallArguments.hpp --- a/src/gpu/hsail/vm/hsailJavaCallArguments.hpp Tue Jun 10 19:08:33 2014 +0200 +++ b/src/gpu/hsail/vm/hsailJavaCallArguments.hpp Tue Jun 10 22:36:26 2014 +0200 @@ -22,8 +22,8 @@ * */ -#ifndef JAVACALL_ARGUMENTS_HSAIL_HPP -#define JAVACALL_ARGUMENTS_HSAIL_HPP +#ifndef GPU_HSAIL_VM_HSAIL_JAVACALL_ARGUMENTS_HPP +#define GPU_HSAIL_VM_HSAIL_JAVACALL_ARGUMENTS_HPP #include "hsailArgumentsBase.hpp" #include "runtime/javaCalls.hpp" @@ -33,17 +33,17 @@ public: private: - // JavaCall Args to push into - JavaCallArguments *_javaArgs; + // JavaCall args to push into + JavaCallArguments* _javaArgs; int _workitemid; public: - HSAILJavaCallArguments(JavaCallArguments *javaArgs, int workitemid, Symbol* signature, objArrayOop args, bool is_static) : HSAILArgumentsBase(signature, args, is_static) { + HSAILJavaCallArguments(JavaCallArguments* javaArgs, int workitemid, Symbol* signature, objArrayOop args, bool is_static) : HSAILArgumentsBase(signature, args, is_static) { _javaArgs = javaArgs; _workitemid = workitemid; collectArgs(); } - virtual char *argsBuilderName() {return (char *)"HSAILJavaCallArguments";} - virtual void pushObject(void *obj) { _javaArgs->push_oop((oop) obj); } + virtual char* argsBuilderName() {return (char*)"HSAILJavaCallArguments";} + virtual void pushObject(void* obj) { _javaArgs->push_oop((oop) obj); } virtual void pushBool(jboolean z) { pushInt(z); } virtual void pushByte(jbyte b) { pushInt(b); } virtual void pushDouble(jdouble d) { _javaArgs->push_double(d); } @@ -64,7 +64,7 @@ // stream source array (already checked in the base class) so for // a javacall we need to extract the correct obj from it based on // the workitemid - virtual void handleFinalObjParameter(void *arg) { + virtual void handleFinalObjParameter(void* arg) { objArrayOop objArrayArg = (objArrayOop) arg; oop extractedObj = objArrayArg->obj_at(_workitemid); if (TraceGPUInteraction) { @@ -77,5 +77,5 @@ }; -#endif // JAVACALL_ARGUMENTS_HSAIL_HPP +#endif // GPU_HSAIL_VM_HSAIL_JAVACALL_ARGUMENTS_HPP diff -r b6ab7e7fa0a5 -r 06eedda53e14 src/gpu/hsail/vm/hsailKernelArguments.hpp --- a/src/gpu/hsail/vm/hsailKernelArguments.hpp Tue Jun 10 19:08:33 2014 +0200 +++ b/src/gpu/hsail/vm/hsailKernelArguments.hpp Tue Jun 10 22:36:26 2014 +0200 @@ -22,8 +22,8 @@ * */ -#ifndef KERNEL_ARGUMENTS_HSAIL_HPP -#define KERNEL_ARGUMENTS_HSAIL_HPP +#ifndef GPU_HSAIL_VM_HSAIL_KERNEL_ARGUMENTS_HPP +#define GPU_HSAIL_VM_HSAIL_KERNEL_ARGUMENTS_HPP #include "gpu_hsail.hpp" #include "runtime/signature.hpp" @@ -37,7 +37,7 @@ private: // Kernel to push into address _kernel; - void * _exceptionHolder; + void* _exceptionHolder; public: HSAILKernelArguments(address kernel, Symbol* signature, objArrayOop args, bool is_static, void* exceptionHolder) : HSAILArgumentsBase(signature, args, is_static) { @@ -45,8 +45,8 @@ _exceptionHolder = exceptionHolder; collectArgs(); } - virtual char *argsBuilderName() {return (char *)"HSAILKernelArguments";} - virtual void pushObject(void *obj) { + virtual char* argsBuilderName() {return (char*)"HSAILKernelArguments";} + virtual void pushObject(void* obj) { bool pushed = Hsail::_okra_push_object(_kernel, obj); assert(pushed == true, "arg push failed"); } @@ -98,9 +98,9 @@ // for kernel arguments, final obj parameter should be an object // stream source array (already checked in the base class) so here we just pass it - virtual void handleFinalObjParameter(void *arg) { + virtual void handleFinalObjParameter(void* arg) { pushObject(arg); } }; -#endif // KERNEL_ARGUMENTS_HSAIL_HPP +#endif // GPU_HSAIL_VM_HSAIL_KERNEL_ARGUMENTS_HPP diff -r b6ab7e7fa0a5 -r 06eedda53e14 src/gpu/hsail/vm/vmStructs_hsail.hpp --- a/src/gpu/hsail/vm/vmStructs_hsail.hpp Tue Jun 10 19:08:33 2014 +0200 +++ b/src/gpu/hsail/vm/vmStructs_hsail.hpp Tue Jun 10 22:36:26 2014 +0200 @@ -41,16 +41,32 @@ nonstatic_field(Hsail::HSAILKernelDeoptimization, _workitemid, jint) \ nonstatic_field(Hsail::HSAILKernelDeoptimization, _actionAndReason, jint) \ \ - nonstatic_field(Hsail::HSAILDeoptimizationInfo, _notice_safepoints, jint*) \ + nonstatic_field(Hsail::HSAILDeoptimizationInfo, _notice_safepoints, jint*) \ nonstatic_field(Hsail::HSAILDeoptimizationInfo, _deopt_occurred, jint) \ nonstatic_field(Hsail::HSAILDeoptimizationInfo, _deopt_next_index, jint) \ - nonstatic_field(Hsail::HSAILDeoptimizationInfo, _donor_threads, JavaThread**) \ - nonstatic_field(Hsail::HSAILDeoptimizationInfo, _never_ran_array, jboolean *) \ + nonstatic_field(Hsail::HSAILDeoptimizationInfo, _cur_tlab_info, HSAILTlabInfo**) \ + nonstatic_field(Hsail::HSAILDeoptimizationInfo, _alloc_info, HSAILAllocationInfo*) \ + nonstatic_field(Hsail::HSAILDeoptimizationInfo, _never_ran_array, jboolean*) \ + \ + nonstatic_field(HSAILAllocationInfo, _tlab_infos_pool_start, HSAILTlabInfo*) \ + nonstatic_field(HSAILAllocationInfo, _tlab_infos_pool_next, HSAILTlabInfo*) \ + nonstatic_field(HSAILAllocationInfo, _tlab_infos_pool_end, HSAILTlabInfo*) \ + nonstatic_field(HSAILAllocationInfo, _tlab_align_reserve_bytes, size_t) \ + \ + nonstatic_field(HSAILTlabInfo, _start, HeapWord*) \ + nonstatic_field(HSAILTlabInfo, _top, HeapWord*) \ + nonstatic_field(HSAILTlabInfo, _end, HeapWord*) \ + nonstatic_field(HSAILTlabInfo, _last_good_top, HeapWord*) \ + nonstatic_field(HSAILTlabInfo, _original_top, HeapWord*) \ + nonstatic_field(HSAILTlabInfo, _donor_thread, JavaThread*) \ + nonstatic_field(HSAILTlabInfo, _alloc_info, HSAILAllocationInfo*) \ -#define VM_TYPES_GPU_HSAIL(declare_type, declare_toplevel_type) \ +#define VM_TYPES_GPU_HSAIL(declare_type, declare_toplevel_type) \ declare_toplevel_type(HSAILFrame) \ declare_toplevel_type(HSAILFrame*) \ declare_toplevel_type(Hsail::HSAILKernelDeoptimization) \ + declare_toplevel_type(HSAILAllocationInfo) \ + declare_toplevel_type(HSAILTlabInfo) \ declare_toplevel_type(Hsail::HSAILDeoptimizationInfo) #endif // GPU_HSAIL_VM_VMSTRUCTS_HSAIL_HPP diff -r b6ab7e7fa0a5 -r 06eedda53e14 src/share/vm/gc_interface/collectedHeap.hpp --- a/src/share/vm/gc_interface/collectedHeap.hpp Tue Jun 10 19:08:33 2014 +0200 +++ b/src/share/vm/gc_interface/collectedHeap.hpp Tue Jun 10 22:36:26 2014 +0200 @@ -84,7 +84,7 @@ class CollectedHeap : public CHeapObj { friend class VMStructs; friend class IsGCActiveMark; // Block structured external access to _is_gc_active - friend class Hsail; // access to allocate_new_tlab + friend class HSAILAllocationInfo; // access to allocate_new_tlab #ifdef ASSERT static int _fire_out_of_memory_count;