# HG changeset patch # User Morris Meyer # Date 1380416772 14400 # Node ID d8659ad83fccbb64b007452b1dbdeb31e756b40b # Parent aeeab846e98c0b122ac31c11a2aadc3858efeb05 PTX single-threaded array store, Warp annotation diff -r aeeab846e98c -r d8659ad83fcc graal/com.oracle.graal.asm.ptx/src/com/oracle/graal/asm/ptx/PTXAssembler.java --- a/graal/com.oracle.graal.asm.ptx/src/com/oracle/graal/asm/ptx/PTXAssembler.java Fri Sep 27 19:51:01 2013 +0200 +++ b/graal/com.oracle.graal.asm.ptx/src/com/oracle/graal/asm/ptx/PTXAssembler.java Sat Sep 28 21:06:12 2013 -0400 @@ -757,6 +757,6 @@ @Override public PTXAddress getPlaceholder() { - throw GraalInternalError.unimplemented("PTXAddress.getPlaceholder()"); + return null; } } diff -r aeeab846e98c -r d8659ad83fcc graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ArrayPTXTest.java --- a/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ArrayPTXTest.java Fri Sep 27 19:51:01 2013 +0200 +++ b/graal/com.oracle.graal.compiler.ptx.test/src/com/oracle/graal/compiler/ptx/test/ArrayPTXTest.java Sat Sep 28 21:06:12 2013 -0400 @@ -22,20 +22,22 @@ */ package com.oracle.graal.compiler.ptx.test; -import java.lang.reflect.Method; +import static com.oracle.graal.lir.ptx.Warp.ThreadDimension.*; -import org.junit.*; +import com.oracle.graal.lir.ptx.Warp; +import java.lang.reflect.Method; +import java.util.Arrays; +import org.junit.Test; public class ArrayPTXTest extends PTXTestBase { - @Ignore @Test public void testArray() { int[] arrayI = { - 1, 2, 3, 4, 5 + 1, 2, 3, 4, 5, 6, 7, 8, 9, }; - Integer resI = (Integer) invoke(compile("testArray1I"), arrayI, 3); - printReport("testArray1I: " + resI); + invoke(compile("testStoreArray1I"), arrayI, 2); + printReport("testStoreArray1I: " + Arrays.toString(arrayI)); // compile("testArray1J"); // compile("testArray1B"); // compile("testArray1S"); @@ -83,8 +85,8 @@ return array[i]; } - public static void testStoreArray1I(int[] array, int i, int val) { - array[i] = val; + public static void testStoreArray1I(int[] array, @Warp(dimension = X) int i) { + array[i] = 42; } public static void testStoreArray1B(byte[] array, int i, byte val) { @@ -111,7 +113,6 @@ // CheckStyle: stop system..print check System.out.println(message); // CheckStyle: resume system..print check - } public static void main(String[] args) { diff -r aeeab846e98c -r d8659ad83fcc graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXLIRGenerator.java --- a/graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXLIRGenerator.java Fri Sep 27 19:51:01 2013 +0200 +++ b/graal/com.oracle.graal.compiler.ptx/src/com/oracle/graal/compiler/ptx/PTXLIRGenerator.java Sat Sep 28 21:06:12 2013 -0400 @@ -62,6 +62,9 @@ import com.oracle.graal.nodes.calc.ConvertNode.Op; import com.oracle.graal.nodes.java.*; +import java.lang.annotation.*; + + /** * This class implements the PTX specific portion of the LIR generator. */ @@ -132,22 +135,46 @@ public void emitPrologue() { // Need to emit .param directives based on incoming arguments and return value CallingConvention incomingArguments = cc; - int argCount = incomingArguments.getArgumentCount(); - // Additional argument for return value. - Variable[] params = new Variable[argCount + 1]; - for (int i = 0; i < argCount; i++) { - params[i] = (Variable) incomingArguments.getArgument(i); + Object returnObject = incomingArguments.getReturn(); + AllocatableValue[] params; + int argCount; + + if (returnObject == Value.ILLEGAL) { + params = incomingArguments.getArguments(); + } else { + argCount = incomingArguments.getArgumentCount(); + params = new Variable[argCount + 1]; + for (int i = 0; i < argCount; i++) { + params[i] = incomingArguments.getArgument(i); + } + params[argCount] = (Variable) returnObject; } - // Add the return value as the last parameter. - params[argCount] = (Variable) incomingArguments.getReturn(); append(new PTXParameterOp(params)); for (LocalNode local : graph.getNodes(LocalNode.class)) { Value param = params[local.index()]; + Annotation[] annos = graph.method().getParameterAnnotations()[local.index()]; + Warp warpAnnotation = null; + + if (annos != null) { + for (int a = 0; a < annos.length; a++) { + if (annos[a].annotationType().equals(Warp.class)) { + warpAnnotation = (Warp) annos[a]; + } + } + } + if (warpAnnotation != null) { + // setResult(local, emitWarpParam(param.getKind(), warpAnnotation)); + } setResult(local, emitLoadParam(param.getKind(), param, null)); } } + public Variable emitWarpParam(Kind kind, @SuppressWarnings("unused") Warp annotation) { + Variable result = newVariable(kind); + return result; + } + @Override public Variable emitMove(Value input) { Variable result = newVariable(input.getKind()); @@ -168,38 +195,47 @@ public PTXAddressValue emitAddress(Value base, long displacement, Value index, int scale) { AllocatableValue baseRegister; long finalDisp = displacement; + if (isConstant(base)) { if (asConstant(base).isNull()) { baseRegister = Value.ILLEGAL; - } else if (asConstant(base).getKind() != Kind.Object) { + } else if (asConstant(base).getKind() != Kind.Object && !runtime.needsDataPatch(asConstant(base))) { finalDisp += asConstant(base).asLong(); baseRegister = Value.ILLEGAL; } else { baseRegister = load(base); } + } else if (base.equals(Value.ILLEGAL)) { + baseRegister = Value.ILLEGAL; } else { baseRegister = asAllocatable(base); } - @SuppressWarnings("unused") Value indexRegister; - if (!index.equals(Value.ILLEGAL) && scale != 0) { + if (!index.equals(Value.ILLEGAL)) { if (isConstant(index)) { finalDisp += asConstant(index).asLong() * scale; - indexRegister = Value.ILLEGAL; } else { + Value convertedIndex; + Value indexRegister; + + convertedIndex = emitConvert(Op.I2L, index); if (scale != 1) { - Variable longIndex = emitConvert(Op.I2L, index); if (CodeUtil.isPowerOf2(scale)) { - indexRegister = emitShl(longIndex, Constant.forLong(CodeUtil.log2(scale))); + indexRegister = emitShl(convertedIndex, Constant.forInt(CodeUtil.log2(scale))); } else { - indexRegister = emitMul(longIndex, Constant.forLong(scale)); + indexRegister = emitMul(convertedIndex, Constant.forInt(scale)); } } else { - indexRegister = asAllocatable(index); + indexRegister = convertedIndex; + } + if (baseRegister.equals(Value.ILLEGAL)) { + baseRegister = asAllocatable(indexRegister); + } else { + Variable longBaseRegister = newVariable(Kind.Long); + emitMove(longBaseRegister, baseRegister); + baseRegister = emitAdd(longBaseRegister, indexRegister); } } - } else { - indexRegister = Value.ILLEGAL; } return new PTXAddressValue(target().wordKind, baseRegister, finalDisp); @@ -504,7 +540,7 @@ append(new Op2Stack(ISHL, result, a, loadNonConst(b))); break; case Long: - append(new Op1Stack(LSHL, result, loadNonConst(b))); + append(new Op2Stack(LSHL, result, a, loadNonConst(b))); break; default: throw GraalInternalError.shouldNotReachHere(); @@ -520,7 +556,7 @@ append(new Op2Stack(ISHR, result, a, loadNonConst(b))); break; case Long: - append(new Op1Stack(LSHR, result, loadNonConst(b))); + append(new Op2Stack(LSHR, result, a, loadNonConst(b))); break; default: throw GraalInternalError.shouldNotReachHere(); diff -r aeeab846e98c -r d8659ad83fcc graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXArithmetic.java --- a/graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXArithmetic.java Fri Sep 27 19:51:01 2013 +0200 +++ b/graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/PTXArithmetic.java Sat Sep 28 21:06:12 2013 -0400 @@ -75,7 +75,7 @@ case D2I: case D2L: case D2F: - break; // cvt handles the move + break; // cvt handles the move default: PTXMove.move(tasm, masm, result, x); } @@ -422,11 +422,73 @@ } private static void verifyKind(PTXArithmetic opcode, Value result, Value x, Value y) { - if (((opcode.name().startsWith("I") && result.getKind() == Kind.Int && x.getKind().getStackKind() == Kind.Int && y.getKind().getStackKind() == Kind.Int) - || (opcode.name().startsWith("L") && result.getKind() == Kind.Long && x.getKind() == Kind.Long && y.getKind() == Kind.Long) - || (opcode.name().startsWith("F") && result.getKind() == Kind.Float && x.getKind() == Kind.Float && y.getKind() == Kind.Float) - || (opcode.name().startsWith("D") && result.getKind() == Kind.Double && x.getKind() == Kind.Double && y.getKind() == Kind.Double)) == false) { - throw GraalInternalError.shouldNotReachHere("opcode: " + opcode.name() + " x: " + x.getKind() + " y: " + y.getKind()); + Kind rk; + Kind xk; + Kind yk; + Kind xsk; + Kind ysk; + + switch (opcode) { + case IADD: + case ISUB: + case IMUL: + case IDIV: + case IREM: + case IAND: + case IOR: + case IXOR: + case ISHL: + case ISHR: + case IUSHR: + rk = result.getKind(); + xsk = x.getKind().getStackKind(); + ysk = y.getKind().getStackKind(); + assert rk == Kind.Int && xsk == Kind.Int && ysk == Kind.Int; + break; + case LADD: + case LSUB: + case LMUL: + case LDIV: + case LREM: + case LAND: + case LOR: + case LXOR: + rk = result.getKind(); + xk = x.getKind(); + yk = y.getKind(); + assert rk == Kind.Long && xk == Kind.Long && yk == Kind.Long; + break; + case LSHL: + case LSHR: + case LUSHR: + rk = result.getKind(); + xk = x.getKind(); + yk = y.getKind(); + assert rk == Kind.Long && xk == Kind.Long && (yk == Kind.Int || yk == Kind.Long); + break; + case FADD: + case FSUB: + case FMUL: + case FDIV: + case FREM: + rk = result.getKind(); + xk = x.getKind(); + yk = y.getKind(); + assert rk == Kind.Float && xk == Kind.Float && yk == Kind.Float; + break; + case DADD: + case DSUB: + case DMUL: + case DDIV: + case DREM: + rk = result.getKind(); + xk = x.getKind(); + yk = y.getKind(); + assert rk == Kind.Double && xk == Kind.Double && yk == Kind.Double : + "opcode=" + opcode + ", result kind=" + rk + ", x kind=" + xk + ", y kind=" + yk; + break; + default: + throw GraalInternalError.shouldNotReachHere("missing: " + opcode); } } } diff -r aeeab846e98c -r d8659ad83fcc graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/Warp.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graal/com.oracle.graal.lir.ptx/src/com/oracle/graal/lir/ptx/Warp.java Sat Sep 28 21:06:12 2013 -0400 @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package com.oracle.graal.lir.ptx; + +import static com.oracle.graal.lir.ptx.Warp.ThreadDimension.*; + +import java.lang.annotation.*; + +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.PARAMETER}) +public @interface Warp { + public enum ThreadDimension { + X, + Y, + Z + } + + String value() default ""; + + ThreadDimension dimension() default X; +} diff -r aeeab846e98c -r d8659ad83fcc src/gpu/ptx/vm/gpu_ptx.cpp --- a/src/gpu/ptx/vm/gpu_ptx.cpp Fri Sep 27 19:51:01 2013 +0200 +++ b/src/gpu/ptx/vm/gpu_ptx.cpp Sat Sep 28 21:06:12 2013 -0400 @@ -282,7 +282,7 @@ // Get the result. TODO: Move this code to get_return_oop() BasicType return_type = ptxka.get_ret_type(); switch (return_type) { - case T_INT : + case T_INT: { int return_val; status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_INT_BYTE_SIZE); @@ -293,7 +293,7 @@ ret.set_jint(return_val); } break; - case T_LONG : + case T_LONG: { long return_val; status = gpu::Ptx::_cuda_cu_memcpy_dtoh(&return_val, ptxka._return_value_ptr, T_LONG_BYTE_SIZE); @@ -304,10 +304,14 @@ ret.set_jlong(return_val); } break; + case T_VOID: + break; default: - tty->print_cr("[CUDA] TODO *** Unhandled return type"); + tty->print_cr("[CUDA] TODO *** Unhandled return type: %d", return_type); } + // handle post-invocation object and array arguemtn + ptxka.reiterate(); // Free device memory allocated for result status = gpu::Ptx::_cuda_cu_memfree(ptxka._return_value_ptr); diff -r aeeab846e98c -r d8659ad83fcc src/gpu/ptx/vm/ptxKernelArguments.cpp --- a/src/gpu/ptx/vm/ptxKernelArguments.cpp Fri Sep 27 19:51:01 2013 +0200 +++ b/src/gpu/ptx/vm/ptxKernelArguments.cpp Sat Sep 28 21:06:12 2013 -0400 @@ -32,12 +32,18 @@ // Get next java argument oop PTXKernelArguments::next_arg(BasicType expectedType) { assert(_index < _args->length(), "out of bounds"); - oop arg=((objArrayOop) (_args))->obj_at(_index++); - assert(expectedType == T_OBJECT || java_lang_boxing_object::is_instance(arg, expectedType), "arg type mismatch"); + + oop arg = ((objArrayOop) (_args))->obj_at(_index++); + assert(expectedType == T_OBJECT || + java_lang_boxing_object::is_instance(arg, expectedType), "arg type mismatch"); + return arg; } -void PTXKernelArguments::do_int() { +void PTXKernelArguments::do_int() { + if (is_after_invocation()) { + return; + } // If the parameter is a return value, if (is_return_type()) { // Allocate device memory for T_INT return value pointer on device. Size in bytes @@ -50,8 +56,7 @@ // Push _return_value_ptr to _kernelBuffer *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _return_value_ptr; _bufferOffset += sizeof(_return_value_ptr); - } - else { + } else { // Get the next java argument and its value which should be a T_INT oop arg = next_arg(T_INT); // Copy the java argument value to kernelArgBuffer @@ -67,7 +72,10 @@ return; } -void PTXKernelArguments::do_long() { +void PTXKernelArguments::do_long() { + if (is_after_invocation()) { + return; + } // If the parameter is a return value, if (is_return_type()) { // Allocate device memory for T_LONG return value pointer on device. Size in bytes @@ -80,8 +88,7 @@ // Push _return_value_ptr to _kernelBuffer *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _return_value_ptr; _bufferOffset += sizeof(_return_value_ptr); - } - else { + } else { // Get the next java argument and its value which should be a T_LONG oop arg = next_arg(T_LONG); // Copy the java argument value to kernelArgBuffer @@ -97,34 +104,81 @@ return; } -void PTXKernelArguments::do_byte() { - // If the parameter is a return value, - if (is_return_type()) { - // Allocate device memory for T_BYTE return value pointer on device. Size in bytes - int status = gpu::Ptx::_cuda_cu_memalloc(&_return_value_ptr, T_BYTE_SIZE); +void PTXKernelArguments::do_byte() { + if (is_after_invocation()) { + return; + } + // If the parameter is a return value, + if (is_return_type()) { + // Allocate device memory for T_BYTE return value pointer on device. Size in bytes + int status = gpu::Ptx::_cuda_cu_memalloc(&_return_value_ptr, T_BYTE_SIZE); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; + } + // Push _return_value_ptr to _kernelBuffer + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _return_value_ptr; + _bufferOffset += sizeof(_return_value_ptr); + } else { + // Get the next java argument and its value which should be a T_BYTE + oop arg = next_arg(T_BYTE); + // Copy the java argument value to kernelArgBuffer + jvalue val; + if (java_lang_boxing_object::get_value(arg, &val) != T_BYTE) { + tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BYTE"); + _success = false; + return; + } + *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.b; + _bufferOffset += sizeof(val.b); + } + return; +} + +void PTXKernelArguments::do_array(int begin, int end) { + gpu::Ptx::CUdeviceptr _array_ptr; + int status; + + // Get the next java argument and its value which should be a T_ARRAY + oop arg = next_arg(T_OBJECT); + int array_size = arg->size() * HeapWordSize; + + if (is_after_invocation()) { + _array_ptr = *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]); + status = gpu::Ptx::_cuda_cu_memcpy_dtoh(arg, _array_ptr, array_size); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to copy array argument to host", status); + _success = false; + return; + } else { + // tty->print_cr("device: %x host: %x size: %d", _array_ptr, arg, array_size); + } + return; + } + // Allocate device memory for T_ARRAY return value pointer on device. Size in bytes + status = gpu::Ptx::_cuda_cu_memalloc(&_return_value_ptr, array_size); if (status != GRAAL_CUDA_SUCCESS) { - tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); - _success = false; - return; + tty->print_cr("[CUDA] *** Error (%d) Failed to allocate memory for return value pointer on device", status); + _success = false; + return; + } + status = gpu::Ptx::_cuda_cu_memcpy_htod(_return_value_ptr, arg, array_size); + if (status != GRAAL_CUDA_SUCCESS) { + tty->print_cr("[CUDA] *** Error (%d) Failed to copy array to device argument", status); + _success = false; + return; + } else { + // tty->print_cr("host: %x device: %x size: %d", arg, _return_value_ptr, array_size); } // Push _return_value_ptr to _kernelBuffer *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = _return_value_ptr; _bufferOffset += sizeof(_return_value_ptr); - } - else { - // Get the next java argument and its value which should be a T_BYTE - oop arg = next_arg(T_BYTE); - // Copy the java argument value to kernelArgBuffer - jvalue val; - if (java_lang_boxing_object::get_value(arg, &val) != T_BYTE) { - tty->print_cr("[CUDA] *** Error: Unexpected argument type; expecting T_BYTE"); - _success = false; - return; - } - *((gpu::Ptx::CUdeviceptr*) &_kernelArgBuffer[_bufferOffset]) = val.b; - _bufferOffset += sizeof(val.b); - } - return; + return; +} + +void PTXKernelArguments::do_void() { + return; } // TODO implement other do_* diff -r aeeab846e98c -r d8659ad83fcc src/gpu/ptx/vm/ptxKernelArguments.hpp --- a/src/gpu/ptx/vm/ptxKernelArguments.hpp Fri Sep 27 19:51:01 2013 +0200 +++ b/src/gpu/ptx/vm/ptxKernelArguments.hpp Sat Sep 28 21:06:12 2013 -0400 @@ -31,6 +31,7 @@ #define T_BYTE_SIZE 1 #define T_INT_BYTE_SIZE 4 #define T_LONG_BYTE_SIZE 8 +#define T_ARRAY_BYTE_SIZE 8 class PTXKernelArguments : public SignatureIterator { public: @@ -46,6 +47,8 @@ int _index; // Flag to indicate successful creation of kernel argument buffer bool _success; + + bool _afterInvoocation; // Get next java argument oop next_arg(BasicType expectedType); @@ -74,6 +77,17 @@ return _bufferOffset; } + void reiterate() { + _afterInvoocation = true; + _bufferOffset = 0; + _index = 0; + iterate(); + } + + inline bool is_after_invocation() { + return _afterInvoocation; + } + // Get the return oop value oop get_return_oop(); @@ -86,44 +100,40 @@ void do_byte(); void do_int(); void do_long(); + void do_array(int begin, int end); + void do_void(); inline void do_bool() { /* TODO : To be implemented */ - guarantee(false, "NYI"); + guarantee(false, "do_bool:NYI"); } inline void do_char() { /* TODO : To be implemented */ - guarantee(false, "NYI"); + guarantee(false, "do_char:NYI"); } inline void do_short() { /* TODO : To be implemented */ - guarantee(false, "NYI"); + guarantee(false, "do_short:NYI"); } inline void do_float() { /* TODO : To be implemented */ - guarantee(false, "NYI"); + guarantee(false, "do_float:NYI"); } inline void do_double() { /* TODO : To be implemented */ - guarantee(false, "NYI"); + guarantee(false, "do_double:NYI"); } inline void do_object() { /* TODO : To be implemented */ - guarantee(false, "NYI"); + guarantee(false, "do_object:NYI"); } + inline void do_object(int begin, int end) { /* TODO : To be implemented */ - guarantee(false, "NYI"); + guarantee(false, "do_object(II):NYI"); } - inline void do_array(int begin, int end) { - /* TODO : To be implemented */ - guarantee(false, "NYI"); - } - inline void do_void() { - /* TODO : To be implemented */ - guarantee(false, "NYI"); - } + }; #endif // KERNEL_ARGUMENTS_HPP