Mercurial > hg > graal-jvmci-8
changeset 23018:ab2353694ea7 jdk8u60-b16
Merge
author | amurillo |
---|---|
date | Tue, 19 May 2015 09:16:26 -0700 |
parents | 75b0573e0a5d (current diff) 4fdda95243c4 (diff) |
children | a20bd9718799 |
files | |
diffstat | 8 files changed, 234 insertions(+), 34 deletions(-) [+] |
line wrap: on
line diff
--- a/.hgtags Thu May 14 20:13:34 2015 -0700 +++ b/.hgtags Tue May 19 09:16:26 2015 -0700 @@ -641,3 +641,4 @@ c9f8b7319d0a5ab07310cf53507642a8fd91589b jdk8u60-b14 4187dc92e90b16b4097627b8af4f5e6e63f3b497 hs25.60-b15 b99f1bf208f385277b03a985d35b6614b4095f3e jdk8u60-b15 +f5800068c61d0627c14e99836e9ce5cf0ef00075 hs25.60-b16
--- a/make/hotspot_version Thu May 14 20:13:34 2015 -0700 +++ b/make/hotspot_version Tue May 19 09:16:26 2015 -0700 @@ -35,7 +35,7 @@ HS_MAJOR_VER=25 HS_MINOR_VER=60 -HS_BUILD_NUMBER=15 +HS_BUILD_NUMBER=16 JDK_MAJOR_VER=1 JDK_MINOR_VER=8
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp Thu May 14 20:13:34 2015 -0700 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Tue May 19 09:16:26 2015 -0700 @@ -6690,7 +6690,7 @@ subl(cnt2, stride2); jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); // clean upper bits of YMM registers - vzeroupper(); + vpxor(vec1, vec1); // compare wide vectors tail bind(COMPARE_WIDE_TAIL); @@ -6705,7 +6705,7 @@ // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. bind(VECTOR_NOT_EQUAL); // clean upper bits of YMM registers - vzeroupper(); + vpxor(vec1, vec1); lea(str1, Address(str1, result, scale)); lea(str2, Address(str2, result, scale)); jmp(COMPARE_16_CHARS); @@ -6964,7 +6964,8 @@ bind(DONE); if (UseAVX >= 2) { // clean upper bits of YMM registers - vzeroupper(); + vpxor(vec1, vec1); + vpxor(vec2, vec2); } } @@ -7098,7 +7099,8 @@ BIND(L_check_fill_8_bytes); // clean upper bits of YMM registers - vzeroupper(); + movdl(xtmp, value); + pshufd(xtmp, xtmp, 0); } else { // Fill 32-byte chunks pshufd(xtmp, xtmp, 0); @@ -7261,7 +7263,11 @@ bind(L_copy_16_chars_exit); if (UseAVX >= 2) { // clean upper bits of YMM registers - vzeroupper(); + vpxor(tmp2Reg, tmp2Reg); + vpxor(tmp3Reg, tmp3Reg); + vpxor(tmp4Reg, tmp4Reg); + movdl(tmp1Reg, tmp5); + pshufd(tmp1Reg, tmp1Reg, 0); } subptr(len, 8); jccb(Assembler::greater, L_copy_8_chars_exit);
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp Thu May 14 20:13:34 2015 -0700 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue May 19 09:16:26 2015 -0700 @@ -837,7 +837,8 @@ if (UseUnalignedLoadStores && (UseAVX >= 2)) { // clean upper bits of YMM registers - __ vzeroupper(); + __ vpxor(xmm0, xmm0); + __ vpxor(xmm1, xmm1); } __ addl(qword_count, 8); __ jccb(Assembler::zero, L_exit);
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Thu May 14 20:13:34 2015 -0700 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue May 19 09:16:26 2015 -0700 @@ -1328,7 +1328,8 @@ __ BIND(L_end); if (UseAVX >= 2) { // clean upper bits of YMM registers - __ vzeroupper(); + __ vpxor(xmm0, xmm0); + __ vpxor(xmm1, xmm1); } } else { // Copy 32-bytes per iteration @@ -1405,7 +1406,8 @@ __ BIND(L_end); if (UseAVX >= 2) { // clean upper bits of YMM registers - __ vzeroupper(); + __ vpxor(xmm0, xmm0); + __ vpxor(xmm1, xmm1); } } else { // Copy 32-bytes per iteration
--- a/src/share/vm/opto/superword.cpp Thu May 14 20:13:34 2015 -0700 +++ b/src/share/vm/opto/superword.cpp Tue May 19 09:16:26 2015 -0700 @@ -232,6 +232,13 @@ // if unaligned memory access is not allowed because number of // iterations in pre-loop will be not enough to align it. create_pack = false; + } else { + SWPointer p2(best_align_to_mem_ref, this); + if (align_to_ref_p.invar() != p2.invar()) { + // Do not vectorize memory accesses with different invariants + // if unaligned memory accesses are not allowed. + create_pack = false; + } } } } else { @@ -445,29 +452,57 @@ int preloop_stride = pre_end->stride_con(); int span = preloop_stride * p.scale_in_bytes(); - - // Stride one accesses are alignable. - if (ABS(span) == p.memory_size()) + int mem_size = p.memory_size(); + int offset = p.offset_in_bytes(); + // Stride one accesses are alignable if offset is aligned to memory operation size. + // Offset can be unaligned when UseUnalignedAccesses is used. + if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) { return true; - - // If initial offset from start of object is computable, - // compute alignment within the vector. + } + // If the initial offset from start of the object is computable, + // check if the pre-loop can align the final offset accordingly. + // + // In other words: Can we find an i such that the offset + // after i pre-loop iterations is aligned to vw? + // (init_offset + pre_loop) % vw == 0 (1) + // where + // pre_loop = i * span + // is the number of bytes added to the offset by i pre-loop iterations. + // + // For this to hold we need pre_loop to increase init_offset by + // pre_loop = vw - (init_offset % vw) + // + // This is only possible if pre_loop is divisible by span because each + // pre-loop iteration increases the initial offset by 'span' bytes: + // (vw - (init_offset % vw)) % span == 0 + // int vw = vector_width_in_bytes(p.mem()); assert(vw > 1, "sanity"); - if (vw % span == 0) { - Node* init_nd = pre_end->init_trip(); - if (init_nd->is_Con() && p.invar() == NULL) { - int init = init_nd->bottom_type()->is_int()->get_con(); - - int init_offset = init * p.scale_in_bytes() + p.offset_in_bytes(); - assert(init_offset >= 0, "positive offset from object start"); - + Node* init_nd = pre_end->init_trip(); + if (init_nd->is_Con() && p.invar() == NULL) { + int init = init_nd->bottom_type()->is_int()->get_con(); + int init_offset = init * p.scale_in_bytes() + offset; + assert(init_offset >= 0, "positive offset from object start"); + if (vw % span == 0) { + // If vm is a multiple of span, we use formula (1). if (span > 0) { return (vw - (init_offset % vw)) % span == 0; } else { assert(span < 0, "nonzero stride * scale"); return (init_offset % vw) % -span == 0; } + } else if (span % vw == 0) { + // If span is a multiple of vw, we can simplify formula (1) to: + // (init_offset + i * span) % vw == 0 + // => + // (init_offset % vw) + ((i * span) % vw) == 0 + // => + // init_offset % vw == 0 + // + // Because we add a multiple of vw to the initial offset, the final + // offset is a multiple of vw if and only if init_offset is a multiple. + // + return (init_offset % vw) == 0; } } return false; @@ -479,17 +514,23 @@ SWPointer align_to_ref_p(mem_ref, this); int offset = align_to_ref_p.offset_in_bytes(); int scale = align_to_ref_p.scale_in_bytes(); + int elt_size = align_to_ref_p.memory_size(); int vw = vector_width_in_bytes(mem_ref); assert(vw > 1, "sanity"); - int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; - // At least one iteration is executed in pre-loop by default. As result - // several iterations are needed to align memory operations in main-loop even - // if offset is 0. - int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); - int elt_size = align_to_ref_p.memory_size(); - assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), - err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); - int iv_adjustment = iv_adjustment_in_bytes/elt_size; + int iv_adjustment; + if (scale != 0) { + int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; + // At least one iteration is executed in pre-loop by default. As result + // several iterations are needed to align memory operations in main-loop even + // if offset is 0. + int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); + assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), + err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); + iv_adjustment = iv_adjustment_in_bytes/elt_size; + } else { + // This memory op is not dependent on iv (scale == 0) + iv_adjustment = 0; + } #ifndef PRODUCT if (TraceSuperWord) @@ -2247,6 +2288,11 @@ } // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant) Node* base = adr->in(AddPNode::Base); + // The base address should be loop invariant + if (!invariant(base)) { + assert(!valid(), "base address is loop variant"); + return; + } //unsafe reference could not be aligned appropriately without runtime checking if (base == NULL || base->bottom_type() == Type::TOP) { assert(!valid(), "unsafe access");
--- a/src/share/vm/opto/superword.hpp Thu May 14 20:13:34 2015 -0700 +++ b/src/share/vm/opto/superword.hpp Tue May 19 09:16:26 2015 -0700 @@ -41,7 +41,7 @@ // Exploiting SuperWord Level Parallelism with // Multimedia Instruction Sets // by -// Samuel Larsen and Saman Amarasighe +// Samuel Larsen and Saman Amarasinghe // MIT Laboratory for Computer Science // date // May 2000 @@ -432,7 +432,7 @@ Node* _base; // NULL if unsafe nonheap reference Node* _adr; // address pointer - jint _scale; // multipler for iv (in bytes), 0 if no loop iv + jint _scale; // multiplier for iv (in bytes), 0 if no loop iv jint _offset; // constant offset (in bytes) Node* _invar; // invariant offset (in bytes), NULL if none bool _negate_invar; // if true then use: (0 - _invar)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/compiler/loopopts/superword/TestVectorizationWithInvariant.java Tue May 19 09:16:26 2015 -0700 @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +import com.oracle.java.testlibrary.*; +import sun.misc.Unsafe; + +/** + * @test + * @bug 8078497 + * @summary Tests correct alignment of vectors with loop invariant offset. + * @library /testlibrary + * @run main TestVectorizationWithInvariant + */ +public class TestVectorizationWithInvariant { + + private static Unsafe unsafe; + private static final long BYTE_ARRAY_OFFSET; + private static final long CHAR_ARRAY_OFFSET; + + static { + unsafe = Utils.getUnsafe(); + BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class); + CHAR_ARRAY_OFFSET = unsafe.arrayBaseOffset(char[].class); + } + + public static void main(String[] args) throws Exception { + byte[] byte_array1 = new byte[1000]; + byte[] byte_array2 = new byte[1000]; + char[] char_array = new char[1000]; + + for (int i = 0; i < 20_000; ++i) { + copyByteToChar(byte_array1, byte_array2, char_array, 1); + copyCharToByte(char_array, byte_array1, 1); + copyCharToByteAligned(char_array, byte_array1); + copyCharToByteUnaligned(char_array, byte_array1); + } + } + + /* + * Copy multiple consecutive chars from a byte array to a given offset in a char array + * to trigger C2's superword optimization. The offset in the byte array is independent + * of the loop induction variable and can be set to an arbitrary value. It may then not + * be possible to both align the LoadUS and the StoreC operations. Therefore, vectorization + * should only be done in this case if unaligned memory accesses are allowed. + */ + public static void copyByteToChar(byte[] src1, byte[] src2, char[] dst, int off) { + off = (int) BYTE_ARRAY_OFFSET + (off << 1); + byte[] src = src1; + for (int i = (int) CHAR_ARRAY_OFFSET; i < 100; i = i + 8) { + // Copy 8 chars from src to dst + unsafe.putChar(dst, i + 0, unsafe.getChar(src, off + 0)); + unsafe.putChar(dst, i + 2, unsafe.getChar(src, off + 2)); + unsafe.putChar(dst, i + 4, unsafe.getChar(src, off + 4)); + unsafe.putChar(dst, i + 6, unsafe.getChar(src, off + 6)); + unsafe.putChar(dst, i + 8, unsafe.getChar(src, off + 8)); + unsafe.putChar(dst, i + 10, unsafe.getChar(src, off + 10)); + unsafe.putChar(dst, i + 12, unsafe.getChar(src, off + 12)); + unsafe.putChar(dst, i + 14, unsafe.getChar(src, off + 14)); + + // Prevent loop invariant code motion of char read. + src = (src == src1) ? src2 : src1; + } + } + + /* + * Copy multiple consecutive chars from a char array to a given offset in a byte array + * to trigger C2's superword optimization. Checks for similar problems as 'copyByteToChar'. + */ + public static void copyCharToByte(char[] src, byte[] dst, int off) { + off = (int) BYTE_ARRAY_OFFSET + (off << 1); + for (int i = 0; i < 100; i = i + 8) { + // Copy 8 chars from src to dst + unsafe.putChar(dst, off + 0, src[i + 0]); + unsafe.putChar(dst, off + 2, src[i + 1]); + unsafe.putChar(dst, off + 4, src[i + 2]); + unsafe.putChar(dst, off + 6, src[i + 3]); + unsafe.putChar(dst, off + 8, src[i + 4]); + unsafe.putChar(dst, off + 10, src[i + 5]); + unsafe.putChar(dst, off + 12, src[i + 6]); + unsafe.putChar(dst, off + 14, src[i + 7]); + } + } + + /* + * Variant of copyCharToByte with a constant destination array offset. + * The loop should always be vectorized because both the LoadUS and StoreC + * operations can be aligned. + */ + public static void copyCharToByteAligned(char[] src, byte[] dst) { + final int off = (int) BYTE_ARRAY_OFFSET; + for (int i = 8; i < 100; i = i + 8) { + // Copy 8 chars from src to dst + unsafe.putChar(dst, off + 0, src[i + 0]); + unsafe.putChar(dst, off + 2, src[i + 1]); + unsafe.putChar(dst, off + 4, src[i + 2]); + unsafe.putChar(dst, off + 6, src[i + 3]); + unsafe.putChar(dst, off + 8, src[i + 4]); + unsafe.putChar(dst, off + 10, src[i + 5]); + unsafe.putChar(dst, off + 12, src[i + 6]); + unsafe.putChar(dst, off + 14, src[i + 7]); + } + } + + /* + * Variant of copyCharToByte with a constant destination array offset. The + * loop should only be vectorized if unaligned memory operations are allowed + * because not both the LoadUS and the StoreC can be aligned. + */ + public static void copyCharToByteUnaligned(char[] src, byte[] dst) { + final int off = (int) BYTE_ARRAY_OFFSET + 2; + for (int i = 0; i < 100; i = i + 8) { + // Copy 8 chars from src to dst + unsafe.putChar(dst, off + 0, src[i + 0]); + unsafe.putChar(dst, off + 2, src[i + 1]); + unsafe.putChar(dst, off + 4, src[i + 2]); + unsafe.putChar(dst, off + 6, src[i + 3]); + unsafe.putChar(dst, off + 8, src[i + 4]); + unsafe.putChar(dst, off + 10, src[i + 5]); + unsafe.putChar(dst, off + 12, src[i + 6]); + unsafe.putChar(dst, off + 14, src[i + 7]); + } + } +}